In [106]:
!mkdir module

## preprocessing

In [186]:
%%writefile ./module/preprocessing1st.py

import numpy as np
import pandas as pd
from sklearn.model_selection import (train_test_split, cross_val_score, StratifiedKFold ,GridSearchCV)
import warnings
warnings.filterwarnings('ignore')

cates = {
    'education' : ['Less than high school','High school','University degree','Graduate degree'],
    'urban' : ['Rural','Suburban','Urban'],
    'gender' : ['Male','Female','Other'],
    'engnat' : ['Yes','No'],
    'hand' : ['Right','Left','Both'],
    'religion' : ['Agnostic','Atheist','Buddhist','Christian(Catholic)','Christian(Mormon)','Christian(Protestant)','Christian(othrer)','Hindu','Jewish','Muslim','Sikh','Other'],
    'orientation' : ['Heterosexual','Bisexual','Homosexual','Asexual','Other'],
    'race' : ['Asian','Arab','Black','Indigenous Australian','Native American','White','Other'],
    'voted' : ['1', '0'],
    'married' : ['Never married','Currently married','Previously married'],
}


class Preprocessing1st():
    
    def __init__(self, data, without_comma=False):
        self.data = data
        self.df = self.read_data(without_comma)
        self.df_eda = self.preprocessing_eda()
        
    def read_data(self, without_comma):

        if without_comma==False:
            df = pd.read_csv(self.data, delimiter="\t")
        
        else:
            df = pd.read_csv(self.data)
            
        return df
        
        
    def tran_cate(self, df, cate, x, y):
        self.df[cate] = self.df[cate].astype('str').replace(x, y)
        return self.df[cate]

    def preprocessing_eda(self):
        # 직관적 EDA를 위해 컬럼명 수정
        self.df.rename(columns = {"Q1A" : "Q1_TP_notell_2u", "Q2A" : "Q2_TP_ppl_nd_dangun", "Q3A" : "Q3_TN_do_moral", "Q4A" : "Q4_VN_ppl_good", "Q5A" : "Q5_VP_ppl_bad", "Q6A" : "Q6_TN_hnsty_best", "Q7A" : "Q7_TN_lying_bad", "Q8A" : "Q8_VP_ppl_lazy", "Q9A" : "Q9_MN_humble_hnst", "Q10A" : "Q10_TN_hnstly_ask", "Q11A" : "Q11_VN_leader_clean", "Q12A" : "Q12_TP_trust_trouble", "Q13A" : "Q13_VP_ppl_criminal", "Q14A" : "Q14_VN_ppl_brave", "Q15A" : "Q15_TP_abu_good", "Q16A" : "Q16_TN_ppl_good", "Q17A" : "Q17_VN_ppl_notbad", "Q18A" : "Q18_VP_komsu_better", "Q19A" : "Q19_MP_anrocksa_ok", "Q20A" : "Q20_VP_money_good",
                     "Q1E" : "Q1E_notell_2u", "Q2E" : "Q2E_ppl_nd_dangun", "Q3E" : "Q3E_do_moral", "Q4E" : "Q4E_ppl_good", "Q5E" : "Q5E_ppl_bad", "Q6E" : "Q6E_hnsty_best", "Q7E" : "Q7E_lying_bad", "Q8E" : "Q8E_ppl_lazy", "Q9E" : "Q9E_humble_hnst", "Q10E" : "Q10E_hnstly_ask", "Q11E" : "Q11E_leader_clean", "Q12E" : "Q12E_trust_trouble", "Q13E" : "Q13E_ppl_criminal", "Q14E" : "Q14E_ppl_brave", "Q15E" : "Q15E_abu_good", "Q16E" : "Q16E_ppl_good", "Q17E" : "Q17E_ppl_notbad", "Q18E" : "Q18E_komsu_better", "Q19E" : "Q19E_anrocksa_ok", "Q20E" : "Q20E_money_good",
                     "TIPI1":"TYP_out", "TIPI2":"TYP_fight", "TIPI3":"TYP_depnd", "TIPI4":"TYP_anx", "TIPI5":"TYP_try", "TIPI6":"TYP_quiet", "TIPI7":"TYP_warm", "TIPI8":"TYP_disorg", "TIPI9":"TYP_calm", "TIPI10":"TYP_stable",
                     "VCL6" : "VCL6_F", "VCL9" : "VCL9_F", "VCL12" : "VCL12_F"
                    }, inplace=True)        
        
        # score 컬럼 추가
        col_list = list(self.df.columns)
        pos_col = []
        neg_col = []

        for col in col_list:
            if "P" in col and "Y" not in col:
                pos_col.append(col)
            if "N" in col:
                neg_col.append(col)

        self.df["score"] = self.df[pos_col].sum(axis=1) + self.df[neg_col].apply(lambda x: 6 -x).sum(axis=1)
        
        # V, T, M score 컬럼 추가 
        v_score = []
        t_score = []
        m_score = []

        for col in col_list:
            if "T" in col:
                t_score.append(col)
            if "M" in col:
                m_score.append(col)
            if "V" in col:
                v_score.append(col)
        self.df["v_score"] = self.df[v_score].sum(axis=1)
        self.df["t_score"] = self.df[t_score].sum(axis=1)
        self.df["m_score"] = self.df[m_score].sum(axis=1)
        
                
        # 텍스트 데이터로 변환, 시간 데이터 초단위로 환산
        vcl_col = []
        sec_col = []

        for col in col_list:
            if "VCL" in col:
                vcl_col.append(col)
            if "E" in col:
                sec_col.append(col)
        
        self.df[vcl_col] = self.df[vcl_col].applymap(lambda x: str(x).replace("1", "know") if x==1 
                                           else str(x).replace("0", "n_know"))
        self.df[sec_col] = self.df[sec_col].apply(lambda x: round(x*0.001))
        
        for x in list(cates.keys()):
            for idx, y in enumerate(cates[x]):
                if x == 'race':
                    self.tran_cate(self.df, x, "{}".format((idx+1)*10) ,y)
                else:    
                    self.tran_cate(self.df, x, "{}".format(idx+1) ,y)

        
        # 나이 18세 이하 데이터 drop
        df = self.df[self.df["age"]>17]
        
        
        # EDA용 csv 생성
        df.to_csv("../MACH_data/raw_data_for_EDA.csv", index=False)
        
        return df
                    
                    
    def preprocessing_model(self):                
        # major 컬럼 drop
        self.df_eda.drop(columns = "major", inplace = True)
        col_list2 = list(self.df_eda.columns)

        
        # null, 0 데이터 제거
        self.df_eda.dropna(inplace=True)
        zero_idx = []
        for col in col_list2:
            zero_idx += list((self.df_eda[(self.df_eda[col] == 0)].index))
        zero_idx = list(set(zero_idx))
        self.df_eda.drop(zero_idx, inplace=True)

        
        # train, test로 나누기 
        df_X = self.df_eda.drop('voted', axis=1)
        df_X = pd.get_dummies(df_X)
        
        
        # 추가 확인된 0 데이터 제거
        col_in_0 = [col for col in df_X.columns if '_0' in col]
        df_X.drop(col_in_0, axis=1, inplace=True)
        
        df_y = self.df_eda['voted'].astype('int')
        
                
        X_train, X_test, y_train, y_test=\
        train_test_split(df_X, df_y, test_size=0.2,
                         random_state=13, stratify=df_y)    

        
        return X_train, X_test, y_train, y_test

Overwriting ./module/preprocessing1st.py


## nth

In [172]:
%%writefile ./module/preprocessing_nth.py

import numpy as np
import pandas as pd
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, confusion_matrix, classification_report)
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import (train_test_split, cross_val_score, StratifiedKFold ,GridSearchCV)
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')

# N차 전처리 클래스(feature 선별, robust scaling, weight 컬럼 추가)
class PreprocessingNth():
    def __init__(self):
        ada = AdaBoostClassifier()
        gbc = GradientBoostingClassifier()
        xgb = XGBClassifier()
        lgbm = LGBMClassifier()
        self.models = [ada, gbc, xgb, lgbm]
        self.model_names = ['Ada', 'GBC', 'XGB', 'LGBM']
        
        
    def feature_selection(self, *xy_train_test):
        X_train, X_test, y_train, y_test = xy_train_test
        
        datas = []
        for model in self.models:
            model.fit(X_train, y_train)
        
        # 모델 fit 이후 feature_importances_ 0인 컬럼을 선별하기 위함.
        ada_fi = self.models[0].feature_importances_
        gbc_fi = self.models[1].feature_importances_
        xgb_fi = self.models[2].feature_importances_
        lgbm_fi = self.models[3].feature_importances_
        
        ada_fm = pd.DataFrame(zip(X_train.columns, ada_fi))
        ada_list1 = list(ada_fm[ada_fm[1]==0][0])
        gbc_fm = pd.DataFrame(zip(X_train.columns, gbc_fi))
        gbc_list1 = list(gbc_fm[gbc_fm[1]==0][0])
        xgb_fm = pd.DataFrame(zip(X_train.columns, xgb_fi))
        xgb_list1 = list(xgb_fm[xgb_fm[1]==0][0])
        lgbm_fm = pd.DataFrame(zip(X_train.columns, lgbm_fi))
        lgbm_list1 = list(lgbm_fm[lgbm_fm[1]==0][0])
        
        # 4가지 모델에서의 feature_importances_가 모두 0인 컬럼들의 교집합 확인. 
        ada_gbc1 = list(set(ada_list1).intersection(gbc_list1))
        ada_gbc_xgb1 = list(set(ada_gbc1).intersection(xgb_list1))
        ada_gbc_xgb_lgbm1 = list(set(ada_gbc_xgb1).intersection(lgbm_list1))
        
        # 해당 컬럼들을 제외한 X데이터를 생성
        X_train = X_train.drop(ada_gbc_xgb_lgbm1, axis=1)
        X_test = X_test.drop(ada_gbc_xgb_lgbm1, axis=1)
        
        # feature_selection 결과 데이터 저장
        self.feature_selection_xy = X_train, X_test, y_train, y_test
        
        return self.feature_selection_xy
    
    

    def scale_robust(self, *xy_train_test):
        X_train, X_test, y_train, y_test = xy_train_test
        
        num_cols = []
        for col in X_train.columns:
            if ("E" in col) | ("age" in col) |("family" in col) |("elapse" in col):
                num_cols.append(col)
                
        rbscale = RobustScaler().fit(X_train[num_cols])
        X_train[num_cols] = rbscale.transform(X_train[num_cols])
        X_test[num_cols] = rbscale.transform(X_test[num_cols])
        
        # scale_robust 결과 데이터 저장
        self.scale_robust_xy = X_train, X_test, y_train, y_test

        return self.scale_robust_xy

       
    def feature_addition(self, *xy_train_test, column="score", voted="voted",col_name="rate"):
        X_train, X_test, y_train, y_test = xy_train_test

        df_tr = pd.concat([X_train, y_train], axis=1)
        df_te = X_test
        add_all_tr = df_tr[[voted, column]].groupby(column).count()
        add_yes_tr = df_tr[[voted, column]].groupby(column).sum()
        add_no_tr = add_all_tr - add_yes_tr
        df_add_tr = round((add_yes_tr - add_no_tr)/ add_all_tr, 4)
        df_add_tr = df_add_tr.rename(columns={voted:col_name})
        
        df1_tr = pd.merge(left=df_tr, right=df_add_tr, how="left", right_index=True, left_on=column)
        df1_te = pd.merge(left=df_te, right=df_add_tr, how="left", right_index=True, left_on=column)
        X_train=df1_tr.drop(voted, axis=1)
        y_train=pd.DataFrame(df1_tr[voted])
        X_test=df1_te.fillna(0)
        
        # feature_addition 결과 데이터 저장
        self.feature_addition_xy = X_train, X_test, y_train, y_test

        return self.feature_addition_xy

Writing ./module/preprocessing_nth.py


In [124]:
## 1차 전처리 데이터 가져오기

In [118]:
from module import preprocessing1st as pre1

In [119]:
pre1 = pre1.Preprocessing1st("./MACH_data/data.csv")

In [120]:
xy = pre1.preprocessing_model()

In [None]:
## Nth 전처리 시도

In [163]:
nth = PreprocessingNth()

In [164]:
fs = nth.feature_selection(*xy)

In [154]:
rb = nth.scale_robust(*nth.feature_selection_xy)

In [168]:
rb[0].tail()

Unnamed: 0,Q1_TP_notell_2u,Q1I,Q1E_notell_2u,Q2_TP_ppl_nd_dangun,Q2I,Q2E_ppl_nd_dangun,Q3_TN_do_moral,Q3I,Q3E_do_moral,Q4_VN_ppl_good,...,orientation_Homosexual,race_Arab,race_Asian,race_Black,race_Native American,race_Other,race_White,married_Currently married,married_Never married,married_Previously married
13302,3.0,4.0,0.0,2.0,17.0,-0.2,1.0,9.0,-0.166667,2.0,...,0,0,0,0,0,0,1,0,1,0
35701,2.0,20.0,-0.166667,2.0,3.0,0.0,4.0,19.0,-0.166667,4.0,...,0,0,0,0,0,0,1,0,1,0
67361,5.0,9.0,1.5,1.0,2.0,0.2,4.0,10.0,0.666667,1.0,...,0,0,0,0,0,0,1,0,0,1
68523,5.0,9.0,0.166667,1.0,11.0,0.2,1.0,20.0,1.666667,5.0,...,0,0,0,1,0,0,0,0,1,0
34740,4.0,4.0,0.0,2.0,17.0,-0.6,3.0,16.0,-0.833333,2.0,...,0,0,0,0,0,0,1,0,1,0


In [165]:
fa = nth.feature_addition(*nth.feature_selection_xy)

In [169]:
fa[0]["rate"].tail()

13302   -0.0509
35701    0.0826
67361    0.0305
68523    0.0305
34740    0.0774
Name: rate, dtype: float64

## modeling


In [188]:
%%writefile ./module/modeling_score.py

import numpy as np
import pandas as pd
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, confusion_matrix, classification_report)
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import (train_test_split, cross_val_score, StratifiedKFold ,GridSearchCV)
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')

# ada, gbc, xgb, lgbm 모델링 클래스 
class Modeling:    
    def __init__(self, *xy_train_test):
        ada = AdaBoostClassifier()
        gbc = GradientBoostingClassifier()
        xgb = XGBClassifier()
        lgbm = LGBMClassifier()
        self.datas = []
        self.models = [ada, gbc, xgb, lgbm]
        self.model_names = ['Ada', 'GBC', 'XGB', 'LGBM']
        
        self.X_train, self.X_test, self.y_train, self.y_test = xy_train_test
        
        
        # 분류모델 평가지표 계산함수 (AUC, ACC를 우선순위로 사용함)
    def get_score(self, pred):
        acc = accuracy_score(self.y_test, pred)
        pre = precision_score(self.y_test, pred)
        rec = recall_score(self.y_test, pred)
        f1 = f1_score(self.y_test, pred)
        auc = roc_auc_score(self.y_test, pred)
       
        return acc, auc, pre, rec, f1
    
    
    def fit_model(self, model):

        model.fit(self.X_train, self.y_train)
        y_pre_tr = model.predict(self.X_train)
        self.y_pre_test = model.predict(self.X_test)
        total_score = self.get_score(self.y_pre_test)
        
        return total_score

    
    def models_score_df(self):
        cols_names = ['accuracy', 'AUC', 'precision', 'recall', 'f1']

        for model in self.models:
            self.datas.append(self.fit_model(model))

        df = pd.DataFrame(self.datas, columns=cols_names, index=self.model_names)
            
        return print(df) 
    
    
    # 평가지표와 confusion matrix 출력 함수
    def print_score(self):
        datas = []
        for model in self.models:
            datas.append(self.fit_model(model))
        
            acc, auc, pre, rec, f1 = datas[0]
            con = confusion_matrix(self.y_test, self.y_pre_test)
            print('='*20)
            print(model)
            print('confusion matrix')
            print(con)
            print('='*20)

            print('Accuracy: {0:.4f}, AUC: {1:.4f}'.format(acc, auc))
            print('Recall: {0:.4f}, f1_score: {1:.4f}, precision: {2:.4f}'.format(rec, f1, pre))
            print('='*20)

Overwriting ./module/modeling_score.py


In [176]:
md = Modeling(*nth.feature_addition_xy)

In [178]:
mdsc = md.print_score()

AdaBoostClassifier()
confusion matrix
[[3030 2549]
 [1563 4524]]
Accuracy: 0.6475, AUC: 0.6432
Recall: 0.7432, f1_score: 0.6875, precision: 0.6396
GradientBoostingClassifier()
confusion matrix
[[3016 2563]
 [1495 4592]]
Accuracy: 0.6475, AUC: 0.6432
Recall: 0.7432, f1_score: 0.6875, precision: 0.6396
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
confusion matrix
[[3212 2367]
 [1780 4307]]
Accuracy: 0.6475, AUC: 0.6432
Recall: 0.7432, f1_score: 0.6875, precisio

In [179]:
md.models_score_df()

Unnamed: 0,accuracy,AUC,precision,recall,f1
Ada,0.647523,0.643166,0.639615,0.743223,0.687538
GBC,0.652152,0.647497,0.641789,0.754395,0.693551
XGB,0.644523,0.641652,0.64534,0.707574,0.675025
LGBM,0.655666,0.651807,0.64905,0.74043,0.691735


In [189]:
%%writefile ./module/ml_project_result_machia_voted.py

import preprocessing1st as pre1
import preprocessing_nth as prenth
import modeling_score as mdsc

# 1st preprocessing
pre = pre1.Preprocessing1st("../MACH_data/data.csv")
xy = pre.preprocessing_model()


# Nth preprocessing
nth = prenth.PreprocessingNth()

# feature_selection
fs = nth.feature_selection(*xy)

# feature_addition
fa = nth.feature_addition(*fs)


# Modeling & Result
md = mdsc.Modeling(*fa)
md.print_score()
md.models_score_df()

Overwriting ./module/ml_project_result_machia_voted.py
