# 클래스 생성

## x,y 가져와서 훈련

### 자료 가져오기

In [3]:
df1 = pd.read_csv("../MACH_data/data_cutoff_age18.csv")

### x : getdummies, y:labelencoding

In [4]:
y = df1["voted"]
x = df1.drop("voted", axis=1)

In [5]:
x.shape, y.shape

((54718, 107), (54718,))

In [6]:
X = pd.get_dummies(x)

In [7]:
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
Y = lb.fit_transform(y)
Y   # no = 0, yes = 1

array([[1],
       [0],
       [1],
       ...,
       [0],
       [1],
       [0]])

In [8]:
X.shape, Y.shape

((54718, 339), (54718, 1))

## 클래스 생성

### train_test

In [11]:
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, confusion_matrix, classification_report)

from sklearn.model_selection import (train_test_split, cross_val_score, StratifiedKFold ,GridSearchCV)
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings('ignore')
class Train_models:    
    def __init__(self,X,y):
        ada = AdaBoostClassifier()
        gbc = GradientBoostingClassifier()
        xgb = XGBClassifier()
        lgbm = LGBMClassifier()
        self.datas = []
        self.models = [ada, gbc, xgb, lgbm]
        self.model_names = ['Ada', 'GBC', 'XGB', 'LGBM']
        
        self.X_train, self.X_test, self.y_train, self.y_test=\
        train_test_split(X, y, test_size=0.2,
                         random_state=13, stratify=y)    

    def get_score(self, pred):
        acc = accuracy_score(self.y_test, pred)
        pre = precision_score(self.y_test, pred)
        rec = recall_score(self.y_test, pred)
        f1 = f1_score(self.y_test, pred)
        auc = roc_auc_score(self.y_test, pred)
       
        return acc, auc, pre, rec, f1
    
    def fit_model(self, model):

        model.fit(self.X_train, self.y_train)
        y_pre_tr = model.predict(self.X_train)
        self.y_pre_test = model.predict(self.X_test)
        total_score = self.get_score(self.y_pre_test)
        
        return total_score

    def models_score_df(self):
        cols_names = ['accuracy', 'AUC', 'precision', 'recall', 'f1']

        for model in self.models:
            self.datas.append(self.fit_model(model))

        return pd.DataFrame(self.datas, columns=cols_names, index=self.model_names)
    

    def print_score(self):
        datas = []
        for model in self.models:
            datas.append(self.fit_model(model))
        
            acc, auc, pre, rec, f1 = datas[0]
            con = confusion_matrix(self.y_test, self.y_pre_test)
            print('='*20)
            print(model)
            print('confusion matrix')
            print(con)
            print('='*20)

            print('Accuracy: {0:.4f}, AUC: {1:.4f}'.format(acc, auc))
            print('Recall: {0:.4f}, f1_score: {1:.4f}, precision: {2:.4f}'.format(rec, f1, pre))
            print('='*20)

In [12]:
train=Train_models(X,Y)

In [13]:
train.print_score()

AdaBoostClassifier()
confusion matrix
[[2713 2465]
 [1417 4349]]
Accuracy: 0.6453, AUC: 0.6391
Recall: 0.7542, f1_score: 0.6914, precision: 0.6382
GradientBoostingClassifier()
confusion matrix
[[2648 2530]
 [1346 4420]]
Accuracy: 0.6453, AUC: 0.6391
Recall: 0.7542, f1_score: 0.6914, precision: 0.6382
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
confusion matrix
[[2897 2281]
 [1588 4178]]
Accuracy: 0.6453, AUC: 0.6391
Recall: 0.7542, f1_score: 0.6914, precisio

In [14]:
train.models_score_df()

Unnamed: 0,accuracy,AUC,precision,recall,f1
Ada,0.645285,0.639098,0.638245,0.754249,0.691415
GBC,0.645651,0.638785,0.635788,0.766563,0.695078
XGB,0.646473,0.642037,0.646849,0.724592,0.683517
LGBM,0.656067,0.650296,0.648605,0.757718,0.698928


## 추가 컬럼 생성 클래스

In [39]:
class Add_columns:
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        
    def score(self, column="score", voted="voted",col_name="rate"):
        df_tr = pd.concat([self.X_train, self.y_train], axis=1)
        df_te = self.X_test
        add_all_tr = df_tr[[voted, column]].groupby(column).count()
        add_yes_tr = df_tr[[voted, column]].groupby(column).sum()
        add_no_tr = add_all_tr - add_yes_tr
        df_add_tr = (add_yes_tr - add_no_tr)/ add_all_tr
        df_add_tr = df_add_tr.rename(columns={voted:col_name})
        
        df1_tr = pd.merge(left=df_tr, right=df_add_tr, how="left", right_index=True, left_on=column)
        df1_te = pd.merge(left=df_te, right=df_add_tr, how="left", right_index=True, left_on=column)
        self.train_X=df1_tr.drop(voted, axis=1)
        self.train_Y=pd.DataFrame(df1_tr[voted])
        self.test_X=df1_te
        self.test_Y=y_test
        return self.train_X, self.test_X, self.train_Y, self.test_Y

# 연습

In [40]:
X_test = pd.read_csv('../MACH_data/X_test_data_without_0feature.csv' )

In [41]:
X_train = pd.read_csv('../MACH_data/X_train_data_without_0feature.csv' )

In [42]:
y_test = pd.read_csv('../MACH_data/Y_test_data_without_0feature.csv' )

In [43]:
y_train = pd.read_csv('../MACH_data/Y_train_data_without_0feature.csv' )

In [44]:
aaa=Add_columns(X_train, X_test, y_train, y_test)

In [45]:
aaa

<__main__.Add_columns at 0x7fa2a1809190>

In [46]:
train_x, test_x, train_y, test_y = aaa.score(column = "age", voted = "voted", col_name="age_voted")
    

In [47]:
train_x

Unnamed: 0,Q1_TP_notell_2u,Q1I,Q1E_notell_2u,Q2_TP_ppl_nd_dangun,Q2I,Q2E_ppl_nd_dangun,Q3_TN_do_moral,Q3I,Q3E_do_moral,Q4_VN_ppl_good,...,race_Arab,race_Asian,race_Black,race_Native American,race_Other,race_White,married_Currently married,married_Never married,married_Previously married,age_voted
0,5.0,11.0,3.0,5.0,18.0,4.0,1.0,14.0,4.0,1.0,...,0,0,0,0,0,1,0,1,0,0.008451
1,3.0,19.0,8.0,2.0,8.0,6.0,5.0,7.0,5.0,3.0,...,0,0,0,0,0,1,0,1,0,0.076705
2,5.0,7.0,5.0,4.0,18.0,3.0,5.0,11.0,1.0,4.0,...,0,0,0,0,0,1,0,1,0,0.104478
3,2.0,17.0,7.0,1.0,12.0,4.0,4.0,14.0,4.0,4.0,...,0,1,0,0,0,0,0,1,0,-0.700822
4,5.0,13.0,6.0,5.0,20.0,5.0,2.0,4.0,6.0,1.0,...,0,0,0,0,0,1,0,1,0,0.049358
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43769,4.0,19.0,6.0,5.0,3.0,19.0,3.0,9.0,10.0,3.0,...,0,1,0,0,0,0,0,1,0,0.030132
43770,5.0,3.0,11.0,5.0,11.0,7.0,5.0,16.0,7.0,3.0,...,0,0,1,0,0,0,0,1,0,0.008451
43771,4.0,7.0,11.0,3.0,12.0,6.0,2.0,13.0,82.0,2.0,...,0,0,0,0,0,1,0,1,0,-0.013829
43772,4.0,17.0,5.0,5.0,19.0,3.0,5.0,13.0,13.0,4.0,...,0,0,0,0,0,1,0,1,0,-0.700822


In [49]:
train_x, test_x, train_y, test_y = aaa.score(column = "score", voted = "voted", column)

In [12]:
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, confusion_matrix, classification_report)

from sklearn.model_selection import (train_test_split, cross_val_score, StratifiedKFold ,GridSearchCV)
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings('ignore')
class Train_models:    
    def __init__(self,X_train, X_test, y_train, y_test):
        ada = AdaBoostClassifier()
        gbc = GradientBoostingClassifier()
        xgb = XGBClassifier()
        lgbm = LGBMClassifier()
        self.datas = []
        self.models = [ada, gbc, xgb, lgbm]
        self.model_names = ['Ada', 'GBC', 'XGB', 'LGBM']
        
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test

    def get_score(self, pred):
        acc = accuracy_score(self.y_test, pred)
        pre = precision_score(self.y_test, pred)
        rec = recall_score(self.y_test, pred)
        f1 = f1_score(self.y_test, pred)
        auc = roc_auc_score(self.y_test, pred)
       
        return acc, auc, pre, rec, f1
    
    def fit_model(self, model):

        model.fit(self.X_train, self.y_train)
        y_pre_tr = model.predict(self.X_train)
        self.y_pre_test = model.predict(self.X_test)
        total_score = self.get_score(self.y_pre_test)
        
        return total_score

    def models_score_df(self):
        cols_names = ['accuracy', 'AUC', 'precision', 'recall', 'f1']

        for model in self.models:
            self.datas.append(self.fit_model(model))

        return pd.DataFrame(self.datas, columns=cols_names, index=self.model_names)
    

    def print_score(self):
        datas = []
        for model in self.models:
            datas.append(self.fit_model(model))
        
            acc, auc, pre, rec, f1 = datas[0]
            con = confusion_matrix(self.y_test, self.y_pre_test)
            print('='*20)
            print(model)
            print('confusion matrix')
            print(con)
            print('='*20)

            print('Accuracy: {0:.4f}, AUC: {1:.4f}'.format(acc, auc))
            print('Recall: {0:.4f}, f1_score: {1:.4f}, precision: {2:.4f}'.format(rec, f1, pre))
            print('='*20)

In [None]:
a, b, c, d = aaa.score(column="age", voted="voted",col_name="age_voted")

In [38]:
train=Train_models(aaa.score(column="age", voted="voted",col_name="age_voted"))

TypeError: __init__() missing 3 required positional arguments: 'X_test', 'y_train', and 'y_test'

In [14]:
train.print_score()

AdaBoostClassifier()
confusion matrix
[[2689 2494]
 [1345 4416]]
Accuracy: 0.6492, AUC: 0.6427
Recall: 0.7665, f1_score: 0.6970, precision: 0.6391
GradientBoostingClassifier()
confusion matrix
[[2676 2507]
 [1270 4491]]
Accuracy: 0.6492, AUC: 0.6427
Recall: 0.7665, f1_score: 0.6970, precision: 0.6391
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
confusion matrix
[[2902 2281]
 [1574 4187]]
Accuracy: 0.6492, AUC: 0.6427
Recall: 0.7665, f1_score: 0.6970, precisio

In [15]:
train.models_score_df()

Unnamed: 0,accuracy,AUC,precision,recall,f1
Ada,0.649214,0.642673,0.639074,0.766534,0.697025
GBC,0.654879,0.647928,0.641755,0.779552,0.703974
XGB,0.647752,0.643345,0.647341,0.726784,0.684766
LGBM,0.659448,0.653855,0.651339,0.759764,0.701386


In [None]:
train_x, test_x, train_y, test_y 

In [36]:
a, b, c, d = aaa.score(column="age", voted="voted",col_name="age_voted")

In [37]:
a

Unnamed: 0,Q1_TP_notell_2u,Q1I,Q1E_notell_2u,Q2_TP_ppl_nd_dangun,Q2I,Q2E_ppl_nd_dangun,Q3_TN_do_moral,Q3I,Q3E_do_moral,Q4_VN_ppl_good,...,race_Arab,race_Asian,race_Black,race_Native American,race_Other,race_White,married_Currently married,married_Never married,married_Previously married,age_voted
0,5.0,11.0,3.0,5.0,18.0,4.0,1.0,14.0,4.0,1.0,...,0,0,0,0,0,1,0,1,0,
1,3.0,19.0,8.0,2.0,8.0,6.0,5.0,7.0,5.0,3.0,...,0,0,0,0,0,1,0,1,0,0.374765
2,5.0,7.0,5.0,4.0,18.0,3.0,5.0,11.0,1.0,4.0,...,0,0,0,0,0,1,0,1,0,0.400000
3,2.0,17.0,7.0,1.0,12.0,4.0,4.0,14.0,4.0,4.0,...,0,1,0,0,0,0,0,1,0,0.215686
4,5.0,13.0,6.0,5.0,20.0,5.0,2.0,4.0,6.0,1.0,...,0,0,0,0,0,1,0,1,0,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43769,4.0,19.0,6.0,5.0,3.0,19.0,3.0,9.0,10.0,3.0,...,0,1,0,0,0,0,0,1,0,0.333333
43770,5.0,3.0,11.0,5.0,11.0,7.0,5.0,16.0,7.0,3.0,...,0,0,1,0,0,0,0,1,0,0.324675
43771,4.0,7.0,11.0,3.0,12.0,6.0,2.0,13.0,82.0,2.0,...,0,0,0,0,0,1,0,1,0,0.230769
43772,4.0,17.0,5.0,5.0,19.0,3.0,5.0,13.0,13.0,4.0,...,0,0,0,0,0,1,0,1,0,0.250000


In [16]:
bbb = Add_columns(train_x, test_x, train_y, test_y )

In [18]:
a, b, c, d = bbb.score(column="age", voted="voted",col_name="age_voted")

In [28]:
a.iloc[0,:].age

21.0

In [34]:
a[a["age_voted"].isnull()]["age"]

0        21
53       40
64       18
111      27
154      40
         ..
43686    33
43699    34
43707    24
43713    27
43718    24
Name: age, Length: 2825, dtype: int64

# 연습 하나더

In [59]:
class User2:

    def __init__(self, train_obj, add_obj):
        self.train_obj = train_obj
        self.add_obj = add_obj
        
    def disp(self):
        print(self.train_obj.name, self.add_obj.email)
        
    def added(self):
        a, b, c, d = self.add_obj(column = "score", voted = "voted")
        return a, b, c, d

# 그리드 서치 cv

In [None]:
from sklearn.model_selection import GridSearchCV
gridsearch = GridSearchCV(estimator=XGB, param_grid=params, cv =5)
gridsearch.fit(X_train, y_train)
gridsearch.best_params_