In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import lightgbm

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score

In [2]:
# set options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
vote_df = pd.read_csv("C:\\Users\\kimsj\\OneDrive\\바탕 화면\\대학교\\git\\AI-project\\data\\train.csv", encoding="utf-8")
print("dataset shape: ", vote_df.shape)

dataset shape:  (36425, 78)


In [4]:
# 예측한 값으로 성능을 평가하는 함수
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred) # 오차 행렬
    accuracy = accuracy_score(y_test, pred) # 정확도
    precision = precision_score(y_test, pred) # 정밀도
    recall = recall_score(y_test, pred) # 재현율
    f1 = f1_score(y_test, pred) # F1
    roc_auc = roc_auc_score(y_test, pred_proba) # ROC-AUC
    print('오차 행렬')
    print(confusion)
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [5]:
# 인덱스 제거
vote_df.drop("index", axis=1, inplace=True)

In [6]:
# tp 데이터 0값 전처리

In [7]:
Q_E = []
#Q_E 시간
for i in range(97, 117):
    a = 'Q' + chr(i) + 'E'
    Q_E.append(a)

In [8]:
# 시간 데이터 전처리 - 정확도 0.85 -> 0.94로 증가함 하지만 StandardScaler로 전처리는 의미가 없을 걸로 보임...
def get_categoty_time(Q_time):
    cat = 0
    if Q_time <= 1000 : cat = 1
    elif Q_time <= 2000 : cat = 2
    elif Q_time <= 3000 : cat = 3
    elif Q_time <= 4000 : cat = 4
    elif Q_time <= 5000 : cat = 5
    elif Q_time <= 6000 : cat = 6
    elif Q_time <= 7000 : cat = 7
    elif Q_time <= 8000 : cat = 8
    elif Q_time <= 9000 : cat = 9
    elif Q_time <=10000 : cat = 10
    elif Q_time <=15000 : cat = 11
    elif Q_time <=20000 : cat = 12
    else : Q_time = 13

    return cat


for Q_time in Q_E:
    vote_df[Q_time] = vote_df[Q_time].apply(lambda x : get_categoty_time(x))


In [8]:
#시간에 대해 이상치 데이터 인덱스 확인 및 제거
def get_outlier(df=None, column=None, weight=1.5):
    fraud = df[df['voted']==1][column]
    quantile_25 = np.percentile(fraud.values, 25)
    quantile_75 = np.percentile(fraud.values, 75)
    
    iqr = quantile_75 - quantile_25
    iqr_weight = iqr * weight
    lowest_val = quantile_25 - iqr_weight
    highest_val = quantile_75 + iqr_weight    
    outlier_index = fraud[(fraud < lowest_val) | (fraud > highest_val)].index    
    return outlier_index


for feature in Q_E:
    outlier_index = get_outlier(df=vote_df, column=feature, weight=1.5)
    print('이상치 데이터 인덱스 :', outlier_index)
    vote_df.drop(outlier_index, axis=0, inplace=True)

이상치 데이터 인덱스 : Int64Index([   21,    31,    77,   110,   124,   134,   214,   243,   253,
              267,
            ...
            36149, 36179, 36205, 36206, 36220, 36221, 36252, 36350, 36363,
            36373],
           dtype='int64', length=1193)
이상치 데이터 인덱스 : Int64Index([   43,    90,   143,   182,   190,   199,   250,   263,   302,
              312,
            ...
            36204, 36254, 36267, 36274, 36306, 36320, 36322, 36336, 36364,
            36387],
           dtype='int64', length=1171)
이상치 데이터 인덱스 : Int64Index([   35,    49,    94,    96,   101,   111,   114,   131,   171,
              270,
            ...
            36237, 36280, 36290, 36295, 36304, 36315, 36325, 36358, 36365,
            36375],
           dtype='int64', length=1068)
이상치 데이터 인덱스 : Int64Index([    9,    27,    33,    66,   258,   266,   274,   394,   400,
              422,
            ...
            35965, 36184, 36224, 36235, 36248, 36303, 36330, 36374, 36392,
            36394],
       

In [9]:
# 라벨 인코딩
encoder = LabelEncoder()
encoder.fit(vote_df['age_group'])
labels = encoder.transform(vote_df['age_group'])
vote_df['age_group'] = labels

In [10]:
def get_categoty_age(age_num):
    num = age_num;
    if age_num == 0: num =7

    return num
vote_df["age_group"] = vote_df["age_group"].apply(lambda x : get_categoty_age(x))
# x_test["age_group"] = x_test["age_group"].apply(lambda x : get_categoty_age(x))

In [11]:
#원핫인코딩 진행
def dummy_data(data, columns) :
    for column in columns:
        data = pd.concat([data, pd.get_dummies(data[column], prefix = column)], axis=1)
        data = data.drop(column, axis=1)
    return data

dummy_columns = ['engnat', 'gender','hand', 'married', 'race', 'religion', 'urban']
vote_df = dummy_data(vote_df, dummy_columns)

In [12]:
vote_df.head(5)

Unnamed: 0,QaA,QaE,QbA,QbE,QcA,QcE,QdA,QdE,QeA,QeE,QfA,QfE,QgA,QgE,QhA,QhE,QiA,QiE,QjA,QjE,QkA,QkE,QlA,QlE,QmA,QmE,QnA,QnE,QoA,QoE,QpA,QpE,QqA,QqE,QrA,QrE,QsA,QsE,QtA,QtE,age_group,education,familysize,tp01,tp02,tp03,tp04,tp05,tp06,tp07,tp08,tp09,tp10,voted,wf_01,wf_02,wf_03,wr_01,wr_02,wr_03,wr_04,wr_05,wr_06,wr_07,wr_08,wr_09,wr_10,wr_11,wr_12,wr_13,engnat_0,engnat_1,engnat_2,gender_Female,gender_Male,hand_0,hand_1,hand_2,hand_3,married_0,married_1,married_2,married_3,race_Arab,race_Asian,race_Black,race_Indigenous Australian,race_Native American,race_Other,race_White,religion_Agnostic,religion_Atheist,religion_Buddhist,religion_Christian_Catholic,religion_Christian_Mormon,religion_Christian_Other,religion_Christian_Protestant,religion_Hindu,religion_Jewish,religion_Muslim,religion_Other,religion_Sikh,urban_0,urban_1,urban_2,urban_3
2,5.0,609,1.0,749,2.0,624,1.0,1833,3.0,1474,5.0,728,4.0,1107,3.0,1743,5.0,3008,3.0,1649,5.0,870,1.0,1551,1.0,989,5.0,347,1.0,824,1.0,1445,2.0,884,5.0,744,1.0,899,4.0,963,2,2,3,0,1,0,5,3,6,0,2,1,6,1,0,0,0,0,1,0,1,1,0,1,1,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
3,4.0,182,1.0,2969,1.0,1955,4.0,4630,1.0,1321,3.0,2345,4.0,850,1.0,3559,5.0,6761,1.0,8315,5.0,614,1.0,1751,1.0,1919,4.0,96,4.0,1251,5.0,261,4.0,548,4.0,2576,2.0,877,5.0,967,1,3,3,2,2,4,5,0,1,0,2,0,4,0,0,0,0,0,1,0,0,1,0,1,1,0,1,0,1,1,0,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
5,2.0,747,2.0,1877,5.0,739,1.0,991,3.0,1577,1.0,805,2.0,961,3.0,1417,1.0,13432,5.0,1000,5.0,1315,5.0,1192,5.0,17827,1.0,445,4.0,742,4.0,2901,2.0,1057,2.0,2090,4.0,610,4.0,1146,1,1,1,6,1,2,3,1,2,2,0,4,6,0,0,0,0,1,1,0,1,1,1,1,1,0,1,1,1,1,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
7,1.0,1719,2.0,5379,5.0,3039,1.0,1260,1.0,2297,3.0,1004,1.0,1291,4.0,1117,3.0,3965,4.0,2363,4.0,2097,5.0,1485,4.0,1510,1.0,2759,5.0,1144,4.0,1738,1.0,1284,2.0,6302,2.0,877,5.0,1894,2,3,2,3,4,2,5,1,2,2,2,2,3,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
8,1.0,467,5.0,736,5.0,614,2.0,571,1.0,185,5.0,272,4.0,413,4.0,986,2.0,1037,5.0,497,5.0,1076,5.0,1551,5.0,616,5.0,399,4.0,549,4.0,663,1.0,364,3.0,508,4.0,1041,4.0,467,1,2,1,5,2,0,1,1,3,4,2,6,2,0,1,1,1,1,1,0,1,1,0,1,1,0,1,0,1,1,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0


In [13]:
# Standard Scaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(vote_df) 
train_scaled = scaler.transform(vote_df)
train_df_scaler = pd.DataFrame(data=train_scaled, columns=vote_df.columns)

In [14]:
y_labels = vote_df.loc[:,'voted']
X_features = train_df_scaler.drop('voted', axis=1)
print("피처 데이터 shape:{0}".format(X_features.shape))

피처 데이터 shape:(24623, 105)


In [15]:
X_features.sample(10)

Unnamed: 0,QaA,QaE,QbA,QbE,QcA,QcE,QdA,QdE,QeA,QeE,QfA,QfE,QgA,QgE,QhA,QhE,QiA,QiE,QjA,QjE,QkA,QkE,QlA,QlE,QmA,QmE,QnA,QnE,QoA,QoE,QpA,QpE,QqA,QqE,QrA,QrE,QsA,QsE,QtA,QtE,age_group,education,familysize,tp01,tp02,tp03,tp04,tp05,tp06,tp07,tp08,tp09,tp10,wf_01,wf_02,wf_03,wr_01,wr_02,wr_03,wr_04,wr_05,wr_06,wr_07,wr_08,wr_09,wr_10,wr_11,wr_12,wr_13,engnat_0,engnat_1,engnat_2,gender_Female,gender_Male,hand_0,hand_1,hand_2,hand_3,married_0,married_1,married_2,married_3,race_Arab,race_Asian,race_Black,race_Indigenous Australian,race_Native American,race_Other,race_White,religion_Agnostic,religion_Atheist,religion_Buddhist,religion_Christian_Catholic,religion_Christian_Mormon,religion_Christian_Other,religion_Christian_Protestant,religion_Hindu,religion_Jewish,religion_Muslim,religion_Other,religion_Sikh,urban_0,urban_1,urban_2,urban_3
11561,0.752068,-0.021603,0.622914,-0.011859,0.90903,-0.125775,-0.712129,-0.06045,-0.958197,0.000525,-0.840892,-0.017155,-1.01105,-0.063902,-0.933742,-0.019224,-0.734869,0.001579,0.261055,-0.030837,0.914461,-0.027349,0.570235,-0.03474,1.050272,-0.004845,-1.073567,-0.076672,1.159766,-0.0578,1.243086,-0.03073,-1.235178,-0.022953,0.154722,-0.00109,0.354827,-0.155961,1.120475,-0.023903,-0.095343,0.636189,0.973715,-1.03119,1.866567,-0.493688,-0.45927,-0.935632,0.81023,-1.180693,1.327359,-0.731169,0.407698,3.352323,-0.447747,-0.360143,0.65675,0.269909,2.0259,0.39697,0.361437,-0.674029,0.259455,0.175656,-0.49087,0.247661,-0.77751,0.244785,0.290974,-0.042789,0.605457,-0.602639,1.07313,-1.07313,-0.061905,0.414755,-0.340634,-0.201419,-0.046882,0.558853,-0.442705,-0.278386,-0.094293,-0.456023,-0.224055,-0.035505,-0.112177,-0.336984,0.724845,1.89778,-0.552281,-0.143822,-0.395324,-0.095383,-0.348086,-0.320525,-0.18003,-0.102296,-0.172685,-0.350793,-0.054903,-0.082386,-0.473533,1.196019,-0.814205
19199,-0.091469,-0.03221,-1.294177,-0.055874,0.90903,-0.091568,1.214556,0.001091,-0.958197,-0.061246,-0.087233,-0.016653,0.990645,-0.065037,-0.933742,-0.085024,0.622266,-0.021266,0.261055,-0.042927,0.201634,-0.071731,-3.024925,-0.025774,-1.180585,-0.006787,-0.389092,-0.075724,-0.886885,-0.033302,-1.468031,-0.03177,0.918112,-0.022265,-0.551698,-0.005095,0.354827,0.011865,0.454369,-0.022451,-0.884732,-0.410929,-1.037114,0.465891,-0.285267,0.096069,-0.45927,1.790041,-1.209389,-0.081098,-0.696391,-0.197856,-1.317372,-0.298301,-0.447747,-0.360143,0.65675,0.269909,-0.493608,0.39697,0.361437,-0.674029,0.259455,0.175656,-0.49087,0.247661,-0.77751,0.244785,0.290974,-0.042789,0.605457,-0.602639,-0.931854,0.931854,-0.061905,0.414755,-0.340634,-0.201419,-0.046882,0.558853,-0.442705,-0.278386,-0.094293,-0.456023,-0.224055,-0.035505,-0.112177,-0.336984,0.724845,-0.526931,-0.552281,-0.143822,-0.395324,-0.095383,-0.348086,3.11988,-0.18003,-0.102296,-0.172685,-0.350793,-0.054903,-0.082386,2.111787,-0.836107,-0.814205
4041,-0.091469,-0.02934,1.261944,-0.038312,0.20001,-0.12996,-0.712129,-0.059396,-0.220572,-0.040474,-0.087233,-0.018158,-1.01105,-0.068157,1.79231,-0.139493,-0.734869,-0.009215,1.013833,-0.039812,-0.511193,-0.034278,-0.328555,-0.035634,0.306653,-0.028243,0.979859,-0.073828,-0.886885,-0.06503,1.243086,-0.031928,-0.517415,0.000581,-0.551698,-0.011368,0.354827,-0.060235,1.120475,-0.026987,2.272821,1.683308,0.303439,1.463945,0.252692,-0.493688,-0.948354,1.790041,-1.209389,-0.630895,0.821422,-0.197856,-0.742348,-0.298301,2.233402,-0.360143,0.65675,0.269909,2.0259,0.39697,0.361437,1.483617,0.259455,0.175656,2.037199,0.247661,1.286157,0.244785,0.290974,-0.042789,0.605457,-0.602639,-0.931854,0.931854,-0.061905,0.414755,-0.340634,-0.201419,-0.046882,-1.789379,2.258839,-0.278386,-0.094293,-0.456023,-0.224055,-0.035505,-0.112177,-0.336984,0.724845,-0.526931,-0.552281,-0.143822,-0.395324,-0.095383,-0.348086,-0.320525,-0.18003,-0.102296,-0.172685,2.850682,-0.054903,-0.082386,2.111787,-0.836107,-0.814205
12586,0.752068,-0.021728,-0.016117,-0.010862,0.90903,-0.074283,0.251213,-0.045288,0.517054,-0.054282,2.173746,-0.017305,0.323413,-0.063902,-0.252229,-0.071606,1.300834,-0.021354,-0.491722,-0.033045,0.201634,-0.030024,0.570235,-0.023618,0.306653,-0.030095,1.664334,-0.085204,0.477549,-0.058123,0.565307,-0.015503,0.200349,-0.022229,0.861142,-0.009125,-0.383836,-0.138853,1.120475,-0.026783,0.694045,-0.410929,-0.366838,-0.532163,0.79065,-0.493688,0.029814,0.427204,-0.199579,0.4687,-0.696391,-1.264481,-0.742348,-0.298301,-0.447747,-0.360143,0.65675,0.269909,-0.493608,0.39697,0.361437,-0.674029,0.259455,0.175656,-0.49087,0.247661,-0.77751,0.244785,0.290974,-0.042789,0.605457,-0.602639,-0.931854,0.931854,-0.061905,0.414755,-0.340634,-0.201419,-0.046882,0.558853,-0.442705,-0.278386,-0.094293,-0.456023,-0.224055,-0.035505,-0.112177,2.9675,-1.379606,1.89778,-0.552281,-0.143822,-0.395324,-0.095383,-0.348086,-0.320525,-0.18003,-0.102296,-0.172685,-0.350793,-0.054903,-0.082386,-0.473533,-0.836107,1.228192
2394,-0.935007,-0.017423,-1.294177,0.071184,0.20001,0.000136,0.251213,0.009848,-0.958197,0.027716,-0.840892,0.005164,0.990645,0.266592,-0.933742,-0.02479,0.622266,0.04457,0.261055,0.038165,0.914461,0.054093,-0.328555,-0.000325,0.306653,0.004927,0.979859,0.038413,-0.886885,0.034255,0.565307,0.024188,-0.517415,0.019988,1.567562,0.011581,1.09349,0.010236,1.120475,-0.008071,1.483433,-0.410929,-0.366838,-1.530217,-0.285267,0.096069,1.007983,1.790041,1.82004,1.018498,1.327359,-0.197856,0.407698,-0.298301,-0.447747,-0.360143,0.65675,0.269909,-0.493608,0.39697,0.361437,1.483617,0.259455,0.175656,-0.49087,0.247661,-0.77751,0.244785,0.290974,-0.042789,0.605457,-0.602639,1.07313,-1.07313,-0.061905,0.414755,-0.340634,-0.201419,-0.046882,-1.789379,-0.442705,3.592136,-0.094293,-0.456023,-0.224055,-0.035505,-0.112177,-0.336984,0.724845,-0.526931,-0.552281,-0.143822,-0.395324,-0.095383,-0.348086,3.11988,-0.18003,-0.102296,-0.172685,-0.350793,-0.054903,-0.082386,2.111787,-0.836107,-0.814205
673,-0.091469,0.037297,0.622914,-0.027774,0.90903,0.146062,-0.712129,-0.029801,-0.220572,-0.015948,-0.840892,-0.013965,-1.01105,-0.020971,1.110797,-0.001532,1.300834,0.001667,-1.2445,-0.028742,0.914461,-0.007569,-0.328555,0.009411,-1.924204,-0.00351,-1.073567,-0.043493,-0.886885,-0.021323,0.565307,-0.008189,-0.517415,-0.014372,-1.258118,-0.008202,-1.861161,-0.133964,-0.877844,-0.009364,2.272821,-0.410929,0.973715,0.465891,1.328608,-0.493688,0.518898,-0.254214,-0.704484,-1.180693,1.327359,-0.731169,-1.317372,-0.298301,-0.447747,-0.360143,0.65675,0.269909,-0.493608,-2.51908,0.361437,-0.674029,0.259455,0.175656,-0.49087,0.247661,1.286157,0.244785,0.290974,-0.042789,0.605457,-0.602639,1.07313,-1.07313,-0.061905,0.414755,-0.340634,-0.201419,-0.046882,-1.789379,2.258839,-0.278386,-0.094293,-0.456023,-0.224055,-0.035505,-0.112177,-0.336984,0.724845,-0.526931,-0.552281,-0.143822,-0.395324,-0.095383,-0.348086,3.11988,-0.18003,-0.102296,-0.172685,-0.350793,-0.054903,-0.082386,-0.473533,-0.836107,1.228192
6776,0.752068,-0.022601,0.622914,-0.048415,-1.92705,-0.148156,2.177898,-0.013585,1.992305,-0.064819,1.420087,-0.020736,1.657876,-0.079316,-0.933742,-0.109277,1.300834,-0.020589,-1.997278,-0.041171,0.201634,-0.063749,0.570235,-0.042812,-0.436966,-0.026817,1.664334,-0.102647,-0.204668,-0.128486,-1.468031,-0.038579,0.918112,-0.020382,0.861142,-0.008405,1.09349,-0.209731,-1.54395,-0.022066,-0.095343,0.636189,-0.366838,-1.530217,1.328608,-0.493688,-0.948354,-0.935632,1.82004,-1.180693,-1.202329,-0.731169,0.982722,-0.298301,2.233402,-0.360143,0.65675,0.269909,-0.493608,0.39697,0.361437,-0.674029,0.259455,0.175656,-0.49087,0.247661,1.286157,0.244785,0.290974,-0.042789,0.605457,-0.602639,1.07313,-1.07313,-0.061905,0.414755,-0.340634,-0.201419,-0.046882,0.558853,-0.442705,-0.278386,-0.094293,-0.456023,-0.224055,-0.035505,-0.112177,-0.336984,0.724845,-0.526931,-0.552281,-0.143822,-0.395324,-0.095383,-0.348086,-0.320525,-0.18003,-0.102296,-0.172685,2.850682,-0.054903,-0.082386,-0.473533,1.196019,-0.814205
8434,-0.091469,-0.007003,-1.294177,-0.041998,-1.21803,-0.128505,0.251213,0.015929,1.25468,-0.050951,0.666427,-0.016362,1.657876,-0.070238,-0.933742,-0.10709,0.622266,-0.005831,-1.2445,-0.043975,0.914461,-0.034234,-0.328555,0.006608,-1.180585,-0.013707,0.295384,-0.033823,0.477549,0.416502,-1.468031,-0.026442,0.200349,-0.016653,0.861142,-0.00906,-0.383836,-0.064308,-1.54395,-0.022156,-0.884732,-1.458048,-0.366838,-0.532163,-0.285267,-0.493688,0.518898,0.427204,-0.199579,-0.081098,-0.696391,-0.731169,0.982722,-0.298301,-0.447747,-0.360143,0.65675,0.269909,-0.493608,0.39697,0.361437,-0.674029,0.259455,0.175656,-0.49087,0.247661,-0.77751,0.244785,0.290974,-0.042789,0.605457,-0.602639,-0.931854,0.931854,-0.061905,0.414755,-0.340634,-0.201419,-0.046882,0.558853,-0.442705,-0.278386,-0.094293,2.192871,-0.224055,-0.035505,-0.112177,-0.336984,-1.379606,1.89778,-0.552281,-0.143822,-0.395324,-0.095383,-0.348086,-0.320525,-0.18003,-0.102296,-0.172685,-0.350793,-0.054903,-0.082386,-0.473533,1.196019,-0.814205
20759,0.752068,-0.029901,-1.294177,-0.036881,-0.50901,-0.083017,-0.712129,-0.018531,1.992305,-0.049255,-0.840892,-0.017496,-1.01105,0.128059,-0.933742,0.030573,-1.413437,-0.016554,-1.997278,0.050142,0.914461,-0.030375,0.570235,-0.014467,-1.180585,-0.018684,-1.073567,0.080125,-1.569102,-0.079275,-1.468031,-0.002577,-1.235178,-0.02317,-0.551698,-0.007576,-1.861161,-0.137631,-0.211738,-0.02608,-0.884732,-0.410929,0.303439,-0.033136,-0.285267,-0.493688,-1.437439,1.790041,-0.704484,1.018498,-0.190453,-0.731169,0.982722,-0.298301,-0.447747,-0.360143,-1.52265,0.269909,-0.493608,0.39697,0.361437,-0.674029,0.259455,0.175656,-0.49087,0.247661,-0.77751,0.244785,0.290974,-0.042789,0.605457,-0.602639,1.07313,-1.07313,-0.061905,0.414755,-0.340634,-0.201419,-0.046882,0.558853,-0.442705,-0.278386,-0.094293,-0.456023,-0.224055,-0.035505,-0.112177,-0.336984,0.724845,-0.526931,-0.552281,-0.143822,2.529573,-0.095383,-0.348086,-0.320525,-0.18003,-0.102296,-0.172685,-0.350793,-0.054903,-0.082386,-0.473533,1.196019,-0.814205
21518,0.752068,-0.03221,-1.294177,-0.018364,-1.92705,-0.054814,0.251213,0.133334,0.517054,0.029896,1.420087,-0.011518,-1.01105,-0.051609,-0.933742,-0.002128,1.300834,-0.024325,-0.491722,-0.036301,0.201634,-0.022744,-0.328555,-0.008613,-1.180585,-0.021415,0.979859,-0.055817,-1.569102,-0.05726,-0.112472,-0.027892,0.918112,-0.013322,0.154722,-0.007259,-1.861161,-0.17307,-0.211738,2.379846,-0.095343,0.636189,-1.037114,1.463945,1.866567,1.865342,-0.45927,1.790041,-0.704484,0.4687,-0.190453,0.335457,-0.167325,-0.298301,-0.447747,-0.360143,-1.52265,0.269909,-0.493608,0.39697,0.361437,-0.674029,0.259455,0.175656,-0.49087,0.247661,1.286157,0.244785,0.290974,-0.042789,-1.651646,1.659369,-0.931854,0.931854,-0.061905,0.414755,-0.340634,-0.201419,-0.046882,0.558853,-0.442705,-0.278386,-0.094293,2.192871,-0.224055,-0.035505,-0.112177,-0.336984,-1.379606,-0.526931,-0.552281,-0.143822,-0.395324,-0.095383,-0.348086,-0.320525,-0.18003,-0.102296,-0.172685,2.850682,-0.054903,-0.082386,-0.473533,-0.836107,1.228192


In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels, test_size=0.2, random_state=0)
train_cnt = y_train.count()
test_cnt = y_test.count()
print("학습 세트 Shape:{0}, 테스트 세트 Shape:{1}".format(X_train.shape, X_test.shape))

print(" 학습 세트 레이블 값 분포 비율")
print(y_train.value_counts()/train_cnt)
print("\n 테스트 세트 레이블 값 분포 비율")
print(y_test.value_counts()/test_cnt)

학습 세트 Shape:(19698, 105), 테스트 세트 Shape:(4925, 105)
 학습 세트 레이블 값 분포 비율
0    0.810641
1    0.189359
Name: voted, dtype: float64

 테스트 세트 레이블 값 분포 비율
0    0.80203
1    0.19797
Name: voted, dtype: float64


In [17]:
from xgboost import XGBClassifier

xgb = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3, random_state=156)
evals = [(X_test, y_test)]
xgb.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="logloss", eval_set=evals, verbose=True)
w_preds = xgb.predict(X_test)
w_pred_proba = xgb.predict_proba(X_test)[:,1]


get_clf_eval(y_test, w_preds, w_pred_proba)

[0]	validation_0-logloss:0.64693
[1]	validation_0-logloss:0.60893




[2]	validation_0-logloss:0.57747
[3]	validation_0-logloss:0.55088
[4]	validation_0-logloss:0.52801
[5]	validation_0-logloss:0.50888
[6]	validation_0-logloss:0.49139
[7]	validation_0-logloss:0.47665
[8]	validation_0-logloss:0.46351
[9]	validation_0-logloss:0.45225
[10]	validation_0-logloss:0.44195
[11]	validation_0-logloss:0.43271
[12]	validation_0-logloss:0.42495
[13]	validation_0-logloss:0.41742
[14]	validation_0-logloss:0.41063
[15]	validation_0-logloss:0.40478
[16]	validation_0-logloss:0.39924
[17]	validation_0-logloss:0.39416
[18]	validation_0-logloss:0.38943
[19]	validation_0-logloss:0.38520
[20]	validation_0-logloss:0.38110
[21]	validation_0-logloss:0.37762
[22]	validation_0-logloss:0.37412
[23]	validation_0-logloss:0.37072
[24]	validation_0-logloss:0.36762
[25]	validation_0-logloss:0.36449
[26]	validation_0-logloss:0.36166
[27]	validation_0-logloss:0.35904
[28]	validation_0-logloss:0.35627
[29]	validation_0-logloss:0.35367
[30]	validation_0-logloss:0.35119
[31]	validation_0-logl

[240]	validation_0-logloss:0.25578
[241]	validation_0-logloss:0.25572
[242]	validation_0-logloss:0.25571
[243]	validation_0-logloss:0.25565
[244]	validation_0-logloss:0.25559
[245]	validation_0-logloss:0.25555
[246]	validation_0-logloss:0.25556
[247]	validation_0-logloss:0.25552
[248]	validation_0-logloss:0.25551
[249]	validation_0-logloss:0.25544
[250]	validation_0-logloss:0.25527
[251]	validation_0-logloss:0.25519
[252]	validation_0-logloss:0.25519
[253]	validation_0-logloss:0.25513
[254]	validation_0-logloss:0.25506
[255]	validation_0-logloss:0.25499
[256]	validation_0-logloss:0.25495
[257]	validation_0-logloss:0.25495
[258]	validation_0-logloss:0.25492
[259]	validation_0-logloss:0.25494
[260]	validation_0-logloss:0.25488
[261]	validation_0-logloss:0.25482
[262]	validation_0-logloss:0.25482
[263]	validation_0-logloss:0.25484
[264]	validation_0-logloss:0.25474
[265]	validation_0-logloss:0.25469
[266]	validation_0-logloss:0.25462
[267]	validation_0-logloss:0.25460
[268]	validation_0-l



In [18]:
w_preds

array([0, 0, 0, ..., 1, 0, 1], dtype=int64)