In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import lightgbm

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier


In [2]:
# set options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
# 예측한 값으로 성능을 평가하는 함수
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred) # 오차 행렬
    accuracy = accuracy_score(y_test, pred) # 정확도
    precision = precision_score(y_test, pred) # 정밀도
    recall = recall_score(y_test, pred) # 재현율
    f1 = f1_score(y_test, pred) # F1
    roc_auc = roc_auc_score(y_test, pred_proba) # ROC-AUC
    print('오차 행렬')
    print(confusion)
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

### Test2

In [4]:
vote_df = pd.read_csv("C:\\Users\\kimsj\\OneDrive\\바탕 화면\\대학교\\git\\AI-project\\data\\train.csv", encoding="utf-8")
test_df = pd.read_csv("C:\\Users\\kimsj\\OneDrive\\바탕 화면\\대학교\\git\\AI-project\\data\\test.csv", encoding="utf-8")
# 인덱스 제거
vote_df.drop("index", axis=1, inplace=True)
test_df.drop("index", axis=1, inplace=True)

In [5]:
# 0값을 검사할 피처명 리스트 객체 설정
zero_features = ['education', 'engnat', 'hand', 'urban']

# tp 추가
for i in range(1, 11):
    s = 'tp' + chr(48+(i//10)) + chr(48+(i%10))
    zero_features.append(s)
    
# zero_features 리스트 내부에 저장된 개별 피처들에 대해 0값을 평균 값으로 대체
vote_df[zero_features] = vote_df[zero_features].replace(0, vote_df[zero_features].mean())

In [6]:
vote_df.sample(10)

Unnamed: 0,QaA,QaE,QbA,QbE,QcA,QcE,QdA,QdE,QeA,QeE,QfA,QfE,QgA,QgE,QhA,QhE,QiA,QiE,QjA,QjE,QkA,QkE,QlA,QlE,QmA,QmE,QnA,QnE,QoA,QoE,QpA,QpE,QqA,QqE,QrA,QrE,QsA,QsE,QtA,QtE,age_group,education,engnat,familysize,gender,hand,married,race,religion,tp01,tp02,tp03,tp04,tp05,tp06,tp07,tp08,tp09,tp10,urban,voted,wf_01,wf_02,wf_03,wr_01,wr_02,wr_03,wr_04,wr_05,wr_06,wr_07,wr_08,wr_09,wr_10,wr_11,wr_12,wr_13
31757,4.0,625,1.0,810,1.0,1945,1.0,885,4.0,1091,1.0,619,1.0,380,2.0,1186,5.0,1249,1.0,1476,5.0,1166,5.0,1505,4.0,1268,1.0,7586,4.0,1259,1.0,556,5.0,1077,2.0,1338,1.0,452,1.0,561,60s,2.0,1.0,4,Female,1.0,2,White,Christian_Other,1.0,2.0,1.0,3.025614,1.333342,6.0,2.022704,5.0,1.0,5.0,3.0,0,0,1,0,1,1,0,1,1,0,1,1,1,1,1,1,1
17158,5.0,188,4.0,843,4.0,5129,1.0,403,5.0,114,3.0,497,1.0,309,1.0,909,2.0,1518,5.0,649,3.0,78,5.0,738,4.0,561,5.0,84,2.0,232,4.0,66,2.0,352,4.0,423,4.0,852,3.0,509,10s,2.0,1.0,3,Female,1.0,1,White,Other,2.0,2.6193,1.0,3.025614,4.0,2.0,1.0,6.0,4.0,5.0,3.0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0
34194,1.0,418,5.0,162,5.0,857,1.0,1534,1.0,1232,1.0,691,1.0,563,4.0,4618,1.0,16945,5.0,1888,1.0,1316,5.0,2047,4.0,1429,1.0,735,4.0,1785,5.0,1243,1.0,1602,1.0,932,2.0,2372,5.0,1022,40s,2.0,1.0,4,Female,1.0,3,White,Christian_Catholic,2.0,4.0,2.0,2.0,1.0,4.0,2.0,2.0,1.0,6.0,2.0,1,1,1,0,1,1,0,1,1,0,1,1,0,1,1,1,1
17289,1.0,616,4.0,850,2.0,1463,1.0,1140,2.0,1523,5.0,1098,1.0,574,1.0,1843,4.0,4118,4.0,2017,5.0,896,5.0,1158,4.0,893,5.0,575,2.0,852,4.0,823,3.0,719,4.0,702,4.0,560,4.0,1091,10s,2.0,1.0,2,Male,1.0,1,White,Christian_Catholic,1.0,6.0,2.0,5.0,1.333342,3.0,2.022704,4.0,2.0,5.0,2.0,0,1,0,0,1,1,0,1,1,0,1,1,0,1,0,1,1
17738,1.0,412,1.0,1494,1.0,644,1.0,1299,4.0,2053,5.0,468,1.0,662,4.0,1720,5.0,2782,5.0,1585,5.0,920,5.0,1062,1.0,946,4.0,636,4.0,991,4.0,1843,4.0,510,4.0,1124,1.0,998,3.0,1503,20s,2.0,1.0,3,Female,1.0,1,White,Atheist,2.0,2.0,1.755717,2.0,1.333342,6.0,1.0,6.0,2.0,6.0,3.0,1,0,0,1,1,1,0,1,1,1,1,1,0,1,0,1,1
25002,1.0,476,2.0,1073,5.0,1438,1.0,2294,1.0,1842,4.0,2581,4.0,783,4.0,2393,4.0,2196,3.0,10558,2.0,1917,5.0,1013,4.0,1673,4.0,709,2.0,1231,4.0,949,2.0,763,4.0,1589,4.0,817,4.0,1512,20s,3.0,2.0,1,Female,1.0,1,White,Atheist,1.0,1.0,6.0,1.0,1.0,6.0,1.0,5.0,6.0,6.0,3.0,1,0,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0
22555,1.0,302,4.0,822,5.0,1021,1.0,1031,3.0,1286,3.0,739,1.0,491,4.0,1244,2.0,1265,5.0,1173,2.0,2232,1.0,1803,5.0,714,2.0,754,5.0,399,5.0,440,2.0,89,4.0,741,5.0,472,5.0,599,20s,3.0,2.0,3,Male,1.0,1,Asian,Christian_Catholic,1.0,2.0,3.0,2.0,1.0,6.0,1.0,4.0,2.0,6.0,3.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
29130,1.0,385,5.0,674,2.0,2423,1.0,697,1.0,1020,2.0,529,2.0,906,4.0,1353,4.0,1111,4.0,1515,5.0,716,5.0,1052,2.0,678,2.0,432,2.0,687,5.0,764,4.0,620,2.0,672,1.0,1213,2.0,938,40s,3.0,1.0,3,Female,1.0,2,White,Christian_Other,6.0,2.0,1.755717,5.0,1.0,1.0,1.0,6.0,1.0,4.0,1.0,1,0,0,0,0,1,0,0,1,0,1,1,0,1,0,1,1
16551,4.0,1310,1.0,767,3.0,877,2.0,1432,5.0,1144,3.0,507,4.0,1379,2.0,1040,4.0,1331,3.0,1301,5.0,1472,5.0,1014,1.0,930,4.0,964,1.0,670,1.0,1036,5.0,1257,4.0,1107,4.0,433,1.0,2458,30s,3.0,1.0,3,Female,1.0,3,White,Christian_Other,6.0,2.0,1.0,2.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,0,0,0,0,1,1,0,1,1,0,1,1,0,1,0,1,1
29728,2.0,421,3.0,2645,5.0,1901,1.0,983,3.0,1213,1.0,506,3.0,992,3.0,1387,2.0,5164,4.0,2777,1.0,5100,5.0,1516,5.0,621,1.0,444,4.0,892,4.0,806,1.0,5495,1.0,3393,3.0,1104,5.0,4124,10s,2.0,1.0,3,Male,1.0,1,White,Atheist,7.0,2.6193,7.0,7.0,1.0,2.0,7.0,7.0,1.0,7.0,1.0,1,0,0,0,1,1,0,0,1,0,1,1,0,1,0,0,1


In [7]:
### F1 = 0.7021

# 레이블 인코딩
encoder = LabelEncoder()
encoder2 = LabelEncoder()
encoder.fit(vote_df['age_group'])
encoder2.fit(test_df['age_group'])
labels = encoder.transform(vote_df['age_group'])
labels2 = encoder2.transform(test_df['age_group'])
vote_df['age_group'] = labels
test_df['age_group'] = labels2

def get_categoty_age(age_num):
    num = age_num;
    if age_num == 0: num =7

    return num
vote_df["age_group"] = vote_df["age_group"].apply(lambda x : get_categoty_age(x))
test_df["age_group"] = test_df["age_group"].apply(lambda x : get_categoty_age(x))

#원핫인코딩 진행
def dummy_data(data, columns) :
    for column in columns:
        data = pd.concat([data, pd.get_dummies(data[column], prefix = column)], axis=1)
        data = data.drop(column, axis=1)
    return data

dummy_columns = ['engnat', 'gender','hand', 'married', 'race', 'religion', 'urban']
vote_df = dummy_data(vote_df, dummy_columns)
test_df = dummy_data(test_df, dummy_columns)

# feature 분리 및 학습세트/평가세트 분리
y_labels = vote_df.loc[:,'voted']
X_features = vote_df.drop('voted', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels, test_size=0.2, random_state=156)


# 데이터 정규화(스케일링)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# oversampling
from imblearn.over_sampling import RandomOverSampler
over_sampler = RandomOverSampler(random_state=156)
X_train_over, y_train_over = over_sampler.fit_resample(X_train, y_train)


# 학습
evals = [(X_test, y_test)]
xgb_model = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3, objective="binary:logistic", random_state=156)
xgb_model.fit(X_train_over, y_train_over, early_stopping_rounds=100, eval_metric="logloss", eval_set=evals, verbose=True)
w_preds = xgb_model.predict(X_test)
w_pred_proba = xgb_model.predict_proba(X_test)[:,1]

get_clf_eval(y_test, w_preds, w_pred_proba)



[0]	validation_0-logloss:0.67451
[1]	validation_0-logloss:0.65922
[2]	validation_0-logloss:0.64650
[3]	validation_0-logloss:0.63601
[4]	validation_0-logloss:0.62740
[5]	validation_0-logloss:0.62014
[6]	validation_0-logloss:0.61389
[7]	validation_0-logloss:0.60861
[8]	validation_0-logloss:0.60415
[9]	validation_0-logloss:0.60013
[10]	validation_0-logloss:0.59676
[11]	validation_0-logloss:0.59393
[12]	validation_0-logloss:0.59126
[13]	validation_0-logloss:0.58900
[14]	validation_0-logloss:0.58706
[15]	validation_0-logloss:0.58541
[16]	validation_0-logloss:0.58400
[17]	validation_0-logloss:0.58281
[18]	validation_0-logloss:0.58163
[19]	validation_0-logloss:0.58077
[20]	validation_0-logloss:0.57987
[21]	validation_0-logloss:0.57879
[22]	validation_0-logloss:0.57812
[23]	validation_0-logloss:0.57758
[24]	validation_0-logloss:0.57703
[25]	validation_0-logloss:0.57633
[26]	validation_0-logloss:0.57594
[27]	validation_0-logloss:0.57548
[28]	validation_0-logloss:0.57508
[29]	validation_0-loglos



In [8]:
### 데이터 정규화(스케일링), oversampling 적용 X

# 레이블 인코딩
encoder = LabelEncoder()
encoder2 = LabelEncoder()
encoder.fit(vote_df['age_group'])
encoder2.fit(test_df['age_group'])
labels = encoder.transform(vote_df['age_group'])
labels2 = encoder.transform(test_df['age_group'])
vote_df['age_group'] = labels
test_df['age_group'] = labels2

def get_categoty_age(age_num):
    num = age_num;
    if age_num == 0: num =7

    return num
vote_df["age_group"] = vote_df["age_group"].apply(lambda x : get_categoty_age(x))
test_df["age_group"] = test_df["age_group"].apply(lambda x : get_categoty_age(x))

#원핫인코딩 진행
def dummy_data(data, columns) :
    for column in columns:
        data = pd.concat([data, pd.get_dummies(data[column], prefix = column)], axis=1)
        data = data.drop(column, axis=1)
    return data

dummy_columns = ['engnat', 'gender','hand', 'married', 'race', 'religion', 'urban']
vote_df = dummy_data(vote_df, dummy_columns)
test_df = dummy_data(test_df, dummy_columns)

KeyError: 'engnat'

In [None]:
# 학습
y_labels = vote_df.loc[:,'voted']
X_features = vote_df.drop('voted', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels, test_size=0.2, random_state=156)

evals = [(X_test, y_test)]

xgb_model = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3, objective="binary:logistic", random_state=156)
xgb_model.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="logloss", eval_set=evals, verbose=True)
w_preds = xgb_model.predict(X_test)
w_pred_proba = xgb_model.predict_proba(X_test)[:,1]

get_clf_eval(y_test, w_preds, w_pred_proba)

In [None]:
from xgboost import plot_importance
fig, ax = plt.subplots(1,1, figsize=(10,8))
plot_importance(xgb_model, ax=ax, max_num_features=20, height=0.4)

In [9]:
vote_df.sample(10)

Unnamed: 0,QaA,QaE,QbA,QbE,QcA,QcE,QdA,QdE,QeA,QeE,QfA,QfE,QgA,QgE,QhA,QhE,QiA,QiE,QjA,QjE,QkA,QkE,QlA,QlE,QmA,QmE,QnA,QnE,QoA,QoE,QpA,QpE,QqA,QqE,QrA,QrE,QsA,QsE,QtA,QtE,age_group,education,familysize,tp01,tp02,tp03,tp04,tp05,tp06,tp07,tp08,tp09,tp10,voted,wf_01,wf_02,wf_03,wr_01,wr_02,wr_03,wr_04,wr_05,wr_06,wr_07,wr_08,wr_09,wr_10,wr_11,wr_12,wr_13,engnat_1.0,engnat_1.2727522306108443,engnat_2.0,gender_Female,gender_Male,hand_1.0,hand_1.170487302676733,hand_2.0,hand_3.0,married_0,married_1,married_2,married_3,race_Arab,race_Asian,race_Black,race_Indigenous Australian,race_Native American,race_Other,race_White,religion_Agnostic,religion_Atheist,religion_Buddhist,religion_Christian_Catholic,religion_Christian_Mormon,religion_Christian_Other,religion_Christian_Protestant,religion_Hindu,religion_Jewish,religion_Muslim,religion_Other,religion_Sikh,urban_1.0,urban_2.0,urban_2.1795744680851064,urban_3.0
11016,1.0,512,5.0,1304,5.0,735,1.0,914,1.0,1340,5.0,671,1.0,868,5.0,2069,5.0,3597,4.0,1361,5.0,1388,5.0,1507,4.0,1121,1.0,790,5.0,673,5.0,1435,1.0,962,1.0,807,5.0,535,5.0,1142,2,3.0,1,1.0,2.6193,1.755717,5.0,1.333342,5.0,5.0,3.509952,5.0,6.0,1,0,0,0,1,1,0,1,1,0,1,1,0,1,0,1,1,1,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3577,1.0,672,2.0,1066,4.0,567,2.0,661,2.0,1266,2.0,905,3.0,622,2.0,1510,2.0,1965,2.0,690,5.0,857,5.0,1210,4.0,1206,1.0,506,5.0,878,4.0,769,4.0,321,4.0,683,5.0,762,2.0,648,1,2.0,2,5.0,6.0,5.0,4.0,1.0,1.0,2.022704,1.0,1.0,4.0,0,0,0,0,0,1,0,0,1,1,1,1,0,1,0,1,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
29182,2.0,562,1.0,1358,1.0,753,1.0,1360,4.0,1145,1.0,756,1.0,575,4.0,1735,5.0,3377,2.0,195,4.0,2097,5.0,1491,2.0,1436,2.0,1486,4.0,3755,4.0,1042,5.0,523,5.0,966,3.0,1202,4.0,2516,3,4.0,2,1.0,6.0,4.0,2.0,1.0,6.0,2.022704,1.0,5.0,6.0,1,0,0,0,0,1,0,0,1,0,1,1,0,1,0,1,0,0,0,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
5800,1.0,370,5.0,1364,1.0,915,1.0,769,1.0,864,5.0,1039,3.0,1510,1.0,1532,3.0,2183,2.0,2606,1.0,1303,5.0,1155,5.0,945,1.0,645,1.0,983,1.0,1114,1.0,919,1.0,928,3.0,693,5.0,2621,3,3.0,7,1.0,2.0,1.755717,6.0,1.333342,2.0,5.0,4.0,2.27234,6.0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
23271,1.0,387,2.0,1119,5.0,757,1.0,1010,3.0,1737,2.0,1173,1.0,691,1.0,1171,1.0,527,5.0,1279,5.0,1694,5.0,1064,5.0,573,2.0,561,4.0,843,1.0,871,1.0,636,3.0,1077,5.0,678,5.0,1154,7,1.0,3,3.018174,2.0,1.0,1.0,1.333342,5.0,2.022704,1.0,3.0,6.0,0,0,0,0,1,1,0,1,1,1,1,1,0,1,0,1,1,0,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
20137,2.0,1774,2.0,673,2.0,737,2.0,728,3.0,623,2.0,383,3.0,628,1.0,812,4.0,897,4.0,828,5.0,704,5.0,840,4.0,521,4.0,553,4.0,2034,3.0,679,3.0,713,5.0,1094,4.0,569,3.0,716,1,4.0,2,2.0,6.0,1.755717,6.0,1.333342,2.0,2.022704,5.0,2.27234,6.0,0,0,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
30380,3.0,1059,1.0,2525,1.0,1584,1.0,2191,1.0,3171,1.0,495,2.0,1980,1.0,2807,4.0,1971,2.0,3928,5.0,967,5.0,2901,2.0,750,4.0,1420,3.0,2085,4.0,1895,4.0,2548,1.0,950,1.0,967,4.0,1850,1,2.0,4,3.0,2.6193,2.0,2.0,1.333342,1.0,1.0,5.0,4.0,6.0,1,0,0,0,1,1,0,1,1,0,1,1,0,1,1,1,1,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
35120,1.0,553,2.0,1031,5.0,788,2.0,2508,2.0,1500,2.0,768,1.0,932,2.0,1375,2.0,1480,3.0,4855,5.0,1885,2.0,1059,4.0,2225,2.0,450,4.0,973,2.0,1662,2.0,990,3.0,1437,1.0,1590,5.0,829,3,3.0,3,6.0,6.0,1.0,2.0,2.0,1.0,4.0,5.0,3.0,5.0,0,0,0,0,1,1,0,1,1,1,1,1,0,1,1,1,1,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
5463,1.0,1515,2.0,664,5.0,577,5.0,163,3.0,627,5.0,348,5.0,432,1.0,764,5.0,419,5.0,1036,5.0,453,5.0,1562,1.0,1693,5.0,723,5.0,470,5.0,453,5.0,327,4.0,1406,2.0,1935,4.0,1017,1,2.0,2,6.0,2.0,5.0,6.0,1.333342,1.0,2.022704,5.0,2.0,1.0,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,1,0,1,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
32812,2.0,695,5.0,935,5.0,740,1.0,399,1.0,1592,1.0,1329,2.0,1484,1.0,4209,2.0,10017,5.0,695,5.0,1102,5.0,1093,3.0,1759,4.0,1812,2.0,1957,2.0,935,1.0,598,1.0,7635,3.0,792,4.0,1190,1,2.0,2,6.0,2.6193,2.0,3.025614,1.0,1.0,5.0,3.509952,6.0,6.0,0,0,0,0,1,1,0,1,1,1,1,1,1,1,0,1,1,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [None]:
#evals = [(X_test, y_test)]

xgb_model2 = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3, objective="binary:logistic", random_state=156)
xgb_model2.fit(X_features.sample(test_df.shape[0]), y_labels.sample(test_df.shape[0]), eval_metric="logloss")
w_preds2 = xgb_model.predict(test_df)
w_pred_proba2 = xgb_model.predict_proba(test_df)[:,1]

---

In [63]:
import csv
f= open("test.csv", 'w', newline='')

for i in range(test_df.shape[0]):
    wr = csv.writer(f)
    wr.writerow([w_preds2[i]])
f.close()

In [53]:
w_preds2[1]

1

In [77]:
sum = 0
sum1 = 0
for i in range(test_df.shape[0]):
    if w_pred_proba2[i] >= 0.8: sum +=1
    else :sum1 += 1
print(sum , sum1)

39 9068
