In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as plb
%matplotlib inline
import seaborn as sns
import matplotlib.font_manager as fm
font_name = fm.FontProperties(fname = 'C:\\Windows\\Fonts\\malgun.ttf').get_name()
plt.rc('font', family = font_name)
mpl.rcParams['axes.unicode_minus'] = False

In [3]:
# 노을 경로
train = pd.read_csv('C:\\Users\\Hyelin\\bigdata\\FinalProject\\csv정리\\train_new_2.csv',encoding="euc-kr",index_col = [0])
train=train.reset_index(drop=True)
pd.set_option('display.max_columns', 150)

## PyCaret 패키지 설치 (Install PyCaret)

In [None]:
!pip install pycaret

## 분류 작업에 필용한 함수 불러오기 (Import methods for classification task)

In [4]:
from pycaret.classification import *

## 실험 환경 구축 (Setup the environment)
#### PyCaret에서는 모델 학습 전 실험 환경을 구축 해주어야 합니다. setup 함수를 통해 환경을 구축할 수 있습니다.
#### setup 단계에서는 PyCaret이 자동으로 컬럼 형태를 인식합니다.(따로 전처리가 필요없는 이유) 그 후 사용자에게 제대로 인식되었는지 확인을 받게 됩니다. 그 때 enter를 눌러주시면 됩니다.
#### 또한 주어진 데이터의 얼마를 사용하여 train / validation을 구축할지 묻게 되는데, 전체 데이터를 사용하고 싶다면 enter 눌러주시면 됩니다.

In [5]:
# 'voted' 컬럼이 예측 대상이므로 target 인자에 명시
# 'voted' column is the target variable
clf = setup(data = train, target = 'voted')

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,180
1,Target Type,Binary
2,Label Encoded,"1: 0, 2: 1"
3,Original Data,"(45532, 139)"
4,Missing Values,False
5,Numeric Features,41
6,Categorical Features,97
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


## 모델 학습 및 비교 (Train models and compare)
#### 환경 구축을 했으니 PyCaret에서 제공하는 기본 모델에 대해 학습하고 비교해보겠습니다.
#### compared_models 함수를 통해 15개의 기본 모델을 학습하고 성능을 비교할 수 있습니다.
#### AUC 기준으로 성능이 가장 좋은 3개의 모델을 추려내어 저장해보겠습니다. 본 대회 평가지표가 AUC이기 때문에 AUC 기준으로 모델을 선정합니다.

In [6]:
best_3 = compare_models(sort = 'AUC', n_select = 3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Gradient Boosting Classifier,0.6959,0.7675,0.6492,0.7597,0.7001,0.3955,0.4006,73.4068
1,Light Gradient Boosting Machine,0.6948,0.7666,0.6493,0.7579,0.6993,0.3932,0.3981,1.2248
2,CatBoost Classifier,0.6939,0.7664,0.659,0.7508,0.7018,0.3902,0.3937,28.313
3,Linear Discriminant Analysis,0.6912,0.7612,0.6611,0.7454,0.7007,0.3841,0.3871,2.7872
4,Extra Trees Classifier,0.6907,0.7592,0.6339,0.7606,0.6914,0.3865,0.393,9.0179
5,Ada Boost Classifier,0.69,0.7569,0.6536,0.7477,0.6975,0.3825,0.3862,12.4086
6,Extreme Gradient Boosting,0.676,0.7479,0.6683,0.7193,0.6928,0.3511,0.3522,11.3714
7,Random Forest Classifier,0.6654,0.7212,0.6071,0.735,0.6649,0.3369,0.3431,0.8198
8,Decision Tree Classifier,0.6127,0.6095,0.6432,0.6467,0.6449,0.219,0.219,4.1679
9,Naive Bayes,0.4589,0.5522,0.0411,0.5434,0.0706,0.0039,0.0037,0.3537


## 모델 앙상블 (Model Ensemble)
#### 학습된 3개의 모델을 앙상블 시키도록 하겠습니다. 본 대회는 score 최적화를 위해 확률 값을 예측해야 하므로 soft vote ensemble을 진행하겠습니다. 

In [7]:
blended = blend_models(estimator_list = best_3, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7005,0.7751,0.65,0.7668,0.7036,0.4052,0.4108
1,0.7005,0.7707,0.6624,0.7593,0.7075,0.4037,0.4076
2,0.6903,0.7633,0.6426,0.7547,0.6941,0.3846,0.3897
3,0.6955,0.7644,0.6554,0.7553,0.7018,0.3939,0.398
4,0.7013,0.7737,0.6557,0.7645,0.7059,0.406,0.411
Mean,0.6976,0.7694,0.6532,0.7601,0.7026,0.3987,0.4034
SD,0.0042,0.0048,0.0066,0.0048,0.0047,0.0083,0.0083


In [8]:
blended_h = blend_models(estimator_list = best_3, fold = 5, method = 'hard')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6995,0.0,0.6486,0.7659,0.7024,0.403,0.4087
1,0.6985,0.0,0.6612,0.7567,0.7058,0.3995,0.4033
2,0.6925,0.0,0.6423,0.7585,0.6956,0.3892,0.3947
3,0.6941,0.0,0.6528,0.7546,0.7,0.3912,0.3955
4,0.7008,0.0,0.6525,0.7657,0.7046,0.4054,0.4107
Mean,0.6971,0.0,0.6515,0.7603,0.7017,0.3977,0.4026
SD,0.0032,0.0,0.0062,0.0047,0.0036,0.0064,0.0066


## 모델 예측 (Prediction)
#### 구축된 앙상블 모델을 통해 예측을 해보겠습니다.
#### setup 환경에 이미 hold-out set이 존재하므로 해당 데이터에 대해 예측을 하여 모델 성능을 확인하겠습니다.

In [9]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.692,0.7621,0.6407,0.7586,0.6947,0.3884,0.3941


In [10]:
pred_holdout_h = predict_model(blended_h)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.6911,0,0.6408,0.7571,0.6941,0.3866,0.392


## 전체 데이터에 대한 재학습 (Re-training the model on whole data)
#### 현재까지 실험은 주어진 train 데이터를 다시 한 번 train / validation으로 나눠서 실험을 한 것이므로, 전체 train 데이터에 학습되어 있지 않습니다.
#### 최적의 성능을 위해 전체 데이터에 학습을 시켜주도록 하겠습니다.

In [11]:
final_model = finalize_model(blended)

In [12]:
#final_model_h = finalize_model(blended_h)

## 대회용 test set에 대한 예측 (Predicting on test set for the competition)
#### predict_model 함수를 통해 재학습된 모델을 대회용 test set에 대해 예측해보겠습니다.
#### We will now use the re-trained model on the test set for the competition

In [15]:
test = pd.read_csv('C:\\Users\\Hyelin\\bigdata\\FinalProject\\TeamProject\open data\\test_x.csv',encoding="euc-kr",index_col = [0])
test=train.reset_index(drop=True)
pd.set_option('display.max_columns', 150)

In [16]:
predictions = predict_model(final_model, data = test)

In [17]:
#predictions_h = predict_model(final_model_h, data = test)

In [18]:
predictions

Unnamed: 0,QaA,QaE,QbA,QbE,QcA,QcE,QdA,QdE,QeA,QeE,QfA,QfE,QgA,QgE,QhA,QhE,QiA,QiE,QjA,QjE,QkA,QkE,QlA,QlE,QmA,QmE,QnA,QnE,QoA,QoE,QpA,QpE,QqA,QqE,QrA,QrE,QsA,QsE,QtA,QtE,age_group,education,engnat,familysize,gender,hand,married,race,religion,tp01,tp02,tp03,tp04,tp05,tp06,tp07,tp08,tp09,tp10,urban,voted,wf_01,wf_02,wf_03,wr_01,wr_02,wr_03,wr_04,wr_05,wr_06,wr_07,wr_08,wr_09,wr_10,wr_11,wr_12,wr_13,QaE_new,QbE_new,QcE_new,QdE_new,QeE_new,QfE_new,QgE_new,QhE_new,QiE_new,QjE_new,QkE_new,QlE_new,QmE_new,QnE_new,QoE_new,QpE_new,QqE_new,QrE_new,QsE_new,QtE_new,urban_new,age3,wf_new,wr_new,tp01_new,tp02_new,tp03_new,tp04_new,tp05_new,tp06_new,tp07_new,tp08_new,tp09_new,tp10_new,tp01-tp06,tp02-tp07,tp03-tp08,tp04-tp09,tp05-tp10,tp,race_new,age_10s,age_num,religion_yes,religion_no,urban_group,religion_OX,religion_siho,age1,age2,age4,urban_new1,urban_group1,race_white,QtotalE,Asian,Black,Other,White,Mach_score,mach_per,tp_score,Label,Score
0,3.0,363,4.0,1370,5.0,997,5.0,1024,4.0,1577,1.0,539,4.0,586,4.0,1095,1.0,1142,4.0,1287,2.0,883,4.0,851,2.0,851,1.0,816,2.0,579,2.0,924,4.0,366,4.0,876,2.0,633,1.0,1115,30s,2,1,4,Female,1,3,White,Other,2,4,2,5,2,5,7,2,4,3,1,2,0,0,0,0,1,0,0,1,0,1,1,0,1,0,1,1,0,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,3,0,0,1,1,1,0,1,0,2,1,1,1,1,-5,-2,-3,-1,tp07,3,0,3,0,1,0,1,4,1,1,1,1,0,1,1,0,0,0,1,2,1,3.6,2,0.6396
1,1.0,647,5.0,1313,3.0,3387,1.0,2969,5.0,4320,3.0,2190,5.0,826,1.0,4082,1.0,1867,3.0,1264,1.0,2943,4.0,3927,1.0,4329,1.0,1828,1.0,1214,5.0,2414,1.0,1356,5.0,3039,4.0,4304,1.0,1346,20s,4,2,3,Female,1,1,Asian,Hindu,1,5,0,6,1,4,3,2,0,2,3,2,0,0,0,0,1,0,1,1,0,1,1,0,1,0,1,1,1,1,2,2,2,2,1,2,1,1,2,2,2,2,1,2,2,2,2,1,3,2,0,0,0,0,0,0,0,1,1,1,0,1,-1,-2,-4,0,-3,tp08,2,0,2,0,0,1,1,3,0,0,1,3,1,0,3,1,0,0,0,2,0,2.4,2,0.6253
2,2.0,1623,1.0,1480,1.0,1021,2.0,3374,1.0,1333,5.0,531,2.0,1167,1.0,1016,3.0,2653,2.0,1569,1.0,998,5.0,2547,2.0,918,2.0,2153,2.0,1304,1.0,1131,1.0,937,2.0,1327,1.0,1170,1.0,1409,30s,3,1,3,Male,1,2,White,Other,2,3,1,1,3,2,2,0,1,3,2,1,0,0,1,1,1,0,1,1,0,1,1,1,1,0,1,1,2,1,1,2,1,1,2,0,1,1,1,2,1,2,2,1,1,1,2,1,2,3,1,0,1,1,0,2,1,1,1,2,0,1,-2,1,-5,4,0,tp08,3,0,3,0,1,1,1,4,1,1,1,2,1,1,2,0,0,0,1,1,0,1.8,1,0.3011
3,3.0,504,3.0,2311,4.0,992,3.0,3245,5.0,357,4.0,1519,2.0,159,3.0,2275,1.0,2809,4.0,5614,3.0,3219,4.0,1296,4.0,9046,2.0,1216,4.0,1169,4.0,23868,3.0,581,2.0,8830,4.0,2392,5.0,1312,20s,4,2,0,Female,1,1,Asian,Hindu,2,2,1,5,1,3,1,3,1,3,3,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,1,1,2,1,2,0,2,0,2,2,2,2,1,2,2,1,2,1,2,2,1,3,2,0,0,1,1,0,0,0,1,0,1,0,1,-1,3,-2,0,-2,tp02,2,0,2,0,0,1,1,3,0,0,1,3,1,0,3,1,0,0,0,3,1,2.2,2,0.5274
4,5.0,927,1.0,707,5.0,556,4.0,1062,5.0,1014,4.0,628,5.0,991,1.0,1259,1.0,1153,5.0,1388,1.0,740,5.0,1181,4.0,547,4.0,575,1.0,754,4.0,1140,1.0,323,1.0,1070,1.0,583,2.0,1889,20s,3,1,2,Male,1,2,White,Agnostic,1,5,1,0,0,4,0,0,2,0,1,1,0,1,0,1,1,0,1,1,1,1,1,0,1,1,1,1,2,0,0,1,1,1,1,1,0,1,0,1,0,1,1,1,0,1,1,2,1,2,1,1,0,0,0,2,0,1,0,2,1,2,-1,1,-5,4,-6,tp10,3,0,2,0,1,0,0,0,0,0,1,1,0,1,2,0,0,0,1,3,1,1.3,1,0.2781
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45527,4.0,1050,5.0,619,4.0,328,5.0,285,5.0,602,5.0,267,5.0,315,1.0,483,5.0,1016,4.0,278,2.0,611,5.0,215,4.0,388,5.0,299,2.0,1915,4.0,439,4.0,823,5.0,309,5.0,586,4.0,2252,10s,2,1,1,Female,3,1,White,Jewish,1,4,6,5,2,0,3,6,6,2,3,2,0,0,1,1,1,0,1,1,0,1,1,1,1,0,1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1,0,1,2,3,1,1,0,0,1,2,0,1,2,1,0,2,1,-5,-1,6,-5,-2,tp03,3,1,1,0,1,1,1,1,0,0,0,3,1,1,1,0,0,0,1,4,2,3.5,2,0.7898
45528,4.0,581,3.0,1353,4.0,1164,5.0,798,3.0,1680,4.0,560,4.0,640,1.0,1415,4.0,4494,5.0,1392,4.0,2478,5.0,1268,3.0,843,4.0,1401,4.0,1524,4.0,719,4.0,754,3.0,1118,4.0,654,4.0,1197,30s,2,1,2,Male,1,1,White,Atheist,2,4,2,3,1,3,3,2,1,1,1,2,0,0,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,2,1,1,1,2,1,2,1,1,2,2,1,1,1,1,1,1,3,1,1,1,1,1,1,0,1,1,1,0,2,-1,-1,-2,2,-4,tp10,3,0,3,0,1,0,0,0,1,1,1,1,0,1,2,0,0,0,1,3,1,2.2,1,0.4897
45529,2.0,593,1.0,857,1.0,1047,2.0,1515,1.0,1690,2.0,1253,1.0,1094,1.0,1283,1.0,2209,1.0,1764,1.0,1062,5.0,1489,1.0,1908,2.0,891,1.0,1298,1.0,1200,1.0,473,1.0,1779,3.0,1355,1.0,667,10s,2,1,1,Female,1,1,White,Christian_Other,4,2,2,2,0,5,0,4,1,0,2,1,0,0,0,1,1,0,1,1,0,1,1,0,1,0,1,1,1,0,1,2,2,2,1,1,1,1,1,1,2,1,1,1,1,2,2,0,2,1,0,0,1,1,1,1,0,0,0,1,0,2,3,4,0,3,-6,tp10,3,1,1,0,1,1,1,1,0,0,0,2,1,1,2,0,0,0,1,1,0,2.0,2,0.7552
45530,5.0,747,3.0,1331,4.0,892,4.0,1281,5.0,1328,5.0,599,5.0,650,1.0,1429,1.0,1748,1.0,770,1.0,1025,5.0,742,4.0,4180,3.0,707,1.0,489,1.0,913,2.0,1301,2.0,1680,4.0,737,1.0,1124,40s,4,1,2,Male,3,1,White,Atheist,1,4,0,2,0,5,3,4,2,2,0,1,0,0,0,1,1,0,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,0,2,1,0,1,2,2,1,1,2,4,0,0,0,1,0,1,0,0,1,1,1,1,0,-1,-2,2,-4,tp10,3,0,4,0,1,1,0,0,1,1,2,1,0,1,2,0,0,0,1,2,1,2.3,1,0.3502


In [19]:
#predictions_h

In [20]:
submission = pd.read_csv('C:\\Users\\Hyelin\\bigdata\\FinalProject\\csv정리\\sample_submission.csv',encoding="euc-kr",index_col = [0])
submission = submission.reset_index(drop=True)
pd.set_option('display.max_columns', 150)

In [21]:
submission['voted'] = predictions['Score']

In [22]:
#submission['voted'] = predictions_h['Score']

In [23]:
submission.to_csv('submission_prob.csv')