In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import lightgbm

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler, SMOTE


In [2]:
# set options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
# 예측한 값으로 성능을 평가하는 함수
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred) # 오차 행렬
    accuracy = accuracy_score(y_test, pred) # 정확도
    precision = precision_score(y_test, pred) # 정밀도
    recall = recall_score(y_test, pred) # 재현율
    f1 = f1_score(y_test, pred) # F1
    roc_auc = roc_auc_score(y_test, pred_proba) # ROC-AUC
    print('오차 행렬')
    print(confusion)
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

## 데이터 전처리

In [4]:
vote_df = pd.read_csv("C:\\Users\\eunseo\\Desktop\\Git\\AI-project\\data\\train.csv", encoding="utf-8")
test_df = pd.read_csv("C:\\Users\\eunseo\\Desktop\\Git\\AI-project\\data\\test.csv", encoding="utf-8")
print("dataset shape: ", vote_df.shape)
print("dataset shape: ", test_df.shape)

# 인덱스 제거
vote_df.drop("index", axis=1, inplace=True)
test_index = test_df["index"]
test_df.drop("index", axis=1, inplace=True)

dataset shape:  (36425, 78)
dataset shape:  (9107, 77)


In [5]:
# 0값을 검사할 피처명 리스트 객체 설정
# zero_features = ['education', 'engnat', 'hand', 'urban']
# education 비율 맞추기?
zero_features = []

# tp 추가
for i in range(1, 11):
    s = 'tp' + chr(48+(i//10)) + chr(48+(i%10))
    zero_features.append(s)
    
# zero_features 리스트 내부에 저장된 개별 피처들에 대해 0값을 평균 값으로 대체
vote_df[zero_features] = vote_df[zero_features].replace(0, vote_df[zero_features].mean())
test_df[zero_features] = test_df[zero_features].replace(0, test_df[zero_features].mean())

### 인코딩

In [6]:
# 레이블 인코딩
encoder = LabelEncoder()
encoder2 = LabelEncoder()
encoder.fit(vote_df['age_group'])
encoder2.fit(test_df['age_group'])
labels = encoder.transform(vote_df['age_group'])
labels2 = encoder2.transform(test_df['age_group'])
vote_df['age_group'] = labels
test_df['age_group'] = labels2

def get_categoty_age(age_num):
    num = age_num;
    if age_num == 0: num =7

    return num
vote_df["age_group"] = vote_df["age_group"].apply(lambda x : get_categoty_age(x))
test_df["age_group"] = test_df["age_group"].apply(lambda x : get_categoty_age(x))

#원핫인코딩 진행
def dummy_data(data, columns) :
    for column in columns:
        data = pd.concat([data, pd.get_dummies(data[column], prefix = column)], axis=1)
        data = data.drop(column, axis=1)
    return data

dummy_columns = ['engnat', 'gender','hand', 'married', 'race', 'religion', 'urban']
# wr 추가
for i in range(1, 14):
    s = 'wr_' + chr(48+(i//10)) + chr(48+(i%10))
    dummy_columns.append(s)
# wf 추가
for i in range(1, 4):
    s = 'wf_' + chr(48+(i//10)) + chr(48+(i%10))
    dummy_columns.append(s)
# print(dummy_columns)

vote_df = dummy_data(vote_df, dummy_columns)
test_df = dummy_data(test_df, dummy_columns)


### 이상값 처리

In [7]:
# 시간 데이터 전처리

#Q_E 시간
Q_E = []
for i in range(97, 117):
    a = 'Q' + chr(i) + 'E'
    Q_E.append(a)

def get_categoty_time(Q_time):
    cat = 0
    if Q_time <= 1000 : cat = 1
    elif Q_time <= 2000 : cat = 2
    elif Q_time <= 3000 : cat = 3
    elif Q_time <= 4000 : cat = 4
    elif Q_time <= 5000 : cat = 5
    elif Q_time <= 6000 : cat = 6
    elif Q_time <= 7000 : cat = 7
    elif Q_time <= 8000 : cat = 8
    elif Q_time <= 9000 : cat = 9
    elif Q_time <=10000 : cat = 10
    elif Q_time <=15000 : cat = 11
    elif Q_time <=20000 : cat = 12
    else : Q_time = 13

    return cat

for Q_time in Q_E:
    vote_df[Q_time] = vote_df[Q_time].apply(lambda x : get_categoty_time(x))
    test_df[Q_time] = test_df[Q_time].apply(lambda x : get_categoty_time(x))


In [8]:
# 이상치 데이터 제거
def get_outlier(df=None, column=None, weight=1.5):
    fraud = df[df['voted']==1][column]
    quantile_25 = np.percentile(fraud.values, 25)
    quantile_75 = np.percentile(fraud.values, 75)
    
    iqr = quantile_75 - quantile_25
    iqr_weight = iqr * weight
    lowest_val = quantile_25 - iqr_weight
    highest_val = quantile_75 + iqr_weight    
    outlier_index = fraud[(fraud < lowest_val) | (fraud > highest_val)].index    
    return outlier_index

# train data - race_White, religion_Agnostic  religion_Atheist  
outlier_index = get_outlier(df=vote_df, column="race_White", weight=1.5)
vote_df.drop(outlier_index, axis=0, inplace=True)

outlier_index = get_outlier(df=vote_df, column="religion_Agnostic", weight=1.5)
vote_df.drop(outlier_index, axis=0, inplace=True)

outlier_index = get_outlier(df=vote_df, column="religion_Atheist", weight=1.5)
vote_df.drop(outlier_index, axis=0, inplace=True)

## 모델 학습

In [9]:
# feature 분리 및 학습세트/평가세트 분리
y_labels = vote_df.loc[:,'voted']
X_features = vote_df.drop('voted', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels, test_size=0.2, random_state=156)

# 분포 비율 확인
train_cnt = y_train.count()
test_cnt = y_test.count()
print("학습 세트 Shape:{0}, 테스트 세트 Shape:{1}".format(X_train.shape, X_test.shape))

print(" 학습 세트 레이블 값 분포 비율")
print(y_train.value_counts()/train_cnt)
print("\n 테스트 세트 레이블 값 분포 비율")
print(y_test.value_counts()/test_cnt)
print()

# 데이터 정규화(스케일링)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

test_sc = scaler.transform(test_df) # test data scaling

# oversampling
over_sampler = RandomOverSampler(random_state=156)
X_train_over, y_train_over = over_sampler.fit_resample(X_train, y_train)

# # smote oversampling
# smote = SMOTE(random_state=156)
# X_train_over, y_train_over = smote.fit_resample(X_train, y_train)

# 학습
evals = [(X_test, y_test)]
xgb_model = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3, objective="binary:logistic", random_state=156)
xgb_model.fit(X_train_over, y_train_over, early_stopping_rounds=100, eval_metric="logloss", eval_set=evals, verbose=True)
w_preds = xgb_model.predict(X_test)
w_pred_proba = xgb_model.predict_proba(X_test)[:,1]

get_clf_eval(y_test, w_preds, w_pred_proba)

학습 세트 Shape:(23740, 121), 테스트 세트 Shape:(5935, 121)
 학습 세트 레이블 값 분포 비율
0    0.669966
1    0.330034
Name: voted, dtype: float64

 테스트 세트 레이블 값 분포 비율
0    0.676158
1    0.323842
Name: voted, dtype: float64

[0]	validation_0-logloss:0.647259
Will train until validation_0-logloss hasn't improved in 100 rounds.
[1]	validation_0-logloss:0.610282
[2]	validation_0-logloss:0.580379
[3]	validation_0-logloss:0.555757
[4]	validation_0-logloss:0.535254
[5]	validation_0-logloss:0.518077
[6]	validation_0-logloss:0.503653
[7]	validation_0-logloss:0.491128
[8]	validation_0-logloss:0.480549
[9]	validation_0-logloss:0.470568
[10]	validation_0-logloss:0.46237
[11]	validation_0-logloss:0.455412
[12]	validation_0-logloss:0.448696
[13]	validation_0-logloss:0.443407
[14]	validation_0-logloss:0.438326
[15]	validation_0-logloss:0.43429
[16]	validation_0-logloss:0.430607
[17]	validation_0-logloss:0.427163
[18]	validation_0-logloss:0.424251
[19]	validation_0-logloss:0.421567
[20]	validation_0-logloss:0.419257
[21]

[224]	validation_0-logloss:0.387899
[225]	validation_0-logloss:0.387902
[226]	validation_0-logloss:0.38792
[227]	validation_0-logloss:0.387872
[228]	validation_0-logloss:0.387879
[229]	validation_0-logloss:0.387862
[230]	validation_0-logloss:0.387817
[231]	validation_0-logloss:0.387781
[232]	validation_0-logloss:0.387822
[233]	validation_0-logloss:0.387861
[234]	validation_0-logloss:0.387867
[235]	validation_0-logloss:0.387848
[236]	validation_0-logloss:0.387827
[237]	validation_0-logloss:0.387844
[238]	validation_0-logloss:0.387865
[239]	validation_0-logloss:0.387847
[240]	validation_0-logloss:0.387844
[241]	validation_0-logloss:0.387878
[242]	validation_0-logloss:0.387876
[243]	validation_0-logloss:0.387844
[244]	validation_0-logloss:0.387818
[245]	validation_0-logloss:0.387865
[246]	validation_0-logloss:0.387822
[247]	validation_0-logloss:0.387793
[248]	validation_0-logloss:0.387739
[249]	validation_0-logloss:0.387774
[250]	validation_0-logloss:0.387704
[251]	validation_0-logloss:0.

### K Fold

In [10]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

# 개별 기반 모델에서 최종 메타 모델이 사용할 학습 및 테스트용 데이터를 생성하기 위한 함수. 
def get_stacking_base_datasets(model, X_train_n, y_train_n, X_test_n, n_folds ):
    # 지정된 n_folds값으로 KFold 생성.
    kf = KFold(n_splits=n_folds, shuffle=False, random_state=0)
    #추후에 메타 모델이 사용할 학습 데이터 반환을 위한 넘파이 배열 초기화 
    train_fold_pred = np.zeros((X_train_n.shape[0] ,1 ))
    test_pred = np.zeros((X_test_n.shape[0],n_folds))
    print(model.__class__.__name__ , ' model 시작 ')
    
    for folder_counter , (train_index, valid_index) in enumerate(kf.split(X_train_n)):
        #입력된 학습 데이터에서 기반 모델이 학습/예측할 폴드 데이터 셋 추출 
        print('\t 폴드 세트: ',folder_counter,' 시작 ')
        X_tr = X_train_n[train_index] 
        y_tr = y_train_n[train_index] 
        X_te = X_train_n[valid_index]  
        
        #폴드 세트 내부에서 다시 만들어진 학습 데이터로 기반 모델의 학습 수행.
        model.fit(X_tr , y_tr)       
        #폴드 세트 내부에서 다시 만들어진 검증 데이터로 기반 모델 예측 후 데이터 저장.
        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1,1)
        #입력된 원본 테스트 데이터를 폴드 세트내 학습된 기반 모델에서 예측 후 데이터 저장. 
        test_pred[:, folder_counter] = model.predict(X_test_n)
            
    # 폴드 세트 내에서 원본 테스트 데이터를 예측한 데이터를 평균하여 테스트 데이터로 생성 
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1,1)    
    
    #train_fold_pred는 최종 메타 모델이 사용하는 학습 데이터, test_pred_mean은 테스트 데이터
    return train_fold_pred , test_pred_mean

In [11]:
xgb_train, xgb_test = get_stacking_base_datasets(xgb_model, X_train_over, y_train_over, X_test, 7)

XGBClassifier  model 시작 
	 폴드 세트:  0  시작 




	 폴드 세트:  1  시작 
	 폴드 세트:  2  시작 
	 폴드 세트:  3  시작 
	 폴드 세트:  4  시작 
	 폴드 세트:  5  시작 
	 폴드 세트:  6  시작 


In [12]:
xgb_model.fit(xgb_train, y_train_over)
stack_final = xgb_model.predict(xgb_test)
stack_final_proba = xgb_model.predict_proba(xgb_test)[:,1]

In [13]:
# 예측한 값으로 성능을 평가하는 함수
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred) # 오차 행렬
    accuracy = accuracy_score(y_test, pred) # 정확도
    precision = precision_score(y_test, pred) # 정밀도
    recall = recall_score(y_test, pred) # 재현율
    f1 = f1_score(y_test, pred) # F1
    roc_auc = roc_auc_score(y_test, pred_proba) # ROC-AUC
    print('오차 행렬')
    print(confusion)
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))
    
get_clf_eval(y_test, stack_final, stack_final_proba)

오차 행렬
[[2992 1021]
 [ 150 1772]]
정확도: 0.8027, 정밀도: 0.6344, 재현율: 0.9220, F1: 0.7516, AUC:0.8338


### threshold test

In [16]:

from sklearn.preprocessing import Binarizer

# 테스트를 수행할 모든 임곗값을 리스트 객체로 저장. 
thresholds = [0.4, 0.45, 0.50, 0.55, 0.60]


def get_clf_eval(y_test , pred):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    # F1 스코어 추가
    f1 = f1_score(y_test,pred)
    print('오차 행렬')
    print(confusion)
    # f1 score print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1:{3:.4f}'.format(accuracy, precision, recall, f1))

def get_eval_by_threshold(y_test , pred_proba_c1, thresholds):
    # thresholds list객체내의 값을 차례로 iteration하면서 Evaluation 수행.
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_c1) 
        custom_predict = binarizer.transform(pred_proba_c1)
        print('임곗값:',custom_threshold)
        get_clf_eval(y_test , custom_predict)

get_eval_by_threshold(y_test ,w_pred_proba.reshape(-1,1), thresholds )


임곗값: 0.4
오차 행렬
[[2866 1147]
 [  95 1827]]
정확도: 0.7907, 정밀도: 0.6143, 재현율: 0.9506, F1:0.7463
임곗값: 0.45
오차 행렬
[[2953 1060]
 [ 132 1790]]
정확도: 0.7992, 정밀도: 0.6281, 재현율: 0.9313, F1:0.7502
임곗값: 0.5
오차 행렬
[[2996 1017]
 [ 152 1770]]
정확도: 0.8030, 정밀도: 0.6351, 재현율: 0.9209, F1:0.7518
임곗값: 0.55
오차 행렬
[[3026  987]
 [ 179 1743]]
정확도: 0.8035, 정밀도: 0.6385, 재현율: 0.9069, F1:0.7494
임곗값: 0.6
오차 행렬
[[3082  931]
 [ 220 1702]]
정확도: 0.8061, 정밀도: 0.6464, 재현율: 0.8855, F1:0.7473
