### 데이터 전처리

In [None]:
import pandas as pd

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/

/content/drive/MyDrive


In [None]:
class0_data = pd.read_csv('./만료및탈퇴회원.csv', index_col=0)
class1_data = pd.read_csv('./정회원.csv', index_col=0)
display(class0_data, class1_data)

In [None]:
## 11월에 탈퇴한 회원의 10월 데이터 - 탈퇴회원 데이터
exit_user = class0_data[class0_data['mm'] == 11]['userid'].unique()
exit_data = class1_data[(class1_data['userid'].isin(exit_user)) & (class1_data['mm'] == 10)]

exit_data['target'] = 0 # WILL EXIT

exit_data

In [None]:
## 11월에 정회원인 회원의 10월 데이터 - 정회원 데이터
regular_user = class1_data[class1_data['mm'] == 11]['userid'].unique()
regular_data = class1_data[(class1_data['userid'].isin(regular_user)) & (class1_data['mm'] == 10)]

regular_data['target'] = 1 # WILL STAY

regular_data

In [None]:
dataset = pd.concat([exit_data, regular_data])

In [None]:
dataset['target'].value_counts()

1    4128
0    1251
Name: target, dtype: int64

In [None]:
dataset = dataset.dropna(subset = ['point_gain_activeday_count', 'point_gain_count', 'point_gain',
             'point_loss_activeday_count', 'point_loss_count', 'point_loss',
             'tablet_activeday_count', 'tablet_moved_menu_count', 'tablet_leave_count', 'tablet_resume_count',
             'tablet_login_count', 'tablet_logout_count', 'study_activeday_count', 'study_count',
             'study_notcompleted_count', 'study_completed_count',
             'study_restart_count', 'total_system_learning_time', 'total_caliper_learning_time',
             'media_activeday_count', 'media_count',
             'video_action_count', 'video_start_count', 'video_restart_count',
             'video_pause_count', 'video_jump_count', 'video_resume_count',
             'video_speed_count', 'video_volume_count', 'video_end_count',
             'test_activeday_count', 'test_count', 'test_average_score',
             'test_item_count', 'test_correct_count', 'wrong_count',
             'wrong_item_count', 'wrong_correct_count'
             ])

In [None]:
X = dataset[['point_gain_activeday_count', 'point_gain_count', 'point_gain',
             'point_loss_activeday_count', 'point_loss_count', 'point_loss',
             'tablet_activeday_count', 'tablet_moved_menu_count', 'tablet_leave_count', 'tablet_resume_count',
             'tablet_login_count', 'tablet_logout_count', 'study_activeday_count', 'study_count',
             'study_notcompleted_count', 'study_completed_count',
             'study_restart_count', 'total_system_learning_time', 'total_caliper_learning_time',
             'media_activeday_count', 'media_count',
             'video_action_count', 'video_start_count', 'video_restart_count',
             'video_pause_count', 'video_jump_count', 'video_resume_count',
             'video_speed_count', 'video_volume_count', 'video_end_count',
             'test_activeday_count', 'test_count', 'test_average_score',
             'test_item_count', 'test_correct_count', 'wrong_count',
             'wrong_item_count', 'wrong_correct_count'
             ]]

In [None]:
Y = dataset['target']

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from collections import Counter
# TODO : OverSampling(SMOTE) 적용
smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X, Y)
print(Counter(y_sm))

Counter({0: 2694, 1: 2694})


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2 , random_state= 42)

### 모델 적용

### Decision Tree(결정 트리)

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(random_state=156)
dt_clf.fit(X_train, y_train)

In [None]:
import sklearn.metrics as mt
y_pred = dt_clf.predict(X_test)
accuracy = mt.accuracy_score(y_test, y_pred)
precision = mt.precision_score(y_test, y_pred)
recall = mt.recall_score(y_test, y_pred)
auc = mt.roc_auc_score(y_test, y_pred)
matrix = mt.confusion_matrix(y_test, y_pred)

print("Decision Tree accuracy: {:.4f}".format(accuracy))
print('Decision Tree Recall: {0:.4f}'.format(recall))
print('Decision Tree Precision: {0:.4f}'.format(precision))
print('Decision Tree AUC: {0:.4f}'.format(auc))
print('Decision Tree Confusion Matrix:','\n', matrix)

Decision Tree accuracy: 0.7532
Decision Tree Recall: 0.7236
Decision Tree Precision: 0.7555
Decision Tree AUC: 0.7523
Decision Tree Confusion Matrix: 
 [[435 122]
 [144 377]]


### Voting(보팅)

In [None]:
# 개별 모델은 KNN와 DecisionTree 임.
knn_clf = KNeighborsClassifier(n_neighbors=8)
dt_clf = DecisionTreeClassifier(random_state=42)

# 개별 모델을 소프트 보팅 기반의 앙상블 모델로 구현한 분류기
vo_clf = VotingClassifier(estimators=[('KNN',knn_clf),('DT',dt_clf)] , voting='soft' )

# VotingClassifier 학습/예측/평가.
vo_clf.fit(X_train , y_train)
pred = vo_clf.predict(X_test)
print('Voting 분류기 정확도: {0:.4f}'.format(accuracy_score(y_test , pred)))
precision = mt.precision_score(y_test, pred)
recall = mt.recall_score(y_test, pred)
auc = mt.roc_auc_score(y_test, pred)
matrix = mt.confusion_matrix(y_test, pred)
print('Decision Tree Recall: {0:.4f}'.format(recall))
print('Decision Tree Precision: {0:.4f}'.format(precision))
print('Decision Tree AUC: {0:.4f}'.format(auc))
print('Decision Tree Confusion Matrix:','\n', matrix)

Voting 분류기 정확도: 0.7718
Decision Tree Recall: 0.7447
Decision Tree Precision: 0.7745
Decision Tree AUC: 0.7709
Decision Tree Confusion Matrix: 
 [[444 113]
 [133 388]]


In [None]:
classifiers = [knn_clf, dt_clf]
for classifier in classifiers:
    classifier.fit(X_train , y_train)
    pred = classifier.predict(X_test)
    class_name= classifier.__class__.__name__
    print('{0} 정확도: {1:.4f}'.format(class_name, accuracy_score(y_test , pred)))

KNeighborsClassifier 정확도: 0.7004
DecisionTreeClassifier 정확도: 0.7681


#### Bagging(배깅)

In [None]:
from sklearn.ensemble import BaggingClassifier

In [None]:
lr_clf = LogisticRegression(solver='liblinear')

bagging_clf = BaggingClassifier(base_estimator=lr_clf)

# BaggingClassifier 학습/예측.
bagging_clf.fit(X_train , y_train)
pred = bagging_clf.predict(X_test)

In [None]:
accuracy = mt.accuracy_score(y_test, pred)
precision = mt.precision_score(y_test, pred)
recall = mt.recall_score(y_test, pred)
auc = mt.roc_auc_score(y_test, pred)
matrix = mt.confusion_matrix(y_test, pred)
print('Decision Tree Accuracy: {0:.4f}'.format(accuracy))
print('Decision Tree Recall: {0:.4f}'.format(recall))
print('Decision Tree Precision: {0:.4f}'.format(precision))
print('Decision Tree AUC: {0:.4f}'.format(auc))
print('Decision Tree Confusion Matrix:','\n', matrix)

Decision Tree Accuracy: 0.6531
Decision Tree Recall: 0.6161
Decision Tree Precision: 0.6485
Decision Tree AUC: 0.6519
Decision Tree Confusion Matrix: 
 [[383 174]
 [200 321]]


#### 랜덤 포레스트

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=100, random_state=0, max_depth=8)
rf_clf.fit(X_train , y_train)
pred = rf_clf.predict(X_test)

accuracy = mt.accuracy_score(y_test, pred)
precision = mt.precision_score(y_test, pred)
recall = mt.recall_score(y_test, pred)
auc = mt.roc_auc_score(y_test, pred)
matrix = mt.confusion_matrix(y_test, pred)

print('random forest Accuracy: {0:.4f}'.format(accuracy))
print('random forest Recall: {0:.4f}'.format(recall))
print('random forest Precision: {0:.4f}'.format(precision))
print('random forest AUC: {0:.4f}'.format(auc))
print('random forest Confusion Matrix:','\n', matrix)

random forest Accuracy: 0.8006
random forest Recall: 0.7697
random forest Precision: 0.8085
random forest AUC: 0.7996
random forest Confusion Matrix: 
 [[462  95]
 [120 401]]


#### 그래디언트 부스팅

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
import time

start_time = time.time()

gb_clf = GradientBoostingClassifier(random_state=0)
gb_clf.fit(X_train , y_train)
gb_pred = gb_clf.predict(X_test)

accuracy = mt.accuracy_score(y_test , gb_pred)
recall = mt.recall_score(y_test, gb_pred)
precision = mt.precision_score(y_test, gb_pred)
auc = mt.roc_auc_score(y_test, gb_pred)
matrix = mt.confusion_matrix(y_test, gb_pred)

print('GBM Accuracy: {0:.4f}'.format(accuracy))
print('GBM Recall: {0:.4f}'.format(recall))
print('GBM Precision: {0:.4f}'.format(precision))
print('GBM AUC: {0:.4f}'.format(auc))
print('GBM Confusion Matrix:','\n', matrix)

print("GBM 수행 시간: {0:.1f} 초 ".format(time.time() - start_time))

GBM Accuracy: 0.8395
GBM Recall: 0.9060
GBM Precision: 0.7919
GBM AUC: 0.8417
GBM Confusion Matrix: 
 [[433 124]
 [ 49 472]]
GBM 수행 시간: 10.3 초 
