In [1]:
# required package import
import numpy as np
import pandas as pd
import time

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
#from lightgbm import LGBMClassifier

from sklearn.ensemble import VotingClassifier
from lime import lime_tabular
from lime import submodular_pick


In [2]:
x_train = pd.read_csv('../data/fill_data/x_train.csv')
x_valid = pd.read_csv('../data/fill_data/x_valid.csv')
y_train = pd.read_csv('../data/fill_data/y_train.csv')
y_valid = pd.read_csv('../data/fill_data/y_valid.csv')
x_test = pd.read_csv('../data/fill_data/x_test.csv')
y_test = pd.read_csv('../data/fill_data/y_test.csv')

In [3]:
class2idx = {-1:0, 1:1}
idx2class = {v: k for k, v, in class2idx.items()}

y_train.replace(class2idx, inplace = True)
y_valid.replace(class2idx, inplace = True)
y_test.replace(class2idx, inplace = True)

print("y_train.unique: ", y_train.label.unique())
print("y_valid.unique: ", y_valid.label.unique())
print("y_test.unique: ", y_test.label.unique())

y_train.unique:  [1. 0.]
y_valid.unique:  [1. 0.]
y_test.unique:  [1. 0.]


In [4]:
y_train = np.array(y_train).ravel()
y_valid = np.array(y_valid).ravel()
y_test = np.array(y_test).ravel()

In [5]:
# check data shape
print("x_train.shape: ", x_train.shape)
print("y_train.shape: ", y_train.shape)
print("x_valid.shape: ", x_valid.shape)
print("y_valid.shape: ", y_valid.shape)
print("x_test.shape: ", x_test.shape)
print("y_test.shape: ", y_test.shape)

x_train.shape:  (99157, 83)
y_train.shape:  (99157,)
x_valid.shape:  (7990, 83)
y_valid.shape:  (7990,)
x_test.shape:  (7912, 83)
y_test.shape:  (7912,)


### adaboost

In [6]:
# final_model
start_T = time.time()
adaboost = AdaBoostClassifier(n_estimators=1126, learning_rate=1.496)
adaboost.fit(x_train, y_train)
end_T = time.time()
print("model training time: ", end_T - start_T)

# prediction and evaluation
# train data
ada_train_pred_y = adaboost.predict(x_train)
print('Accuracy: ', (accuracy_score(y_train, ada_train_pred_y)))
print('fl-score: ', (f1_score(y_train, ada_train_pred_y)))

# valid data
ada_valid_pred_y = adaboost.predict(x_valid)
print('Accuracy: ', (accuracy_score(y_valid, ada_valid_pred_y)))
print('fl-score: ', (f1_score(y_valid, ada_valid_pred_y)))

# test data
ada_test_pred_y = adaboost.predict(x_test)
print('Accuracy: ', (accuracy_score(y_test, ada_test_pred_y)))
print('fl-score: ', (f1_score(y_test, ada_test_pred_y)))

model training time:  345.27329993247986
Accuracy:  0.8756819992537087
fl-score:  0.9255809179983457
Accuracy:  0.8659574468085106
fl-score:  0.9198173242494572
Accuracy:  0.8814459049544995
fl-score:  0.9288099574984822


### random forest

In [7]:
# final_model
start_T = time.time()
rf = RandomForestClassifier(max_samples= 0.9206 ,n_estimators=75 ,max_features=0.9854)
rf.fit(x_train, y_train)
end_T = time.time()
print("model training time: ", end_T - start_T)

# prediction and evaluation
# train data
rf_train_pred_y = rf.predict(x_train)
print('Accuracy: ', (accuracy_score(y_train, rf_train_pred_y)))
print('fl-score: ', (f1_score(y_train, rf_train_pred_y)))

# valid data
rf_valid_pred_y = rf.predict(x_valid)
print('Accuracy: ', (accuracy_score(y_valid, rf_valid_pred_y)))
print('fl-score: ', (f1_score(y_valid, rf_valid_pred_y)))

# test data
rf_test_pred_y = rf.predict(x_test)
print('Accuracy: ', (accuracy_score(y_test, rf_test_pred_y)))
print('fl-score: ', (f1_score(y_test, rf_test_pred_y)))

model training time:  252.29882311820984
Accuracy:  0.9996772794658975
fl-score:  0.999800064978882
Accuracy:  0.8846057571964956
fl-score:  0.9304885404101326
Accuracy:  0.8938321536905965
fl-score:  0.9359072180680603


### XGBoost

In [8]:
# final_model
start_T = time.time()
xgb_m = xgb.XGBClassifier(max_depth = 6, learning_rate=0.1614, n_estimators = 30, gamma=10.95, min_child_weight=2.061, subsample=0.601, colsample_bytree=0.8121)
xgb_m.fit(x_train, y_train)
end_T = time.time()
print("model training time: ", end_T - start_T)

# prediction and evaluation
# train data
xgb_train_pred_y = xgb_m.predict(x_train)
print('Accuracy: ', (accuracy_score(y_train, xgb_train_pred_y)))
print('fl-score: ', (f1_score(y_train, xgb_train_pred_y)))

# valid data
xgb_valid_pred_y = xgb_m.predict(x_valid)
print('Accuracy: ', (accuracy_score(y_valid, xgb_valid_pred_y)))
print('fl-score: ', (f1_score(y_valid, xgb_valid_pred_y)))

# test data
xgb_test_pred_y = xgb_m.predict(x_test)
print('Accuracy: ', (accuracy_score(y_test, xgb_test_pred_y)))
print('fl-score: ', (f1_score(y_test, xgb_test_pred_y)))

model training time:  5.984353065490723
Accuracy:  0.8856661657775043
fl-score:  0.9317282619823313
Accuracy:  0.8821026282853567
fl-score:  0.9296174536760312
Accuracy:  0.893958543983822
fl-score:  0.9363767346629256


### LGBM

In [9]:
# # final_model
# start_T = time.time()
# lgbm = LGBMClassifier(n_estimators = 378, min_data_in_leaf = 36, learning_rate =0.2430823373157657,
#                         max_depth = 2, num_leaves = 49)
# lgbm.fit(x_train, y_train)
# end_T = time.time()
# print("model training time: ", end_T - start_T)
#
# # prediction and evaluation
# # train data
# lgbm_train_pred_y = lgbm.predict(x_train)
# print('Accuracy: ', (accuracy_score(y_train, lgbm_train_pred_y)))
# print('fl-score: ', (f1_score(y_train, lgbm_train_pred_y)))
#
# # valid data
# lgbm_valid_pred_y = lgbm.predict(x_valid)
# print('Accuracy: ', (accuracy_score(y_valid, lgbm_valid_pred_y)))
# print('fl-score: ', (f1_score(y_valid, lgbm_valid_pred_y)))
#
# # test data
# lgbm_test_pred_y = lgbm.predict(x_test)
# print('Accuracy: ', (accuracy_score(y_test, lgbm_test_pred_y)))
# print('fl-score: ', (f1_score(y_test, lgbm_test_pred_y)))

### voting

#### 1. hard voting

In [10]:
# hard_voting_model = VotingClassifier(estimators = [('Adaboost', adaboost), ('RandomForest', rf), ('XGBoost', xgb_m), ('LGBM', lgbm)], voting = 'hard')
#
# start_T = time.time()
# hard_voting_model.fit(x_train, y_train)
# end_T = time.time()
# print("model training time: ", end_T - start_T)
#
# # prediction and evaluation
# # train data
# hard_train_pred_y = hard_voting_model.predict(x_train)
# print('Accuracy: ', (accuracy_score(y_train, hard_train_pred_y)))
# print('fl-score: ', (f1_score(y_train, hard_train_pred_y)))
#
# # valid data
# hard_valid_pred_y = hard_voting_model.predict(x_valid)
# print('Accuracy: ', (accuracy_score(y_valid, hard_valid_pred_y)))
# print('fl-score: ', (f1_score(y_valid, hard_valid_pred_y)))
#
# # test data
# hard_test_pred_y = hard_voting_model.predict(x_test)
# print('Accuracy: ', (accuracy_score(y_test, hard_test_pred_y)))
# print('fl-score: ', (f1_score(y_test, hard_test_pred_y)))

### soft voting

In [11]:
soft_voting_model = VotingClassifier(estimators = [('Adaboost', adaboost), ('RandomForest', rf), ('XGBoost', xgb_m)], voting = 'soft')

start_T = time.time()
soft_voting_model.fit(x_train, y_train)
end_T = time.time()
print("model training time: ", end_T - start_T)

# prediction and evaluation
# train data
soft_train_pred_y = soft_voting_model.predict(x_train)
print('Accuracy: ', (accuracy_score(y_train, soft_train_pred_y)))
print('fl-score: ', (f1_score(y_train, soft_train_pred_y)))

# valid data
soft_valid_pred_y = soft_voting_model.predict(x_valid)
print('Accuracy: ', (accuracy_score(y_valid, soft_valid_pred_y)))
print('fl-score: ', (f1_score(y_valid, soft_valid_pred_y)))

# test data
soft_test_pred_y = soft_voting_model.predict(x_test)
print('Accuracy: ', (accuracy_score(y_test, soft_test_pred_y)))
print('fl-score: ', (f1_score(y_test, soft_test_pred_y)))

model training time:  596.0949280261993
Accuracy:  0.9453392095363918
fl-score:  0.966935897123057
Accuracy:  0.8852315394242803
fl-score:  0.931233595800525
Accuracy:  0.8956016177957533
fl-score:  0.9372721749696235


### save ensemble model's predict proba

In [12]:
# predict proba save
full_data = pd.read_csv('../data/fill_data/full_data.csv')

In [13]:
full_data_predict_proba = soft_voting_model.predict_proba(full_data.drop(columns = ['Unnamed: 0', 'label', 'cust_no']))

In [14]:
pd.DataFrame(full_data_predict_proba).iloc[:,1].to_csv('../data/tableau_data/churn_proba.csv', index = None)

### permutation importance

In [31]:
from sklearn.inspection import permutation_importance
from sklearn.metrics import make_scorer
import datetime

- adaboost

In [None]:
start = time.time()

result = permutation_importance(adaboost, x_test, y_test, n_repeats = 30,
                           scoring = make_scorer(f1_score), random_state= 1004)

end = time.time()
print(datetime.timedelta(seconds = end-start))

In [None]:
sorted_result = result.importances_mean.argsort()
importances = pd.DataFrame(result.importances[sorted_result].T,
                          columns = x_test.columns[sorted_result])

In [None]:
ada_imp_mean = importances.T.mean(axis=1)
ada_imp_std = importances.T.std(axis=1)

In [None]:
pd.DataFrame([ada_imp_mean, ada_imp_std], index = ['imp_mean', 'imp_std']).T.sort_values('imp_mean', ascending = False)

- xgboost

In [None]:
start = time.time()

result = permutation_importance(xgb_m, x_test, y_test, n_repeats = 30,
                           scoring = make_scorer(f1_score), random_state= 1004)

end = time.time()
print(datetime.timedelta(seconds = end-start))

In [None]:
sorted_result = result.importances_mean.argsort()
importances = pd.DataFrame(result.importances[sorted_result].T,
                          columns = x_test.columns[sorted_result])

In [None]:
xgb_imp_mean = importances.T.mean(axis=1)
xgb_imp_std = importances.T.std(axis=1)

In [None]:
pd.DataFrame([xgb_imp_mean, xgb_imp_std], index = ['imp_mean', 'imp_std']).T.sort_values('imp_mean', ascending = False)

- rf

In [None]:
start = time.time()

result = permutation_importance(xgb_m, x_test, y_test, n_repeats = 30,
                           scoring = make_scorer(f1_score), random_state= 1004)

end = time.time()
print(datetime.timedelta(seconds = end-start))

In [None]:
sorted_result = result.importances_mean.argsort()
importances = pd.DataFrame(result.importances[sorted_result].T,
                          columns = x_test.columns[sorted_result])

In [None]:
rf_imp_mean = importances.T.mean(axis=1)
rf_imp_std = importances.T.std(axis=1)

In [None]:
pd.DataFrame([rf_imp_mean, rf_imp_std], index = ['imp_mean', 'imp_std']).T.sort_values('imp_mean', ascending = False)

In [None]:
# explainer = lime_tabular.LimeTabularExplainer(
#     training_data=np.array(x_train),
#     feature_names=x_train.columns,
#     class_names=['0', '1'],
#     mode='classification'
# )

In [None]:
# sp_exp = submodular_pick.SubmodularPick(explainer,
#                                         x_train.values,
#                                         predict_fn=soft_voting_model.predict_proba,
#                                         num_features=83,
#                                         num_exps_desired=5)

In [None]:
# # SP-LIME visualization
# [exp.show_in_notebook() for exp in sp_exp.sp_explanations]
# print('SP-LIME Explanations.')

In [None]:
# [exp.as_pyplot_figure(label=exp.available_labels()[0]) for exp in sp_exp.sp_explanations]
# print('SP-LIME Local Explanations')