In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', 30)

In [2]:
df = pd.read_csv("data/df_post_EDA.csv", index_col=0)
df.head()

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,P,A,L,M,S,RGI,RGO,RRI,RRO,V,1-2-1,TYFCB,CEU,year_of_membership,chapter_size,chapter_retention_rate,chapter_growth_rate,seat_popularity_rate,total_meetings,wont_renew
0,0,31,2016-04-01,33,0,0,1,0,21,7,33,32,1,37,129784,45,0,32,0.315789,1.421053,0.076923,34,0
1,2,18,2017-05-01,33,2,2,0,0,6,17,11,22,1,26,9285,1,0,34,0.71875,1.09375,0.266667,37,0
2,2,18,2018-05-01,35,1,0,0,2,6,24,19,36,1,20,7263,10,1,21,0.424242,0.636364,0.210526,38,0
3,2,18,2019-05-01,36,0,0,0,1,9,8,23,19,3,28,1860,31,2,22,0.73913,1.217391,0.263158,37,0
4,2,18,2020-05-01,33,1,0,0,3,10,13,19,47,4,30,6668,16,3,26,0.76,1.12,0.277778,37,0


### Drop & split

In [3]:
df.drop(["user_ID", "relative_renewal_date"], axis=1, inplace=True)

In [4]:
df["P_by_A"] = df["P"] / (df["A"] + 1)
df["P_by_M"] = df["P"] / (df["M"] + 1)
df["P_by_S"] = df["P"] / (df["S"] + 1)
df["P_by_total"] = df["P"] / df["total_meetings"]
df["A_by_total"] = df["A"] / df["total_meetings"]
df["M_by_total"] = df["M"] / df["total_meetings"]

df["P_tim_retention"] = df["P"] * df["chapter_retention_rate"]
df["P_tim_growth"] = df["P"] * df["chapter_growth_rate"]
df["P_tim_popularity"] = df["P"] * df["seat_popularity_rate"]
df["P_tim_V"] = df["P"] * df["V"]
df["P2_tim_V"] = df["P"]**2 * df["V"]
df["P_tim_TYFCB"] = df["P"] * df["TYFCB"]

df["year_tim_retention"] = df["year_of_membership"] * df["chapter_retention_rate"]

df["P2"] = df["P"]**2

columns_list = df.columns.tolist()
columns_list.remove("wont_renew")

columns_list+= ["wont_renew"]
df = df[columns_list]

### Final pick

In [29]:
logreg_10_features = pd.read_csv("data/model_scores/pick_10_features_logreg.csv", index_col=0)
logreg_10_features = logreg_10_features["feature_name"].tolist()

rf_10_features = pd.read_csv("data/model_scores/pick_10_features_rf.csv", index_col=0)
rf_10_features = rf_10_features["feature_name"].tolist()

catboost_10_features = pd.read_csv("data/model_scores/pick_10_features_catboost.csv", index_col=0)
catboost_10_features = catboost_10_features["feature_name"].tolist()

xgb_10_features = pd.read_csv("data/model_scores/pick_10_features_xgb.csv", index_col=0)
xgb_10_features = xgb_10_features["feature_name"].tolist()

# Model

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from catboost import CatBoostClassifier, Pool
import xgboost as xgb

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

from imblearn.under_sampling import EditedNearestNeighbours, TomekLinks, NeighbourhoodCleaningRule
from imblearn.over_sampling import SMOTE, ADASYN
from collections import Counter

In [25]:
df_scores = pd.DataFrame(columns=["training_time [s]", "accuracy", "precision", "recall", "f1", "auc"])

## 1. Logistic Regression

In [34]:
X = df.drop(["wont_renew", "chapter_ID"], axis=1).copy()
y = df["wont_renew"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=710)

under_sampler = TomekLinks()
X_resampled, y_resampled = under_sampler.fit_resample(X_train, y_train)

scaler = StandardScaler()
scaler.fit(X_resampled)
X_resampled = pd.DataFrame(scaler.transform(X_resampled), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

In [48]:
logreg = LogisticRegression(max_iter=5000)

X_resampled_logreg = X_resampled[logreg_10_features]
X_test_logreg = X_test[logreg_10_features]

logreg_grid = {}
logreg_grid["C"] = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]

logreg_cv = GridSearchCV(logreg, logreg_grid, scoring="roc_auc", n_jobs=-1, cv=3)

start = time.time()
logreg_cv.fit(X_resampled_logreg, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start):.2f} seconds")
print(logreg_cv.best_params_)

y_pred = logreg_cv.predict(X_test_logreg)
y_proba = logreg_cv.predict_proba(X_test_logreg)

df_scores.loc["logistic_regression"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]
df_scores

Elapsed time: 0.07 seconds
{'C': 0.001}


Unnamed: 0,training_time [s],accuracy,precision,recall,f1,auc
logistic_regression,0.07,0.712984,1.0,0.066667,0.125,0.698879


## 4. Random Forest

In [49]:
X = df.drop(["wont_renew", "chapter_ID"], axis=1).copy()
y = df["wont_renew"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=710)

under_sampler = TomekLinks()
X_resampled, y_resampled = under_sampler.fit_resample(X_train, y_train)

scaler = StandardScaler()
scaler.fit(X_resampled)
X_resampled = pd.DataFrame(scaler.transform(X_resampled), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

In [51]:
rf = RandomForestClassifier()

X_resampled_rf = X_resampled[rf_10_features]
X_test_rf = X_test[rf_10_features]

rf_grid = {}
rf_grid["n_estimators"] = [100, 500, 1000]
rf_grid["max_depth"] = [x for x in range(2, 10, 2)]
rf_grid["min_samples_split"] = [x for x in range(2, 7, 2)]
rf_grid["min_samples_leaf"] = [x for x in range(1, 6, 1)]

rf_cv = GridSearchCV(rf, rf_grid, scoring="roc_auc", n_jobs=-1, cv=3, verbose=2)

start = time.time()
rf_cv.fit(X_resampled_rf, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start):.2f} seconds")
print(rf_cv.best_params_)

y_pred = rf_cv.predict(X_test_rf)
y_proba = rf_cv.predict_proba(X_test_rf)

df_scores.loc["random_forest"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Fitting 3 folds for each of 180 candidates, totalling 540 fits
Elapsed time: 112.73 seconds
{'max_depth': 6, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}


## 6. XGBoost

In [52]:
X = df.drop(["wont_renew", "chapter_ID"], axis=1).copy()
y = df["wont_renew"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=710)

under_sampler = TomekLinks()
X_resampled, y_resampled = under_sampler.fit_resample(X_train, y_train)

scaler = StandardScaler()
scaler.fit(X_resampled)
X_resampled = pd.DataFrame(scaler.transform(X_resampled), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

In [53]:
xg = xgb.XGBClassifier(objective='binary:logistic', eval_metric="auc", random_state=42, use_label_encoder=False)

X_resampled_xgb = X_resampled[xgb_10_features]
X_test_xgb = X_test[xgb_10_features]

xg_grid = {}
xg_grid["n_estimators"] = [100, 500, 1000]
xg_grid["colsample_bytree"] = [0.1, 0.3, 0.5, 0.8]
xg_grid["learning_rate"] = [0.03, 0.1, 0.3, 1]
xg_grid["max_depth"] = [x for x in range(2, 11, 1)]
xg_grid["alpha"] = [0.1, 0.3, 1, 3, 10]

xg_cv = GridSearchCV(xg, xg_grid, scoring="roc_auc", n_jobs=-1, cv=3, verbose=2)

start = time.time()
xg_cv.fit(X_resampled_xgb, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start):.2f} seconds")
print(xg_cv.best_params_)

y_pred = xg_cv.predict(X_test_xgb)
y_proba = xg_cv.predict_proba(X_test_xgb)

df_scores.loc["XGBoost"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Fitting 3 folds for each of 2160 candidates, totalling 6480 fits
Elapsed time: 538.03 seconds
{'alpha': 10, 'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100}


## 7. CatBoost

In [58]:
X = df.drop(["wont_renew"], axis=1).copy()
y = df["wont_renew"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=710)

scaler = StandardScaler()
scaler.fit(X_train)

X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

X_train["chapter_ID"] = X_train["chapter_ID"].astype("str")
X_test["chapter_ID"] = X_test["chapter_ID"].astype("str")

cb = CatBoostClassifier(early_stopping_rounds=20, eval_metric="AUC", verbose=0, cat_features=["chapter_ID"])

X_train_cat = X_train[catboost_10_features]
X_test_cat = X_test[catboost_10_features]

cb_grid = {}
cb_grid["iterations"] = [100, 500, 1000]
cb_grid["learning_rate"] = [0.03, 0.1, 0.3, 1]
cb_grid['l2_leaf_reg'] = [1, 3, 5, 7, 9]
cb_grid["depth"] = [4, 6, 8, 10]

start = time.time()
grid_search_result = cb.grid_search(cb_grid, X=X_train_cat, y=y_train, plot=True)
end = time.time()
print(f"Elapsed time: {(end - start):.2f} seconds")
print(grid_search_result)

y_pred = cb.predict(X_test_cat)
y_proba = cb.predict_proba(X_test_cat)
df_scores.loc["CatBoost"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6999434176
bestIteration = 2

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
0:	loss: 0.6999434	best: 0.6999434 (0)	total: 393ms	remaining: 1m 34s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.7058091286
bestIteration = 9

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
1:	loss: 0.7058091	best: 0.7058091 (1)	total: 1.14s	remaining: 2m 15s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.7024707657
bestIteration = 4

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
2:	loss: 0.7024708	best: 0.7058091 (1)	total: 1.63s	remaining: 2m 8s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6873632591
bestIteration = 

Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.702640513
bestIteration = 6

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
28:	loss: 0.7026405	best: 0.7073180 (20)	total: 21.9s	remaining: 2m 39s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6965296115
bestIteration = 8

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
29:	loss: 0.6965296	best: 0.7073180 (20)	total: 22.5s	remaining: 2m 37s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6998679743
bestIteration = 8

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
30:	loss: 0.6998680	best: 0.7073180 (20)	total: 23.1s	remaining: 2m 35s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6846850245
bestIterat

Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6807808374
bestIteration = 1

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
55:	loss: 0.6807808	best: 0.7073180 (20)	total: 39.5s	remaining: 2m 9s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.7023387401
bestIteration = 8

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
56:	loss: 0.7023387	best: 0.7073180 (20)	total: 40.1s	remaining: 2m 8s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6956431535
bestIteration = 6

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
57:	loss: 0.6956432	best: 0.7073180 (20)	total: 40.6s	remaining: 2m 7s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.7003394945
bestIteratio

Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6576952094
bestIteration = 1

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
83:	loss: 0.6576952	best: 0.7090155 (77)	total: 59.2s	remaining: 1m 49s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6934741607
bestIteration = 13

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
84:	loss: 0.6934742	best: 0.7090155 (77)	total: 1m	remaining: 1m 49s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6965861939
bestIteration = 3

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
85:	loss: 0.6965862	best: 0.7090155 (77)	total: 1m	remaining: 1m 49s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6801772916
bestIteration 

Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6858543946
bestIteration = 4

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
110:	loss: 0.6858544	best: 0.7090155 (77)	total: 1m 22s	remaining: 1m 35s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6923425123
bestIteration = 30

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
111:	loss: 0.6923425	best: 0.7090155 (77)	total: 1m 23s	remaining: 1m 35s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.690607318
bestIteration = 14

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
112:	loss: 0.6906073	best: 0.7090155 (77)	total: 1m 24s	remaining: 1m 34s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6901735194
be

Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6906261788
bestIteration = 2

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
137:	loss: 0.6906262	best: 0.7112033 (125)	total: 1m 53s	remaining: 1m 23s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.7042248208
bestIteration = 4

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
138:	loss: 0.7042248	best: 0.7112033 (125)	total: 1m 54s	remaining: 1m 22s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.684835911
bestIteration = 8

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
139:	loss: 0.6848359	best: 0.7112033 (125)	total: 1m 55s	remaining: 1m 22s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.694907582
be

Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6997736703
bestIteration = 41

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
164:	loss: 0.6997737	best: 0.7112033 (125)	total: 2m 48s	remaining: 1m 16s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6858543946
bestIteration = 24

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
165:	loss: 0.6858544	best: 0.7112033 (125)	total: 2m 50s	remaining: 1m 16s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6811769144
bestIteration = 3

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
166:	loss: 0.6811769	best: 0.7112033 (125)	total: 2m 51s	remaining: 1m 14s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.671633345

Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.657827235
bestIteration = 1

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
191:	loss: 0.6578272	best: 0.7112033 (125)	total: 3m 33s	remaining: 53.4s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.698038476
bestIteration = 35

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
192:	loss: 0.6980385	best: 0.7112033 (125)	total: 3m 36s	remaining: 52.6s

bestTest = 0.69883063
bestIteration = 83

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
193:	loss: 0.6988306	best: 0.7112033 (125)	total: 3m 40s	remaining: 52.3s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6527348171
bestIteration = 15

Metric AUC is not calculated on train b

Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.679253112
bestIteration = 9

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
218:	loss: 0.6792531	best: 0.7112033 (125)	total: 4m 31s	remaining: 26s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6698981516
bestIteration = 71

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
219:	loss: 0.6698982	best: 0.7112033 (125)	total: 4m 35s	remaining: 25.1s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6883062995
bestIteration = 37

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
220:	loss: 0.6883063	best: 0.7112033 (125)	total: 4m 38s	remaining: 23.9s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6672953602
best

## Compare results

In [59]:
df_scores.sort_values("auc", ascending=False)

Unnamed: 0,training_time [s],accuracy,precision,recall,f1,auc
XGBoost,538.03,0.722096,0.565657,0.414815,0.478632,0.70653
logistic_regression,0.07,0.712984,1.0,0.066667,0.125,0.698879
random_forest,112.73,0.719818,0.583333,0.311111,0.405797,0.694298
CatBoost,330.69,0.712984,0.578947,0.244444,0.34375,0.666179


In [60]:
df_scores.to_csv("model_scores_step7.csv")