In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', 30)

In [2]:
df = pd.read_csv("data/df_post_EDA.csv", index_col=0)
df.head()

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,P,A,L,M,S,RGI,RGO,RRI,RRO,V,1-2-1,TYFCB,CEU,year_of_membership,chapter_size,chapter_retention_rate,chapter_growth_rate,seat_popularity_rate,total_meetings,wont_renew
0,0,31,2016-04-01,33,0,0,1,0,21,7,33,32,1,37,129784,45,0,32,0.315789,1.421053,0.076923,34,0
1,2,18,2017-05-01,33,2,2,0,0,6,17,11,22,1,26,9285,1,0,34,0.71875,1.09375,0.266667,37,0
2,2,18,2018-05-01,35,1,0,0,2,6,24,19,36,1,20,7263,10,1,21,0.424242,0.636364,0.210526,38,0
3,2,18,2019-05-01,36,0,0,0,1,9,8,23,19,3,28,1860,31,2,22,0.73913,1.217391,0.263158,37,0
4,2,18,2020-05-01,33,1,0,0,3,10,13,19,47,4,30,6668,16,3,26,0.76,1.12,0.277778,37,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2191 entries, 0 to 2428
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   user_ID                 2191 non-null   int64  
 1   chapter_ID              2191 non-null   int64  
 2   relative_renewal_date   2191 non-null   object 
 3   P                       2191 non-null   int64  
 4   A                       2191 non-null   int64  
 5   L                       2191 non-null   int64  
 6   M                       2191 non-null   int64  
 7   S                       2191 non-null   int64  
 8   RGI                     2191 non-null   int64  
 9   RGO                     2191 non-null   int64  
 10  RRI                     2191 non-null   int64  
 11  RRO                     2191 non-null   int64  
 12  V                       2191 non-null   int64  
 13  1-2-1                   2191 non-null   int64  
 14  TYFCB                   2191 non-null   

### Drop & split

In [4]:
df.drop(["user_ID",
         "relative_renewal_date",
         "year_of_membership",
         "chapter_size",
         "chapter_retention_rate",
         "chapter_growth_rate",
         "seat_popularity_rate",
         "total_meetings"], axis=1, inplace=True)

In [5]:
X = df.drop(["wont_renew", "chapter_ID"], axis=1).copy()
y = df["wont_renew"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=710)

In [6]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

In [7]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1752 entries, 0 to 1751
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   P       1752 non-null   float64
 1   A       1752 non-null   float64
 2   L       1752 non-null   float64
 3   M       1752 non-null   float64
 4   S       1752 non-null   float64
 5   RGI     1752 non-null   float64
 6   RGO     1752 non-null   float64
 7   RRI     1752 non-null   float64
 8   RRO     1752 non-null   float64
 9   V       1752 non-null   float64
 10  1-2-1   1752 non-null   float64
 11  TYFCB   1752 non-null   float64
 12  CEU     1752 non-null   float64
dtypes: float64(13)
memory usage: 178.1 KB


# Model

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from catboost import CatBoostClassifier, Pool
import xgboost as xgb

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

In [9]:
df_scores = pd.DataFrame(columns=["training_time [s]", "accuracy", "precision", "recall", "f1", "auc"])

## 1. Logistic Regression

In [18]:
logreg = LogisticRegression(max_iter=2000)

logreg_grid = {}
logreg_grid["C"] = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]

logreg_cv = GridSearchCV(logreg, logreg_grid, scoring="roc_auc", n_jobs=-1, cv=3, verbose=5)

start = time.time()
logreg_cv.fit(X_train, y_train)
end = time.time()
print(f"Elapsed time: {(end - start):.2f} seconds")
print(logreg_cv.best_params_)

y_pred = logreg_cv.predict(X_test)
y_proba = logreg_cv.predict_proba(X_test)

df_scores.loc["logistic_regression"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]
df_scores

Fitting 3 folds for each of 11 candidates, totalling 33 fits
Elapsed time: 0 minutes 0.08 seconds
{'C': 0.01}


Unnamed: 0,training_time [s],accuracy,precision,recall,f1,auc
logistic_regression,0.08,0.728929,0.710526,0.2,0.312139,0.700414


## 2. K-neighbors classifier

In [23]:
knn = KNeighborsClassifier()

knn_grid = {}
knn_grid["n_neighbors"] = [x for x in range(3, 30, 2)]
knn_grid["weights"] = ["uniform", "distance"]

knn_cv = GridSearchCV(knn, knn_grid, scoring="roc_auc", n_jobs=-1, cv=3, verbose=5)

start = time.time()
knn_cv.fit(X_train, y_train)
end = time.time()
print(f"Elapsed time: {(end - start):.2f} seconds")
print(knn_cv.best_params_)

y_pred = knn_cv.predict(X_test)
y_proba = knn_cv.predict_proba(X_test)

df_scores.loc["knn"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Fitting 3 folds for each of 28 candidates, totalling 84 fits
Elapsed time: 0 minutes 0.35 seconds
{'n_neighbors': 29, 'weights': 'distance'}


## 3. Decision Tree

In [25]:
dt = DecisionTreeClassifier()

dt_grid = {}
dt_grid["max_depth"] = [x for x in range(2, 20, 2)]
dt_grid["min_samples_split"] = [x for x in range(2, 10, 2)]
dt_grid["min_samples_leaf"] = [x for x in range(1, 9, 1)]

dt_cv = GridSearchCV(dt, dt_grid, scoring="roc_auc", n_jobs=-1, cv=3, verbose=5)

start = time.time()
dt_cv.fit(X_train, y_train)
end = time.time()
print(f"Elapsed time: {(end - start):.2f} seconds")
print(dt_cv.best_params_)

y_pred = dt_cv.predict(X_test)
y_proba = dt_cv.predict_proba(X_test)

df_scores.loc["decision_tree"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Fitting 3 folds for each of 288 candidates, totalling 864 fits
Elapsed time: 0 minutes 1.43 seconds
{'max_depth': 4, 'min_samples_leaf': 6, 'min_samples_split': 4}


## 4. Random Forest

In [26]:
rf = RandomForestClassifier()

rf_grid = {}
rf_grid["n_estimators"] = [100, 500, 1000]
rf_grid["max_depth"] = [x for x in range(2, 10, 2)]
rf_grid["min_samples_split"] = [x for x in range(2, 7, 2)]
rf_grid["min_samples_leaf"] = [x for x in range(1, 6, 1)]

rf_cv = GridSearchCV(rf, rf_grid, scoring="roc_auc", n_jobs=-1, cv=3, verbose=5)

start = time.time()
rf_cv.fit(X_train, y_train)
end = time.time()
print(f"Elapsed time: {(end - start):.2f} seconds")
print(rf_cv.best_params_)

y_pred = rf_cv.predict(X_test)
y_proba = rf_cv.predict_proba(X_test)

df_scores.loc["random_forest"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Fitting 3 folds for each of 240 candidates, totalling 720 fits
Elapsed time: 3 minutes 208.49 seconds
{'max_depth': 4, 'min_samples_leaf': 4, 'min_samples_split': 4, 'n_estimators': 100}


## 5. AdaBoost

In [27]:
ada = AdaBoostClassifier()

ada_grid = {}
ada_grid["n_estimators"] = [50, 100, 500, 1000]
ada_grid["learning_rate"] = [0.03, 0.1, 0.3, 1]

ada_cv = GridSearchCV(ada, ada_grid, scoring="roc_auc", n_jobs=-1, cv=3, verbose=5)

start = time.time()
ada_cv.fit(X_train, y_train)
end = time.time()
print(f"Elapsed time: {(end - start):.2f} seconds")
print(ada_cv.best_params_)

y_pred = ada_cv.predict(X_test)
y_proba = ada_cv.predict_proba(X_test)

df_scores.loc["AdaBoost"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Elapsed time: 0 minutes 9.55 seconds
{'learning_rate': 0.1, 'n_estimators': 50}


## 6. XGBoost

In [29]:
xg = xgb.XGBClassifier(objective='binary:logistic', eval_metric="auc", use_label_encoder=False)

xg_grid = {}
xg_grid["n_estimators"] = [100, 500, 1000]
xg_grid["colsample_bytree"] = [0.1, 0.3, 0.5, 0.8]
xg_grid["learning_rate"] = [0.03, 0.1, 0.3, 1]
xg_grid["max_depth"] = [x for x in range(2, 11, 1)]
xg_grid["alpha"] = [0.1, 0.3, 1, 3, 10]

xg_cv = GridSearchCV(xg, xg_grid, scoring="roc_auc", n_jobs=-1, cv=3, verbose=5)

start = time.time()
xg_cv.fit(X_train, y_train)
end = time.time()
print(f"Elapsed time: {(end - start):.2f} seconds")
print(xg_cv.best_params_)

y_pred = xg_cv.predict(X_test)
y_proba = xg_cv.predict_proba(X_test)

df_scores.loc["XGBoost"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Fitting 3 folds for each of 2160 candidates, totalling 6480 fits
Elapsed time: 9 minutes 564.69 seconds
{'alpha': 1, 'colsample_bytree': 0.1, 'learning_rate': 0.03, 'max_depth': 7, 'n_estimators': 100}


## 7. CatBoost

In [31]:
X = df.drop(["wont_renew"], axis=1).copy()
y = df["wont_renew"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=710)

scaler = StandardScaler()
scaler.fit(X_train)

X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

X_train["chapter_ID"] = X_train["chapter_ID"].astype("str")
X_test["chapter_ID"] = X_test["chapter_ID"].astype("str")

cb = CatBoostClassifier(early_stopping_rounds=20, cat_features=["chapter_ID"])

cb_grid = {}
cb_grid["iterations"] = [100, 500, 1000]
cb_grid["learning_rate"] = [0.03, 0.1, 0.3, 1]
cb_grid['l2_leaf_reg'] = [1, 3, 5, 7, 9]
cb_grid["depth"] = [4, 6, 8, 10]

cb_cv = GridSearchCV(cb, cb_grid, scoring="roc_auc", n_jobs=-1, cv=3, verbose=5)

start = time.time()
cb_cv.fit(X_train, y_train, verbose=0)
end = time.time()
print(f"Elapsed time: {(end - start):.2f} seconds")
print(cb_cv.best_params_)

y_pred = cb_cv.predict(X_test)
y_proba = cb_cv.predict_proba(X_test)

df_scores.loc["CatBoost"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Fitting 3 folds for each of 4 candidates, totalling 12 fits
Elapsed time: 2 minutes 147.09 seconds
{'depth': 10}


# Compare results

In [32]:
df_scores.sort_values("auc", ascending=False)

Unnamed: 0,training_time [s],accuracy,precision,recall,f1,auc
XGBoost,564.69,0.708428,0.684211,0.096296,0.168831,0.71423
AdaBoost,9.55,0.724374,0.675,0.2,0.308571,0.703289
logistic_regression,0.08,0.728929,0.710526,0.2,0.312139,0.700414
random_forest,208.49,0.715262,0.647059,0.162963,0.260355,0.698514
CatBoost,147.09,0.703872,0.537313,0.266667,0.356436,0.674488
knn,0.35,0.719818,0.666667,0.177778,0.280702,0.657724
decision_tree,1.43,0.71754,0.641026,0.185185,0.287356,0.632736


In [19]:
df_scores.to_csv("model_scores_step3.csv")