In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', 30)

In [2]:
df = pd.read_csv("data/df_post_EDA.csv", index_col=0)
df.head()

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,P,A,L,M,S,RGI,RGO,RRI,RRO,V,1-2-1,TYFCB,CEU,year_of_membership,chapter_size,chapter_retention_rate,chapter_growth_rate,seat_popularity_rate,total_meetings,wont_renew
0,0,31,2016-04-01,33,0,0,1,0,21,7,33,32,1,37,129784,45,0,32,0.315789,1.421053,0.076923,34,0
1,2,18,2017-05-01,33,2,2,0,0,6,17,11,22,1,26,9285,1,0,34,0.71875,1.09375,0.266667,37,0
2,2,18,2018-05-01,35,1,0,0,2,6,24,19,36,1,20,7263,10,1,21,0.424242,0.636364,0.210526,38,0
3,2,18,2019-05-01,36,0,0,0,1,9,8,23,19,3,28,1860,31,2,22,0.73913,1.217391,0.263158,37,0
4,2,18,2020-05-01,33,1,0,0,3,10,13,19,47,4,30,6668,16,3,26,0.76,1.12,0.277778,37,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2191 entries, 0 to 2428
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   user_ID                 2191 non-null   int64  
 1   chapter_ID              2191 non-null   int64  
 2   relative_renewal_date   2191 non-null   object 
 3   P                       2191 non-null   int64  
 4   A                       2191 non-null   int64  
 5   L                       2191 non-null   int64  
 6   M                       2191 non-null   int64  
 7   S                       2191 non-null   int64  
 8   RGI                     2191 non-null   int64  
 9   RGO                     2191 non-null   int64  
 10  RRI                     2191 non-null   int64  
 11  RRO                     2191 non-null   int64  
 12  V                       2191 non-null   int64  
 13  1-2-1                   2191 non-null   int64  
 14  TYFCB                   2191 non-null   

### Drop & split

In [4]:
df.drop(["user_ID",
         "relative_renewal_date",
         "year_of_membership",
         "chapter_size",
         "chapter_retention_rate",
         "chapter_growth_rate",
         "seat_popularity_rate",
         "total_meetings"], axis=1, inplace=True)

In [5]:
X = df.drop(["wont_renew", "chapter_ID"], axis=1).copy()
y = df["wont_renew"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=710)

In [8]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

In [9]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1752 entries, 0 to 1751
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   P       1752 non-null   float64
 1   A       1752 non-null   float64
 2   L       1752 non-null   float64
 3   M       1752 non-null   float64
 4   S       1752 non-null   float64
 5   RGI     1752 non-null   float64
 6   RGO     1752 non-null   float64
 7   RRI     1752 non-null   float64
 8   RRO     1752 non-null   float64
 9   V       1752 non-null   float64
 10  1-2-1   1752 non-null   float64
 11  TYFCB   1752 non-null   float64
 12  CEU     1752 non-null   float64
dtypes: float64(13)
memory usage: 178.1 KB


# Model

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from catboost import CatBoostClassifier, Pool
import xgboost as xgb

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

In [11]:
df_scores = pd.DataFrame(columns=["training_time [s]", "accuracy", "precision", "recall", "f1", "auc"])

## 1. Logistic Regression

In [12]:
logreg = LogisticRegression(max_iter=2000)

start = time.time()
logreg.fit(X_train, y_train)
end = time.time()
print(f"Elapsed time: {(end - start) // 60:.0f} minutes {(end - start):.2f} seconds")

y_pred = logreg.predict(X_test)
y_proba = logreg.predict_proba(X_test)

df_scores.loc["logistic_regression"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]
df_scores

Elapsed time: 0 minutes 0.01 seconds


Unnamed: 0,training_time [s],accuracy,precision,recall,f1,auc
logistic_regression,0.01,0.728929,0.666667,0.237037,0.349727,0.695273


## 2. K-neighbors classifier

In [13]:
knn = KNeighborsClassifier()

start = time.time()
knn.fit(X_train, y_train)
end = time.time()
print(f"Elapsed time: {(end - start) // 60:.0f} minutes {(end - start):.2f} seconds")

y_pred = knn.predict(X_test)
y_proba = knn.predict_proba(X_test)

df_scores.loc["knn"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Elapsed time: 0 minutes 0.01 seconds


## 3. Decision Tree

In [14]:
dt = DecisionTreeClassifier()

start = time.time()
dt.fit(X_train, y_train)
end = time.time()
print(f"Elapsed time: {(end - start) // 60:.0f} minutes {(end - start):.2f} seconds")

y_pred = dt.predict(X_test)
y_proba = dt.predict_proba(X_test)

df_scores.loc["decision_tree"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Elapsed time: 0 minutes 0.01 seconds


## 4. Random Forest

In [15]:
rf = RandomForestClassifier()

start = time.time()
rf.fit(X_train, y_train)
end = time.time()
print(f"Elapsed time: {(end - start) // 60:.0f} minutes {(end - start):.2f} seconds")

y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)

df_scores.loc["random_forest"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Elapsed time: 0 minutes 0.27 seconds


## 5. AdaBoost

In [16]:
ada = AdaBoostClassifier()

start = time.time()
ada.fit(X_train, y_train)
end = time.time()
print(f"Elapsed time: {(end - start) // 60:.0f} minutes {(end - start):.2f} seconds")

y_pred = ada.predict(X_test)
y_proba = ada.predict_proba(X_test)

df_scores.loc["AdaBoost"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Elapsed time: 0 minutes 0.11 seconds


## 6. XGBoost

In [17]:
xg = xgb.XGBClassifier(objective='binary:logistic', eval_metric="auc", use_label_encoder=False)

start = time.time()
xg.fit(X_train, y_train)
end = time.time()
print(f"Elapsed time: {(end - start) // 60:.0f} minutes {(end - start):.2f} seconds")

y_pred = xg.predict(X_test)
y_proba = xg.predict_proba(X_test)

df_scores.loc["XGBoost"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Elapsed time: 0 minutes 0.11 seconds


## 7. CatBoost

In [18]:
X = df.drop(["wont_renew"], axis=1).copy()
y = df["wont_renew"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=710)

scaler = StandardScaler()
scaler.fit(X_train)

X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

X_train["chapter_ID"] = X_train["chapter_ID"].astype("str")
X_test["chapter_ID"] = X_test["chapter_ID"].astype("str")

cb = CatBoostClassifier()

start = time.time()
cb.fit(X_train, y_train, verbose=100, cat_features=["chapter_ID"])
end = time.time()
print(f"Elapsed time: {(end - start) // 60:.0f} minutes {(end - start):.2f} seconds")

y_pred = cb.predict(X_test)
y_proba = cb.predict_proba(X_test)

df_scores.loc["CatBoost"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Learning rate set to 0.01309
0:	learn: 0.6895886	total: 188ms	remaining: 3m 7s
100:	learn: 0.5476949	total: 2.75s	remaining: 24.5s
200:	learn: 0.5109564	total: 5.24s	remaining: 20.8s
300:	learn: 0.4853639	total: 7.57s	remaining: 17.6s
400:	learn: 0.4640459	total: 10.1s	remaining: 15.1s
500:	learn: 0.4448940	total: 12.5s	remaining: 12.4s
600:	learn: 0.4228027	total: 14.9s	remaining: 9.87s
700:	learn: 0.4010647	total: 17.2s	remaining: 7.35s
800:	learn: 0.3801725	total: 19.8s	remaining: 4.91s
900:	learn: 0.3604510	total: 22.4s	remaining: 2.46s
999:	learn: 0.3424114	total: 24.8s	remaining: 0us
Elapsed time: 0 minutes 24.89 seconds


# Compare results

In [19]:
df_scores.sort_values("auc", ascending=False)

Unnamed: 0,training_time [s],accuracy,precision,recall,f1,auc
logistic_regression,0.01,0.728929,0.666667,0.237037,0.349727,0.695273
CatBoost,24.89,0.719818,0.625,0.222222,0.327869,0.694664
AdaBoost,0.11,0.726651,0.61194,0.303704,0.405941,0.694652
random_forest,0.27,0.733485,0.673077,0.259259,0.374332,0.671162
XGBoost,0.11,0.703872,0.532468,0.303704,0.386792,0.64732
knn,0.01,0.665148,0.414286,0.214815,0.282927,0.591435
decision_tree,0.01,0.640091,0.41844,0.437037,0.427536,0.58365


In [20]:
df_scores.to_csv("model_scores_step2.csv")