In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', 30)

In [2]:
df = pd.read_csv("data/df_post_EDA.csv", index_col=0)
df.head()

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,P,A,L,M,S,RGI,RGO,RRI,RRO,V,1-2-1,TYFCB,CEU,year_of_membership,chapter_size,chapter_retention_rate,chapter_growth_rate,seat_popularity_rate,total_meetings,wont_renew
0,0,31,2016-04-01,33,0,0,1,0,21,7,33,32,1,37,129784,45,0,32,0.315789,1.421053,0.076923,34,0
1,2,18,2017-05-01,33,2,2,0,0,6,17,11,22,1,26,9285,1,0,34,0.71875,1.09375,0.266667,37,0
2,2,18,2018-05-01,35,1,0,0,2,6,24,19,36,1,20,7263,10,1,21,0.424242,0.636364,0.210526,38,0
3,2,18,2019-05-01,36,0,0,0,1,9,8,23,19,3,28,1860,31,2,22,0.73913,1.217391,0.263158,37,0
4,2,18,2020-05-01,33,1,0,0,3,10,13,19,47,4,30,6668,16,3,26,0.76,1.12,0.277778,37,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2191 entries, 0 to 2428
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   user_ID                 2191 non-null   int64  
 1   chapter_ID              2191 non-null   int64  
 2   relative_renewal_date   2191 non-null   object 
 3   P                       2191 non-null   int64  
 4   A                       2191 non-null   int64  
 5   L                       2191 non-null   int64  
 6   M                       2191 non-null   int64  
 7   S                       2191 non-null   int64  
 8   RGI                     2191 non-null   int64  
 9   RGO                     2191 non-null   int64  
 10  RRI                     2191 non-null   int64  
 11  RRO                     2191 non-null   int64  
 12  V                       2191 non-null   int64  
 13  1-2-1                   2191 non-null   int64  
 14  TYFCB                   2191 non-null   

### Drop & split

In [4]:
df.drop(["user_ID",
         "relative_renewal_date",
         "year_of_membership",
         "chapter_size",
         "chapter_retention_rate",
         "chapter_growth_rate",
         "seat_popularity_rate",
         "total_meetings"], axis=1, inplace=True)

# Re-sample

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from catboost import CatBoostClassifier, Pool
import xgboost as xgb

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

from imblearn.under_sampling import EditedNearestNeighbours, TomekLinks, NeighbourhoodCleaningRule
from imblearn.over_sampling import SMOTE, ADASYN
from collections import Counter

## Strategy 1: downsample majority

In [6]:
X = df.drop(["wont_renew", "chapter_ID"], axis=1).copy()
y = df["wont_renew"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=710)

print("Original dataset shape: %s" %Counter(y_train))

Original dataset shape: Counter({0: 1215, 1: 537})


In [7]:
under_sampler = TomekLinks()
X_resampled, y_resampled = under_sampler.fit_resample(X_train, y_train)
print("Downsampled dataset shape: %s" %Counter(y_resampled))

Downsampled dataset shape: Counter({0: 1005, 1: 537})


In [8]:
scaler = StandardScaler()
scaler.fit(X_resampled)
X_resampled = pd.DataFrame(scaler.transform(X_resampled), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

In [9]:
X_resampled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1542 entries, 0 to 1541
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   P       1542 non-null   float64
 1   A       1542 non-null   float64
 2   L       1542 non-null   float64
 3   M       1542 non-null   float64
 4   S       1542 non-null   float64
 5   RGI     1542 non-null   float64
 6   RGO     1542 non-null   float64
 7   RRI     1542 non-null   float64
 8   RRO     1542 non-null   float64
 9   V       1542 non-null   float64
 10  1-2-1   1542 non-null   float64
 11  TYFCB   1542 non-null   float64
 12  CEU     1542 non-null   float64
dtypes: float64(13)
memory usage: 156.7 KB


# Model

In [10]:
df_scores1 = pd.DataFrame(columns=["training_time [s]", "accuracy", "precision", "recall", "f1", "auc"])

## 1. Logistic Regression

In [11]:
logreg = LogisticRegression(max_iter=2000)

start = time.time()
logreg.fit(X_resampled, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start):.2f} seconds")

y_pred = logreg.predict(X_test)
y_proba = logreg.predict_proba(X_test)

df_scores1.loc["logistic_regression"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]
df_scores1

Elapsed time: 0.01 seconds


Unnamed: 0,training_time [s],accuracy,precision,recall,f1,auc
logistic_regression,0.01,0.735763,0.637681,0.325926,0.431373,0.701072


## 2. K-neighbors classifier

In [12]:
knn = KNeighborsClassifier()

start = time.time()
knn.fit(X_resampled, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start) // 60:.0f} minutes {(end - start):.2f} seconds")

y_pred = knn.predict(X_test)
y_proba = knn.predict_proba(X_test)

df_scores1.loc["knn"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Elapsed time: 0 minutes 0.01 seconds


## 3. Decision Tree

In [13]:
dt = DecisionTreeClassifier()

start = time.time()
dt.fit(X_resampled, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start) // 60:.0f} minutes {(end - start):.2f} seconds")

y_pred = dt.predict(X_test)
y_proba = dt.predict_proba(X_test)

df_scores1.loc["decision_tree"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Elapsed time: 0 minutes 0.01 seconds


## 4. Random Forest

In [14]:
rf = RandomForestClassifier()

start = time.time()
rf.fit(X_resampled, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start) // 60:.0f} minutes {(end - start):.2f} seconds")

y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)

df_scores1.loc["random_forest"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Elapsed time: 0 minutes 0.25 seconds


## 5. AdaBoost

In [15]:
ada = AdaBoostClassifier()

start = time.time()
ada.fit(X_resampled, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start) // 60:.0f} minutes {(end - start):.2f} seconds")

y_pred = ada.predict(X_test)
y_proba = ada.predict_proba(X_test)

df_scores1.loc["AdaBoost"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Elapsed time: 0 minutes 0.10 seconds


## 6. XGBoost

In [16]:
xg = xgb.XGBClassifier(objective='binary:logistic', eval_metric="auc", use_label_encoder=False)

start = time.time()
xg.fit(X_resampled, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start) // 60:.0f} minutes {(end - start):.2f} seconds")

y_pred = xg.predict(X_test)
y_proba = xg.predict_proba(X_test)

df_scores1.loc["XGBoost"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Elapsed time: 0 minutes 0.12 seconds


## 7. CatBoost

In [17]:
X = df.drop(["wont_renew"], axis=1).copy()
y = df["wont_renew"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=710)

under_sampler = TomekLinks()
X_resampled, y_resampled = under_sampler.fit_resample(X_train, y_train)

scaler = StandardScaler()
scaler.fit(X_resampled)

X_train = pd.DataFrame(scaler.transform(X_resampled), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

X_train["chapter_ID"] = X_train["chapter_ID"].astype("str")
X_test["chapter_ID"] = X_test["chapter_ID"].astype("str")

cb = CatBoostClassifier()

start = time.time()
cb.fit(X_resampled, y_resampled, verbose=100, cat_features=["chapter_ID"])
end = time.time()
print(f"Elapsed time: {(end - start) // 60:.0f} minutes {(end - start):.2f} seconds")

y_pred = cb.predict(X_test)
y_proba = cb.predict_proba(X_test)

df_scores1.loc["CatBoost"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Learning rate set to 0.012388
0:	learn: 0.6905031	total: 186ms	remaining: 3m 5s
100:	learn: 0.5649444	total: 2.79s	remaining: 24.8s
200:	learn: 0.5228478	total: 5.38s	remaining: 21.4s
300:	learn: 0.4950302	total: 7.77s	remaining: 18s
400:	learn: 0.4751276	total: 10.2s	remaining: 15.3s
500:	learn: 0.4550220	total: 12.7s	remaining: 12.6s
600:	learn: 0.4334178	total: 15.1s	remaining: 10s
700:	learn: 0.4089121	total: 17.4s	remaining: 7.43s
800:	learn: 0.3859090	total: 19.8s	remaining: 4.92s
900:	learn: 0.3653110	total: 22.2s	remaining: 2.44s
999:	learn: 0.3467367	total: 24.6s	remaining: 0us
Elapsed time: 0 minutes 24.73 seconds


## Compare results

In [18]:
df_scores1.sort_values("auc", ascending=False)

Unnamed: 0,training_time [s],accuracy,precision,recall,f1,auc
logistic_regression,0.01,0.735763,0.637681,0.325926,0.431373,0.701072
XGBoost,0.12,0.687927,0.49,0.362963,0.417021,0.678095
AdaBoost,0.1,0.719818,0.566667,0.377778,0.453333,0.674488
random_forest,0.25,0.681093,0.47191,0.311111,0.375,0.672052
CatBoost,24.73,0.307517,0.307517,1.0,0.470383,0.640765
knn,0.01,0.628702,0.365385,0.281481,0.317992,0.590521
decision_tree,0.01,0.61959,0.388889,0.414815,0.401434,0.562671


## Strategy 2: Upsample minority

In [19]:
X = df.drop(["wont_renew", "chapter_ID"], axis=1).copy()
y = df["wont_renew"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=710)

print("Original dataset shape: %s" %Counter(y_train))

Original dataset shape: Counter({0: 1215, 1: 537})


In [20]:
over_sampler = SMOTE()
X_resampled, y_resampled = over_sampler.fit_resample(X_train, y_train)
print("Downsampled dataset shape: %s" %Counter(y_resampled))

Downsampled dataset shape: Counter({0: 1215, 1: 1215})


In [21]:
scaler = StandardScaler()
scaler.fit(X_resampled)
X_resampled = pd.DataFrame(scaler.transform(X_resampled), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

In [22]:
X_resampled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2430 entries, 0 to 2429
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   P       2430 non-null   float64
 1   A       2430 non-null   float64
 2   L       2430 non-null   float64
 3   M       2430 non-null   float64
 4   S       2430 non-null   float64
 5   RGI     2430 non-null   float64
 6   RGO     2430 non-null   float64
 7   RRI     2430 non-null   float64
 8   RRO     2430 non-null   float64
 9   V       2430 non-null   float64
 10  1-2-1   2430 non-null   float64
 11  TYFCB   2430 non-null   float64
 12  CEU     2430 non-null   float64
dtypes: float64(13)
memory usage: 246.9 KB


## Model

In [23]:
df_scores2 = pd.DataFrame(columns=["training_time [s]", "accuracy", "precision", "recall", "f1", "auc"])

## 1. Logistic Regression

In [24]:
logreg = LogisticRegression(max_iter=2000)

start = time.time()
logreg.fit(X_resampled, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start):.2f} seconds")

y_pred = logreg.predict(X_test)
y_proba = logreg.predict_proba(X_test)

df_scores2.loc["logistic_regression"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]
df_scores2

Elapsed time: 0.01 seconds


Unnamed: 0,training_time [s],accuracy,precision,recall,f1,auc
logistic_regression,0.01,0.633257,0.419753,0.503704,0.457912,0.656287


## 2. K-neighbors classifier

In [25]:
knn = KNeighborsClassifier()

start = time.time()
knn.fit(X_resampled, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start) // 60:.0f} minutes {(end - start):.2f} seconds")

y_pred = knn.predict(X_test)
y_proba = knn.predict_proba(X_test)

df_scores2.loc["knn"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Elapsed time: 0 minutes 0.01 seconds


## 3. Decision Tree

In [26]:
dt = DecisionTreeClassifier()

start = time.time()
dt.fit(X_resampled, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start) // 60:.0f} minutes {(end - start):.2f} seconds")

y_pred = dt.predict(X_test)
y_proba = dt.predict_proba(X_test)

df_scores2.loc["decision_tree"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Elapsed time: 0 minutes 0.01 seconds


## 4. Random Forest

In [27]:
rf = RandomForestClassifier()

start = time.time()
rf.fit(X_resampled, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start) // 60:.0f} minutes {(end - start):.2f} seconds")

y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)

df_scores2.loc["random_forest"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Elapsed time: 0 minutes 0.33 seconds


## 5. AdaBoost

In [28]:
ada = AdaBoostClassifier()

start = time.time()
ada.fit(X_resampled, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start) // 60:.0f} minutes {(end - start):.2f} seconds")

y_pred = ada.predict(X_test)
y_proba = ada.predict_proba(X_test)

df_scores2.loc["AdaBoost"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Elapsed time: 0 minutes 0.13 seconds


## 6. XGBoost

In [29]:
xg = xgb.XGBClassifier(objective='binary:logistic', eval_metric="auc", use_label_encoder=False)

start = time.time()
xg.fit(X_resampled, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start) // 60:.0f} minutes {(end - start):.2f} seconds")

y_pred = xg.predict(X_test)
y_proba = xg.predict_proba(X_test)

df_scores2.loc["XGBoost"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Elapsed time: 0 minutes 0.13 seconds


## 7. CatBoost

In [30]:
X = df.drop(["wont_renew"], axis=1).copy()
y = df["wont_renew"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=710)

over_sampler = SMOTE()
X_resampled, y_resampled = over_sampler.fit_resample(X_train, y_train)

scaler = StandardScaler()
scaler.fit(X_resampled)

X_train = pd.DataFrame(scaler.transform(X_resampled), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

X_train["chapter_ID"] = X_train["chapter_ID"].astype("str")
X_test["chapter_ID"] = X_test["chapter_ID"].astype("str")

cb = CatBoostClassifier()

start = time.time()
cb.fit(X_resampled, y_resampled, verbose=100, cat_features=["chapter_ID"])
end = time.time()
print(f"Elapsed time: {(end - start) // 60:.0f} minutes {(end - start):.2f} seconds")

y_pred = cb.predict(X_test)
y_proba = cb.predict_proba(X_test)

df_scores2.loc["CatBoost"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Learning rate set to 0.015052
0:	learn: 0.6894730	total: 20.8ms	remaining: 20.8s
100:	learn: 0.5538733	total: 2.54s	remaining: 22.6s
200:	learn: 0.5087741	total: 5.08s	remaining: 20.2s
300:	learn: 0.4799763	total: 7.71s	remaining: 17.9s
400:	learn: 0.4555726	total: 10.3s	remaining: 15.3s
500:	learn: 0.4309249	total: 12.8s	remaining: 12.7s
600:	learn: 0.4064944	total: 15.3s	remaining: 10.2s
700:	learn: 0.3802734	total: 17.8s	remaining: 7.6s
800:	learn: 0.3568392	total: 20.7s	remaining: 5.14s
900:	learn: 0.3347268	total: 23.6s	remaining: 2.59s
999:	learn: 0.3143610	total: 26.1s	remaining: 0us
Elapsed time: 0 minutes 26.24 seconds


## Compare results

In [31]:
df_scores2.sort_values("auc", ascending=False)

Unnamed: 0,training_time [s],accuracy,precision,recall,f1,auc
AdaBoost,0.13,0.658314,0.455621,0.57037,0.506579,0.683991
logistic_regression,0.01,0.633257,0.419753,0.503704,0.457912,0.656287
random_forest,0.33,0.671982,0.465116,0.444444,0.454545,0.655897
XGBoost,0.13,0.633257,0.410959,0.444444,0.427046,0.645541
knn,0.01,0.587699,0.382653,0.555556,0.453172,0.594981
decision_tree,0.01,0.615034,0.4,0.503704,0.445902,0.584089
CatBoost,26.24,0.307517,0.307517,1.0,0.470383,0.498769


## Strategy 3. Balance

In [32]:
X = df.drop(["wont_renew", "chapter_ID"], axis=1).copy()
y = df["wont_renew"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=710)

print("Original dataset shape: %s" %Counter(y_train))

Original dataset shape: Counter({0: 1215, 1: 537})


In [33]:
over_sampler = SMOTE(sampling_strategy=0.7)
X_resampled, y_resampled = over_sampler.fit_resample(X_train, y_train)
print("Downsampled dataset shape: %s" %Counter(y_resampled))

Downsampled dataset shape: Counter({0: 1215, 1: 850})


In [34]:
under_sampler = TomekLinks()
X_resampled, y_resampled = under_sampler.fit_resample(X_resampled, y_resampled)
print("Downsampled dataset shape: %s" %Counter(y_resampled))

Downsampled dataset shape: Counter({0: 1042, 1: 850})


In [35]:
scaler = StandardScaler()
scaler.fit(X_resampled)
X_resampled = pd.DataFrame(scaler.transform(X_resampled), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

## Model

In [36]:
df_scores3 = pd.DataFrame(columns=["training_time [s]", "accuracy", "precision", "recall", "f1", "auc"])

## 1. Logistic Regression

In [37]:
logreg = LogisticRegression(max_iter=2000)

start = time.time()
logreg.fit(X_resampled, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start):.2f} seconds")

y_pred = logreg.predict(X_test)
y_proba = logreg.predict_proba(X_test)

df_scores3.loc["logistic_regression"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]
df_scores3

Elapsed time: 0.00 seconds


Unnamed: 0,training_time [s],accuracy,precision,recall,f1,auc
logistic_regression,0.0,0.676538,0.47482,0.488889,0.481752,0.674732


## 2. K-nearest neighbors

In [38]:
knn = KNeighborsClassifier()

start = time.time()
knn.fit(X_resampled, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start) // 60:.0f} minutes {(end - start):.2f} seconds")

y_pred = knn.predict(X_test)
y_proba = knn.predict_proba(X_test)

df_scores3.loc["knn"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Elapsed time: 0 minutes 0.00 seconds


## 3. Decision Tree

In [39]:
dt = DecisionTreeClassifier()

start = time.time()
dt.fit(X_resampled, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start) // 60:.0f} minutes {(end - start):.2f} seconds")

y_pred = dt.predict(X_test)
y_proba = dt.predict_proba(X_test)

df_scores3.loc["decision_tree"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Elapsed time: 0 minutes 0.01 seconds


## 4. Random Forest

In [40]:
rf = RandomForestClassifier()

start = time.time()
rf.fit(X_resampled, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start) // 60:.0f} minutes {(end - start):.2f} seconds")

y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)

df_scores3.loc["random_forest"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Elapsed time: 0 minutes 0.29 seconds


## 5. AdaBoost

In [41]:
ada = AdaBoostClassifier()

start = time.time()
ada.fit(X_resampled, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start) // 60:.0f} minutes {(end - start):.2f} seconds")

y_pred = ada.predict(X_test)
y_proba = ada.predict_proba(X_test)

df_scores3.loc["AdaBoost"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Elapsed time: 0 minutes 0.11 seconds


## 6. XGBoost

In [42]:
xg = xgb.XGBClassifier(objective='binary:logistic', eval_metric="auc", use_label_encoder=False)

start = time.time()
xg.fit(X_resampled, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start) // 60:.0f} minutes {(end - start):.2f} seconds")

y_pred = xg.predict(X_test)
y_proba = xg.predict_proba(X_test)

df_scores3.loc["XGBoost"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Elapsed time: 0 minutes 0.11 seconds


## 7. CatBoost

In [43]:
X = df.drop(["wont_renew"], axis=1).copy()
y = df["wont_renew"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=710)

over_sampler = SMOTE(sampling_strategy=0.7)
X_resampled, y_resampled = over_sampler.fit_resample(X_train, y_train)
print("Downsampled dataset shape: %s" %Counter(y_resampled))

under_sampler = TomekLinks()
X_resampled, y_resampled = under_sampler.fit_resample(X_resampled, y_resampled)
print("Downsampled dataset shape: %s" %Counter(y_resampled))

scaler = StandardScaler()
scaler.fit(X_resampled)

X_train = pd.DataFrame(scaler.transform(X_resampled), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

X_train["chapter_ID"] = X_train["chapter_ID"].astype("str")
X_test["chapter_ID"] = X_test["chapter_ID"].astype("str")

cb = CatBoostClassifier()

start = time.time()
cb.fit(X_resampled, y_resampled, verbose=100, cat_features=["chapter_ID"])
end = time.time()
print(f"Elapsed time: {(end - start) // 60:.0f} minutes {(end - start):.2f} seconds")

y_pred = cb.predict(X_test)
y_proba = cb.predict_proba(X_test)

df_scores3.loc["CatBoost"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Downsampled dataset shape: Counter({0: 1215, 1: 850})
Downsampled dataset shape: Counter({0: 1043, 1: 850})
Learning rate set to 0.013529
0:	learn: 0.6908821	total: 19ms	remaining: 19s
100:	learn: 0.5716746	total: 2.38s	remaining: 21.1s
200:	learn: 0.5305292	total: 4.77s	remaining: 19s
300:	learn: 0.5022435	total: 7.16s	remaining: 16.6s
400:	learn: 0.4780240	total: 9.56s	remaining: 14.3s
500:	learn: 0.4570288	total: 12s	remaining: 11.9s
600:	learn: 0.4312840	total: 14.3s	remaining: 9.52s
700:	learn: 0.4058136	total: 17.1s	remaining: 7.3s
800:	learn: 0.3790067	total: 19.6s	remaining: 4.87s
900:	learn: 0.3547373	total: 22.1s	remaining: 2.43s
999:	learn: 0.3342324	total: 24.6s	remaining: 0us
Elapsed time: 0 minutes 24.75 seconds


# Compare results

In [44]:
df_scores3.sort_values("auc", ascending=False)

Unnamed: 0,training_time [s],accuracy,precision,recall,f1,auc
random_forest,0.29,0.67426,0.466667,0.414815,0.439216,0.678448
logistic_regression,0.0,0.676538,0.47482,0.488889,0.481752,0.674732
AdaBoost,0.11,0.681093,0.48227,0.503704,0.492754,0.66674
XGBoost,0.11,0.67426,0.469231,0.451852,0.460377,0.648124
CatBoost,24.75,0.307517,0.307517,1.0,0.470383,0.595492
knn,0.0,0.585421,0.342282,0.377778,0.359155,0.574013
decision_tree,0.01,0.585421,0.362573,0.459259,0.405229,0.550353


In [45]:
df_scores1.index = "strat1_" + df_scores1.index
df_scores2.index = "strat2_" + df_scores2.index
df_scores3.index = "strat3_" + df_scores3.index
df_scores = pd.concat([df_scores1, df_scores2, df_scores3], axis=0)

df_scores.sort_values("auc", ascending=False)

Unnamed: 0,training_time [s],accuracy,precision,recall,f1,auc
strat1_logistic_regression,0.01,0.735763,0.637681,0.325926,0.431373,0.701072
strat2_AdaBoost,0.13,0.658314,0.455621,0.57037,0.506579,0.683991
strat3_random_forest,0.29,0.67426,0.466667,0.414815,0.439216,0.678448
strat1_XGBoost,0.12,0.687927,0.49,0.362963,0.417021,0.678095
strat3_logistic_regression,0.0,0.676538,0.47482,0.488889,0.481752,0.674732
strat1_AdaBoost,0.1,0.719818,0.566667,0.377778,0.453333,0.674488
strat1_random_forest,0.25,0.681093,0.47191,0.311111,0.375,0.672052
strat3_AdaBoost,0.11,0.681093,0.48227,0.503704,0.492754,0.66674
strat2_logistic_regression,0.01,0.633257,0.419753,0.503704,0.457912,0.656287
strat2_random_forest,0.33,0.671982,0.465116,0.444444,0.454545,0.655897


In [46]:
df_scores.to_csv("model_scores_step3.csv")