In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', 30)

In [5]:
df = pd.read_csv("data/df_post_EDA.csv", index_col=0)
df.head()

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,P,A,L,M,S,RGI,RGO,RRI,RRO,V,1-2-1,TYFCB,CEU,year_of_membership,chapter_size,chapter_retention_rate,chapter_growth_rate,seat_popularity_rate,total_meetings,wont_renew
0,0,31,2016-04-01,33,0,0,1,0,21,7,33,32,1,37,129784,45,0,32,0.315789,1.421053,0.076923,34,0
1,2,18,2017-05-01,33,2,2,0,0,6,17,11,22,1,26,9285,1,0,34,0.71875,1.09375,0.266667,37,0
2,2,18,2018-05-01,35,1,0,0,2,6,24,19,36,1,20,7263,10,1,21,0.424242,0.636364,0.210526,38,0
3,2,18,2019-05-01,36,0,0,0,1,9,8,23,19,3,28,1860,31,2,22,0.73913,1.217391,0.263158,37,0
4,2,18,2020-05-01,33,1,0,0,3,10,13,19,47,4,30,6668,16,3,26,0.76,1.12,0.277778,37,0


### Drop & split

In [6]:
df.drop(["user_ID", "relative_renewal_date"], axis=1, inplace=True)

In [7]:
df["P_by_A"] = df["P"] / (df["A"] + 1)
df["P_by_M"] = df["P"] / (df["M"] + 1)
df["P_by_S"] = df["P"] / (df["S"] + 1)
df["P_by_total"] = df["P"] / df["total_meetings"]
df["A_by_total"] = df["A"] / df["total_meetings"]
df["M_by_total"] = df["M"] / df["total_meetings"]

df["P_tim_retention"] = df["P"] * df["chapter_retention_rate"]
df["P_tim_growth"] = df["P"] * df["chapter_growth_rate"]
df["P_tim_popularity"] = df["P"] * df["seat_popularity_rate"]
df["P_tim_V"] = df["P"] * df["V"]
df["P2_tim_V"] = df["P"]**2 * df["V"]
df["P_tim_TYFCB"] = df["P"] * df["TYFCB"]

df["year_tim_retention"] = df["year_of_membership"] * df["chapter_retention_rate"]

df["P2"] = df["P"]**2

columns_list = df.columns.tolist()
columns_list.remove("wont_renew")

columns_list+= ["wont_renew"]
df = df[columns_list]

### Final pick

In [8]:
logreg_10_features = pd.read_csv("data/model_scores/pick_10_features_logreg.csv", index_col=0)
logreg_10_features = logreg_10_features["feature_name"].tolist()

dt_10_features = pd.read_csv("data/model_scores/pick_10_features_dt.csv", index_col=0)
dt_10_features = dt_10_features["feature_name"].tolist()

rf_10_features = pd.read_csv("data/model_scores/pick_10_features_rf.csv", index_col=0)
rf_10_features = rf_10_features["feature_name"].tolist()

ada_10_features = pd.read_csv("data/model_scores/pick_10_features_ada.csv", index_col=0)
ada_10_features = ada_10_features["feature_name"].tolist()

xgb_10_features = pd.read_csv("data/model_scores/pick_10_features_xgb.csv", index_col=0)
xgb_10_features = xgb_10_features["feature_name"].tolist()

catboost_10_features = pd.read_csv("data/model_scores/pick_10_features_catboost.csv", index_col=0)
catboost_10_features = catboost_10_features["feature_name"].tolist()

# Model

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from catboost import CatBoostClassifier, Pool
import xgboost as xgb

from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

from imblearn.under_sampling import EditedNearestNeighbours, TomekLinks, NeighbourhoodCleaningRule
from imblearn.over_sampling import SMOTE, ADASYN
from collections import Counter

In [10]:
df_scores = pd.DataFrame(columns=["training_time [s]", "accuracy", "precision", "recall", "f1", "auc"])

## 1. Logistic Regression

In [11]:
X = df.drop(["wont_renew", "chapter_ID"], axis=1).copy()
y = df["wont_renew"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=710)

under_sampler = TomekLinks()
X_resampled, y_resampled = under_sampler.fit_resample(X_train, y_train)

scaler = StandardScaler()
scaler.fit(X_resampled)
X_resampled = pd.DataFrame(scaler.transform(X_resampled), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

In [12]:
logreg = LogisticRegression(max_iter=5000)

X_resampled_logreg = X_resampled[logreg_10_features]
X_test_logreg = X_test[logreg_10_features]

logreg_grid = {}
logreg_grid["C"] = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]

logreg_cv = GridSearchCV(logreg, logreg_grid, scoring="roc_auc", n_jobs=-1, cv=3)

start = time.time()
logreg_cv.fit(X_resampled_logreg, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start):.2f} seconds")
print(logreg_cv.best_params_)

y_pred = logreg_cv.predict(X_test_logreg)
y_proba = logreg_cv.predict_proba(X_test_logreg)

df_scores.loc["logistic_regression"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]
df_scores

Elapsed time: 2.04 seconds
{'C': 0.001}


Unnamed: 0,training_time [s],accuracy,precision,recall,f1,auc
logistic_regression,2.04,0.708428,1.0,0.051852,0.098592,0.702266


## 2. Naive Bayes

In [13]:
X = df.drop(["wont_renew", "chapter_ID"], axis=1).copy()
y = df["wont_renew"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=710)

under_sampler = TomekLinks()
X_resampled, y_resampled = under_sampler.fit_resample(X_train, y_train)

scaler = StandardScaler()
scaler.fit(X_resampled)
X_resampled = pd.DataFrame(scaler.transform(X_resampled), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

pca = PCA(n_components=10)
pca.fit(X)
X_resampled = pd.DataFrame(pca.transform(X_resampled))
X_test = pd.DataFrame(pca.transform(X_test))
# print(pca.explained_variance_ratio_)

In [14]:
naive_bayes = GaussianNB()

start = time.time()
naive_bayes.fit(X_resampled, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start):.2f} seconds")

y_pred = naive_bayes.predict(X_test)
y_proba = naive_bayes.predict_proba(X_test)

df_scores.loc["naive_bayes"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Elapsed time: 0.00 seconds


## 3. K-neighbors classifier

In [15]:
X = df.drop(["wont_renew", "chapter_ID"], axis=1).copy()
y = df["wont_renew"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=710)

under_sampler = TomekLinks()
X_resampled, y_resampled = under_sampler.fit_resample(X_train, y_train)

scaler = StandardScaler()
scaler.fit(X_resampled)
X_resampled = pd.DataFrame(scaler.transform(X_resampled), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

pca = PCA(n_components=10)
pca.fit(X)
X_resampled = pd.DataFrame(pca.transform(X_resampled))
X_test = pd.DataFrame(pca.transform(X_test))
# print(pca.explained_variance_ratio_)

In [17]:
knn = KNeighborsClassifier()

knn_grid = {}
knn_grid["n_neighbors"] = [x for x in range(3,20,2)]
knn_grid["weights"] = ["uniform", "distance"]
knn_grid["leaf_size"] = [x for x in range(5,55,5)]

knn_cv = GridSearchCV(knn, knn_grid, scoring="roc_auc", n_jobs=-1, cv=3)

start = time.time()
knn_cv.fit(X_resampled, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start):.2f} seconds")

y_pred = knn_cv.predict(X_test)
y_proba = knn_cv.predict_proba(X_test)

df_scores.loc["knn"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Elapsed time: 4.11 seconds


## 4. Decision Tree

In [18]:
X = df.drop(["wont_renew", "chapter_ID"], axis=1).copy()
y = df["wont_renew"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=710)

under_sampler = TomekLinks()
X_resampled, y_resampled = under_sampler.fit_resample(X_train, y_train)

scaler = StandardScaler()
scaler.fit(X_resampled)
X_resampled = pd.DataFrame(scaler.transform(X_resampled), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

In [20]:
dt = DecisionTreeClassifier()

X_resampled_dt = X_resampled[dt_10_features]
X_test_dt = X_test[dt_10_features]

dt_grid = {}
dt_grid["max_depth"] = [x for x in range(2, 51, 2)]
dt_grid["min_samples_split"] = [x for x in range(2, 11, 2)]
dt_grid["min_samples_leaf"] = [x for x in range(1, 8, 1)]

dt_cv = GridSearchCV(dt, dt_grid, scoring="roc_auc", n_jobs=-1, cv=3, verbose=2)

start = time.time()
dt_cv.fit(X_resampled_dt, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start):.2f} seconds")
print(dt_cv.best_params_)

y_pred = dt_cv.predict(X_test_dt)
y_proba = dt_cv.predict_proba(X_test_dt)

df_scores.loc["decision_tree"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Fitting 3 folds for each of 875 candidates, totalling 2625 fits
Elapsed time: 5.56 seconds
{'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}


## 5. Random Forest

In [21]:
X = df.drop(["wont_renew", "chapter_ID"], axis=1).copy()
y = df["wont_renew"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=710)

under_sampler = TomekLinks()
X_resampled, y_resampled = under_sampler.fit_resample(X_train, y_train)

scaler = StandardScaler()
scaler.fit(X_resampled)
X_resampled = pd.DataFrame(scaler.transform(X_resampled), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

In [23]:
rf = RandomForestClassifier()

X_resampled_rf = X_resampled[rf_10_features]
X_test_rf = X_test[rf_10_features]

rf_grid = {}
rf_grid["n_estimators"] = [50, 100, 200, 400]
rf_grid["max_depth"] = [x for x in range(2, 10, 2)]
rf_grid["min_samples_split"] = [x for x in range(2, 7, 2)]
rf_grid["min_samples_leaf"] = [x for x in range(1, 6, 1)]

rf_cv = GridSearchCV(rf, rf_grid, scoring="roc_auc", n_jobs=-1, cv=3, verbose=2)

start = time.time()
rf_cv.fit(X_resampled_rf, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start):.2f} seconds")
print(rf_cv.best_params_)

y_pred = rf_cv.predict(X_test_rf)
y_proba = rf_cv.predict_proba(X_test_rf)

df_scores.loc["random_forest"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Fitting 3 folds for each of 240 candidates, totalling 720 fits
Elapsed time: 58.89 seconds
{'max_depth': 4, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}


## 6. AdaBoost

In [24]:
X = df.drop(["wont_renew", "chapter_ID"], axis=1).copy()
y = df["wont_renew"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=710)

under_sampler = TomekLinks()
X_resampled, y_resampled = under_sampler.fit_resample(X_train, y_train)

scaler = StandardScaler()
scaler.fit(X_resampled)
X_resampled = pd.DataFrame(scaler.transform(X_resampled), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

In [25]:
ada = AdaBoostClassifier()

X_resampled_ada = X_resampled[ada_10_features]
X_test_ada = X_test[ada_10_features]

ada_grid = {}
ada_grid["n_estimators"] = [50, 100, 200, 400]
ada_grid["learning_rate"] = [0.03, 0.1, 0.3, 1]

ada_cv = GridSearchCV(ada, ada_grid, scoring="roc_auc", n_jobs=-1, cv=3, verbose=2)

start = time.time()
ada_cv.fit(X_resampled_ada, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start):.2f} seconds")
print(ada_cv.best_params_)

y_pred = ada_cv.predict(X_test_ada)
y_proba = ada_cv.predict_proba(X_test_ada)

df_scores.loc["AdaBoost"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Elapsed time: 5.17 seconds
{'learning_rate': 0.1, 'n_estimators': 200}


## 6. XGBoost

In [26]:
X = df.drop(["wont_renew", "chapter_ID"], axis=1).copy()
y = df["wont_renew"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=710)

under_sampler = TomekLinks()
X_resampled, y_resampled = under_sampler.fit_resample(X_train, y_train)

scaler = StandardScaler()
scaler.fit(X_resampled)
X_resampled = pd.DataFrame(scaler.transform(X_resampled), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

In [27]:
xg = xgb.XGBClassifier(objective='binary:logistic', eval_metric="auc", random_state=42, use_label_encoder=False)

X_resampled_xgb = X_resampled[xgb_10_features]
X_test_xgb = X_test[xgb_10_features]

xg_grid = {}
xg_grid["n_estimators"] = [50, 100, 200, 400]
xg_grid["colsample_bytree"] = [0.1, 0.3, 0.5, 0.8]
xg_grid["learning_rate"] = [0.03, 0.1, 0.3, 1]
xg_grid["max_depth"] = [x for x in range(2, 11, 1)]
xg_grid["alpha"] = [0.1, 0.3, 1, 3, 10]

xg_cv = GridSearchCV(xg, xg_grid, scoring="roc_auc", n_jobs=-1, cv=3, verbose=2)

start = time.time()
xg_cv.fit(X_resampled_xgb, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start):.2f} seconds")
print(xg_cv.best_params_)

y_pred = xg_cv.predict(X_test_xgb)
y_proba = xg_cv.predict_proba(X_test_xgb)

df_scores.loc["XGBoost"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

Fitting 3 folds for each of 2880 candidates, totalling 8640 fits
Elapsed time: 397.57 seconds
{'alpha': 3, 'colsample_bytree': 0.1, 'learning_rate': 0.03, 'max_depth': 2, 'n_estimators': 50}


## 7. CatBoost

In [28]:
X = df.drop(["wont_renew"], axis=1).copy()
y = df["wont_renew"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=710)

cb = CatBoostClassifier(early_stopping_rounds=20, eval_metric="AUC", verbose=0, cat_features=["chapter_ID"])

X_train_cat = X_train[catboost_10_features]
X_test_cat = X_test[catboost_10_features]

cb_grid = {}
cb_grid["iterations"] = [50, 100, 200, 400]
cb_grid["learning_rate"] = [0.03, 0.1, 0.3, 1]
cb_grid['l2_leaf_reg'] = [1, 3, 5, 7, 9]
cb_grid["depth"] = [4, 6, 8, 10]

start = time.time()
grid_search_result = cb.grid_search(cb_grid, X=X_train_cat, y=y_train, plot=True)
end = time.time()
print(f"Elapsed time: {(end - start):.2f} seconds")
print(grid_search_result)

y_pred = cb.predict(X_test_cat)
y_proba = cb.predict_proba(X_test_cat)
df_scores.loc["CatBoost"] = [round(end - start, 2),
                                        accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred),
                                        f1_score(y_test, y_pred),
                                        roc_auc_score(y_test, y_proba[:,1])]

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


bestTest = 0.6937004904
bestIteration = 46

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
0:	loss: 0.6937005	best: 0.6937005 (0)	total: 1.05s	remaining: 5m 34s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6889098453
bestIteration = 15

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
1:	loss: 0.6889098	best: 0.6937005 (0)	total: 1.77s	remaining: 4m 41s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6961523953
bestIteration = 25

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
2:	loss: 0.6961524	best: 0.6961524 (2)	total: 2.58s	remaining: 4m 32s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.667125613
bestIteration = 2

Metric AUC is not calculated on train by default

Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6858543946
bestIteration = 26

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
28:	loss: 0.6858544	best: 0.7042814 (5)	total: 17s	remaining: 2m 51s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6973217654
bestIteration = 46

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
29:	loss: 0.6973218	best: 0.7042814 (5)	total: 18.1s	remaining: 2m 54s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6797812146
bestIteration = 4

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
30:	loss: 0.6797812	best: 0.7042814 (5)	total: 18.4s	remaining: 2m 51s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6721614485
bestIteratio

Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6870614862
bestIteration = 3

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
55:	loss: 0.6870615	best: 0.7094304 (46)	total: 32.7s	remaining: 2m 34s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.699019238
bestIteration = 6

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
56:	loss: 0.6990192	best: 0.7094304 (46)	total: 33.2s	remaining: 2m 33s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.7010184836
bestIteration = 7

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
57:	loss: 0.7010185	best: 0.7094304 (46)	total: 33.7s	remaining: 2m 32s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.7069219163
bestIterat

Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.655771407
bestIteration = 3

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
83:	loss: 0.6557714	best: 0.7094304 (46)	total: 49.7s	remaining: 2m 19s

bestTest = 0.6867597133
bestIteration = 44

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
84:	loss: 0.6867597	best: 0.7094304 (46)	total: 50.7s	remaining: 2m 20s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6815918521
bestIteration = 8

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
85:	loss: 0.6815919	best: 0.7094304 (46)	total: 51.3s	remaining: 2m 19s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6804224821
bestIteration = 7

Metric AUC is not calculated on train by def

Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6908713693
bestIteration = 63

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
112:	loss: 0.6908714	best: 0.7094304 (46)	total: 1m 18s	remaining: 2m 22s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6927951716
bestIteration = 32

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
113:	loss: 0.6927952	best: 0.7094304 (46)	total: 1m 19s	remaining: 2m 23s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6715956243
bestIteration = 11

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
114:	loss: 0.6715956	best: 0.7094304 (46)	total: 1m 19s	remaining: 2m 22s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6611090155


Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6608449642
bestIteration = 5

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
139:	loss: 0.6608450	best: 0.7094304 (46)	total: 1m 48s	remaining: 2m 18s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6909468125
bestIteration = 55

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
140:	loss: 0.6909468	best: 0.7094304 (46)	total: 1m 49s	remaining: 2m 19s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.7032817805
bestIteration = 14

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
141:	loss: 0.7032818	best: 0.7094304 (46)	total: 1m 50s	remaining: 2m 18s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6840437571
b

Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6493398717
bestIteration = 20

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
167:	loss: 0.6493399	best: 0.7094304 (46)	total: 2m 19s	remaining: 2m 5s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6936250472
bestIteration = 17

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
168:	loss: 0.6936250	best: 0.7094304 (46)	total: 2m 20s	remaining: 2m 5s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6867597133
bestIteration = 23

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
169:	loss: 0.6867597	best: 0.7094304 (46)	total: 2m 21s	remaining: 2m 5s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.7032440588
bes

Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.667125613
bestIteration = 2

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
195:	loss: 0.6671256	best: 0.7094304 (46)	total: 2m 55s	remaining: 1m 51s

bestTest = 0.6962655602
bestIteration = 93

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
196:	loss: 0.6962656	best: 0.7094304 (46)	total: 2m 58s	remaining: 1m 51s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.7003772161
bestIteration = 23

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
197:	loss: 0.7003772	best: 0.7094304 (46)	total: 3m	remaining: 1m 51s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6969068276
bestIteration = 38

Metric AUC is not calculated on train by

Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6704262542
bestIteration = 3

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
222:	loss: 0.6704263	best: 0.7190117 (201)	total: 3m 31s	remaining: 1m 32s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6704073934
bestIteration = 1

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
223:	loss: 0.6704074	best: 0.7190117 (201)	total: 3m 32s	remaining: 1m 31s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.7064881177
bestIteration = 65

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
224:	loss: 0.7064881	best: 0.7190117 (201)	total: 3m 35s	remaining: 1m 30s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.7012825349

Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.71565447
bestIteration = 7

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
249:	loss: 0.7156545	best: 0.7345719 (242)	total: 4m 4s	remaining: 1m 8s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.7139192757
bestIteration = 12

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
250:	loss: 0.7139193	best: 0.7345719 (242)	total: 4m 5s	remaining: 1m 7s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6847793286
bestIteration = 0

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
251:	loss: 0.6847793	best: 0.7345719 (242)	total: 4m 6s	remaining: 1m 6s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.7117314221
bestIte

Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.7033949453
bestIteration = 7

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
277:	loss: 0.7033949	best: 0.7345719 (242)	total: 4m 44s	remaining: 43s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6944549227
bestIteration = 10

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
278:	loss: 0.6944549	best: 0.7345719 (242)	total: 4m 45s	remaining: 42s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6952847982
bestIteration = 38

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
279:	loss: 0.6952848	best: 0.7345719 (242)	total: 4m 48s	remaining: 41.3s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.7041870992
bestI

Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6927197284
bestIteration = 6

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
304:	loss: 0.6927197	best: 0.7345719 (242)	total: 5m 27s	remaining: 16.1s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.7044888721
bestIteration = 11

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
305:	loss: 0.7044889	best: 0.7345719 (242)	total: 5m 29s	remaining: 15.1s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6570350811
bestIteration = 19

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
306:	loss: 0.6570351	best: 0.7345719 (242)	total: 5m 31s	remaining: 14s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6862693323
bes

## Compare results

In [29]:
df_scores.sort_values("auc", ascending=False)

Unnamed: 0,training_time [s],accuracy,precision,recall,f1,auc
XGBoost,397.57,0.719818,0.75,0.133333,0.226415,0.707286
logistic_regression,2.04,0.708428,1.0,0.051852,0.098592,0.702266
random_forest,58.89,0.728929,0.617647,0.311111,0.413793,0.690595
AdaBoost,5.17,0.70615,0.534091,0.348148,0.421525,0.680068
knn,4.11,0.701595,0.529412,0.266667,0.35468,0.660916
naive_bayes,0.0,0.43508,0.331343,0.822222,0.47234,0.656871
decision_tree,5.56,0.71754,0.617021,0.214815,0.318681,0.649366
CatBoost,360.07,0.703872,0.53012,0.325926,0.40367,0.635819


In [30]:
df_scores.to_csv("model_scores_step7.csv")