In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', 30)

In [2]:
df = pd.read_csv("data/df_post_EDA.csv", index_col=0)
df.head()

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,P,A,L,M,S,RGI,RGO,RRI,RRO,V,1-2-1,TYFCB,CEU,year_of_membership,chapter_size,chapter_retention_rate,chapter_growth_rate,seat_popularity_rate,total_meetings,wont_renew
0,0,31,2016-04-01,33,0,0,1,0,21,7,33,32,1,37,129784,45,0,32,0.315789,1.421053,0.076923,34,0
1,2,18,2017-05-01,33,2,2,0,0,6,17,11,22,1,26,9285,1,0,34,0.71875,1.09375,0.266667,37,0
2,2,18,2018-05-01,35,1,0,0,2,6,24,19,36,1,20,7263,10,1,21,0.424242,0.636364,0.210526,38,0
3,2,18,2019-05-01,36,0,0,0,1,9,8,23,19,3,28,1860,31,2,22,0.73913,1.217391,0.263158,37,0
4,2,18,2020-05-01,33,1,0,0,3,10,13,19,47,4,30,6668,16,3,26,0.76,1.12,0.277778,37,0


### Drop & split

In [3]:
df.drop(["user_ID", "relative_renewal_date"], axis=1, inplace=True)

In [4]:
df["P_by_A"] = df["P"] / (df["A"] + 1)
df["P_by_M"] = df["P"] / (df["M"] + 1)
df["P_by_S"] = df["P"] / (df["S"] + 1)
df["P_by_total"] = df["P"] / df["total_meetings"]
df["A_by_total"] = df["A"] / df["total_meetings"]
df["M_by_total"] = df["M"] / df["total_meetings"]

df["P_tim_retention"] = df["P"] * df["chapter_retention_rate"]
df["P_tim_growth"] = df["P"] * df["chapter_growth_rate"]
df["P_tim_popularity"] = df["P"] * df["seat_popularity_rate"]
df["P_tim_V"] = df["P"] * df["V"]
df["P2_tim_V"] = df["P"]**2 * df["V"]
df["P_tim_TYFCB"] = df["P"] * df["TYFCB"]

df["year_tim_retention"] = df["year_of_membership"] * df["chapter_retention_rate"]

df["P2"] = df["P"]**2

columns_list = df.columns.tolist()
columns_list.remove("wont_renew")

columns_list+= ["wont_renew"]
df = df[columns_list]

In [18]:
df.columns

Index(['chapter_ID', 'P', 'A', 'L', 'M', 'S', 'RGI', 'RGO', 'RRI', 'RRO', 'V',
       '1-2-1', 'TYFCB', 'CEU', 'year_of_membership', 'chapter_size',
       'chapter_retention_rate', 'chapter_growth_rate', 'seat_popularity_rate',
       'total_meetings', 'P_by_A', 'P_by_M', 'P_by_S', 'P_by_total',
       'A_by_total', 'M_by_total', 'P_tim_retention', 'P_tim_growth',
       'P_tim_popularity', 'P_tim_V', 'P2_tim_V', 'P_tim_TYFCB',
       'year_tim_retention', 'P2', 'wont_renew'],
      dtype='object')

### Selected features

In [5]:
xgb_10_features = pd.read_csv("feature_selection/pick_10_features_xgb.csv", index_col=0)
xgb_10_features = xgb_10_features["feature_name"].tolist()

In [6]:
xgb_10_features

['P2',
 '1-2-1',
 'P_tim_TYFCB',
 'RGO',
 'RGI',
 'P_tim_popularity',
 'P_by_A',
 'P_tim_retention',
 'year_of_membership',
 'chapter_size']

# Model

In [13]:
import xgboost as xgb

from imblearn.under_sampling import EditedNearestNeighbours, TomekLinks, NeighbourhoodCleaningRule

from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.metrics import f1_score, precision_score, recall_score

In [9]:
X = df.drop(["wont_renew", "chapter_ID"], axis=1).copy()
y = df["wont_renew"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=710)

under_sampler = TomekLinks()
X_resampled, y_resampled = under_sampler.fit_resample(X_train, y_train)

scaler = StandardScaler()
scaler.fit(X_resampled)
X_resampled = pd.DataFrame(scaler.transform(X_resampled), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

X_resampled_xgb = X_resampled[xgb_10_features]
X_test_xgb = X_test[xgb_10_features]

In [10]:
xg = xgb.XGBClassifier(objective='binary:logistic', eval_metric="auc", random_state=42, use_label_encoder=False)

xg_grid = {}
xg_grid["n_estimators"] = [50, 100]
xg_grid["colsample_bytree"] = [0.1, 0.3, 0.5, 0.8]
xg_grid["learning_rate"] = [0.03, 0.1, 0.3, 1]
xg_grid["max_depth"] = [x for x in range(2, 11, 1)]
xg_grid["alpha"] = [0.3, 1, 3, 10]

xg_cv = GridSearchCV(xg, xg_grid, scoring="roc_auc", n_jobs=-1, cv=3, verbose=2)

start = time.time()
xg_cv.fit(X_resampled_xgb, y_resampled)
end = time.time()
print(f"Elapsed time: {(end - start):.2f} seconds")
print(xg_cv.best_params_)

y_proba = xg_cv.predict_proba(X_test_xgb)

print(roc_auc_score(y_test, y_proba[:,1]))

Fitting 3 folds for each of 1152 candidates, totalling 3456 fits
Elapsed time: 64.60 seconds
{'alpha': 3, 'colsample_bytree': 0.1, 'learning_rate': 0.03, 'max_depth': 2, 'n_estimators': 50}
0.707285575048733


In [12]:
test_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric="auc", random_state=42, use_label_encoder=False, **xg_cv.best_params_)
calibrated_clf = CalibratedClassifierCV(base_estimator=test_model, cv=3, method="isotonic")

calibrated_clf.fit(X_resampled_xgb, y_resampled)

y_proba = calibrated_clf.predict_proba(X_test_xgb)

print(roc_auc_score(y_test, y_proba[:,1]))

0.7110867446393763


### Dump the final model

In [10]:
import joblib
from sklearn.utils.validation import check_is_fitted 

In [19]:
# help(calibrated_clf)

In [20]:
# help(xgb.XGBClassifier)

In [16]:
check_is_fitted(calibrated_clf, msg="not fitted")

In [14]:
model_filename = "_xgboost_final_model.sav"
joblib.dump(calibrated_clf, model_filename)

# load the model from disk
loaded_model = joblib.load(model_filename)
y_proba = loaded_model.predict_proba(X_test_xgb)

roc_auc_score(y_test, y_proba[:,1])
# test passed - result is the same

0.7110867446393763