# Customer Churn - Savings Bank

In [1]:
## TODO: clarify why the loop in the timestamp_data-fct is only returning the last element of the frames-list...the loop should 
# insert both X_train as well as X_val into the empty list "transformed_data_list"
## train a NB-classifier
## refactor performance (redundant in its current version), incl. summary of model performance
## fit best model on complete X_train and save it

## fit DT model on complete X_train and save
## transform to .py-script
## decide: deployment on streamlit, dockerize and then on AWS

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import plot_roc_curve, confusion_matrix, roc_auc_score, f1_score

import pickle

import warnings
warnings.filterwarnings("ignore")

In [3]:
def include_timestamps(df):
    """
    @params:
        - df: an initial DataFrame.
    @return:
        - df: a DataFrame including timestamps for months and years.
    """
    df["Year"] = df.index.year
    df["Month"] = df.index.month
    return df

In [4]:
def performance_metrics(y_val, y_pred):
    """
    @params:
    - labels and prediction Series.
    
    @return:
    - prints  roc-auc- and f1-scores predictors.
    """
    
    metrics = ["roc_auc_score", "f1_score"]
    
    for metric in metrics:
        auc = round(roc_auc_score(y_val, y_pred), 2)
        f1 = round(f1_score(y_val, y_pred), 2)
        cm = confusion_matrix(y_val, y_pred)
    print(f"Classifier has an ROC-AUC-score of {auc} and an f1-score of {f1}.\nIts confusion matrix is {cm}")

In [5]:
def model_fit(model, df, labels):
    """
    @params:
    - Takes model to be fit, transformed dataset and labels.
    @return:
    - Returns fitted model.
    """
    df = df.copy()
    labels = labels.copy(),
    return model.fit(df, labels)

In [6]:
def timestamp_data(frames_list, data):
    """
    @params:
    - Takes a list of Dataframes, consisting of X_train and X_val.
    @return:
    - Returns a list of timestamped data, both re X_train as well as X_val.
    """
    transformed_data_list = []
    for df in frames_list:
        data_timestamped = include_timestamps(df)
        transformed_data_list.append(data_timestamped)
        return transformed_data_list

### Reading in and inspecting data

In [7]:
df = pd.read_csv("../data/Tabla_01_English_Unique_postEDA.csv", index_col=0, parse_dates=True)

In [8]:
df.dtypes

Client_ID                                  int64
Resident_Capital                           int64
Client_Age_Years                           int64
Client_Sex                                 int64
Client_Married                             int64
Amount_Last_Disbursement                 float64
n(Months)_Since_Last_Disbursement          int64
n(Months)_Client_Relationship              int64
n(Months)_LO_Active_Employee               int64
Client_Status_Post3Months                  int64
Client_Status_Post6Months                  int64
LO_Active_Employee_Post3Months             int64
LO_Active_Employee_Post6Months             int64
n(Loans)_Outstanding_Maynas                int64
n(Loans)_Outstanding_Other                 int64
n(Additional_Loans)_Post3Months            int64
n(Additional_Loans)_Post6Months            int64
Total_Accumulated_Interest_per_Client      int64
LO_Active_Employee_Prior3Months            int64
LO_Active_Employee_Prior6Months            int64
n(Additional_Loans)_

### Define X, y and split data (using complete dataset, i.e. years 2018 - may 2021 for fitting and evaluating models).

In [9]:
X = df[["n(Months)_Since_Last_Disbursement", "n(Months)_Client_Relationship", "n(Months)_LO_Active_Employee", "n(Loans)_Outstanding_Maynas", "n(Loans)_Outstanding_Other", "n(Additional_Loans)_Post3Months", "n(Additional_Loans)_Post6Months", "LO_Active_Employee_Post3Months", "LO_Active_Employee_Prior6Months","LO_Active_Employee_Post6Months", "Client_Age_Years", "Amount_Last_Disbursement", "Total_Accumulated_Interest_per_Client"]] # features selected based on calculation of feature importance in NB "all features".
y = df["Client_Status_Post3Months"]  
X.shape, y.shape

((4304, 13), (4304,))

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y) 

In [11]:
X_train.columns

Index(['n(Months)_Since_Last_Disbursement', 'n(Months)_Client_Relationship',
       'n(Months)_LO_Active_Employee', 'n(Loans)_Outstanding_Maynas',
       'n(Loans)_Outstanding_Other', 'n(Additional_Loans)_Post3Months',
       'n(Additional_Loans)_Post6Months', 'LO_Active_Employee_Post3Months',
       'LO_Active_Employee_Prior6Months', 'LO_Active_Employee_Post6Months',
       'Client_Age_Years', 'Amount_Last_Disbursement',
       'Total_Accumulated_Interest_per_Client'],
      dtype='object')

In [12]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((3443, 13), (3443,), (861, 13), (861,))

### Feature Engineering (only LogReg): timestamps, binning and scaling

In [13]:
frames = [X_train, X_val]

In [17]:
X_train_timestamped = timestamp_data(frames_list = frames, data = frames)[0]
X_train_timestamped.columns
# X_val_timestamped = timestamp_data(frames_list = frames, data = frames)[1]



Index(['n(Months)_Since_Last_Disbursement', 'n(Months)_Client_Relationship',
       'n(Months)_LO_Active_Employee', 'n(Loans)_Outstanding_Maynas',
       'n(Loans)_Outstanding_Other', 'n(Additional_Loans)_Post3Months',
       'n(Additional_Loans)_Post6Months', 'LO_Active_Employee_Post3Months',
       'LO_Active_Employee_Prior6Months', 'LO_Active_Employee_Post6Months',
       'Client_Age_Years', 'Amount_Last_Disbursement',
       'Total_Accumulated_Interest_per_Client', 'Year', 'Month'],
      dtype='object')

In [None]:
X_train_timestamps = include_timestamps(X_train)

In [None]:
X_train_timestamps.columns

In [None]:
binnables = ["n(Months)_Since_Last_Disbursement", "n(Months)_Client_Relationship", "n(Months)_LO_Active_Employee", "n(Loans)_Outstanding_Maynas", "n(Loans)_Outstanding_Other", "n(Additional_Loans)_Post3Months", "n(Additional_Loans)_Post6Months", "Year", "Month"]

In [None]:
scalables = ["Client_Age_Years", "Amount_Last_Disbursement", "Total_Accumulated_Interest_per_Client"]

In [None]:
weights = {0:0.41, 1:0.59} # rationale: taking inverse distribution of labels (see EDA on distribution of minority/majority groups).

In [None]:
transformer_LogReg = ColumnTransformer(
    [
    ("binner", KBinsDiscretizer(), binnables),
    ("scaler", MinMaxScaler(), scalables),
    ],
    remainder="passthrough"
    )

In [None]:
pipeline_LogReg = Pipeline(
    [
        ("transformer", transformer_LogReg),
        ("classifier", LogisticRegression(class_weight = weights, random_state=42))
    ]
    )

In [None]:
clf_LogReg_fitted = pipeline_LogReg.fit(X_train, y_train)
clf_LogReg_fitted

In [None]:
X_val_fe = 

## Evaluate LogReg

In [None]:
model_LR_disp = plot_roc_curve(clf_LogReg_fitted, X_val_timestamps, y_val)
plt.show()

In [None]:
pred_LogReg = clf_LogReg_fitted.predict(X_val_timestamps)

In [None]:
print(f"Confusion Matrix:{confusion_matrix(y_val, pred_LogReg)}")

In [None]:
performance_metrics(y_val, pred_LogReg)

In [None]:
train_accuracy = model_LR.score(X_train, y_train)
train_accuracy

In [None]:
val_accuracy = model_LR.score(X_val, y_val)
val_accuracy

### Fit a Random Forest model

### Grid Search for hyperparameter tuning: Random Forest Classifier

In [None]:
model_RF = RandomForestClassifier(max_depth=10, min_samples_split=3, n_estimators=63, random_state=42) # params from manual tuning

In [None]:
X_train.copy()
y_train.copy()

In [None]:
model_RF.fit(X_train, y_train)

### Make predictions.

In [None]:
ypred_RF = model_RF.predict(X_val)


In [None]:
probs_RF = model_RF.predict_proba(X_val) 
probs_RF

### Evaluate Random Forest: ROC curve 

In [None]:
ax = plt.gca()

model_RF_disp = plot_roc_curve(model_RF, X_val, y_val, ax=ax, alpha=0.8)
model_LR_disp.plot(ax=ax, alpha=0.8)

plt.show() 

### Evaluate Random Forest: confusion matrix and AUC

In [None]:
print(f"Confusion Matrix: \n{confusion_matrix(y_val, ypred_RF)}")
print(f"Area Under Curve: {roc_auc_score(y_val, ypred_RF).round(2)}")
print(f"f1 - score: {f1_score(y_val, ypred_RF).round(2)}")

In [None]:
auc_RF = roc_auc_score(y_val, ypred_RF)
f1_RF = f1_score(y_val, ypred_RF)

Interpretation:
- re Confusion Matrix: out of 529 predictions regarding the positive class, 100% were predicted correctly. Model seems to be overfitting!
- Out of 332 predictions regarding the negative class, 12% were predicted incorrectly (false negatives).
- f1 - score:  94% (vs. initially 93%)
- re AUC: the likelihood that a randomly selected customer from the minority group is scored higher than the a randomly selected customer from the majority group is 94% - as in first version.

In [None]:
train_accuracy = model_RF.score(X_train, y_train)
train_accuracy

In [None]:
val_accuracy = model_RF.score(X_val, y_val)
val_accuracy

=> reduction in overfitting!

### Fit a Decision Tree model

In [None]:
model_DT = DecisionTreeClassifier(max_depth=6, random_state=42) # param from manual tuning

In [None]:
X_train.copy()
y_train.copy()

In [None]:
model_DT.fit(X_train, y_train)

### Make predictions.

In [None]:
ypred_DT = model_DT.predict(X_val)

In [None]:
probs_DT = model_DT.predict_proba(X_val)
probs_DT

### Evaluate Decision Tree: ROC curve 

In [None]:
ax = plt.gca()

model_DT_disp = plot_roc_curve(model_DT, X_val, y_val, ax=ax, alpha=0.8)
model_RF_disp.plot(ax=ax, alpha=0.8)
model_LR_disp.plot(ax=ax, alpha=0.8)

plt.show()

### Evaluate Decision Tree: confusion matrix and AUC

In [None]:
print(f"Confusion Matrix: \n{confusion_matrix(y_val, ypred_DT)}")
print(f"Area Under Curve: {roc_auc_score(y_val, ypred_DT).round(2)}")
print(f"f1 - score: {f1_score(y_val, ypred_DT).round(2)}")

In [None]:
auc_DT = roc_auc_score(y_val, ypred_DT)
f1_DT = f1_score(y_val, ypred_DT)

Interpretation:
- re Confusion Matrix: out of 332 predictions regarding the positive class, 90% were predicted correctly. As such, DT shows best precision of all models. Out of 529 predictions regarding the negative class, 6% were predicted incorrectly (false negatives). 
- f1 - score = 89.95% 
- re AUC: the likelihood that a randomly selected customer from the minority group is scored higher than the a randomly selected customer from the majority group is 92% in the case of a Decision Tree classifier (vs. 94% of a Random Forest classifier).

In [None]:
train_accuracy = model_DT.score(X_train, y_train)
train_accuracy

In [None]:
val_accuracy = model_DT.score(X_val, y_val)
val_accuracy

=> reduction in overfitting!

### Summary model evaluation: AUC

In [None]:
auc = [["LogReg", auc_LR], ["Tree",  auc_DT], ["RF",  auc_RF]]
auc  = pd.DataFrame(auc , columns = ["Model", "auc"])
auc.sort_values(by=["auc"], inplace=True, ascending = False)
auc.set_index(["Model"])

### Summary model evaluation: f1 - score

In [None]:
f1 = [["LogReg", f1_LR], ["Tree",  f1_DT], ["RF",  f1_RF]]
f1  = pd.DataFrame(f1 , columns = ["Model", "f1"])
f1.sort_values(by=["f1"], inplace=True, ascending = False)
f1.set_index(["Model"])

#### Interpretation
* Random Forest Model shows best performance (based on AUC and f1-score).
* Both RF as well as Tree perform better than LogReg.
* All models tend to overfit less than with the complete dataset.
* LogReg suffers performance, in comparison with model trained on all features.

## Saving the Random Forest model.

In [None]:
with open("../artefacts/churn-model.bin", "wb") as f_out:
    pickle.dump(model_RF, f_out) 