### Predicting Bank's Term Deposit Subscription - Gradient Boost Classification Model

#### Author: Guansu(Frances) Niu

#### Data Resource: https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

In [47]:
# Imports:

import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier

In [55]:
# Read data:

raw_data = pd.read_csv("data/raw data.csv",sep=';')
preprocessed_data = pd.read_csv("data/preprocessed data.csv")

pp_data = preprocessed_data.drop(preprocessed_data.columns[0], axis=1)

X = pp_data.drop(['y'], inplace = False, axis = 1)
y = pp_data['y']

In [56]:
# Calculate baseline of F1 score:

# Method reference: 
# https://stats.stackexchange.com/questions/390200/what-is-the-baseline-of-the-f1-score-for-a-binary-classifier

pp_data['y'].value_counts()
count_0 = 36531
prob_0 = count_0/len(pp_data['y'])
prob_1 = (len(pp_data['y']) - count_0)/len(pp_data['y'])
baseline = (2*prob_1)/(prob_1+1)
print(baseline)

0.202432155099011


In [61]:
# Split data:

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state =467,stratify=y)

In [63]:
# Tune parameters and get best F1 score:

max_depths = range(10,14) 
f1_gbc=[]

for max_depth in max_depths:
    gbc = GradientBoostingClassifier(max_depth=max_depth,n_estimators=100,random_state=123)
    gbc.fit(X_train, y_train)
    y_pred_gbc = gbc.predict(X_test)
    f1_gbc.append(fbeta_score(y_test,y_pred_gbc,1))
    
best_max_depth = max_depths[np.argmax(f1_gbc)]

print(f"gradient boost: max_depth={best_max_depth},f1_score={max(f1_gbc)}")

gradient boost: max_depth=11,f1_score=0.38634812286689424


In [65]:
# Cross validation using parameters tuned from the best f1 score:

gbc = GradientBoostingClassifier(max_depth=11,n_estimators=100,random_state=123)
gbc.fit(X_train, y_train)
y_pred_gbc = gbc.predict(X_test)

kf = StratifiedKFold(n_splits=10,shuffle=True,random_state=1)
CV = (cross_val_score(gbc, X_train, y_train, cv=kf, n_jobs=1, scoring = 'f1').mean())
print(CV)

0.39125514578358356


In [66]:
# Uncertainty due to splitting:

uncertainty_split_gbc = []

for i in range(1, 12, 2): 
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=i,stratify=y)

    # Run the model using the tuned parameters:
    gbc = GradientBoostingClassifier(max_depth=5,n_estimators=500,random_state=123)
    gbc.fit(X_train, y_train)
    y_pred_gbc = gbc.predict(X_test)

    # Cross validation
    CV = (cross_val_score(gbc, X_train, y_train, cv=kf, n_jobs=1, scoring = 'f1').mean())
    uncertainty_split_gbc.append(CV)
print(np.std(uncertainty_split_gbc))

0.006919718421499724


In [67]:
# Uncertainty due to non-deterministic ML methods:

uncertainty_ndeter_gbc = []

for i in range(123, 700, 80): 
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=58,stratify=y)

    # Run the model using the tuned parameters:
    gbc = GradientBoostingClassifier(max_depth=5,n_estimators=500,random_state=i)
    gbc.fit(X_train, y_train)
    y_pred_gbc = gbc.predict(X_test)

    # Cross validation
    CV = (cross_val_score(gbc, X_train, y_train, cv=kf, n_jobs=1, scoring = 'f1').mean())
    uncertainty_ndeter_gbc.append(CV)
print(np.std(uncertainty_ndeter_gbc))

0.0008558466612630979


In [68]:
# Confusion matrix:

confusion_matrix(y_test,y_pred_xgb)

array([[6884,  423],
       [ 881,   46]])

In [69]:
# Accuracy score:

accuracy_score(y_test, y_pred_xgb)

0.8416322564974495

In [46]:
# Baseline of accuracy score:

baseline_accuracy = np.array(pp_data["y"].value_counts()/pp_data.shape[0])[0]
print(baseline_accuracy)

0.8873855272426944


For gradient boost classification:

The F1 score is 0.391 (after cross validation) with tuned parameters max_depth = 11.

The uncertainties due to splitting is 0.00692, and due to non-deterministic ML method is 0.000856.

The accuracy score is 0.842.