### Predicting Bank's Term Deposit Subscription - XGBoost Classification Model

#### Author: Guansu(Frances) Niu

#### Data Resource: https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

In [143]:
# Imports:

import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import fbeta_score

In [144]:
# Read data:

raw_data = pd.read_csv("data/raw data.csv",sep=';')
preprocessed_data = pd.read_csv("data/preprocessed data.csv")

pp_data = preprocessed_data.drop(preprocessed_data.columns[0], axis=1)

X = pp_data.drop(['y'], inplace = False, axis = 1)
y = pp_data['y']

In [145]:
# Calculate baseline of F1 score:

# Method reference: 
# https://stats.stackexchange.com/questions/390200/what-is-the-baseline-of-the-f1-score-for-a-binary-classifier

pp_data['y'].value_counts()
count_0 = 36531
prob_0 = count_0/len(pp_data['y'])
prob_1 = (len(pp_data['y']) - count_0)/len(pp_data['y'])
baseline = (2*prob_1)/(prob_1+1)
print(baseline)

0.202432155099011


In [158]:
# Split data:

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=56,stratify=y)

In [162]:
# Tune parameters and get best F1 score:

max_depths = range(7,14) 
f1_xgb=[]

for max_depth in max_depths:
    xgb = XGBClassifier(max_depth=max_depth,learning_rate=0.1,random_state=458)
    xgb.fit(X_train, y_train)
    y_pred_xgb = xgb.predict(X_test)
    f1_xgb.append(fbeta_score(y_test, y_pred_xgb,1))
    
best_max_depth = max_depths[np.argmax(f1_xgb)]

print(f"xgboost: max_depth={best_max_depth},f1_score={max(f1_xgb)}")

xgboost: max_depth=11,f1_score=0.3927536231884058


In [163]:
# Cross validation using parameters tuned from the best f1 score:

xgb = XGBClassifier(max_depth=11, learning_rate=0.1,random_state=123)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

kf = StratifiedKFold(n_splits=10,shuffle=True,random_state=1)
CV = (cross_val_score(xgb, X_train, y_train, cv=kf, n_jobs=1, scoring = 'f1').mean())
print(CV)

0.37904190273634375


In [169]:
# Uncertainty due to splitting:

uncertainty_split_xgb = []

for i in range(1, 12, 2): 
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=i,stratify=y)

    # Run the model using the tuned parameters:
    xgb = XGBClassifier(max_depth=11,learning_rate=0.001,random_state=123)
    xgb.fit(X_train, y_train)
    y_pred_xgb = xgb.predict(X_test)

    # Cross validation
    CV = (cross_val_score(xgb, X_train, y_train, cv=kf, n_jobs=1, scoring = 'f1').mean())
    uncertainty_split_xgb.append(CV)
print(np.std(uncertainty_split_xgb))

0.007599212627488509


In [168]:
# Uncertainty due to non-deterministic ML methods:

uncertainty_ndeter_xgb = []

for i in range(123, 500, 80): 
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=58,stratify=y)

    # Run the model using the tuned parameters:
    xgb = XGBClassifier(max_depth=11,learning_rate=0.001,random_state=i)
    xgb.fit(X_train, y_train)
    y_pred_xgb = xgb.predict(X_test)

    # Cross validation
    CV = (cross_val_score(xgb, X_train, y_train, cv=kf, n_jobs=1, scoring = 'f1').mean())
    uncertainty_ndeter_xgb.append(CV)
print(np.std(uncertainty_ndeter_xgb))

0.0


In [170]:
# Confusion matrix:

confusion_matrix(y_test,y_pred_xgb)

array([[7167,  140],
       [ 690,  237]])

In [164]:
# Accuracy score:

accuracy_score(y_test, y_pred_xgb)

0.8982268642215205

In [165]:
# Baseline of accuracy score:

baseline_accuracy = np.array(pp_data["y"].value_counts()/pp_data.shape[0])[0]
print(baseline_accuracy)

0.8873855272426944


For xgboost classification:

The F1 score is 0.379 (after cross validation) with tuned parameters max_depth = 11.

The uncertainties due to splitting is 0.00760, and due to non-deterministic ML method is 0.0.

The accuracy score is 0.898.