### Predicting Bank's Term Deposit Subscription - Random Forest Classification Model

#### Author: Guansu(Frances) Niu

#### Data Resource: https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

In [1]:
# Imports:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import fbeta_score

In [3]:
# # Read data:

pp_data = pd.read_csv("data/preprocessed data.csv")
label = 'y'
y = LabelEncoder().fit_transform(df[label])
df.drop(columns=[label],inplace=True)
X = df
ftr_names = X.columns

In [204]:
# Calculate baseline of F1 score:

# Method reference: 
# https://stats.stackexchange.com/questions/390200/what-is-the-baseline-of-the-f1-score-for-a-binary-classifier

pp_data['y'].value_counts()
count_0 = 36531
prob_0 = count_0/len(pp_data['y'])
prob_1 = (len(pp_data['y']) - count_0)/len(pp_data['y'])
baseline = (2*prob_1)/(prob_1+1)
print(baseline)

0.202432155099011


In [209]:
# Split data:

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state =random_state,stratify=y)

In [None]:
# Apply preprocessing:

cat_ftrs = ['job', 'marital', 'default', 'housing','loan', 'contact','poutcome']

ordinal_ftrs = ['education','month','day_of_week']

ordinal_cats = [['basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'illiterate', 'professional.course', 
                'university.degree','missing'],['mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep','oct',
                'nov','dec'],['mon', 'tue', 'wed', 'thu', 'fri']]

num_ftrs = ['age','campaign','previous','emp.var.rate','cons.price.idx','cons.conf.idx', 
                 'euribor3m','nr.employed']

# one-hot encoder
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant',fill_value='missing')),
    ('onehot', OneHotEncoder(sparse=False,handle_unknown='ignore'))])

# ordinal encoder
ordinal_transformer = Pipeline(steps=[
    ('imputer2', SimpleImputer(strategy='constant',fill_value='NA')),
    ('ordinal', OrdinalEncoder(categories = ordinal_cats))])

# standard scaler
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# collect the transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_ftrs),
        ('cat', categorical_transformer, cat_ftrs),
        ('ord', ordinal_transformer, ordinal_ftrs)])

# fit_transform the training set
X_prep = preprocessor.fit_transform(X_train)

# Collect feature names

feature_names = preprocessor.transformers_[0][-1] + \
                list(preprocessor.named_transformers_['cat'][1].get_feature_names(cat_ftrs)) + \
                preprocessor.transformers_[2][-1]

X_train = pd.DataFrame(data=X_prep,columns=feature_names)

# transform the test
df_test = preprocessor.transform(X_test)
X_test = pd.DataFrame(data=df_test,columns = feature_names)

In [210]:
# Tune parameters and get best F1 score:

max_features = range(10,20) 
max_depths = range(5,10) 
f1_rfc={"max_feature":[],"max_depth":[],"f1_score":[]}

for max_feature in max_features:
    for max_depth in max_depths:
        rfc = RandomForestClassifier(n_estimators=100,
                                     max_depth=max_depth,max_features=max_feature,
                                     random_state=random_state)
        rfc.fit(X_train, y_train)
        y_pred_rfc = rfc.predict(X_test)
        f1_rfc['f1_score'].append(fbeta_score(y_test, y_pred_rfc,1))
        f1_rfc['max_depth'].append(max_depth)
        f1_rfc['max_feature'].append(max_feature)
max_f1_rfc = max(f1_rfc['f1_score'])
best_max_feature = f1_rfc['max_feature'][np.argmax(f1_rfc['f1_score'])]
best_max_depth = f1_rfc['max_depth'][np.argmax(f1_rfc['f1_score'])]
print(f"random forest: max_feature={best_max_feature},max_depth={best_max_depth},\
f1_score={max_f1_rfc}") 

random forest: max_feature=15,max_depth=9,f1_score=0.3745261561789234


In [212]:
# Cross validation using parameters tuned from the best f1 score:

rfc = RandomForestClassifier(n_estimators=100,max_depth=9,max_features=15,random_state=random_state)
rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_test)
kf = StratifiedKFold(n_splits=10,shuffle=True,random_state=1)
CV = (cross_val_score(rfc, X_train, y_train, cv=kf, n_jobs=1, scoring = 'f1').mean())
print(CV)

0.3666976070940632


In [216]:
# Uncertainty due to splitting:

uncertainty_split_rfc = []

for i in range(1, 12, 2): 
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=i,stratify=y)

    # Run the model using the tuned parameters:
    rfc = RandomForestClassifier(n_estimators=100,max_depth=8,max_features=16,random_state=123)
    rfc.fit(X_train, y_train)
    y_pred_rfc = rfc.predict(X_test)

    # Cross validation
    CV = (cross_val_score(rfc, X_train, y_train, cv=kf, n_jobs=1, scoring = 'f1').mean())
    uncertainty_split_rfc.append(CV)
print(np.std(uncertainty_split_rfc))

0.00499765587448889


In [217]:
# Uncertainty due to non-deterministic ML methods:

uncertainty_ndeter_rfc = []

for i in range(123, 500, 80): 
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1,stratify=y)

    # Run the model using the tuned parameters:
    rfc = RandomForestClassifier(n_estimators=100,max_depth=9,max_features=18,random_state=i)
    rfc.fit(X_train, y_train)
    y_pred_rfc = rfc.predict(X_test)

    # Cross validation
    CV = (cross_val_score(rfc, X_train, y_train, cv=kf, n_jobs=1, scoring = 'f1').mean())
    uncertainty_ndeter_rfc.append(CV)
print(np.std(uncertainty_ndeter_rfc))

0.0018626119520612472


In [213]:
# Confusion matrix:

confusion_matrix(y_test, y_pred_rfc)

array([[7162,  145],
       [ 680,  247]])

In [214]:
# Accuracy score:

accuracy_score(y_test, y_pred_rfc)

0.8998056837503036

In [215]:
# Baseline of accuracy score:

baseline_accuracy = np.array(pp_data["y"].value_counts()/pp_data.shape[0])[0]
print(baseline_accuracy)

0.8873855272426944


For random forest classification:

The F1 score is 0.367 (after cross validation) with tuned parameters max_feature = 16 and max_depth = 8.

The uncertainties due to splitting is 0.00450, and due to non-deterministic ML method is 0.00186.

The accuracy score is 0.90.