# Modeling the Bank Marketing Data

In [23]:
import warnings
warnings.filterwarnings('ignore')

In [90]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df = pd.read_csv('../data/bank-full.csv', sep=';')

## Let's build pipelines for numeric and categorical features

In [3]:
num_feat = df.drop('y', axis=1).select_dtypes(include=np.number).columns
cat_feat = df.drop('y', axis=1).select_dtypes(include=['object']).columns
X = df.drop('y', axis=1)
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [64]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()) #('scaler', MinMaxScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('one_hot', OneHotEncoder())
])

In [65]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_feat),
        ('cat', categorical_transformer, cat_feat)
    ])

## Let's use a simple model to check if the pipeline is working fine

In [66]:
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier',  LogisticRegression(class_weight='balanced', random_state=0))
])

model = pipe.fit(X_train, y_train)
target_names = y_test.unique().astype(str)
y_pred = model.predict(X_test)

In [67]:
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

          no       0.97      0.85      0.91      7961
         yes       0.42      0.81      0.55      1082

    accuracy                           0.84      9043
   macro avg       0.70      0.83      0.73      9043
weighted avg       0.90      0.84      0.86      9043



In [68]:
print(round(pd.DataFrame(confusion_matrix(y_test, y_pred)/len(y_test)*1e2)))

      0     1
0  75.0  13.0
1   2.0  10.0


## Let's analyze multiple models

In [69]:
from time import time
from sklearn.dummy import DummyClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [75]:
results = pd.DataFrame(columns=['Name', 'AUC', 'Gini', 'StdDev(%)', 'Time(s)'])

for model in [
    DummyClassifier,
    LinearDiscriminantAnalysis,
    LogisticRegression, 
    RidgeClassifier,
    DecisionTreeClassifier,
    RandomForestClassifier, 
    GradientBoostingClassifier,
    MLPClassifier,
    XGBClassifier,
    LGBMClassifier
]:
    cls = model()
    start_time = time()
    kfold = StratifiedKFold(n_splits=5, random_state=1)
    scores = cross_val_score(cls, preprocessor.fit_transform(X_train), 
                             y_train, scoring='roc_auc', cv=kfold)
    time_mod = time() - start_time
    results = results.append({
        'Name' : model.__name__, 
        'AUC' : round(scores.mean(), 2), 
        'Gini' : round(2*scores.mean()-1, 2), 
        'StdDev(%)' : round(1e2*scores.std(), 2), 
        'Time': round(time_mod, 2)
    }, ignore_index=True)
    del cls
print('Done!')

results = results.sort_values('Gini', ascending=False)

Done!


In [92]:
results = pd.DataFrame(columns=['Name', 'AUC', 'Gini', 'StdDev(%)', 'Time(s)'])

for model in [
    DummyClassifier,
    LinearDiscriminantAnalysis,
    LogisticRegression, 
    RidgeClassifier,
    DecisionTreeClassifier,
    RandomForestClassifier, 
    GradientBoostingClassifier,
    MLPClassifier,
    XGBClassifier,
    LGBMClassifier
]:
    pipe = make_pipeline(preprocessor, model())
    start_time = time()
    kfold = StratifiedKFold(n_splits=5, random_state=1)
    scores = cross_val_score(pipe, X_train, 
                             y_train, scoring='roc_auc', cv=kfold)
    time_mod = time() - start_time
    results = results.append({
        'Name' : model.__name__, 
        'AUC' : round(scores.mean(), 2), 
        'Gini' : round(2*scores.mean()-1, 2), 
        'StdDev(%)' : round(1e2*scores.std(), 2), 
        'Time(s)': round(time_mod, 2)
    }, ignore_index=True)
    del pipe
print('Done!')

results = results.sort_values('Gini', ascending=False)

Done!


In [93]:
results

Unnamed: 0,Name,AUC,Gini,StdDev(%),Time(s),Time
8,XGBClassifier,0.93,0.87,0.16,,41.86
9,LGBMClassifier,0.94,0.87,0.18,,8.9
5,RandomForestClassifier,0.93,0.86,0.33,,27.29
6,GradientBoostingClassifier,0.93,0.85,0.13,,52.19
7,MLPClassifier,0.92,0.83,0.4,,256.76
1,LinearDiscriminantAnalysis,0.91,0.81,0.38,,3.38
2,LogisticRegression,0.9,0.81,0.35,,5.75
3,RidgeClassifier,0.91,0.81,0.37,,2.42
4,DecisionTreeClassifier,0.7,0.41,0.84,,3.8
0,DummyClassifier,0.5,-0.01,0.75,,1.22


In [76]:
results

Unnamed: 0,Name,AUC,Gini,StdDev(%),Time
8,XGBClassifier,0.93,0.87,0.16,39.54
9,LGBMClassifier,0.94,0.87,0.18,6.66
5,RandomForestClassifier,0.93,0.86,0.34,24.01
6,GradientBoostingClassifier,0.93,0.85,0.13,47.04
7,MLPClassifier,0.92,0.84,0.35,248.3
1,LinearDiscriminantAnalysis,0.91,0.81,0.38,2.51
2,LogisticRegression,0.9,0.81,0.33,5.44
3,RidgeClassifier,0.91,0.81,0.37,1.55
4,DecisionTreeClassifier,0.7,0.41,0.94,3.12
0,DummyClassifier,0.5,0.0,0.65,0.51


## Let's use Lightgbm Model in a pipeline

In [78]:
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lgbm',  LGBMClassifier())
])

model = pipe.fit(X_train, y_train)
target_names = y_test.unique().astype(str)
y_pred = model.predict(X_test)

In [79]:
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

          no       0.93      0.96      0.95      7961
         yes       0.66      0.50      0.57      1082

    accuracy                           0.91      9043
   macro avg       0.80      0.73      0.76      9043
weighted avg       0.90      0.91      0.90      9043



In [80]:
print(round(pd.DataFrame(confusion_matrix(y_test, y_pred)/len(y_test)*1e2)))

      0    1
0  85.0  3.0
1   6.0  6.0


## Default parameters of LGBMClassifier


### Core parameters
1. num_iterations (100) 
2. learning_rate (0.1) 
3. num_leaves (31)

### Learning Control Parameters
4. max_depth (-1)
5. min_data_in_leaf (20)
6. bagging_fraction (1.0)
7. feature_fraction (1.0)

In [88]:
param_grid = {
    'lgbm__num_iterations': [2, 100, 200],
    'lgbm__learning_rate': [0.01, 0.1, 1],
    'lgbm__num_leaves': [3, 31, 301],
    'lgbm__max_depth': [-1, 10, 100],
}

In [89]:
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_))
print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))
print("Best parameters: {}".format(grid.best_params_))

Best cross-validation accuracy: 0.91
Test set score: 0.91
Best parameters: {'lgbm__learning_rate': 0.1, 'lgbm__max_depth': -1, 'lgbm__num_iterations': 100, 'lgbm__num_leaves': 31}
