In [12]:
import pandas as pd


In [2]:
# Read data into pd dataframe
data = pd.read_csv("Data/bank/bank-full.csv", delimiter=";")
print(data.shape)
print(data.head())

(45211, 17)
   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  


In [3]:
# Copy data to a new dataframe for cleaning
clean_data = data.copy()

In [4]:
# Columns of categorical data
categorical_cols = ['job', 'marital', 'education', 'default', 'housing',
       'loan', 'contact', 'poutcome']

# Columns of real-valued data
real_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [5]:
# Remove columns 'day', 'month', 'pdays'
# which may have low correlation to output class (to be verified)
clean_data = clean_data.drop(['day', 'month', 'pdays', 'duration'], axis=1)

In [6]:
# Remove records with "unknown" values in any column
for col in categorical_cols :
    clean_data = clean_data[clean_data[col]!="unknown"]

print(clean_data.shape)    
display(clean_data.head())

(7842, 13)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,campaign,previous,poutcome,y
24060,33,admin.,married,tertiary,no,882,no,no,telephone,1,3,failure,no
24062,42,admin.,single,secondary,no,-247,yes,yes,telephone,1,1,other,yes
24064,33,services,married,secondary,no,3444,yes,no,telephone,1,4,failure,yes
24072,36,management,married,tertiary,no,2415,yes,no,telephone,1,4,other,no
24077,36,management,married,tertiary,no,0,yes,no,telephone,1,3,failure,yes


In [7]:
from sklearn.preprocessing import LabelEncoder

# Convert categorical data to nominals
lb_make = LabelEncoder()
for col in categorical_cols :
    clean_data[col] = lb_make.fit_transform(clean_data[col])

print(clean_data.head())

       age  job  marital  education  default  balance  housing  loan  contact  \
24060   33    0        1          2        0      882        0     0        1   
24062   42    0        2          1        0     -247        1     1        1   
24064   33    7        1          1        0     3444        1     0        1   
24072   36    4        1          2        0     2415        1     0        1   
24077   36    4        1          2        0        0        1     0        1   

       campaign  previous  poutcome    y  
24060         1         3         0   no  
24062         1         1         1  yes  
24064         1         4         0  yes  
24072         1         4         1   no  
24077         1         3         0  yes  


In [8]:
# Label output class 'y' to {0,1}
y_classes = ['no', 'yes']
lb_make.fit(y_classes)
clean_data['y'] = lb_make.fit_transform(clean_data['y'])
print(clean_data.head())

       age  job  marital  education  default  balance  housing  loan  contact  \
24060   33    0        1          2        0      882        0     0        1   
24062   42    0        2          1        0     -247        1     1        1   
24064   33    7        1          1        0     3444        1     0        1   
24072   36    4        1          2        0     2415        1     0        1   
24077   36    4        1          2        0        0        1     0        1   

       campaign  previous  poutcome  y  
24060         1         3         0  0  
24062         1         1         1  1  
24064         1         4         0  1  
24072         1         4         1  0  
24077         1         3         0  1  


In [10]:
from sklearn.preprocessing import KBinsDiscretizer

# Binning on real-valued attributes
# For simplicity, binning all attributes with 10 bins
enc = KBinsDiscretizer(n_bins=10, encode='ordinal')

# Binning 'age'
col = 'age'
enc.fit(clean_data[col].values.reshape((-1,1)))
clean_data[col] = enc.transform(clean_data[col].values.reshape((-1,1)))

# Binning 'balance'
col = 'balance'
enc.fit(clean_data[col].values.reshape((-1,1)))
clean_data[col] = enc.transform(clean_data[col].values.reshape((-1,1)))

"""
# Binning 'duration'
col = 'duration'
enc.fit(clean_data[col].values.reshape((-1,1)))
clean_data[col] = enc.transform(clean_data[col].values.reshape((-1,1)))
"""

print(clean_data.head())

       age  job  marital  education  default  balance  housing  loan  contact  \
24060  3.0    0        1          2        0      5.0        0     0        1   
24062  6.0    0        2          1        0      0.0        1     1        1   
24064  3.0    7        1          1        0      8.0        1     0        1   
24072  4.0    4        1          2        0      8.0        1     0        1   
24077  4.0    4        1          2        0      1.0        1     0        1   

       campaign  previous  poutcome  y  
24060         1         3         0  0  
24062         1         1         1  1  
24064         1         4         0  1  
24072         1         4         1  0  
24077         1         3         0  1  


In [14]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
train_X, test_X, train_y, test_y =train_test_split(clean_data.iloc[:,:-1], clean_data['y'],
                                                   test_size=0.2, random_state=500)

In [15]:
from sklearn.metrics import confusion_matrix
# Evaluate prediction result with confusion matrix

def eval_cm(actual, pred) :
    tn, fp, fn, tp = confusion_matrix(actual, pred).ravel()
    print("tn: {}, fp: {}, fn: {}, tp: {}".format(tn, fp, fn, tp))
    return tn, fp, fn, tp

In [33]:
from sklearn.tree import DecisionTreeClassifier
# A simple Decision Tree Classifier

dt_clf = DecisionTreeClassifier()
dt_clf = dt_clf.fit(train_X, train_y)

In [34]:
# Prediction
dt_pred = dt_clf.predict(test_X)

# Evaluation with confusion matrix
eval_cm(test_y, dt_pred)

tn: 994, fp: 232, fn: 215, tp: 128


(994, 232, 215, 128)

In [None]:
from sklearn.ensemble import RandomForestClassifier
# A simple Random Forest Classifier

rt_clf = RandomForestClassifier()
rt_clf = rt_clf.fit(train_X, train_y)

In [None]:
# Prediction
rt_pred = rt_clf.predict(test_X)
# Evaluation with confusion matrix
eval_cm(test_y, rt_pred)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# Random Forest Classifier with GridSearchCV

param_grid = { 
    'n_estimators': [100,200,300,400,500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [8, 10, 12, 14, 16],
    'criterion' :['gini', 'entropy']
}

In [None]:
grid_rt_clf = GridSearchCV(estimator=RandomForestClassifier(), 
                           param_grid=param_grid, 
                           cv=3,
                           verbose=1, 
                           n_jobs=-1)
grid_rt_clf = grid_rt_clf.fit(train_X, train_y)

In [None]:
# Best parameters from GridSearchCV
print(grid_rt_clf.best_params_)

In [None]:
# Prediction
grid_rt_pred = grid_rt_clf.predict(test_X)

# Evaluation with confusion matrix
eval_cm(test_y, grid_rt_pred)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
# A simple Gradient Boosting Classifier

gb_clf = GradientBoostingClassifier()
gb_clf = gb_clf.fit(train_X, train_y)

In [None]:
# Prediction
gb_pred = gb_clf.predict(test_X)

# Evaluation with confusion matrix
eval_cm(test_y, gb_pred)

In [47]:
from sklearn.ensemble import AdaBoostClassifier
# AdaBoost Classifier

ab_clf = AdaBoostClassifier()
ab_clf.fit(train_X, train_y)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [48]:
# Prediction
ab_pred = ab_clf.predict(test_X)

# Evaluation with confusion matrix
eval_cm(test_y, ab_pred)

tn: 1159, fp: 67, fn: 214, tp: 129


(1159, 67, 214, 129)

In [54]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
# Gradient Boosting Classifier with GridSearchCV

param_grid = {"loss": ['deviance'],
              "learning_rate": [1e-3, 5e-3, 1e-2],
              "n_estimators": [10, 50, 100],      
              "subsample":[0.5, 0.8, 1.0],
              "criterion": ["friedman_mse"],
              "min_samples_split": [.1, .2, .3],
              "min_samples_leaf":  [.1, .2, .3],
              "max_depth": [6,9,12],
              "max_features":["log2"],}

gb_clf = GradientBoostingClassifier()
grid_gb_clf = GridSearchCV(estimator=gb_clf, 
                           param_grid=param_grid,
                           cv=3,
                           verbose=1,
                           n_jobs=-1)
grid_gb_clf = grid_gb_clf.fit(train_X, train_y)

Fitting 3 folds for each of 729 candidates, totalling 2187 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 212 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 1112 tasks      | elapsed:   35.4s
[Parallel(n_jobs=-1)]: Done 1970 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 2187 out of 2187 | elapsed:  1.2min finished


In [55]:
grid_gb_clf.best_params_

{'criterion': 'friedman_mse',
 'learning_rate': 0.001,
 'loss': 'deviance',
 'max_depth': 6,
 'max_features': 'log2',
 'min_samples_leaf': 0.1,
 'min_samples_split': 0.1,
 'n_estimators': 10,
 'subsample': 0.5}

In [56]:
# Prediction
grid_gb_pred = grid_gb_clf.predict(test_X)

# Evaluation with confusion matrix
eval_cm(test_y, grid_gb_pred)

tn: 1226, fp: 0, fn: 343, tp: 0


(1226, 0, 343, 0)