In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm
from collections import Counter
import pandas as pd
from sklearn import tree
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import SVC
from sklearn.cross_validation import PredefinedSplit
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn import model_selection
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics 
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
import operator
import graphviz  
import pydotplus
from IPython.display import Image  
import matplotlib.pyplot as plt




In [2]:
#import data
raw_data = pd.read_csv("bank-additional-full.csv",delimiter = ";")
raw_data = raw_data.sample(frac=1).reset_index(drop=True)

In [3]:
#******************data preprocessing*********************
#mutual information
raw_array_n = raw_data.as_matrix()
x_n = raw_array_n[:,0:-1]
y_n = raw_array_n[:,-1]
print(Counter(raw_data['y']))
scores = []
for i in range(x_n.shape[1]):
    scores.append((metrics.mutual_info_score(x_n[:,i], y_n), raw_data.columns[i]))
           
for score in sorted(scores,reverse=True):
    print(score)
    
#check missing values ('unknown' )
raw_data.loc[:,['job', 'y']].groupby(['job','y']).size() #also check marital, education

Counter({'no': 36548, 'yes': 4640})
(0.09744495076204493, 'duration')
(0.07758409907879295, 'euribor3m')
(0.0681771292828563, 'cons.price.idx')
(0.06817712928285628, 'cons.conf.idx')
(0.06222040677351051, 'nr.employed')
(0.054568132557260715, 'emp.var.rate')
(0.03154246805520029, 'pdays')
(0.030383612679616152, 'poutcome')
(0.026406581004977794, 'month')
(0.019268877762732103, 'previous')
(0.014120642590666126, 'age')
(0.011645721145540237, 'contact')
(0.009858724326144045, 'job')
(0.00577430676511924, 'default')
(0.003334438122576158, 'campaign')
(0.0023896903228810638, 'education')
(0.0014338512748518547, 'marital')
(0.00032199848153421917, 'day_of_week')
(6.90830137016589e-05, 'housing')
(1.3380897777300754e-05, 'loan')


job            y  
admin.         no     9070
               yes    1352
blue-collar    no     8616
               yes     638
entrepreneur   no     1332
               yes     124
housemaid      no      954
               yes     106
management     no     2596
               yes     328
retired        no     1286
               yes     434
self-employed  no     1272
               yes     149
services       no     3646
               yes     323
student        no      600
               yes     275
technician     no     6013
               yes     730
unemployed     no      870
               yes     144
unknown        no      293
               yes      37
dtype: int64

In [4]:
# deal with missing values and remove features
raw_data = raw_data[raw_data['job'] != 'unknown']
raw_data = raw_data[raw_data['marital']!= 'unknown']
del raw_data['default']
del raw_data['duration']
del raw_data['loan']
del raw_data['housing']


In [6]:
#one hot encoding

#transform to labelencoder
for column in raw_data.columns:
    if raw_data[column].dtype == type(object):
        le = LabelEncoder()
        raw_data[column] = le.fit_transform(raw_data[column])
        
#transform to onehotencode
for column in [ 'poutcome', 'contact','education','marital','job','month', 'day_of_week']:
    ohe = pd.get_dummies(raw_data[column], prefix = column)
    raw_data = raw_data.drop(column, 1).join(ohe)

In [7]:
#new dataset
raw_data = raw_data.drop('y',1).join(raw_data['y'])
raw_array = raw_data.as_matrix()
print("new dataset shape:", raw_array.shape)

new dataset shape: (40787, 52)


In [8]:
#unbalanced data
Counter(raw_data['y'])

Counter({0: 36193, 1: 4594})

In [9]:
#************** split dataset******************
raw_array = raw_data.as_matrix()
raw_x = raw_array[:,0:-1]
raw_y = raw_array[:,-1]
x_train_val, x_test, y_train_val, y_test = train_test_split(raw_x, raw_y, test_size=0.1, random_state=0)
x_train, x_val, y_train, y_val = train_test_split(x_train_val,y_train_val, test_size = 2.0/7.0, random_state = 0)
x_train, y_train = SMOTE().fit_sample(x_train, y_train) #resample training data ADASYN SMOTE

In [13]:
#random baseline
y_random = np.random.randint(2, size = y_test.shape[0])
print("F1 score of the random classifier:")
print(metrics.f1_score(y_test, y_random,labels = 1))
print("Recall of the random classifier:")
print(metrics.recall_score(y_test, y_random, labels = 1))

F1 score of the random classifier:
0.1756919374247894
Recall of the random classifier:
0.4910313901345291


In [15]:
#single decision tree
dt = tree.DecisionTreeClassifier()
dt = dt.fit(x_train, y_train)
y_pred_dt_train = dt.predict(x_train)
y_pred_dt_val = dt.predict(x_val)
print("The f1 score of default decision tree on training set:")
print(metrics.f1_score(y_train, y_pred_dt_train, labels =1))
print("The recall of default decision tree on training set:")
print(metrics.recall_score(y_train, y_pred_dt_train, labels=1))
print("The f1 score of default decision tree on validation set:")
print(metrics.f1_score(y_val, y_pred_dt_val, labels = 1))
print("The recall of default decision tree on validation set:")
print(metrics.recall_score(y_val, y_pred_dt_val, labels=1) )

The f1 score of default decision tree on training set:
0.9940456858287323
The recall of default decision tree on training set:
0.9889707466287536
The f1 score of default decision tree on validation set:
0.31979695431472077
The recall of default decision tree on validation set:
0.33187006145741876


In [16]:
#single logistic regression
lgr = LogisticRegression()
lgr = lgr.fit(x_train, y_train)
y_pred_lgr_train = lgr.predict(x_train)
y_pred_lgr_val = lgr.predict(x_val)
print("The f1 score of simple LRG on training set:")
print(metrics.f1_score(y_train, y_pred_lgr_train, labels = 1))
print("The recall of simple LRG on training set:")
print(metrics.recall_score(y_train, y_pred_lgr_train, labels=1))
print("The f1 score of simple LRG on validation set:")
print(metrics.f1_score(y_val, y_pred_lgr_val, labels = 1))
print("The recall of simple LRG on validation set:")
print(metrics.recall_score(y_val, y_pred_lgr_val, labels=1) )


The f1 score of simple LRG on training set:
0.7126271003100887
The recall of simple LRG on training set:
0.638619620007755
The f1 score of simple LRG on validation set:
0.42671501964339675
The recall of simple LRG on validation set:
0.6198419666374012


In [17]:
# svm
svm = SVC()
svm = svm.fit(x_train, y_train)
y_pred_svm_train = svm.predict(x_train)
y_pred_svm_val = svm.predict(x_val)
print("The f1 score of simple SVM on training set:")
print(metrics.f1_score(y_train, y_pred_svm_train,labels = 1))
print("The recall of simple SVM on training set:")
print(metrics.recall_score(y_train, y_pred_svm_train, labels=1))
print("The f1 score of simple SVM on validation set:")
print(metrics.f1_score(y_val, y_pred_svm_val,labels = 1))
print("The recall of simple SVM on validation set:")
print(metrics.recall_score(y_val, y_pred_svm_val, labels=1) )

The f1 score of simple SVM on training set:
0.7447292220068442
The recall of simple SVM on training set:
0.6703718064710698
The f1 score of simple SVM on validation set:
0.448742746615087
The recall of simple SVM on validation set:
0.611062335381914


In [18]:
# *****************Grid search*****************

from sklearn.metrics import make_scorer, f1_score, recall_score

kfold = model_selection.KFold(n_splits=5, random_state=7)

x_new = np.concatenate((x_train,x_val ),axis=0)
y_new = np.concatenate((y_train,y_val ),axis=0)
test_fold = np.zeros(x_new.shape[0])   
test_fold[:x_train.shape[0]] = -1 #set the index of training set 
ps = PredefinedSplit(test_fold=test_fold)


def grid(param_grid, ps, classifier, xdata = x_new, ydata = y_new):
    #define a function to do grid search on different models
    score = make_scorer(recall_score, pos_label=1) # define scorer, set positive lable = 1, or python will give out average recall
    print("Tuning hyper-parameters" )
    clf = GridSearchCV(classifier, param_grid, cv= ps,
                       scoring= score)
    clf.fit(xdata, ydata)
    print("Best parameters set found:")
    print(clf.best_params_)
    print("Best recall:")
    print(clf.best_score_)


In [19]:
# tune logistic regression parameters
param_grid = {'C': [ 0.1, 1, 10, 100],'tol': [1e-6, 1e-5, 1e-4, 1e-3, 1e-2]}
classifier = LogisticRegression()
print("On logistic regression model:")
grid(param_grid, ps, classifier)

On logistic regression model:
Tuning hyper-parameters
Best parameters set found:
{'C': 0.1, 'tol': 0.01}
Best recall:
0.7216856892010536


In [25]:
# f1 on validation data
tuned_lgr = LogisticRegression(C = 0.1, tol = 0.01)
tuned_lgr = tuned_lgr.fit(x_train, y_train)
y_tuned_lgr_val = tuned_lgr.predict(x_val)
print(metrics.f1_score(y_val, y_tuned_lgr_val, labels = 1))

0.36076366030283086


In [20]:
# tune decision tree parameters
param_grid = {'min_samples_split': range(10,200,20),'max_depth': range(1,40,2)} 
classifier = tree.DecisionTreeClassifier()
print("On decision tree model:")
grid(param_grid, ps, classifier)

On decision tree model:
Tuning hyper-parameters
Best parameters set found:
{'max_depth': 3, 'min_samples_split': 10}
Best recall:
0.568920105355575


In [24]:
# f1 score on validation data
tuned_dt = tree.DecisionTreeClassifier(max_depth= 3, min_samples_split= 10)
tuned_dt = tuned_dt.fit(x_train, y_train)
y_tuned_dt_val = tuned_dt.predict(x_val)
print(metrics.f1_score(y_val, y_tuned_dt_val, labels = 1))

0.4421699078812692


In [30]:
#plot the tree
dot_simpletree = tree.export_graphviz(tuned_dt,out_file=None,feature_names=raw_data.columns[0:-1],class_names=np.array(['0', '1']) ,filled=True, rounded=True,special_characters=True)  
graph = pydotplus.graph_from_dot_data(dot_simpletree)  
img = Image(graph.create_png())  
graph.write_png("tuned_decisiontree.png") 

True

In [53]:
#tune svm parameters
param_grid = [
  {'C': [1, 10, 100, 100], 'kernel': ['linear']},
  {'C': [1, 10, 100, 100], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}
 ]
classifier = SVC()  
print("On svm model:")
grid(param_grid, ps, classifier)

On svm model:
Tuning hyper-parameters
Best parameters set found:
{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
Best recall:
0.7067603160667252


In [61]:
#f1 score on validation data
svm_tuned = SVC(C= 1, gamma = 0.0001, kernel = 'rbf')
svm_tuned = svm_tuned.fit(x_train, y_train)
y_tuned_svm_val = svm_tuned.predict(x_val)
print("The f1 score of SVM on validation set:")
print(metrics.f1_score(y_val, y_tuned_svm_val,labels = 1))

The f1 score of SVM on validation set:
0.37251272559000465


In [57]:
#*****************Gradient Boost*****************

seed = 7
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.01,max_depth=1, random_state=seed).fit(x_train, y_train)
print("F1 score of gradient decision stump on validation set:")
print(metrics.f1_score(y_val, clf.predict(x_val), labels=1) )
print("The recall of gradient decision stump on validation set:")
print(metrics.recall_score(y_val, clf.predict(x_val), labels=1) )

F1 score of gradient decision stump on validation set:
0.35681768913938866
The recall of gradient decision stump on validation set:
0.7225636523266022


In [34]:
#***************model on test data***************
# decision tree
clf_dt = tree.DecisionTreeClassifier(max_depth = 3, min_samples_split = 10)
clf_dt = clf_dt.fit(x_train,y_train)
print("Recall of final decision tree on test set:")
print(metrics.recall_score(y_test, clf_dt.predict(x_test), labels = 1))
print("F1 score of final  decision tree on test set:")
print(metrics.f1_score(y_test, clf_dt.predict(x_test), labels = 1))

Recall of final decision tree on test set:
0.5717488789237668
F1 score of final  decision tree on test set:
0.45333333333333337


In [35]:
# Logistic regression
clf_lg = LogisticRegression(C = 0.1, tol = 0.01)
clf_lg = clf_lg.fit(x_train, y_train)
print("Recall of final LGR on test set:")
print(metrics.recall_score(y_test, clf_lg.predict(x_test), labels = 1))
print("F1 score of final  LGR on test set:")
print(metrics.f1_score(y_test, clf_lg.predict(x_test), labels = 1))

Recall of final LGR on test set:
0.7017937219730942
F1 score of final  LGR on test set:
0.3641652123327516


In [59]:
#SVM
clf_svm = SVC(C= 1, gamma = 0.0001, kernel = 'rbf')
clf_svm = svm_tuned.fit(x_train, y_train)

print("The f1 score of simple SVM on training set:")
print(metrics.f1_score(y_test, clf_svm.predict(x_test),labels = 1))
print("The recall of simple SVM on training set:")
print(metrics.recall_score(y_test, clf_svm.predict(x_test), labels=1))

The f1 score of simple SVM on training set:
0.3741620962827544
The recall of simple SVM on training set:
0.6883408071748879


In [60]:
#Gradient boosting
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.01,max_depth=1, random_state=seed).fit(x_train, y_train)
print("F1 score of gradient decision stump on validation set:")
print(metrics.f1_score(y_test, clf.predict(x_test), labels=1) )
print("The recall of gradient decision stump on validation set:")
print(metrics.recall_score(y_test, clf.predict(x_test), labels=1) )

F1 score of gradient decision stump on validation set:
0.3627960716348931
The recall of gradient decision stump on validation set:
0.7040358744394619


In [64]:
#rank coefficient
attribute_list = clf_lg.coef_.tolist()
coe = attribute_list[0]
coe=map(abs, coe)
names = raw_data.columns
coe, names = zip(*sorted(zip(coe, names), reverse = True))

In [65]:
for i, j in zip(coe, names):
    print(i,j)

0.200306343371889 euribor3m
0.19149877511898472 cons.price.idx
0.18031514271927962 emp.var.rate
0.11842276852759719 month_6
0.08177252073368553 contact_0
0.07965714657408533 contact_1
0.04534372446904961 education_6
0.04487848363124248 month_3
0.04483701869990658 campaign
0.03662543535996889 poutcome_0
0.03459845657649612 poutcome_1
0.032370555241437414 previous
0.029644037004199417 marital_2
0.02847054337201246 job_0
0.0273306470232855 job_1
0.02500676375531232 month_5
0.022736101101847473 day_of_week_1
0.020508215056443214 job_5
0.020260203262017093 cons.conf.idx
0.019994050834914782 education_2
0.018391779665150907 day_of_week_4
0.017873403259241947 job_8
0.016860618145055472 month_0
0.01672246954747583 month_4
0.01597850401410519 job_7
0.015617429031767884 marital_1
0.014965429514085834 month_8
0.014822782005787865 month_7
0.01382781064540861 day_of_week_0
0.01285896077448802 day_of_week_2
0.011911233812830473 marital_0
0.010427122957867376 education_0
0.008464977318560647 educatio

In [None]:
# *************** fail to improve by bagging**************
seed = 7
kfold = model_selection.KFold(n_splits=5, random_state=seed)

# #predict on bagging decision tree
# score = make_scorer(recall_score, pos_label=1)
# dt_bagging = BaggingClassifier(
#     base_estimator=tree.DecisionTreeClassifier(), 
#     n_estimators=30, random_state=seed)
# dt_bagging = dt_bagging.fit(x_train, y_train)
# kfold_result_dt = model_selection.cross_val_score(
#     dt_bagging, x_train_val, y_train_val, cv=ps, scoring=score)
# print("The f1 score of bagging tuned decision tree on cross validation set:")
# print("the average f1-score is: ", kfold_result_dt.mean())
# # print("The f1 score of bagging tuned decision tree on validation set is:")
# # print(metrics.f1_score(y_val, lgr.predict(x_val),average = 'micro'))


# #predict on bagging svm
# svm_bagging = BaggingClassifier(
#     base_estimator=BaggingClassifier(base_estimator=SVC(), 
#                                      n_estimators=30, random_state=seed))
# # svm_bagging = svm_bagging.fit(x_train,y_train)
# # print("The f1 score of bagging tuned svm on training set:")
# # print(metrics.f1_score(y_train, svm_bagging.predict(x_train), average = 'micro'))

# kfold_result_svm = model_selection.cross_val_score(
#     svm_bagging, x_train_val, y_train_val, cv=kfold, scoring="f1_micro")
# print("The f1 score of bagging tuned svm on cross validation set:")
# print("the average f1-score is: ", kfold_result_svm.mean())

# #predict on bagging logistic regression
# lgr_bagging = BaggingClassifier(
#     base_estimator=BaggingClassifier(base_estimator=LogisticRegression(), 
#                                      n_estimators=30, random_state=seed))
# # lgr_bagging = lgr_bagging.fit(x_train,y_train)
# # print("The f1 score of bagging tuned svm on training set:")
# # print(metrics.f1_score(y_train, lgr_bagging.predict(x_train), average = 'micro'))

# kfold_result_lgr = model_selection.cross_val_score(
#     lgr_bagging, x_train_val, y_train_val, cv=kfold, scoring="f1_micro")
# print("The f1 score of bagging tuned lrg on cross validation set:")
# print("the average f1-score is: ", kfold_result_lgr.mean())