In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix,recall_score,accuracy_score
from sklearn import tree
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
%matplotlib inline

In [2]:
df=pd.read_csv("./bank-additional/bank-additional/bank-additional-full.csv",sep=";")

In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [4]:
df.shape
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
age               41188 non-null int64
job               41188 non-null object
marital           41188 non-null object
education         41188 non-null object
default           41188 non-null object
housing           41188 non-null object
loan              41188 non-null object
contact           41188 non-null object
month             41188 non-null object
day_of_week       41188 non-null object
duration          41188 non-null int64
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
poutcome          41188 non-null object
emp.var.rate      41188 non-null float64
cons.price.idx    41188 non-null float64
cons.conf.idx     41188 non-null float64
euribor3m         41188 non-null float64
nr.employed       41188 non-null float64
y                 41188 non-null object
dtypes: float64(5), int64(5), object(11)
memory usa

y class imbalance 

In [7]:
y_count_df=pd.DataFrame(pd.concat([df["y"].value_counts(),df["y"].value_counts()/np.sum(df["y"].value_counts())*100],axis=1))
y_count_df.columns=["count","%"]
y_count_df

Unnamed: 0,count,%
no,36548,88.734583
yes,4640,11.265417


Data Preprocessing

In [8]:
X=df.drop("y",axis=1)
Y=df["y"]
#transform non-numerical labels to numerical labels
le = preprocessing.LabelEncoder()
for i in X.columns[X.dtypes=="object"]:
    X.loc[:,i]=le.fit_transform(X.loc[:,i])
Y=le.fit_transform(Y)

In [9]:
#train test split 
X_train, X_test, y_train, y_test=train_test_split(X,Y,test_size=0.25,random_state=42)

Training

In [10]:
#K fold with oversampling on training data
class UpsampleStratifiedKFold:
    def __init__(self, n_splits=3):
        self.n_splits = n_splits
    def split(self, X, y, groups=None):
        for rx, tx in StratifiedKFold(n_splits=self.n_splits).split(X,y):
            nix = np.where(y[rx]==0)[0]
            pix = np.where(y[rx]==1)[0]
            pixu = np.random.choice(pix, size=nix.shape[0], replace=True)
            ix = np.append(nix, pixu)
            rxm = rx[ix]
            yield rxm, tx
    def get_n_splits(self, X, y, groups=None):
        return self.n_splits

In [39]:
# oversampling + choose parameters by recall
clf = tree.DecisionTreeClassifier(random_state=0)
unsample_kf=UpsampleStratifiedKFold(n_splits=5)
gscv=GridSearchCV(estimator=clf, param_grid={"max_depth":[1,3,5,7,9]},cv=unsample_kf,scoring=["accuracy","f1","precision","recall"],refit=False)
gscv.fit(X_train, y_train)
recall_by_par=pd.DataFrame(gscv.cv_results_).loc[:,["params","mean_test_recall"]]
print(recall_by_par)
best_par=recall_by_par.loc[recall_by_par["mean_test_recall"]==np.max(recall_by_par["mean_test_recall"]),"params"]
print("Best parameters:",best_par.values)

GridSearchCV(cv=<__main__.UpsampleStratifiedKFold object at 0x000002462C4F7FD0>,
             error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=0,
                                              splitter='best'),
             iid='warn', n_jobs=None, param_grid={'max_depth': [1, 3, 5, 7, 9]},
             pre_disp

In [48]:
# Without oversampling + choose parameters by accuracy
clf = tree.DecisionTreeClassifier(random_state=0)
gscv_ori=GridSearchCV(estimator=clf, param_grid={"max_depth":[1,3,5,7,9]},cv=5)
gscv_ori.fit(X_train, y_train)
print("Best parameters:",gscv_ori.best_params_)
clf = tree.DecisionTreeClassifier(random_state=0,max_depth=5)
clf_fit=clf.fit(X_train,y_train)
y_pred=clf_fit.predict(X_test)
print("Original Testing Accuracy:", accuracy_score(y_test,y_pred))
print("Original Testing recall: ",recall_score(y_test,y_pred))
print("confusion matrix:\n",confusion_matrix(y_test,y_pred))

from sklearn.dummy import DummyClassifier
dummy_clf =DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train,y_train)
y_pred_dummy=dummy_clf.predict(X_test)
print("Dummy Testing Accuracy:", accuracy_score(y_test,y_pred_dummy))
print("Dummy Testing recall: ",recall_score(y_test,y_pred_dummy))
print("Confusion matrix:\n",confusion_matrix(y_test,y_pred_dummy))

Best parameters: {'max_depth': 5}
Original Testing Accuracy: 0.9152180246673789
Original Testing recall:  0.5177797051170858
confusion matrix:
 [[8827  317]
 [ 556  597]]
Dummy Testing Accuracy: 0.8880256385354958
Dummy Testing recall:  0.0
Confusion matrix:
 [[9144    0]
 [1153    0]]


Testing result 

In [72]:
#over sample training data 
ind_0=np.where(y_train==0)[0]
ind_1=np.where(y_train==1)[0]
sample_cnt=len(ind_0)-len(ind_1)
sample_to_add=np.random.choice(ind_1,size=sample_cnt)
ind_ros=np.concatenate([ind_0,sample_to_add])
X_train_ros=X_train.iloc[ind_ros,]
y_train_ros=y_train[ind_ros]

#test 
clf_ros_new = tree.DecisionTreeClassifier(random_state=0,max_depth=5)
clf_ros_new.fit(X_train_ros,y_train_ros)
ros_new_y_pred=clf_ros_new.predict(X_test)
print("confusion matrix:\n",confusion_matrix(y_test,ros_new_y_pred))
print("Our new model's Testing recall: ",recall_score(y_test,ros_new_y_pred))
print("Our new model's Testing accuracy: ",accuracy_score(y_test,ros_new_y_pred))


confusion matrix:
 [[7674 1470]
 [  97 1056]]
Our new model's Testing recall:  0.9158716392020815
Our new model's Testing accuracy:  0.8478197533262115
