In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('bank.csv', delimiter=";",header='infer')
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [3]:
data.shape

(4521, 17)

In [4]:
data.corr()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.08382,-0.017853,-0.002367,-0.005148,-0.008894,-0.003511
balance,0.08382,1.0,-0.008677,-0.01595,-0.009976,0.009437,0.026196
day,-0.017853,-0.008677,1.0,-0.024629,0.160706,-0.094352,-0.059114
duration,-0.002367,-0.01595,-0.024629,1.0,-0.068382,0.01038,0.01808
campaign,-0.005148,-0.009976,0.160706,-0.068382,1.0,-0.093137,-0.067833
pdays,-0.008894,0.009437,-0.094352,0.01038,-0.093137,1.0,0.577562
previous,-0.003511,0.026196,-0.059114,0.01808,-0.067833,0.577562,1.0


In [5]:
data['y'].replace(('no','yes'), (0,1), inplace=True)

In [6]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,0
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,0
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,0
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,0
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,0


In [7]:
data.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [8]:
data.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y
count,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0
mean,41.170095,1422.657819,15.915284,263.961292,2.79363,39.766645,0.542579,0.11524
std,10.576211,3009.638142,8.247667,259.856633,3.109807,100.121124,1.693562,0.319347
min,19.0,-3313.0,1.0,4.0,1.0,-1.0,0.0,0.0
25%,33.0,69.0,9.0,104.0,1.0,-1.0,0.0,0.0
50%,39.0,444.0,16.0,185.0,2.0,-1.0,0.0,0.0
75%,49.0,1480.0,21.0,329.0,3.0,-1.0,0.0,0.0
max,87.0,71188.0,31.0,3025.0,50.0,871.0,25.0,1.0


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   int64 
dtypes: int64(8), object(9)
memory usage: 600.6+ KB


In [10]:
data['job'].value_counts()

management       969
blue-collar      946
technician       768
admin.           478
services         417
retired          230
self-employed    183
entrepreneur     168
unemployed       128
housemaid        112
student           84
unknown           38
Name: job, dtype: int64

In [11]:
data['marital'].value_counts()

married     2797
single      1196
divorced     528
Name: marital, dtype: int64

In [12]:
data['default'].value_counts()

no     4445
yes      76
Name: default, dtype: int64

In [13]:
data['housing'].value_counts()

yes    2559
no     1962
Name: housing, dtype: int64

In [14]:
data['loan'].value_counts()

no     3830
yes     691
Name: loan, dtype: int64

In [15]:
data['contact'].value_counts()

cellular     2896
unknown      1324
telephone     301
Name: contact, dtype: int64

In [16]:
data['poutcome'].value_counts()

unknown    3705
failure     490
other       197
success     129
Name: poutcome, dtype: int64

In [17]:
# dummies = ['job','marital','education','contact','month','poutcome']

In [18]:
job = pd.get_dummies(data['job'], drop_first = True)

In [19]:
data = pd.concat([data, job], axis = 1)

In [20]:
marital = pd.get_dummies(data['marital'], drop_first = True)

In [21]:
marital

Unnamed: 0,married,single
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0
...,...,...
4516,1,0
4517,1,0
4518,1,0
4519,1,0


In [22]:
data = pd.concat([data, marital], axis = 1)

In [23]:
education = pd.get_dummies(data['education'], drop_first = True)

In [24]:
data = pd.concat([data, education], axis = 1)

In [25]:
contact = pd.get_dummies(data['contact'], drop_first = True)

In [26]:
data = pd.concat([data, contact], axis = 1)

In [27]:
month = pd.get_dummies(data['month'], drop_first = True)

In [28]:
data = pd.concat([data, month], axis = 1)

In [29]:
poutcome = pd.get_dummies(data['poutcome'], drop_first = True)

In [30]:
data = pd.concat([data, poutcome], axis = 1)

In [31]:
data.drop(['job','marital','education','contact','month','poutcome'], axis=1, inplace=True)

In [32]:
data.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,jul,jun,mar,may,nov,oct,sep,other,success,unknown
0,30,no,1787,no,no,19,79,1,-1,0,...,0,0,0,0,0,1,0,0,0,1
1,33,no,4789,yes,yes,11,220,1,339,4,...,0,0,0,1,0,0,0,0,0,0
2,35,no,1350,yes,no,16,185,1,330,1,...,0,0,0,0,0,0,0,0,0,0
3,30,no,1476,yes,yes,3,199,4,-1,0,...,0,1,0,0,0,0,0,0,0,1
4,59,no,0,yes,no,5,226,1,-1,0,...,0,0,0,1,0,0,0,0,0,1


In [33]:
data.columns

Index(['age', 'default', 'balance', 'housing', 'loan', 'day', 'duration',
       'campaign', 'pdays', 'previous', 'y', 'blue-collar', 'entrepreneur',
       'housemaid', 'management', 'retired', 'self-employed', 'services',
       'student', 'technician', 'unemployed', 'unknown', 'married', 'single',
       'secondary', 'tertiary', 'unknown', 'telephone', 'unknown', 'aug',
       'dec', 'feb', 'jan', 'jul', 'jun', 'mar', 'may', 'nov', 'oct', 'sep',
       'other', 'success', 'unknown'],
      dtype='object')

In [34]:
X = data.iloc[:,:-1]
y = data.y

In [35]:
enc = LabelEncoder()
X.loan = enc.fit_transform(X.loan)
X.default = enc.fit_transform(X.default)
X.housing = enc.fit_transform(X.housing)

In [36]:
X.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,jan,jul,jun,mar,may,nov,oct,sep,other,success
0,30,0,1787,0,0,19,79,1,-1,0,...,0,0,0,0,0,0,1,0,0,0
1,33,0,4789,1,1,11,220,1,339,4,...,0,0,0,0,1,0,0,0,0,0
2,35,0,1350,1,0,16,185,1,330,1,...,0,0,0,0,0,0,0,0,0,0
3,30,0,1476,1,1,3,199,4,-1,0,...,0,0,1,0,0,0,0,0,0,0
4,59,0,0,1,0,5,226,1,-1,0,...,0,0,0,0,1,0,0,0,0,0


In [37]:
data.shape

(4521, 43)

In [38]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=10)

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import model_selection
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

In [40]:
# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('Linear Discriminant Analysis', LinearDiscriminantAnalysis()))
models.append(('K Nearest Neighbours', KNeighborsClassifier()))
models.append(('Decision Tree Classifier', DecisionTreeClassifier()))
models.append(('Naive Bayes', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

Logistic Regression: 0.957322 (0.036078)
Linear Discriminant Analysis: 0.901573 (0.017386)
K Nearest Neighbours: 0.873036 (0.015147)
Decision Tree Classifier: 1.000000 (0.000000)
Naive Bayes: 0.999779 (0.000664)
SVM: 0.884762 (0.014195)


In [41]:
model_gaussiannb = GaussianNB()
model_gaussiannb.fit(X_train, y_train)
y_predict = model_gaussiannb.predict(X_test)
accuracy_score(y_test, y_predict)

1.0

In [42]:
pd.crosstab(y_test, y_predict)

col_0,0,1
y,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1001,0
1,0,130


In [43]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
accuracy_score(y_test, y_predict)

1.0

In [44]:
pd.crosstab(y_test, y_predict)

col_0,0,1
y,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1001,0
1,0,130


In [45]:
model = KNeighborsClassifier(n_neighbors=25, metric='euclidean')
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
accuracy_score(y_test, y_predict)

0.883289124668435

In [46]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
accuracy_score(y_test, y_predict)

0.9849690539345711