In [25]:
import pandas as pd

In [26]:
bank_data = pd.read_csv('bank-full.csv', sep=";")

In [27]:
bank_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
age          45211 non-null int64
job          45211 non-null object
marital      45211 non-null object
education    45211 non-null object
default      45211 non-null object
balance      45211 non-null int64
housing      45211 non-null object
loan         45211 non-null object
contact      45211 non-null object
day          45211 non-null int64
month        45211 non-null object
duration     45211 non-null int64
campaign     45211 non-null int64
pdays        45211 non-null int64
previous     45211 non-null int64
poutcome     45211 non-null object
y            45211 non-null object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


### Categorical data into numerical data

In [28]:
from sklearn.preprocessing import LabelEncoder

In [29]:
bank_data.job.unique()

array(['management', 'technician', 'entrepreneur', 'blue-collar',
       'unknown', 'retired', 'admin.', 'services', 'self-employed',
       'unemployed', 'housemaid', 'student'], dtype=object)

In [30]:
bank_data.marital.unique()

array(['married', 'single', 'divorced'], dtype=object)

In [31]:
bank_data.education.unique()

array(['tertiary', 'secondary', 'unknown', 'primary'], dtype=object)

In [32]:
bank_data.default.unique()

array(['no', 'yes'], dtype=object)

In [33]:
bank_data.housing.unique()

array(['yes', 'no'], dtype=object)

In [34]:
bank_data.loan.unique()

array(['no', 'yes'], dtype=object)

In [35]:
bank_data.contact.unique()

array(['unknown', 'cellular', 'telephone'], dtype=object)

In [36]:
cat_cols = ['job', 'marital', 'education', 'default', 'loan', 'housing', 'contact', 'poutcome', 'month', 'y']

In [37]:
for col in cat_cols:
    le = LabelEncoder()
    bank_data[col] = le.fit_transform(bank_data[col])

In [38]:
bank_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,4,1,2,0,2143,1,0,2,5,8,261,1,-1,0,3,0
1,44,9,2,1,0,29,1,0,2,5,8,151,1,-1,0,3,0
2,33,2,1,1,0,2,1,1,2,5,8,76,1,-1,0,3,0
3,47,1,1,3,0,1506,1,0,2,5,8,92,1,-1,0,3,0
4,33,11,2,3,0,1,0,0,2,5,8,198,1,-1,0,3,0


In [39]:
bank_data.poutcome.unique()

array([3, 0, 1, 2], dtype=int64)

In [40]:
### Find imp features

In [42]:
bank_data.corr()['y']

age          0.025155
job          0.040438
marital      0.045588
education    0.066241
default     -0.022419
balance      0.052838
housing     -0.139173
loan        -0.068185
contact     -0.148395
day         -0.028348
month       -0.024471
duration     0.394521
campaign    -0.073172
pdays        0.103621
previous     0.093236
poutcome    -0.077840
y            1.000000
Name: y, dtype: float64

In [43]:
import numpy as np

In [45]:
s = np.abs(bank_data.corr()['y'])

In [46]:
s.sort_values(ascending=False)

y            1.000000
duration     0.394521
contact      0.148395
housing      0.139173
pdays        0.103621
previous     0.093236
poutcome     0.077840
campaign     0.073172
loan         0.068185
education    0.066241
balance      0.052838
marital      0.045588
job          0.040438
day          0.028348
age          0.025155
month        0.024471
default      0.022419
Name: y, dtype: float64

In [47]:
imp_cols = ['duration','contact','previous','housing','pdays']

In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.naive_bayes import GaussianNB as GB

In [49]:
Classification_names = ['RF','GB','knn','LR']
Classification_models = [RandomForestClassifier(n_estimators=100),
                         GB(),
                         knn(n_neighbors=7),
                         LogisticRegression()]

In [50]:
from sklearn.model_selection import train_test_split

In [58]:
def classification_model(model, data, feature, target):
    trainX, testX, trainY, testY = train_test_split(data[feature], data[target])
    model.fit(trainX, trainY)
    return model.score(testX,testY)

In [60]:
for idx,model in enumerate(Classification_models):
    acc = classification_model(model,bank_data, imp_cols, 'y')
    print (Classification_names[idx], acc)
    

RF 0.879058656994
GB 0.862425904627
knn 0.885251703088
LR 0.890206139963


In [64]:
all_cols = bank_data.columns.tolist()

In [65]:
all_cols.remove('y')

In [66]:
for idx,model in enumerate(Classification_models):
    acc = classification_model(model,bank_data, all_cols, 'y')
    print (Classification_names[idx], acc)

RF 0.90232681589
GB 0.851366893745
knn 0.884986286827
LR 0.891267805008
