In [1]:
import numpy as np
import pandas as pd

##  Machine Learning Process
- Clean/Prepare your data (EDA Process)
    - Missing Value Treatment
    - Outlier removal
    - Standardization
    - Dummy variable Conversion
- Split your data in training and testing
- Separate input & output columns (train_x, train_y, test_x, test_y)
- Build model using train data
- predict output values for test data
- Evalueate your model
    - Accuracy ,sensitivity, specificity
- Fine tune your model(s) for better performance
    - Hyper parameter tuning
    - cross validation

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.tree import DecisionTreeClassifier

In [4]:
bank = pd.read_csv('bank-full.csv', sep=';')
bank.shape

(45211, 17)

In [5]:
bank

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
5,35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no
6,28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,-1,0,unknown,no
7,42,entrepreneur,divorced,tertiary,yes,2,yes,no,unknown,5,may,380,1,-1,0,unknown,no
8,58,retired,married,primary,no,121,yes,no,unknown,5,may,50,1,-1,0,unknown,no
9,43,technician,single,secondary,no,593,yes,no,unknown,5,may,55,1,-1,0,unknown,no


## Get Dummies


In [6]:
# here we get dummy by dropping our target variable column y
bank_dummy = pd.get_dummies(bank.drop('y', axis=1))
bank_dummy['y'] = bank['y']
bank_dummy.shape

(45211, 52)

In [7]:
train, test = train_test_split(bank_dummy,
                               test_size = 0.3,
                               random_state=100)
print(train.shape)
print(test.shape)

(31647, 52)
(13564, 52)


In [8]:
# Segregating input and output
train_y = train['y']
test_y = test['y']

train_x = train.drop('y', axis=1)
test_x = test.drop('y', axis=1)

train_x.shape

(31647, 51)

In [9]:
print(train_y.shape)
print(test_y.shape)
print(train_x.shape)
print(test_x.shape)

(31647,)
(13564,)
(31647, 51)
(13564, 51)


In [10]:
from sklearn.tree import DecisionTreeClassifier

In [11]:
# Creating/Fitting a model
model = DecisionTreeClassifier()
model.fit(train_x, train_y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [12]:
# predicting on test data
test_pred = model.predict(test_x)
print(len(test_pred))
test_pred

13564


array(['no', 'no', 'no', ..., 'no', 'no', 'no'], dtype=object)

In [13]:
df_pred = pd.DataFrame({'actual': test_y,
                         'predicted': test_pred})
df_pred.head()

Unnamed: 0,actual,predicted
14789,no,no
8968,no,no
34685,no,no
2369,no,no
36561,no,yes


In [14]:
df_pred['Status'] = df_pred['actual'] == df_pred['predicted']
df_pred.head()

Unnamed: 0,actual,predicted,Status
14789,no,no,True
8968,no,no,True
34685,no,no,True
2369,no,no,True
36561,no,yes,False


In [15]:
df_pred['Status'].value_counts() / df_pred.shape[0] * 100

True     87.66588
False    12.33412
Name: Status, dtype: float64

In [16]:
print(train_y[train_y == 'yes'].shape[0])
print(train_y[train_y == 'no'].shape[0])

3710
27937


In [17]:
1 - np.square(3710/31674) - np.square(27937/31674)

0.2083267541648135

## Day 2 decision tree

In [18]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [19]:
bank = pd.read_csv('bank-full.csv', sep=';')
bank.shape

(45211, 17)

In [20]:
# here we get dummy by dropping our target variable column y
bank_dummy = pd.get_dummies(bank.drop('y', axis=1))
bank_dummy['y'] = bank['y']
bank_dummy.shape

(45211, 52)

In [21]:
train, test = train_test_split(bank_dummy,
                               test_size = 0.3,
                               random_state=100)
print(train.shape)
print(test.shape)

(31647, 52)
(13564, 52)


In [22]:
# Segregating input and output
train_y = train['y']
test_y = test['y']

train_x = train.drop('y', axis=1)
test_x = test.drop('y', axis=1)

train_x.shape

(31647, 51)

In [23]:
model = DecisionTreeClassifier(random_state=100, max_depth=3)
model.fit(train_x, train_y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=100,
            splitter='best')

In [24]:
from sklearn import tree
with open('modelbank.dot', 'w') as f:
    f = tree.export_graphviz(model,
                             out_file=f,
                             feature_names=train_x.columns) # this is for giving names to nodes in the tree

# Everything once again

In [25]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [26]:
bank = pd.read_csv('bank-full.csv', sep=';')
bank.shape

(45211, 17)

In [27]:
# here we get dummy by dropping our target variable column y
bank_dummy = pd.get_dummies(bank.drop('y', axis=1))
bank_dummy['y'] = bank['y']
bank_dummy.shape

(45211, 52)

In [28]:
train, test = train_test_split(bank_dummy,
                               test_size = 0.3,
                               random_state=100)
print(train.shape)
print(test.shape)

(31647, 52)
(13564, 52)


In [29]:
# Segregating input and output
train_y = train['y']
test_y = test['y']

train_x = train.drop('y', axis=1)
test_x = test.drop('y', axis=1)

train_x.shape

(31647, 51)

In [54]:
# Creating/Fitting a model
model_dt = DecisionTreeClassifier()
model_dt.fit(train_x, train_y)

# predicting on test data
test_pred_dt = model_dt.predict(test_x)
print(len(test_pred_dt))
test_pred_dt

df_pred_dt = pd.DataFrame({'actual': test_y,
                         'predicted': test_pred_dt})
df_pred_dt.head()

df_pred_dt['pred_status'] = df_pred_dt['actual'] == df_pred_dt['predicted']
df_pred_dt.head()

## true positive
tp_dt = df_pred_dt[(df_pred_dt['predicted']=='yes') & (df_pred_dt['actual']=='yes')].shape[0]

###true negative
tn_dt = df_pred_dt[(df_pred_dt['predicted']=='no') & (df_pred_dt['actual']=='no')].shape[0]

##false positive
fp_dt = df_pred_dt[(df_pred_dt['predicted']=='yes') & (df_pred_dt['actual']=='no')].shape[0]

##false negative
fn_dt = df_pred_dt[(df_pred_dt['predicted']=='no') & (df_pred_dt['actual']=='yes')].shape[0]

tn_dt, fp_dt, fn_dt, tp_dt

### we can do that using sklearn as that function of the confusion matrix is present
from sklearn.metrics import confusion_matrix

confusion_matrix(df_pred_dt['actual'],df_pred_dt['predicted'])

acc_dt = (tp_dt+tn_dt)/(tn_dt + fp_dt+fn_dt+tp_dt)
acc_dt

sensitivity_dt = tp_dt / (tp_dt + fn_dt)
sensitivity_dt

specificity_dt = tn_dt/(tn_dt + fp_dt)
specificity_dt

13564


0.9267417605340008

In [58]:
(df_pred_dt['pred_status'].value_counts() / df_pred_dt.shape[0] * 100) == acc_dt

True     False
False    False
Name: pred_status, dtype: bool

## Random forest

In [53]:

from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(random_state=100)
model_rf.fit(train_x, train_y)

test_pred_rf = model_rf.predict(test_x)
df_pred_rf = pd.DataFrame({'actual': test_y,
                        'predicted': test_pred_rf})

df_pred_rf['pred_status'] = df_pred_rf['actual'] == df_pred_rf['predicted']

## true positive
tp_rf = df_pred_rf[(df_pred_rf['predicted']=='yes') & (df_pred_rf['actual']=='yes')].shape[0]

###true negative
tn_rf = df_pred_rf[(df_pred_rf['predicted']=='no') & (df_pred_rf['actual']=='no')].shape[0]

##false positive
fp_rf = df_pred_rf[(df_pred_rf['predicted']=='yes') & (df_pred_rf['actual']=='no')].shape[0]

##false negative
fn_rf = df_pred_rf[(df_pred_rf['predicted']=='no') & (df_pred_rf['actual']=='yes')].shape[0]

tn_rf, fp_rf, fn_rf, tp_rf

### we can do that using sklearn as that function of the confusion matrix is present
from sklearn.metrics import confusion_matrix

confusion_matrix(df_pred_rf['actual'],df_pred_rf['predicted'])

acc_rf = (tp_rf+tn_rf)/(tn_rf + fp_rf+fn_rf+tp_rf)
acc_rf

sensitivity_rf = tp_rf / (tp_rf + fn_rf)
sensitivity_rf

specificity_rf = tn_rf/(tn_rf + fp_rf)
specificity_rf

## Adaptive Boost

In [59]:
from sklearn.ensemble import AdaBoostClassifier

model_ab = AdaBoostClassifier(random_state=100)
model_ab.fit(train_x, train_y)

test_pred_ab = model_ab.predict(test_x)

df_pred_ab = pd.DataFrame({'actual': test_y,
                        'predicted': test_pred_ab})

df_pred_ab['pred_status'] = df_pred_ab['actual'] == df_pred_ab['predicted']

## true positive
tp_ab = df_pred_ab[(df_pred_ab['predicted']=='yes') & (df_pred_ab['actual']=='yes')].shape[0]

###true negative
tn_ab = df_pred_ab[(df_pred_ab['predicted']=='no') & (df_pred_ab['actual']=='no')].shape[0]

##false positive
fp_ab = df_pred_ab[(df_pred_ab['predicted']=='yes') & (df_pred_ab['actual']=='no')].shape[0]

##false negative
fn_ab = df_pred_ab[(df_pred_ab['predicted']=='no') & (df_pred_ab['actual']=='yes')].shape[0]

### we can do that using sklearn as that function of the confusion matrix is present
from sklearn.metrics import confusion_matrix

confusion_matrix(df_pred_ab['actual'],df_pred_ab['predicted'])

acc_ab = (tp_ab+tn_ab)/(tp_ab+tn_ab+fp_ab+fn_ab)
acc_ab

sensitivity_ab = tp_ab / (tp_ab + fn_ab)
sensitivity_ab

specificity_ab = tn_ab/(tn_ab + fp_ab)
specificity_ab



0.966624947851481

In [49]:
main_df = {'Decision Tree': pd.Series([tp_dt,tn_dt,fp_dt,fn_dt,acc_dt,sensitivity_dt,specificity_dt], 
                                      index=['TP','TN','FP','FN','Accuracy','Sensitivity','Specificity']),
          'Random Forest': pd.Series([tp_rf,tn_rf,fp_rf,fn_rf,acc_rf,sensitivity_rf,specificity_rf], 
                                     index=['TP','TN','FP','FN','Accuracy','Sensitivity','Specificity']),
          'Adaptive Boosting': pd.Series([tp_ab,tn_ab,fp_ab,fn_ab,acc_ab,sensitivity_ab,specificity_ab], 
                                     index=['TP','TN','FP','FN','Accuracy','Sensitivity','Specificity'])}

In [50]:
info = pd.DataFrame(main_df)

In [51]:
info

Unnamed: 0,Adaptive Boosting,Decision Tree,Random Forest
TP,585.0,782.0,561.0
TN,11585.0,11118.0,11643.0
FP,400.0,867.0,342.0
FN,994.0,797.0,1018.0
Accuracy,0.897228,0.877322,0.899735
Sensitivity,0.370488,0.49525,0.355288
Specificity,0.966625,0.92766,0.971464
