# Calculating the Accuracy of the Model

Using the same dataset, expand the model by including all other features into the regression. 

Moreover, calculate the accuracy of the model and create a confusion matrix

## Import the relevant libraries

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

## Load the data

Load the ‘Bank_data.csv’ dataset.

In [2]:
raw_data = pd.read_csv('datasets/Bank-data.csv')
raw_data

Unnamed: 0.1,Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,0,1.334,0.0,1.0,0.0,0.0,117.0,no
1,1,0.767,0.0,0.0,2.0,1.0,274.0,yes
2,2,4.858,0.0,1.0,0.0,0.0,167.0,no
3,3,4.120,0.0,0.0,0.0,0.0,686.0,yes
4,4,4.856,0.0,1.0,0.0,0.0,157.0,no
...,...,...,...,...,...,...,...,...
513,513,1.334,0.0,1.0,0.0,0.0,204.0,no
514,514,0.861,0.0,0.0,2.0,1.0,806.0,yes
515,515,0.879,0.0,0.0,0.0,0.0,290.0,no
516,516,0.877,0.0,0.0,5.0,1.0,473.0,yes


In [3]:
data = raw_data.copy()
data = data.drop(['Unnamed: 0'], axis = 1)
data['Subscription'] = raw_data['y'].map({'yes': 1, 'no': 0})

In [4]:
data

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y,Subscription
0,1.334,0.0,1.0,0.0,0.0,117.0,no,0
1,0.767,0.0,0.0,2.0,1.0,274.0,yes,1
2,4.858,0.0,1.0,0.0,0.0,167.0,no,0
3,4.120,0.0,0.0,0.0,0.0,686.0,yes,1
4,4.856,0.0,1.0,0.0,0.0,157.0,no,0
...,...,...,...,...,...,...,...,...
513,1.334,0.0,1.0,0.0,0.0,204.0,no,0
514,0.861,0.0,0.0,2.0,1.0,806.0,yes,1
515,0.879,0.0,0.0,0.0,0.0,290.0,no,0
516,0.877,0.0,0.0,5.0,1.0,473.0,yes,1


### Declare the dependent and independent variables

Use 'duration' as the independet variable.

In [5]:
y = data['Subscription']
x1 = data['duration']

### Simple Logistic Regression

Run the regression and graph the scatter plot.

In [6]:
x = sm.add_constant(x1)
reg_log = sm.Logit(y, x)
results_log = reg_log.fit()

Optimization terminated successfully.
         Current function value: 0.546118
         Iterations 7


In [7]:
results_log.summary()

0,1,2,3
Dep. Variable:,Subscription,No. Observations:,518.0
Model:,Logit,Df Residuals:,516.0
Method:,MLE,Df Model:,1.0
Date:,"Tue, 22 Sep 2020",Pseudo R-squ.:,0.2121
Time:,15:31:37,Log-Likelihood:,-282.89
converged:,True,LL-Null:,-359.05
Covariance Type:,nonrobust,LLR p-value:,5.387e-35

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.7001,0.192,-8.863,0.000,-2.076,-1.324
duration,0.0051,0.001,9.159,0.000,0.004,0.006


## Expand the model

We can be omitting many causal factors in our simple logistic model, so we instead switch to a multivariate logistic regression model. Add the ‘interest_rate’, ‘march’, ‘credit’ and ‘previous’ estimators to our model and run the regression again. 

### Declare the independent variable(s)

In [8]:
y = data['Subscription']
x1 = data[['interest_rate', 'credit', 'march', 'previous', 'duration']]

In [9]:
x = sm.add_constant(x1)
reg_log = sm.Logit(y, x)
results_log = reg_log.fit()

Optimization terminated successfully.
         Current function value: 0.336664
         Iterations 7


In [10]:
results_log.summary()

0,1,2,3
Dep. Variable:,Subscription,No. Observations:,518.0
Model:,Logit,Df Residuals:,512.0
Method:,MLE,Df Model:,5.0
Date:,"Tue, 22 Sep 2020",Pseudo R-squ.:,0.5143
Time:,15:31:37,Log-Likelihood:,-174.39
converged:,True,LL-Null:,-359.05
Covariance Type:,nonrobust,LLR p-value:,1.211e-77

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.0211,0.311,-0.068,0.946,-0.631,0.589
interest_rate,-0.8001,0.089,-8.943,0.000,-0.975,-0.625
credit,2.3585,1.088,2.169,0.030,0.227,4.490
march,-1.8322,0.330,-5.556,0.000,-2.478,-1.186
previous,1.5363,0.501,3.067,0.002,0.554,2.518
duration,0.0070,0.001,9.381,0.000,0.006,0.008


### Confusion Matrix

Create the confusion matrix of the model and estimate its accuracy. 

<i> For convenience we have already provided you with a function that finds the confusion matrix and the model accuracy.</i>

In [11]:
def confusion_matrix(data,actual_values,model):
        
        # Confusion matrix 
        
        # Parameters
        # ----------
        # data: data frame or array
            # data is a data frame formatted in the same way as your input data (without the actual values)
            # e.g. const, var1, var2, etc. Order is very important!
        # actual_values: data frame or array
            # These are the actual values from the test_data
            # In the case of a logistic regression, it should be a single column with 0s and 1s
            
        # model: a LogitResults object
            # this is the variable where you have the fitted model 
            # e.g. results_log in this course
        # ----------
        
        #Predict the values using the Logit model
        pred_values = model.predict(data)
        # Specify the bins 
        bins=np.array([0,0.5,1])
        # Create a histogram, where if values are between 0 and 0.5 tell will be considered 0
        # if they are between 0.5 and 1, they will be considered 1
        cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
        # Calculate the accuracy
        accuracy = (cm[0,0]+cm[1,1])/cm.sum()
        # Return the confusion matrix and 
        return cm, accuracy

In [16]:
cm = confusion_matrix(x, y, results_log)
cm

(array([[218.,  41.],
        [ 30., 229.]]),
 0.862934362934363)

In [17]:
cm_df = pd.DataFrame(results_log.pred_table())
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index = {0: 'Actual 0',1: 'Actual 1'})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,218.0,41.0
Actual 1,30.0,229.0


In [19]:
print('Accuracy: ' + "%.4f" % cm[1])

Accuracy: 0.8629
