In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
sns.set()

# Load Data

In [9]:
raw_data = pd.read_csv("Binary predictors.csv")
raw_data.head()

Unnamed: 0,SAT,Admitted,Gender
0,1363,No,Male
1,1792,Yes,Female
2,1954,Yes,Female
3,1653,No,Male
4,1593,No,Male


# Map Variables

In [10]:
data = raw_data.copy()
data["Admitted"] = data["Admitted"].map({"Yes": 1, "No":0})
data["Gender"] = data["Gender"].map({"Female":1, "Male":0})
data.head()

Unnamed: 0,SAT,Admitted,Gender
0,1363,0,0
1,1792,1,1
2,1954,1,1
3,1653,0,0
4,1593,0,0


# Declare Variables

In [11]:
y = data["Admitted"]
x1 = data[["Gender", "SAT"]]

# Regression

In [12]:
x = sm.add_constant(x1)
reg_log = sm.Logit(y,x)
results_log = reg_log.fit()
results_log.summary()

Optimization terminated successfully.
         Current function value: 0.120117
         Iterations 10


0,1,2,3
Dep. Variable:,Admitted,No. Observations:,168.0
Model:,Logit,Df Residuals:,165.0
Method:,MLE,Df Model:,2.0
Date:,"Wed, 17 Oct 2018",Pseudo R-squ.:,0.8249
Time:,20:11:14,Log-Likelihood:,-20.18
converged:,True,LL-Null:,-115.26
,,LLR p-value:,5.1180000000000006e-42

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-68.3489,16.454,-4.154,0.000,-100.598,-36.100
Gender,1.9449,0.846,2.299,0.022,0.287,3.603
SAT,0.0406,0.010,4.129,0.000,0.021,0.060


log(odds) = -0.64 + 2.08 * Gender # Only Gender included in x1
log(odds) = -0.68.35 + 1.95 * Gender + 0.41 * SAT 

In [13]:
np.exp(1.9449)

6.992932526814459

7 * more likely to get admitted if student is female with same SAT score

In [14]:
np.set_printoptions(formatter={'float':lambda x: "{0:0.2f}".format(x)})
results_log.predict()

array([0.00, 1.00, 1.00, 0.23, 0.02, 0.99, 1.00, 1.00, 1.00, 0.01, 1.00,
       1.00, 0.76, 0.00, 0.60, 1.00, 0.11, 0.12, 0.51, 1.00, 1.00, 1.00,
       0.00, 0.01, 0.97, 1.00, 0.48, 0.99, 1.00, 0.99, 0.00, 0.83, 0.25,
       1.00, 1.00, 1.00, 0.31, 1.00, 0.23, 0.00, 0.02, 0.45, 1.00, 0.00,
       0.99, 0.00, 0.99, 0.00, 0.00, 0.01, 0.00, 1.00, 0.92, 0.02, 1.00,
       0.00, 0.37, 0.98, 0.12, 1.00, 0.00, 0.78, 1.00, 1.00, 0.98, 0.00,
       0.00, 0.00, 1.00, 0.00, 0.78, 0.12, 0.00, 0.99, 1.00, 1.00, 0.00,
       0.30, 1.00, 1.00, 0.00, 1.00, 1.00, 0.85, 1.00, 1.00, 0.00, 1.00,
       1.00, 0.89, 0.83, 0.00, 0.98, 0.97, 0.00, 1.00, 1.00, 0.03, 0.99,
       0.96, 1.00, 0.00, 1.00, 0.01, 0.01, 1.00, 1.00, 1.00, 0.00, 0.00,
       0.02, 0.33, 0.00, 1.00, 0.09, 0.00, 0.97, 0.00, 0.75, 1.00, 1.00,
       0.01, 0.01, 0.00, 1.00, 0.00, 0.99, 0.57, 0.54, 0.87, 0.83, 0.00,
       1.00, 0.00, 0.00, 0.00, 1.00, 0.04, 0.00, 0.01, 1.00, 0.99, 0.52,
       1.00, 1.00, 0.05, 0.00, 0.00, 0.00, 0.68, 1.

In [16]:
cm = results_log.pred_table()
cm

array([[69.00, 5.00],
       [4.00, 90.00]])

actual 0: predicted 0 = 69. actual 1: predicted 1 = 90.
actual 0: predicted 1 = 5. actual 1: predicted 0 = 4.

# Model Accuracy

In [20]:
accuracy = (cm[0,0]+cm[1,1])/cm.sum()
accuracy

0.9464285714285714

# Testing the model and assesing its accuracy

In [27]:
test = pd.read_csv("Test dataset.csv")
test.head()

Unnamed: 0,SAT,Admitted,Gender
0,1323,No,Male
1,1725,Yes,Female
2,1762,Yes,Female
3,1777,Yes,Male
4,1665,No,Male


In [28]:
test["Admitted"] = test["Admitted"].map({"Yes":1, "No":0})
test["Gender"] = test["Gender"].map({"Female": 1, "Male":0})
test.head()

Unnamed: 0,SAT,Admitted,Gender
0,1323,0,0
1,1725,1,1
2,1762,1,1
3,1777,1,0
4,1665,0,0


In [30]:
x.head()

Unnamed: 0,const,Gender,SAT
0,1.0,0,1363
1,1.0,1,1792
2,1.0,1,1954
3,1.0,0,1653
4,1.0,0,1593


In [31]:
test_actual = test["Admitted"]
test_data = test.drop(["Admitted"], axis=1)
test_data = sm.add_constant(test_data)
test_data.head()

Unnamed: 0,const,SAT,Gender
0,1.0,1323,0
1,1.0,1725,1
2,1.0,1762,1
3,1.0,1777,0
4,1.0,1665,0


In [33]:
test_data = test_data[["const", "Gender", "SAT"]]
test_data.head()

Unnamed: 0,const,Gender,SAT
0,1.0,0,1323
1,1.0,1,1725
2,1.0,1,1762
3,1.0,0,1777
4,1.0,0,1665


In [36]:
def confusion_matrix(data, actual_values, model):
    pred_values = model.predict(data)
    bins=np.array([0,0.5,1])
    cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
    accuracy = (cm[0,0]+cm[1,1])/cm.sum()
    return cm, accuracy

In [37]:
cm = confusion_matrix(test_data, test_actual, results_log)
cm

(array([[5.00, 1.00],
        [1.00, 12.00]]), 0.8947368421052632)