## Import the relevant libraries

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

#Apply a fix to the statsmodels library
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

## Load the data

In [None]:
raw_data = pd.read_csv('3.0binaryPredictors.csv')
data = raw_data.copy()
data['Admitted'] = data['Admitted'].map({'Yes': 1, 'No': 0})
data['Gender'] = data['Gender'].map({'Female': 1, 'Male': 0})
data

## Declare the dependent and the independent variables

In [None]:
y = data['Admitted']
x1 = data[['SAT','Gender']]

## Regression

In [None]:
x = sm.add_constant(x1)
reg_log = sm.Logit(y,x)
results_log = reg_log.fit()
# Get the regression summary
results_log.summary()

In [None]:
np.exp(1.94)

## Accuracy

In [None]:
np.set_printoptions(formatter={'float': lambda x: "{0:0.2f}".format(x)})
#np.set_printoptions(formatter=None)
results_log.predict()

In [None]:
np.array(data['Admitted'])

In [None]:
results_log.pred_table()

In [None]:
cm_df = pd.DataFrame(results_log.pred_table())
cm_df.columns = ['Predicted 0','Predicted 1']
cm_df = cm_df.rename(index={0: 'Actual 0',1:'Actual 1'})
cm_df

In [None]:
cm = np.array(cm_df)
accuracy_train = (cm[0,0]+cm[1,1])/cm.sum()
accuracy_train

# Underfitting and Overfitting

* underfitting: the model has not captured the underlying logic of the data
* overfitting: our training has focused on the particular training set so much, it has "missed the point"



## Testing the model and assessing its accuracy

In [1]:
#* testing is done on a dataset the model has never seen before

test = pd.read_csv('4.0testDataset.csv')
test

NameError: name 'pd' is not defined

In [None]:
test['Admitted'] = test['Admitted'].map({'Yes': 1, 'No': 0})
test['Gender'] = test['Gender'].map({'Female': 1, 'Male': 0})
test

In [None]:
x

In [None]:
test_actual = test['Admitted']
test_data = test.drop(['Admitted'],axis=1)
test_data = sm.add_constant(test_data)
#test_data = test_data[x.columns.values]
test_data

In [None]:
def confusion_matrix(data,actual_values,model):
    
        pred_values = model.predict(data)
        bins=np.array([0,0.5,1])
        cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
        accuracy = (cm[0,0]+cm[1,1])/cm.sum()
        return cm, accuracy

In [None]:
cm = confusion_matrix(test_data,test_actual,results_log)
cm

In [None]:
cm_df = pd.DataFrame(cm[0])
cm_df.columns = ['Predicted 0','Predicted 1']
cm_df = cm_df.rename(index={0: 'Actual 0',1:'Actual 1'})
cm_df

In [None]:
print ('Missclassification rate: '+str((1+1)/19))