## Logistic regression

### probability to pass or fail an exam depending on hours of study
### data source: Wikipedia
### https://en.wikipedia.org/wiki/Logistic_regression


In [None]:
import sklearn
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import metrics
from matplotlib import pyplot as plt

In [None]:
# read the data
data_exams = pd.read_csv("PassFail.csv")
data_exams.head()

In [None]:
data_exams.describe()

In [None]:
formula = 'Pass ~ Hours'
model = smf.glm(formula=formula, data=data_exams, family = sm.families.Binomial())
result = model.fit()
result.summary()

In [None]:
probabilities = result.predict(data_exams['Hours'])

In [None]:
plt.scatter(data_exams['Hours'],probabilities,marker='+',color='red')

In [None]:
# what is the probability of passing with 2 hours of study ?
Hours = [2]
x = pd.DataFrame(Hours, columns = ['Hours'])
pass_pred = result.predict(x)
pass_pred

In [None]:
# One student studies 2.5 hours and another 3.5 hours
Hours = [2.5, 3.5]
x = pd.DataFrame(Hours, columns = ['Hours'])
pass_pred = result.predict(x)
pass_pred

#### What if we want to split the students into two groups: "likely to pass" and "likely to fail"?
#### We need to decide what is high enough probability to pass, so any student with a probability to pass that is higher than that threshold will be assigned to the "likely to pass" group, otherwise - to the "likely to fail" group.


In [None]:
# Usually the threshold that is used is 0.5
# The in-sample predictions
classes = np.where( probabilities > 0.5, 1, 0 )
classes

In [None]:
# Out of sample predictions
Hours = [2.5, 3.5]
x = pd.DataFrame(Hours, columns = ['Hours'])
pass_pred = result.predict(x)
cls = np.where( pass_pred > 0.5, 1, 0 )
cls


### Using LogisticRegression() from sklearn

In [None]:
logreg = LogisticRegression()
logreg.fit(data_exams[['Hours']], data_exams.Pass)

In [None]:
y_pred = logreg.predict(data_exams[['Hours']])
y_pred

In [None]:
# confusion matrix
cnf_matrix = metrics.confusion_matrix(data_exams.Pass, y_pred)
cnf_matrix

In [None]:
# ROC curve
y_pred_proba = logreg.predict_proba(data_exams[['Hours']])[::,1]
fpr, tpr, _ = metrics.roc_curve(data_exams.Pass,  y_pred_proba)
auc = metrics.roc_auc_score(data_exams.Pass, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()