# Logistic Regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

## Import data

In [None]:
col_admit = pd.read_csv("../assets/dataset/collegeadmissions.csv")

col_admit.head(10)

## Parse data

In [None]:
# check parsing
col_admit.info()

In [None]:
# re-parse
col_admit['admit'] = col_admit['admit'] == 1

# create dummy variables
rank_dummy = pd.get_dummies(col_admit['rank'], prefix='rank')

rank_dummy.head(10)

# fuse back
col_admit_clean = pd.concat([col_admit, rank_dummy], axis = 1)

# drop original rank column
col_admit_clean = col_admit_clean.drop(columns = ['rank'])

In [None]:
# check parsing
col_admit_clean.info()

In [None]:
# looks ok
col_admit_clean.head(10)

## Model

### Training

In [None]:
# specifying the classifier. The C paramater is set high so we can compare with stats models and R output
lr = LogisticRegression(C=1e9)

# feature set
X = col_admit_clean.drop(columns = ['admit', 'rank_1'])

# target
y = col_admit_clean['admit']

# creating training / testings datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0, random_state=42)

In [None]:
print(len(X_train.index))
print(len(X_test.index))

In [None]:
# fit the classifier# fit t 
lr.fit(X_train,y_train)

### Interpretation

In [None]:
lr.coef_

### Testing

In [None]:
# predicted
predicted = lr.predict(X_test)

# predicted probabilities
predictied_prob = lr.predict_proba(X_test)

# actual
validate = pd.DataFrame(y_test)

validate.columns = ['actual']

validate['predicted'] = predicted

# estimated probability of admission
validate['predictied_prob_admit'] = predictied_prob[:,1]

# estimated probability of rejection!
validate['predictied_prob_not_admit'] = predictied_prob[:,0]

validate.head(10)

Note: When predicited_prob_admit > 0.5 we predict that the student will be admitted!!

In [None]:
# overal accuracy... be careful!!
np.sum(validate['actual'] == validate['predicted']) / len(validate)

## Confusion matrix

In [None]:
confusion_matrix(validate['actual'], validate['predicted'])

In [None]:
pd.crosstab(validate['actual'], validate['predicted'], rownames=['Actual'], colnames=['Predicted'], margins=True)

What can we say about this? Is this a good model?

## ROC Curve (what happens when we vary our criteria for classification???)

In [None]:
plt.plot(roc_curve(y_test, predictied_prob[:,1])[0], roc_curve(y_test, predictied_prob[:,1])[1])

Is this a good model? What about error? We we have just done one shot here... perhaps we can use cross-vailidation to get a better idea of the model performance.