In [1]:
# This notebook uses Scikit-Learn's LogisticRegression class to perform the task of Admission Prediction.

In [2]:
# Import Statements
import numpy as np
import pandas as pd
import sklearn
from pathlib import Path
import joblib

In [3]:
# Load the data into a pandas dataframe
root = Path('archive')
filename = 'Admission_Predict_Ver1.1.csv'

admission_data = pd.read_csv(root / filename)
admission_data.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [4]:
# Select the predictors and target values
X = admission_data[['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA', 'Research']]
Y = admission_data['Chance of Admit ']
X.shape, Y.shape

((500, 7), (500,))

In [5]:
# Convert the target values to 0's and 1's
def binarizeValues(list_var):
    '''This function takes a list of probabality values and binarizes them so that the output only contains 0s and 1s.'''
    output_list = []
    for item in list_var:
        if item < 0.5:
            output_list.append(0)
        else:
            output_list.append(1)
    return output_list

Y = pd.Series(binarizeValues(Y.tolist()))

In [6]:
# Create a train and test set
shuffled_indices = np.random.permutation(len(X))
X_shuffled = X.loc[shuffled_indices]
Y_shuffled = Y.loc[shuffled_indices]

X_train = X_shuffled[:400]
Y_train = Y_shuffled[:400]
X_test = X_shuffled[400:]
Y_test = Y_shuffled[400:]
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((400, 7), (100, 7), (400,), (100,))

In [7]:
# Cross Validation to estimate the models performance
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_reg = LogisticRegression()
cross_val_score(log_reg, X_train, Y_train, scoring = 'accuracy', cv = 3)



array([0.92537313, 0.90225564, 0.90977444])

In [8]:
# Train the Logistic Regression model on the train data
log_reg.fit(X_train, Y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [9]:
# Generate predictions for the test set
Y_test_predictions = log_reg.predict(X_test)

In [10]:
# Evaluate the test set
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(Y_test, Y_test_predictions)
print('Test Set Accuracy: ', accuracy)

Test Set Accuracy:  0.93


In [11]:
# The cross val accuracy and the test set accuracy are misleading! The labels/classes were skewed.
def numberOfZeros(list1):
    count = 0
    for item in list1:
        if item == 0:
            count += 1
        else:
            continue
    return count

def numberOfOnes(list1):
    count = 0
    for item in list1:
        if item == 1:
            count += 1
        else:
            continue
    return count

print('Rejects: {}'.format(numberOfZeros(Y.tolist())) + '\n' + 'Admits: {}'.format(numberOfOnes(Y.tolist())))

Rejects: 37
Admits: 463


In [12]:
# A better performance metric is the Precision, Recall and F1-score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

cm = confusion_matrix(Y_test, Y_test_predictions)
print('Confusion Matrix: \n {}'.format(cm))

precision = precision_score(Y_test, Y_test_predictions)
recall = recall_score(Y_test, Y_test_predictions)
f1 = f1_score(Y_test, Y_test_predictions)
print('Precision: {}'.format(precision) + '\n' + 'Recall: {}'.format(recall) + '\n' + 'F1 Score: {}'.format(f1))


Confusion Matrix: 
 [[ 1  5]
 [ 2 92]]
Precision: 0.9484536082474226
Recall: 0.9787234042553191
F1 Score: 0.963350785340314
