In [1]:
import enum
import pandas as pd
import numpy as np
import sklearn
import pprint as pprn
from typing import Callable
from collections import OrderedDict
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn import metrics

%matplotlib inline


In [2]:
def prepare_dataset(dataset: pd.DataFrame, dropna_rows_naive: bool, columns_to_drop: list):
    if dropna_rows_naive:
        dataset.dropna(axis=0, inplace=True)

    if isinstance(columns_to_drop, list) and columns_to_drop:
        dataset.drop(columns_to_drop, axis=1, inplace=True)


def transform_dataset_column(dataset: pd.DataFrame, column_name: str, transform_function: Callable):
    dataset[column_name] = dataset[column_name].apply(transform_function)


def prepare_train_and_test_data(dataset_train: pd.DataFrame, dataset_test: pd.DataFrame, target_variable_name: str):
    X_train = dataset_train.drop(target_variable_name, axis=1)
    Y_train = dataset_train[target_variable_name]

    X_test = dataset_test.drop(target_variable_name, axis=1)
    Y_test = dataset_test[target_variable_name]

    return X_train, Y_train, X_test, Y_test


class LogRegressionModel(enum.Enum):
    EQUAL_WEIGHTS = 1,
    AUTO_BALANCED_WEIGHTS = 2,
    MANUAL_BALANCED_WEIGHTS = 3


def train_logistic_regression(X_train: pd.DataFrame, Y_train,
                              model_type: LogRegressionModel = LogRegressionModel.EQUAL_WEIGHTS,
                              class_weights: list = None) -> linear_model.LogisticRegression:
    if LogRegressionModel.EQUAL_WEIGHTS == model_type:
        model = linear_model.LogisticRegression(
            solver='liblinear', class_weight=None)
    elif LogRegressionModel.AUTO_BALANCED_WEIGHTS == model_type:
        model = linear_model.LogisticRegression(
            solver='liblinear', class_weight='balanced')
    elif LogRegressionModel.MANUAL_BALANCED_WEIGHTS == model_type:
        model = linear_model.LogisticRegression(solver='liblinear', class_weight={
                                                0: class_weights[0], 1: class_weights[1]})

    model.fit(X_train, Y_train)

    return model


def predict_with_logistic_regression(model: linear_model.LogisticRegression, X_test: pd.DataFrame):
    return model.predict(X_test)


def calculate_model_scores(X_test: pd.DataFrame, Y_test, Y_predicted,
                           model: sklearn.base.ClassifierMixin = None, print_scores: bool = False) -> dict:
    confusion_matrix = metrics.confusion_matrix(Y_test, Y_predicted)

    TN = confusion_matrix[0, 0]  # True Negative
    TP = confusion_matrix[1, 1]  # True Positive
    FN = confusion_matrix[1, 0]  # False Negative
    FP = confusion_matrix[0, 1]  # False Positive

    scores = OrderedDict()

    if model:
        scores["accuracy_model"] = model.score(X_test, Y_test)
    scores["accuracy"] = (TP + TN) / (TP + TN + FP + FN)

    scores["sensitivity"] = TP / (TP + FN)
    scores["specificity"] = TN / (TN + FP)

    scores["precision"] = TP / (TP + FP)

    scores["error_typeI"] = FP / (FP + TN)
    scores["error_typeII"] = FN / (FN + TP)

    if print_scores:
        print("[Model Scores]")
        pprn.pprint(scores)

    return scores


In [3]:
df_train = pd.read_csv('Admission_train.csv')
df_test = pd.read_csv('Admission_test.csv')

prepare_dataset(df_train, dropna_rows_naive=True,
                columns_to_drop=["Unnamed: 0", "Serial No."])
prepare_dataset(df_test, dropna_rows_naive=True,
                columns_to_drop=["Unnamed: 0", "Serial No."])

target_variable_name = "Chance of Admit"
transform_dataset_column(df_train, target_variable_name,
                         lambda value: 0 if value < 0.5 else 1)
transform_dataset_column(df_test, target_variable_name,
                         lambda value: 0 if value < 0.5 else 1)

X_train, Y_train, X_test, Y_test = prepare_train_and_test_data(
    df_train, df_test, target_variable_name)


In [4]:
model = train_logistic_regression(
    X_train, Y_train, LogRegressionModel.EQUAL_WEIGHTS)
Y_predicted = predict_with_logistic_regression(model, X_test)
model_scores = calculate_model_scores(
    X_test, Y_test, Y_predicted, model=model, print_scores=True)


[Model Scores]
OrderedDict([('accuracy_model', 0.91),
             ('accuracy', 0.91),
             ('sensitivity', 0.967032967032967),
             ('specificity', 0.3333333333333333),
             ('precision', 0.9361702127659575),
             ('error_typeI', 0.6666666666666666),
             ('error_typeII', 0.03296703296703297)])


In [5]:
model = train_logistic_regression(
    X_train, Y_train, LogRegressionModel.AUTO_BALANCED_WEIGHTS)
Y_predicted = predict_with_logistic_regression(model, X_test)
model_scores = calculate_model_scores(
    X_test, Y_test, Y_predicted, model=model, print_scores=True)


[Model Scores]
OrderedDict([('accuracy_model', 0.78),
             ('accuracy', 0.78),
             ('sensitivity', 0.7802197802197802),
             ('specificity', 0.7777777777777778),
             ('precision', 0.9726027397260274),
             ('error_typeI', 0.2222222222222222),
             ('error_typeII', 0.21978021978021978)])
