A quick survey of some simple classifiers (LDA, QDA, logistic regression) and logistic regression with ridge shrinkage. Models are compared on original data, bootstrapped data, and oversampled data, and undersampled data.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
import os
import pathlib

from sklearn.discriminant_analysis import (LinearDiscriminantAnalysis as LDA, QuadraticDiscriminantAnalysis as QDA)
from sklearn.metrics import f1_score, make_scorer
import sklearn.linear_model as skl
import sklearn.model_selection as skm

from ISLP import confusion_table

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [2]:
data = pd.read_csv(os.path.join(pathlib.Path.home(), "stat5610", "stat-5610-project", "data", "train.csv"))
x_data = np.array(data[data.columns.drop("Y")].values)
y_data = data["Y"].values

Quantify imbalance

In [3]:
pos_count = sum(y_data == 1)
neg_count = sum(y_data == 0)
print(f"Positive: {100*pos_count/len(y_data)}%")
print(f"Negative: {100*neg_count/len(y_data)}%")

Positive: 2.502%
Negative: 97.498%


Train/test split

In [4]:
idx = list(range(len(y_data)))
train_idx, test_idx = skm.train_test_split(idx)
x_train, y_train = x_data[train_idx, :], y_data[train_idx]
x_test, y_test = x_data[test_idx, :], y_data[test_idx]

Bootstrapped Class 1 observations for test set

In [None]:
class_1_idx = np.where(y_train == 1)[0]
class_0_idx = np.where(y_train == 0)[0]

rng = np.random.default_rng(25)
n = len(class_0_idx) # Bootstrap to get 50/50 split
class_1_idx_bs = rng.choice(class_1_idx, n, replace=True)

x_train_bs = np.concatenate([x_train[class_1_idx_bs], x_train[class_0_idx]])
y_train_bs = np.concatenate([y_train[class_1_idx_bs], y_train[class_0_idx]])

Oversampled data

In [4]:
smote = SMOTE(random_state=42)
x_train_sm, y_train_sm = smote.fit_resample(x_train, y_train)

Undersampled data

In [5]:
rus = RandomUnderSampler(random_state=0)
x_train_rus, y_train_rus = rus.fit_resample(x_train, y_train)

LDA classifier with bootstrapped data

In [16]:
lda = LDA(store_covariance=True)
results_lda = lda.fit(x_train_bs, y_train_bs)
lda_pred = lda.predict(x_test)
f1 = f1_score(y_test, lda_pred)
print(f"F1 Score: {f1}")
confusion_table(lda_pred, y_test)

F1 Score: 0.1761723700887199


Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,19244,72
1,5128,556


LDA classifier with oversampled data

In [34]:
lda = LDA(store_covariance=True)
results_lda = lda.fit(x_train_sm, y_train_sm)
lda_pred = lda.predict(x_test)
f1 = f1_score(y_test, lda_pred)
print(f"F1 Score: {f1}")
confusion_table(lda_pred, y_test)

F1 Score: 0.1736757624398074


Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,19311,86
1,5062,541


LDA classifier with undersampled data

In [35]:
lda = LDA(store_covariance=True)
results_lda = lda.fit(x_train_rus, y_train_rus)
lda_pred = lda.predict(x_test)
f1 = f1_score(y_test, lda_pred)
print(f"F1 Score: {f1}")
confusion_table(lda_pred, y_test)

F1 Score: 0.17205671499123784


Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,19263,87
1,5110,540


Logistic classifier with original data

In [53]:
model = sm.GLM(y_train, x_train, family=sm.families.Binomial())
results = model.fit()

preds = results.predict(x_test)

n = len(preds)
labels = np.zeros(n)
labels[preds > 0.5] = 1
f1 = f1_score(y_test, labels)
print(f"F1 Score: {f1}")

conf_table_full = confusion_table(labels, y_test)
conf_table_full

F1 Score: 0.04945904173106646


Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,24369,611
1,4,16


Logistic classifier with bootstrapped Class 1 data

In [24]:
model = sm.GLM(y_train_bs, x_train_bs, family=sm.families.Binomial())
results = model.fit()

preds = results.predict(x_test)

n = len(preds)
labels = np.zeros(n)
labels[preds > 0.8] = 1
f1 = f1_score(y_test, labels)
print(f"F1 Score: {f1}")

conf_table_full = confusion_table(labels, y_test)
conf_table_full

F1 Score: 0.20397208803005903


Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,23327,438
1,1045,190


Logistic classifier with oversampled data

In [38]:
model = sm.GLM(y_train_sm, x_train_sm, family=sm.families.Binomial())
results = model.fit()

preds = results.predict(x_test)

n = len(preds)
labels = np.zeros(n)
labels[preds > 0.5] = 1
f1 = f1_score(y_test, labels)
print(f"F1 Score: {f1}")

conf_table_full = confusion_table(labels, y_test)
conf_table_full

F1 Score: 0.15636057287278854


Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,19529,163
1,4844,464


Logistic classifier with undersampled data

In [39]:
model = sm.GLM(y_train_rus, x_train_rus, family=sm.families.Binomial())
results = model.fit()

preds = results.predict(x_test)

n = len(preds)
labels = np.zeros(n)
labels[preds > 0.5] = 1
f1 = f1_score(y_test, labels)
print(f"F1 Score: {f1}")

conf_table_full = confusion_table(labels, y_test)
conf_table_full

F1 Score: 0.15523097826086957


Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,19569,170
1,4804,457


QDA classifier with bootstrapped data

In [25]:
qda = QDA(store_covariance=True)
qda.fit(x_train_bs, y_train_bs)
qda_pred = qda.predict(x_test)
f1 = f1_score(y_test, qda_pred)
print(f"F1 Score: {f1}")
confusion_table(qda_pred, y_test)

F1 Score: 0.24121679520137104


Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,20895,65
1,3477,563


QDA classifier with oversampled data

In [40]:
qda = QDA(store_covariance=True)
qda.fit(x_train_sm, y_train_sm)
qda_pred = qda.predict(x_test)
f1 = f1_score(y_test, qda_pred)
print(f"F1 Score: {f1}")
confusion_table(qda_pred, y_test)

F1 Score: 0.262873957822462


Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,21458,91
1,2915,536


QDA classifier with undersampled data

In [41]:
qda = QDA(store_covariance=True)
qda.fit(x_train_rus, y_train_rus)
qda_pred = qda.predict(x_test)
f1 = f1_score(y_test, qda_pred)
print(f"F1 Score: {f1}")
confusion_table(qda_pred, y_test)

F1 Score: 0.23711340206185566


Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,20896,75
1,3477,552


Regularized (ridge) logistic classifier with original data and CV to find l2 penalty weight

In [7]:
model = skl.LogisticRegression(penalty="l2")
parameters = {"C":list(np.arange(0.01,1.0, 0.05))}
scorer = make_scorer(f1_score)
clf = skm.GridSearchCV(model, parameters, scoring=scorer, return_train_score=True)
result = clf.fit(x_train, y_train)
preds = result.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.21438263229308005


Regularized (ridge) logistic classifier with bootstrapped data and CV to find l2 penalty weight

In [26]:
model = skl.LogisticRegression(penalty="l2")
parameters = {"C":list(np.arange(0.01,10, 0.1))}
scorer = make_scorer(f1_score)
clf = skm.GridSearchCV(model, parameters, scoring=scorer, return_train_score=True)
result = clf.fit(x_train_bs, y_train_bs)
preds = result.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.1864672116889894


Regularized (ridge) logistic classifier with oversampled data  and CV to find l2 penalty weight

In [8]:
model = skl.LogisticRegression(penalty="l2")
parameters = {"C":list(np.arange(0.01,1.0, 0.05))}
scorer = make_scorer(f1_score)
clf = skm.GridSearchCV(model, parameters, scoring=scorer, return_train_score=True)
result = clf.fit(x_train_sm, y_train_sm)
preds = result.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.1901732060798869


Regularized (ridge) logistic classifier with undersampled data and CV to find l2 penalty weight

In [9]:
model = skl.LogisticRegression(penalty="l2")
parameters = {"C":list(np.arange(0.01,1.0, 0.05))}
scorer = make_scorer(f1_score)
clf = skm.GridSearchCV(model, parameters, scoring=scorer, return_train_score=True)
result = clf.fit(x_train_rus, y_train_rus)
preds = result.predict(x_test)
f1 = f1_score(y_test, preds)
print(f"F1 Score: {f1}")

F1 Score: 0.1833810888252149
