In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier

from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier
from imblearn.combine import SMOTEENN

from sklearn.metrics import accuracy_score, balanced_accuracy_score

from predict import load_train, load_test
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

In [2]:
X, y = load_train()
test_x, test_y = load_test()

classes = ["Not Occupied", "Occupied"]

In [3]:
smote_enn = SMOTEENN(random_state=0)

X_resamp, y_resamp = smote_enn.fit_resample(X, y)

print(np.unique(y, return_counts=True))
print(np.unique(y_resamp, return_counts=True))

(array([0., 1.]), array([14117,  3778]))
(array([0., 1.]), array([13946, 13963]))


In [6]:
# Testing Logistic Regression
log_clf = LogisticRegression(solver='lbfgs', random_state=0)
log_clf.fit(X, y)

log_pred = log_clf.predict(test_x)
log_bl_score = balanced_accuracy_score(test_y, log_pred, adjusted=True)
log_acc_score = accuracy_score(test_y, log_pred)

log_clf_se = LogisticRegression(solver='lbfgs', random_state=0)
log_clf_se.fit(X_resamp, y_resamp)

log_se_pred = log_clf_se.predict(test_x)
log_se_bl_score = balanced_accuracy_score(test_y, log_se_pred, adjusted=True)
log_se_acc_score = accuracy_score(test_y, log_se_pred)

# Testing regular Bagging 
bag = BaggingClassifier(n_estimators=500, n_jobs=-1, random_state=0)
bag.fit(X, y)

bag_pred = bag.predict(test_x)
bag_bl_score = balanced_accuracy_score(test_y, bag_pred, adjusted=True)
bag_acc_score = accuracy_score(test_y, bag_pred)

# Testing Balacned Bagging
bb_clf = BalancedBaggingClassifier(n_estimators=500, n_jobs=-1, random_state=0)
bb_clf.fit(X, y)

bb_pred = bb_clf.predict(test_x)
bb_bl_score = balanced_accuracy_score(test_y, bb_pred)
bb_acc_score = accuracy_score(test_y, bb_pred)

# Bagging with SMOTENN
bag_clf_se = BalancedBaggingClassifier(n_estimators=500, n_jobs=-1, random_state=0)
bag_clf_se.fit(X_resamp, y_resamp)

bag_se_pred = bag_clf_se.predict(test_x)
bag_se_bl_score = balanced_accuracy_score(test_y, bag_se_pred)
bag_se_acc_score = accuracy_score(test_y, bag_se_pred)

# Testing regular Random Forest
rf_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
rf_clf.fit(X, y)

rf_pred = rf_clf.predict(test_x)
rf_bl_score = balanced_accuracy_score(test_y, rf_pred, adjusted=True)
rf_acc_score = accuracy_score(test_y, rf_pred)

# Testing Balanced Random Forest
brf_clf = BalancedRandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
brf_clf.fit(X, y)

brf_pred = brf_clf.predict(test_x)
brf_bl_score = balanced_accuracy_score(test_y, brf_pred, adjusted=True)
brf_acc_score = accuracy_score(test_y, brf_pred)

# RandomForest with SMOTEEN

rf_clf_se = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
rf_clf_se.fit(X_resamp, y_resamp)

rf_se_pred = rf_clf_se.predict(test_x)
rf_se_bl_score = balanced_accuracy_score(test_y, rf_se_pred)
rf_se_acc_score = accuracy_score(test_y, rf_se_pred)

# Testing Gradient Boost Tree
gbt = GradientBoostingClassifier()
gbt.fit(X, y)

gbt_pred = gbt.predict(test_x)
gbt_bl_score = balanced_accuracy_score(test_y, gbt_pred)
gbt_acc_score = accuracy_score(test_y, gbt_pred)

# GBT with SMOTEEN

gbt_se = GradientBoostingClassifier()
gbt_se.fit(X_resamp, y_resamp)

gbt_se_pred = gbt_se.predict(test_x)
gbt_se_bl_score = balanced_accuracy_score(test_y, gbt_se_pred)
gbt_se_acc_score = accuracy_score(test_y, gbt_se_pred)

scores = pd.DataFrame([[log_acc_score, log_bl_score],
                       [log_se_acc_score, log_se_bl_score],
                       [bag_acc_score, bag_bl_score],
                       [bb_acc_score, bb_bl_score],
                       [bag_se_acc_score, bag_se_bl_score],
                       [rf_acc_score, rf_bl_score],
                       [brf_acc_score, brf_bl_score],
                       [rf_se_acc_score, rf_se_bl_score],
                       [gbt_acc_score, gbt_bl_score],
                       [gbt_se_acc_score, gbt_se_bl_score]
                       ], 
                       columns=['Test Accuracy', 'Balanced Accu. Score'],
                       index=['Logistic', 'Logistic SMOTEENN','Bagging', 'BalancedBag', 'Bagging SMOTEENN', 
                              'RandomForest', 'BalancedRF', 'RF SMOTEENN', 
                              'GBT', 'GBT SMOTEENN'])
scores

Unnamed: 0,Test Accuracy,Balanced Accu. Score
Logistic,0.978612,0.965018
Logistic SMOTEENN,0.978236,0.964427
Bagging,0.966229,0.931067
BalancedBag,0.978236,0.981994
Bagging SMOTEENN,0.977861,0.981918
RandomForest,0.965854,0.930038
BalancedRF,0.978612,0.965456
RF SMOTEENN,0.977861,0.981918
GBT,0.970732,0.971706
GBT SMOTEENN,0.978236,0.982213
