In [1]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from collections import Counter

# local imports
from prepare import *
from evaluate import *

### Read in initial datasets if needed

In [2]:
#raw_train, raw_train_labels, raw_test, specs, sample = read_raw_csvs()
raw_train_labels = pd.read_csv('data/train_labels.csv')

### Load compiled train/test datasets

In [3]:
train, test = load_and_prep(raw_train_labels)

### Start throwing model mud at the wall

In [4]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \
                                ExtraTreesClassifier, BaggingClassifier, \
                                GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings('ignore')  #Ridge classifier throws some warnings about ill-conditioned matrix

### Baseline accuracy of 50%

In [5]:
train.accuracy_group.value_counts(normalize=True)

3    0.500000
0    0.239062
1    0.136292
2    0.124647
Name: accuracy_group, dtype: float64

#### KNN and SVC will require scaling, others shouldn't

In [6]:
rf = RandomForestClassifier()
#lr = LogisticRegression()
#sgd = SGDClassifier()
rc = RidgeClassifier()
#nb = GaussianNB()
ac = AdaBoostClassifier()
et = ExtraTreesClassifier()
bc = BaggingClassifier()
gbc = GradientBoostingClassifier()
#vc = VotingClassifier()  
#sc = StackingClassifier()

In [7]:
knn = KNeighborsClassifier()
svc = SVC()

In [9]:
quick_eval(train, rf, cv=True)
#quick_eval(train, lr, cv=True)
#quick_eval(train, sgd, cv=True)
quick_eval(train, rc, cv=True)
#quick_eval(train, nb, cv=True)
quick_eval(train, ac, cv=True)
quick_eval(train, et, cv=True)
quick_eval(train, bc, cv=True)
quick_eval(train, gbc, cv=True)

#quick_eval(train, knn, scale=True, cv=True)
quick_eval(train, svc, scale=True, cv=True)

The CV score of RandomForestClassifier is 0.5334652345958169
The CV score of RidgeClassifier is 0.5076879592990391
The CV score of AdaBoostClassifier is 0.5541548897682307
The CV score of ExtraTreesClassifier is 0.5105709440361786
The CV score of BaggingClassifier is 0.4841153193894856
The CV score of GradientBoostingClassifier is 0.5570378745053702
The CV score of SVC is 0.5087054833239119


('SVC', 0.5087054833239119)

In [12]:
vc = VotingClassifier(estimators=[
    ('Adaboost',ac), ('rf', rf), ('gbc', gbc)],
                     voting='soft')
quick_eval(train, vc)#, cv=True)

The accuracy of VotingClassifier is 0.562464669304692


('VotingClassifier', 0.562464669304692)

In [15]:
vc = VotingClassifier(estimators=[
    ('Adaboost',ac),
    ('rf', rf), 
    ('gbc', gbc),
    ('et', et),
    ('svc', svc),
    ('rc', rc)],
                     voting='hard')
quick_eval(train, vc)#, cv=True)

The accuracy of VotingClassifier is 0.5644431882419446


('VotingClassifier', 0.5644431882419446)

In [16]:
estimators = [
    ('Adaboost',ac),
    ('rf', rf), 
    ('gbc', gbc),
    ('et', et),
    ('svc', svc),
    ('rc', rc),
]
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier())
quick_eval(train, stacking_clf)

The accuracy of StackingClassifier is 0.5816845675522895


('StackingClassifier', 0.5816845675522895)