In [1]:

# see https://ipython.readthedocs.io/en/stable/interactive/magics.html
%pylab inline

# sets backend to render higher res images
%config InlineBackend.figure_formats = ['retina']

#######################
#       imports       #
#######################
import pandas as pd
import numpy as np
import seaborn as sns

import itertools
import pickle
from mlxtend.plotting import plot_decision_regions
from mlxtend.classifier import StackingClassifier # <-- note: this is not from sklearn!


from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, 
                              AdaBoostClassifier, BaggingRegressor)

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.datasets.california_housing import fetch_california_housing

# change margin size of jupyter notebook
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

sns.set_style("whitegrid")

Populating the interactive namespace from numpy and matplotlib


In [2]:
# Read in pickle data

with open("data/small_images_X_values.pkl", "rb") as pfile:
    X_data = pickle.load(pfile)
with open("data/small_images_y_values.pkl", "rb") as pfile:
    y_data = pickle.load(pfile)


In [3]:
X_data[:5]

array([[162, 108,  61, ..., 157,  90,  19],
       [ 26,  44,  94, ..., 143, 152, 125],
       [ 58,  62,  73, ..., 150, 134, 121],
       [103, 105, 100, ..., 140,  98,  58],
       [ 20,   7,   1, ...,   2,   4,   1]], dtype=uint8)

In [4]:
y_data[:5]

array([0, 0, 0, 0, 0])

In [15]:
# Set up models for ensemble. Logistic Regression didn't converge so I used an ada model instead

rf_model = RandomForestClassifier(n_estimators=100)
lr_model = LogisticRegression()
nb_model = GaussianNB()
ada_model = AdaBoostClassifier()

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.20, random_state=123)
X_train, X_cross, y_train, y_cross = train_test_split(X_train, y_train, test_size=0.25, random_state=123)

In [16]:
# Create classifier

model_names = ["ada_model", "nb_model", "rf_model"]
model_vars = [eval(n) for n in model_names]
print(model_vars)

[AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None), GaussianNB(priors=None, var_smoothing=1e-09), RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)]


In [17]:
model_list = list(zip(model_names, model_vars))

In [18]:
voting_classifer = VotingClassifier(estimators=model_list,
                                    voting='hard',
                                    n_jobs=-1)
voting_classifer.fit(X_train, y_train)

VotingClassifier(estimators=[('ada_model', AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)), ('nb_model', GaussianNB(priors=None, var_smoothing=1e-09)), ('rf_model', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))],
         flatten_transform=None, n_jobs=-1, voting='hard', weights=None)

In [19]:
# Hard classfier wasn't much better than just random forest and took much longer so went back to random forest
y_pred = voting_classifer.predict(X_cross)
accuracy_score(y_cross, y_pred)

0.6357608695652174

In [20]:
# Soft was even worse!

voting_classifer = VotingClassifier(estimators=model_list,
                                    voting='soft',
                                    n_jobs=-1)
voting_classifer.fit(X_train, y_train)

VotingClassifier(estimators=[('ada_model', AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)), ('nb_model', GaussianNB(priors=None, var_smoothing=1e-09)), ('rf_model', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))],
         flatten_transform=None, n_jobs=-1, voting='soft', weights=None)

In [21]:
y_pred = voting_classifer.predict(X_cross)
accuracy_score(y_cross, y_pred)

0.6023913043478261