# Ensembles

Warning! Some of the code cells in the Notebook will take a very long time to run. To speed things up a little, I've used only small number of hyperparameter values in the grid searches and I've used cv=5, instead of cv=10. But it is still slow. You might consider running it on Google Colab for more speed.

In [None]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

from sklearn.dummy import DummyClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [None]:
rng = np.random.RandomState(2)

## Read in wine dataset, take a cheeky look, and split it

In [None]:
import os
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')
    base_dir = "./drive/My Drive/Colab Notebooks/" # You may need to change this, depending on where your notebooks are on Google Drive
else:
    base_dir = "."
dataset_dir = os.path.join(base_dir, "datasets")

In [None]:
df = pd.read_csv(os.path.join(dataset_dir, "wines.csv"))

In [None]:
df.head()

colour = 0 are red wines, colour = 1 are white wines. It is essential that train_test_split shuffles because red wines come first and then white wines.

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe(include="all")

In [None]:
df["quality"].value_counts()

We could use regression: we're predicting quality and it is numeric. Or we could use multiclass classification: there is a finite set of quality labels. Maybe we should use ordinal classification (if we knew how!), since the labels are ordered. 

We will use multiclass classification. 

Stratification is essential, since the dataset is very unbalanced. 

In [None]:
train, test = train_test_split(df, test_size=0.2, stratify=df["quality"], random_state=rng)

In [None]:
features = ["colour", "fixed acidity", "volatile acidity", "citric acid",
            "residual sugar", "chlorides", "free sulfur dioxide",
            "total sulfur dioxide", "density", "pH", "sulphates", "alcohol"]

X_train = train[features]
y_train = train["quality"]
X_test = test[features]
y_test = test["quality"]

## Majority-class classifier - we want to beat this!

In [None]:
dummy = DummyClassifier()

dummy.fit(X_train, y_train)

np.mean(cross_val_score(dummy, X_train, y_train, scoring="accuracy", cv=5))

## Decision Tree, kNN, Multinomial Logistic Regression

A handy function that we will make much use - it does model selection using grid search.

In [None]:
def grid_search(preprocessor, predictor, param_grid, cv, metric, X_train, y_train):
    model = Pipeline([
                ("preprocessor", preprocessor),
                ("predictor", predictor)
    ])

    gs = GridSearchCV(model, param_grid, scoring=metric, cv=cv, n_jobs=-1)

    gs.fit(X_train, y_train)

    return gs

In [None]:
decision_tree_gs = grid_search(
    preprocessor = None,
    predictor = DecisionTreeClassifier(random_state=rng),
    param_grid = {
        "predictor__max_depth": range(1, 11)                 
    },
    cv = 5,
    metric = "accuracy",
    X_train = X_train,
    y_train = y_train
)

decision_tree_gs.best_params_, decision_tree_gs.best_score_

For kNN and Logistic Regresssion, we'll use standardization to scale the features. We could use grid search to choose between various scaling methods - but we won't bother this time.

In [None]:
knn_gs = grid_search(
    preprocessor = StandardScaler(), 
    predictor = KNeighborsClassifier(),
    param_grid = {
        "predictor__n_neighbors": range(1, 11)
    },
    cv = 5,
    metric = "accuracy",
    X_train = X_train,
    y_train = y_train
)

knn_gs.best_params_, knn_gs.best_score_

In [None]:
logistic_gs = grid_search(
    preprocessor = StandardScaler(), 
    predictor = LogisticRegression(penalty=None, max_iter=600, random_state=rng),
    param_grid = {},
    cv = 5,
    metric = "accuracy",
    X_train = X_train,
    y_train = y_train
)

logistic_gs.best_params_, logistic_gs.best_score_

## Voting Classifier

In [None]:
voting_gs = grid_search(
    preprocessor = StandardScaler(), 
    predictor = VotingClassifier(estimators=[("tree", DecisionTreeClassifier(random_state=rng)), 
                                             ("knn", KNeighborsClassifier()),
                                             ("logistic", LogisticRegression(penalty=None, max_iter=600, random_state=rng))],
                                 n_jobs=-1),
    param_grid = {
        "predictor__tree__max_depth": range(1, 4),
        "predictor__knn__n_neighbors": range(10, 13) ,
    },
    cv = 5,
    metric = "accuracy",
    X_train = X_train,
    y_train = y_train
)

voting_gs.best_params_, voting_gs.best_score_

This simple ensemble is worse than kNN on its own. Why?

## Bagging (a) with kNN, (b) with Decision Trees = Random Forest

(I also tried bagging with Logistic Regression but its accuracy wasn't great.)

In [None]:
bagging_knn_gs = grid_search(
    preprocessor = StandardScaler(), 
    predictor = BaggingClassifier(estimator=KNeighborsClassifier(), n_jobs=-1),
    param_grid = {
        "predictor__n_estimators": [100, 150, 200],
        "predictor__estimator__n_neighbors": range(1, 4) ,
    },
    cv = 5,
    metric = "accuracy",
    X_train = X_train,
    y_train = y_train
)

bagging_knn_gs.best_params_, bagging_knn_gs.best_score_

In [None]:
random_forest_gs = grid_search(
    preprocessor = None, 
    predictor = RandomForestClassifier(n_jobs=-1, random_state=rng),
    param_grid = {
        "predictor__n_estimators": [250, 300, 350],
        "predictor__max_depth": range(10, 13) ,
    },
    cv = 5,
    metric = "accuracy",
    X_train = X_train,
    y_train = y_train
)

random_forest_gs.best_params_, random_forest_gs.best_score_

## AdaBoost with Decision Trees

The models within an AdaBoost ensemble must allow weighted examples. This excludes kNN but does include Decision Trees (the default) and Logistic Regression. (Again I tried it wih Logistic Regression but it didn't perform well.)

In [None]:
ada_boost_gs = grid_search(
    preprocessor = None, 
    predictor = AdaBoostClassifier(estimator=DecisionTreeClassifier(random_state=rng), random_state=rng),
    param_grid = {
        "predictor__n_estimators": [200, 250, 300], 
        "predictor__estimator__max_depth": range(10, 13)
    },
    cv = 5,
    metric = "accuracy",
    X_train = X_train,
    y_train = y_train
)

ada_boost_gs.best_params_, ada_boost_gs.best_score_

## Gradient Boosting

In scikit-learn's implementation, this ensemble is made up of trees. (There seems to be some debate about whether scaling is needed or not. Since it does no harm, I've included it.) This one will take an especially long time to run.

In [None]:
gradient_boost_gs = grid_search(
    preprocessor = StandardScaler(), 
    predictor = GradientBoostingClassifier(random_state=rng),
    param_grid = {
        "predictor__n_estimators": [100, 150, 200],
        "predictor__max_depth": range(10, 13)
    },
    cv = 5,
    metric = "accuracy",
    X_train = X_train,
    y_train = y_train
)

gradient_boost_gs.best_params_, gradient_boost_gs.best_score_

## How do they all perform on the test set?

In [None]:
[accuracy_score(model.predict(X_test), y_test) for model in 
     [decision_tree_gs.best_estimator_, 
      knn_gs.best_estimator_, 
      logistic_gs.best_estimator_, 
      voting_gs.best_estimator_, 
      bagging_knn_gs.best_estimator_, 
      random_forest_gs.best_estimator_,
      ada_boost_gs.best_estimator_,
      gradient_boost_gs.best_estimator_]]