# COMP47590: Advanced Machine Learning
# Assignment 1: Building Stacked Ensembles

Name(s): 

Student Number(s):

## Import Packages Etc

In [1]:
from IPython.display import display, HTML, Image

from TAS_Python_Utilities import data_viz
from TAS_Python_Utilities import data_viz_target
from TAS_Python_Utilities import visualize_tree

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import pyplot
from random import randint
import math

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from scipy.spatial import distance
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.tree import export_graphviz
from sklearn import tree
from sklearn import metrics
from sklearn import tree
from sklearn import svm
from sklearn import ensemble
from sklearn import linear_model
from sklearn import neighbors
from sklearn.utils import resample
from sklearn.metrics import cohen_kappa_score
from collections import Counter

from sklearn.ensemble import RandomForestClassifier

%matplotlib inline
#%qtconsole

Loading Iris Data

In [2]:
from sklearn.datasets import load_iris
iris = load_iris()

Loading MNIST Data

In [3]:
df = pd.read_csv("./MNIST_Data/mnist_train_small.csv")
df = df.groupby('value').apply(lambda x: x.sample(20)) #Sample only 20 observations from each class
target = df.value
data = df.drop(columns = ["value"])
data = data.values
target = target.values
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.3)

## Define StackedEnsembleClassifier

Utility function to create classifer objects based on a name

In [4]:
def create_classifier(classifier_type, tree_min_samples_split = 20):

    if classifier_type == "svm":
        c = svm.SVC(probability=True, gamma='auto')

    elif classifier_type == "logreg":
        c = linear_model.LogisticRegression(multi_class='ovr', solver='liblinear', max_iter=1000)

    elif classifier_type == "knn":
        c = neighbors.KNeighborsClassifier()

    elif classifier_type == "tree":
        c = tree.DecisionTreeClassifier(min_samples_split = tree_min_samples_split)

    elif classifier_type == "randomforest":
        c = ensemble.RandomForestClassifier()
        
    else:
        c = linear_model.LogisticRegression(multi_class='ovr', solver='liblinear', max_iter=1000)
    
    return c

StackedEnsembleClassifier class.

In [5]:
# Create a new classifier which is based on the sckit-learn BaseEstimator and ClassifierMixin classes
class StackedEnsembleClassifier(BaseEstimator, ClassifierMixin):
    
    """An ensemble classifier that uses heterogeneous models at the base layer and a aggregatnio model at the aggregation layer. A k-fold cross validation is used to gnerate training data for the stack layer model.

    Parameters
    ----------
    base_estimators: list 
        A list of the classifiers in the ase layer of the ensemble. Supported types are
        - "svm" Support Vector Machine implemented by sklearn.svm.SVC
        - "logreg" Logistic Regression implemented by sklearn.linear_models.LogisticRegression
        - "knn" k Nearest Neighbour implemented by sklearn.neighbors.KNeighborsClassifier
        - "tree" Decision Tree implemented by sklearn.tree.DecisionTreeClassifier
        - "randomforest" RandomForest implemented by sklearn.tree.RandomForestClassifier    
    classifier_duplicates: int, optional (default = 1)
        How many instances of each classifier type listed in base_estimators is included in the ensemble
    stack_layer_classifier: string, optional (default = "logreg')
        The classifier type used at the stack layer. The same classifier types as are supported at the base layer are supported        
    training_folds: int, optional (default = 4)
        How many folds will be used to generate the training set for the stacked layer
        
    Attributes
    ----------
    classes_ : array of shape = [n_classes] 
        The classes labels (single output problem).


    Notes
    -----
    The default values for most base learners are used.

    See also
    --------
    
    ----------
    .. [1]  van der Laan, M., Polley, E. & Hubbard, A. (2007). 
            Super Learner. Statistical Applications in Genetics 
            and Molecular Biology, 6(1) 
            doi:10.2202/1544-6115.1309
    Examples
    --------
    >>> from sklearn.datasets import load_iris
    >>> from sklearn.model_selection import cross_val_score
    >>> clf = StackedEnsembleClassifier()
    >>> iris = load_iris()
    >>> cross_val_score(clf, iris.data, iris.target, cv=10)

    """
    # Constructor for the classifier object
    def __init__(self, base_estimator_types = ["svm", "logreg", "tree"], base_estimator_duplicates = 8, stack_layer_classifier_type = "logreg"):
        """Setup a SuperLearner classifier .
        Parameters
        ----------
        base_estimator_types: The types of classifiers to include at the base layer
        base_estimator_duplicates: The number of duplicates of each type of classiifer to include
        stack_layer_classifier_type: The type of classifier to include at the stack layer 
        
        Returns
        -------
        Nothing
        """     

        # Initialise class variabels
        self.base_estimator_types = base_estimator_types
        self.base_estimator_type_list = list()
        self.base_estimator_duplicates = base_estimator_duplicates
        self.stack_layer_classifier_type = stack_layer_classifier_type

    # The fit function to train a classifier
    def fit(self, X, y):
        """Build a SuperLearner classifier from the training set (X, y).
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The training input samples. 
        y : array-like, shape = [n_samples] 
            The target values (class labels) as integers or strings.
        Returns
        -------
        self : object
        """    
        
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)
        
        ########################
        # LEVEL 0
        ########################
        
        # Set up the base classifeirs in the ensemble
        self.classifiers_ = list()
        
        for i in range(0, self.base_estimator_duplicates):
            for t in self.base_estimator_types:

                self.base_estimator_type_list.append(t)      
                c = create_classifier(t, tree_min_samples_split=math.ceil(len(X)*0.05))
                self.classifiers_.append(c)
        
        # Store the number of classifers in the ensemble
        self.n_estimators_ = len(self.classifiers_)

        # Use all training data to train base classifiers
        X_train = X
        y_train = y
        
        # Set up empty arrays to hold stack layer training data
        self.X_stack_train = None #(dtype = float)
        self.y_stack_train = y_train
          
        # Train each base calssifier and generate the stack layer training dataset
        for classifier in self.classifiers_:

            # Extract a bootstrap sample
            X_train_samp, y_train_samp = resample(X_train, y_train, replace=True)    
            
            # Train a base classifier
            classifier.fit(X_train_samp, y_train_samp)
            
            # Make predictions for all instances in the training set
            y_pred = classifier.predict_proba(X_train)

            # Append the predictions ot the stack layer traing set (a bit of hacking here!)
            try:
                self.X_stack_train = np.c_[self.X_stack_train, y_pred]
            except ValueError:
                self.X_stack_train = y_pred
                  
        ########################
        # LEVEL 1
        ########################
        
        # Create the stack layer classifier
        self.stack_layer_classifier_ = create_classifier(self.stack_layer_classifier_type, tree_min_samples_split=math.ceil(len(X)*0.05))

        # Train the stack layer using the newly created dataset
        self.stack_layer_classifier_.fit(self.X_stack_train, self.y_stack_train)
            
        # Return the classifier
        return self

    # The predict function to make a set of predictions for a set of query instances
    def predict(self, X):
        """Predict class labels of the input samples X.
        Parameters
        ----------
        X : array-like matrix of shape = [n_samples, n_features]
            The input samples. 
        Returns
        -------
        p : array of shape = [n_samples, ].
            The predicted class labels of the input samples. 
        """
        
        # Check is fit had been called by confirming that the teamplates_ dictiponary has been set up
        check_is_fitted(self, ['stack_layer_classifier_'])

        # Check that the input features match the type and shape of the training features
        X = check_array(X)
   
        X_stack_queries = None
              
        # Make a prediction with each base classifier and assemble the stack layer query
        for classifier in self.classifiers_:
            
            y_pred = classifier.predict_proba(X)
            
            try:
                X_stack_queries = np.c_[X_stack_queries, y_pred]
            except ValueError:
                X_stack_queries = y_pred
        
        # Return the prediction made by the stack layer classifier
        return self.stack_layer_classifier_.predict(X_stack_queries)
    
    # The predict function to make a set of predictions for a set of query instances
    def predict_proba(self, X):
        """Predict class probabilities of the input samples X.
        Parameters
        ----------
        X : array-like matrix of shape = [n_samples, n_features]
            The input samples. 
        Returns
        -------
        p : array of shape = [n_samples, n_labels].
            The predicted class label probabilities of the input samples. 
        """
        # Check is fit had been called by confirming that the teamplates_ dictiponary has been set up
        check_is_fitted(self, ['stack_layer_classifier_'])

        # Check that the input features match the type and shape of the training features
        X = check_array(X)
        
        X_stack_queries = None
        
        # Make a prediction with each base classifier
        for classifier in self.classifiers_:
            
            y_pred = classifier.predict_proba(X)
                
            try:
                X_stack_queries = np.c_[X_stack_queries, y_pred]
            except ValueError:
                X_stack_queries = y_pred

        # Return the prediction made by the stack layer classifier        
        return self.stack_layer_classifier_.predict_proba(X_stack_queries)

## Test the StackedEnsembleClassifier

Perform a simple test using the StackedEnsembleClassifier on the Iris dataset

In [6]:
clf = StackedEnsembleClassifier()
clf.fit(iris.data, iris.target)
y_pred = clf.predict(iris.data)
print(metrics.classification_report(iris.target, y_pred))
print("Confusion Matrix")
display(pd.crosstab(np.array(iris.target), y_pred, rownames=['True'], colnames=['Predicted'], margins=True))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        50
          1       1.00      0.98      0.99        50
          2       0.98      1.00      0.99        50

avg / total       0.99      0.99      0.99       150

Confusion Matrix


Predicted,0,1,2,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,50,0,0,50
1,0,49,1,50
2,0,0,50,50
All,50,49,51,150


Perform a simple test using the StackedEnsembleClassifier on the MNIST dataset

In [7]:
clf = StackedEnsembleClassifier()
clf.fit(data, target)
y_pred = clf.predict(data)
print(metrics.classification_report(target, y_pred))
print("Confusion Matrix")
display(pd.crosstab(np.array(target), y_pred, rownames=['True'], colnames=['Predicted'], margins=True))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        20
          1       1.00      1.00      1.00        20
          2       1.00      1.00      1.00        20
          3       1.00      1.00      1.00        20
          4       1.00      1.00      1.00        20
          5       1.00      1.00      1.00        20
          6       1.00      1.00      1.00        20
          7       1.00      1.00      1.00        20
          8       1.00      1.00      1.00        20
          9       1.00      1.00      1.00        20

avg / total       1.00      1.00      1.00       200

Confusion Matrix


Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,20,0,0,0,0,0,0,0,0,0,20
1,0,20,0,0,0,0,0,0,0,0,20
2,0,0,20,0,0,0,0,0,0,0,20
3,0,0,0,20,0,0,0,0,0,0,20
4,0,0,0,0,20,0,0,0,0,0,20
5,0,0,0,0,0,20,0,0,0,0,20
6,0,0,0,0,0,0,20,0,0,0,20
7,0,0,0,0,0,0,0,20,0,0,20
8,0,0,0,0,0,0,0,0,20,0,20
9,0,0,0,0,0,0,0,0,0,20,20


Perform a cross validation experiment on Iris data

In [8]:
scores = cross_val_score(clf, iris.data, iris.target, cv=10)
print(scores)
print(np.mean(scores), " +/- ", np.std(scores))

[1.         0.93333333 1.         0.93333333 0.93333333 0.93333333
 0.86666667 0.93333333 1.         1.        ]
0.9533333333333334  +/-  0.04268749491621898


Perform a cross validation experiment on MNIST data

In [9]:
scores = cross_val_score(clf, data, target, cv=10)
print(scores)
print(np.mean(scores), " +/- ", np.std(scores))

[0.9  0.8  0.85 0.85 0.7  0.9  0.85 0.9  0.95 0.7 ]
0.8400000000000001  +/-  0.08


## Task 1: Design the StackedEnsembleHoldOut Class

Modify the StackedEnsembleClassifier implementation provided to create StackedEnsembleClassifierHoldOut so that it uses a hold-out test set to generate the stack training dataset.

In [10]:
# Create a new classifier which is based on the sckit-learn BaseEstimator and ClassifierMixin classes
class StackedEnsembleHoldOut(BaseEstimator, ClassifierMixin):
    
    """An ensemble classifier that uses heterogeneous models at the base layer and a aggregatnio model at the aggregation layer. A k-fold cross validation is used to gnerate training data for the stack layer model.

    Parameters
    ----------
    base_estimators: list 
        A list of the classifiers in the ase layer of the ensemble. Supported types are
        - "svm" Support Vector Machine implemented by sklearn.svm.SVC
        - "logreg" Logistic Regression implemented by sklearn.linear_models.LogisticRegression
        - "knn" k Nearest Neighbour implemented by sklearn.neighbors.KNeighborsClassifier
        - "tree" Decision Tree implemented by sklearn.tree.DecisionTreeClassifier
        - "randomforest" RandomForest implemented by sklearn.tree.RandomForestClassifier    
    classifier_duplicates: int, optional (default = 1)
        How many instances of each classifier type listed in base_estimators is included in the ensemble
    stack_layer_classifier: string, optional (default = "logreg')
        The classifier type used at the stack layer. The same classifier types as are supported at the base layer are supported        
    training_folds: int, optional (default = 4)
        How many folds will be used to generate the training set for the stacked layer
        
    Attributes
    ----------
    classes_ : array of shape = [n_classes] 
        The classes labels (single output problem).


    Notes
    -----
    The default values for most base learners are used.

    See also
    --------
    
    ----------
    .. [1]  van der Laan, M., Polley, E. & Hubbard, A. (2007). 
            Super Learner. Statistical Applications in Genetics 
            and Molecular Biology, 6(1) 
            doi:10.2202/1544-6115.1309
    Examples
    --------
    >>> from sklearn.datasets import load_iris
    >>> from sklearn.model_selection import cross_val_score
    >>> clf = StackedEnsembleClassifier()
    >>> iris = load_iris()
    >>> cross_val_score(clf, iris.data, iris.target, cv=10)

    """
    # Constructor for the classifier object
    def __init__(self, base_estimator_types = ["svm", "logreg", "tree"], base_estimator_duplicates = 8, stack_layer_classifier_type = "logreg"):
        """Setup a SuperLearner classifier .
        Parameters
        ----------
        base_estimator_types: The types of classifiers to include at the base layer
        base_estimator_duplicates: The number of duplicates of each type of classiifer to include
        stack_layer_classifier_type: The type of classifier to include at the stack layer 
        
        Returns
        -------
        Nothing
        """     

        # Initialise class variabels
        self.base_estimator_types = base_estimator_types
        self.base_estimator_type_list = list()
        self.base_estimator_duplicates = base_estimator_duplicates
        self.stack_layer_classifier_type = stack_layer_classifier_type

    # The fit function to train a classifier
    def fit(self, X, y):
        """Build a SuperLearner classifier from the training set (X, y).
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The training input samples. 
        y : array-like, shape = [n_samples] 
            The target values (class labels) as integers or strings.
        Returns
        -------
        self : object
        """    
        
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)
        
        ########################
        # LEVEL 0
        ########################
        
        # Set up the base classifeirs in the ensemble
        self.classifiers_ = list()
        
        for i in range(0, self.base_estimator_duplicates):
            for t in self.base_estimator_types:

                self.base_estimator_type_list.append(t)      
                c = create_classifier(t, tree_min_samples_split=math.ceil(len(X)*0.05))
                self.classifiers_.append(c)
        
        # Store the number of classifers in the ensemble
        self.n_estimators_ = len(self.classifiers_)

        # Use all training data to train base classifiers
        X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size = 0.3)
        
        # Set up empty arrays to hold stack layer training data
        self.X_stack_train = None #(dtype = float)
        self.y_stack_train = y_holdout
          
        # Train each base calssifier and generate the stack layer training dataset
        for classifier in self.classifiers_:

            # Extract a bootstrap sample
            X_train_samp, y_train_samp = resample(X_train, y_train, replace=True) 
            
            # Train a base classifier
            classifier.fit(X_train_samp, y_train_samp)
            
            # Make predictions for all instances in the training set
            y_pred_holdout = classifier.predict_proba(X_holdout)

            # Append the predictions to the stack layer traing set (a bit of hacking here!)
            try:
                self.X_stack_train = np.c_[self.X_stack_train, y_pred_holdout]
            except ValueError:
                self.X_stack_train = y_pred_holdout
    
        ########################
        # LEVEL 1
        ########################
        
        # Create the stack layer classifier
        self.stack_layer_classifier_ = create_classifier(self.stack_layer_classifier_type, tree_min_samples_split=math.ceil(len(X)*0.05))

        # Train the stack layer using the newly created dataset
        self.stack_layer_classifier_.fit(self.X_stack_train, self.y_stack_train)
            
        # Return the classifier
        return self

    # The predict function to make a set of predictions for a set of query instances
    def predict(self, X):
        """Predict class labels of the input samples X.
        Parameters
        ----------
        X : array-like matrix of shape = [n_samples, n_features]
            The input samples. 
        Returns
        -------
        p : array of shape = [n_samples, ].
            The predicted class labels of the input samples. 
        """
        
        # Check is fit had been called by confirming that the teamplates_ dictiponary has been set up
        check_is_fitted(self, ['stack_layer_classifier_'])

        # Check that the input features match the type and shape of the training features
        X = check_array(X)
   
        X_stack_queries = None
              
        # Make a prediction with each base classifier and assemble the stack layer query
        for classifier in self.classifiers_:
            
            y_pred = classifier.predict_proba(X)
            
            try:
                X_stack_queries = np.c_[X_stack_queries, y_pred]
            except ValueError:
                X_stack_queries = y_pred
        
        # Return the prediction made by the stack layer classifier
        return self.stack_layer_classifier_.predict(X_stack_queries)
    
    # The predict function to make a set of predictions for a set of query instances
    def predict_proba(self, X):
        """Predict class probabilities of the input samples X.
        Parameters
        ----------
        X : array-like matrix of shape = [n_samples, n_features]
            The input samples. 
        Returns
        -------
        p : array of shape = [n_samples, n_labels].
            The predicted class label probabilities of the input samples. 
        """
        # Check is fit had been called by confirming that the teamplates_ dictiponary has been set up
        check_is_fitted(self, ['stack_layer_classifier_'])

        # Check that the input features match the type and shape of the training features
        X = check_array(X)
        
        X_stack_queries = None
        
        # Make a prediction with each base classifier
        for classifier in self.classifiers_:
            
            y_pred = classifier.predict_proba(X)
                
            try:
                X_stack_queries = np.c_[X_stack_queries, y_pred]
            except ValueError:
                X_stack_queries = y_pred

        # Return the prediction made by the stack layer classifier        
        return self.stack_layer_classifier_.predict_proba(X_stack_queries)

## Task 2: Design the StackedEnsembleKFold Class

In [11]:
# Create a new classifier which is based on the sckit-learn BaseEstimator and ClassifierMixin classes
class StackedEnsembleKFold(BaseEstimator, ClassifierMixin):
    
    """An ensemble classifier that uses heterogeneous models at the base layer and a aggregatnio model at the aggregation layer. A k-fold cross validation is used to gnerate training data for the stack layer model.

    Parameters
    ----------
    base_estimators: list 
        A list of the classifiers in the ase layer of the ensemble. Supported types are
        - "svm" Support Vector Machine implemented by sklearn.svm.SVC
        - "logreg" Logistic Regression implemented by sklearn.linear_models.LogisticRegression
        - "knn" k Nearest Neighbour implemented by sklearn.neighbors.KNeighborsClassifier
        - "tree" Decision Tree implemented by sklearn.tree.DecisionTreeClassifier
        - "randomforest" RandomForest implemented by sklearn.tree.RandomForestClassifier    
    classifier_duplicates: int, optional (default = 1)
        How many instances of each classifier type listed in base_estimators is included in the ensemble
    stack_layer_classifier: string, optional (default = "logreg')
        The classifier type used at the stack layer. The same classifier types as are supported at the base layer are supported        
    training_folds: int, optional (default = 4)
        How many folds will be used to generate the training set for the stacked layer
        
    Attributes
    ----------
    classes_ : array of shape = [n_classes] 
        The classes labels (single output problem).


    Notes
    -----
    The default values for most base learners are used.

    See also
    --------
    
    ----------
    .. [1]  van der Laan, M., Polley, E. & Hubbard, A. (2007). 
            Super Learner. Statistical Applications in Genetics 
            and Molecular Biology, 6(1) 
            doi:10.2202/1544-6115.1309
    Examples
    --------
    >>> from sklearn.datasets import load_iris
    >>> from sklearn.model_selection import cross_val_score
    >>> clf = StackedEnsembleClassifier()
    >>> iris = load_iris()
    >>> cross_val_score(clf, iris.data, iris.target, cv=10)

    """
    # Constructor for the classifier object
    def __init__(self, base_estimator_types = ["svm", "logreg", "tree"], base_estimator_duplicates = 8, stack_layer_classifier_type = "logreg"):
        """Setup a SuperLearner classifier .
        Parameters
        ----------
        base_estimator_types: The types of classifiers to include at the base layer
        base_estimator_duplicates: The number of duplicates of each type of classiifer to include
        stack_layer_classifier_type: The type of classifier to include at the stack layer 
        
        Returns
        -------
        Nothing
        """     

        # Initialise class variabels
        self.base_estimator_types = base_estimator_types
        self.base_estimator_type_list = list()
        self.base_estimator_duplicates = base_estimator_duplicates
        self.stack_layer_classifier_type = stack_layer_classifier_type

    # The fit function to train a classifier
    def fit(self, X, y):
        """Build a SuperLearner classifier from the training set (X, y).
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The training input samples. 
        y : array-like, shape = [n_samples] 
            The target values (class labels) as integers or strings.
        Returns
        -------
        self : object
        """    
        
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)
        
        ########################
        # LEVEL 0
        ########################
        
        # Set up the base classifeirs in the ensemble
        self.classifiers_ = list()
        
        for i in range(0, self.base_estimator_duplicates):
            for t in self.base_estimator_types:

                self.base_estimator_type_list.append(t)      
                c = create_classifier(t, tree_min_samples_split=math.ceil(len(X)*0.05))
                self.classifiers_.append(c)
        
        # Store the number of classifers in the ensemble
        self.n_estimators_ = len(self.classifiers_)

        kf = StratifiedKFold(n_splits=5)
        
        # Set up empty arrays to hold stack layer training data
        self.X_stack_train = None #(dtype = float)
        self.y_stack_train = None
        first_pass = True
          
        # Train each base calssifier and generate the stack layer training dataset
        for train_index, test_index in kf.split(X, y):
            
           
            X_train, X_holdout = X[train_index], X[test_index]
            y_train, y_holdout = y[train_index], y[test_index]

            X_stack_train_K = None
                        
            for classifier in self.classifiers_:

                # Extract a bootstrap sample
                X_train_samp, y_train_samp = resample(X_train, y_train, replace=True) 

                # Train a base classifier
                classifier.fit(X_train_samp, y_train_samp)

                # Make predictions for all instances in the training set
                y_pred_holdout = classifier.predict_proba(X_holdout)

                # Append the predictions to the stack layer traing set (a bit of hacking here!)
                try:
                    X_stack_train_K = np.c_[X_stack_train_K, y_pred_holdout]
                except ValueError:
                    X_stack_train_K = y_pred_holdout
            
            # Append the predictions to the stack layer traing set (a bit of hacking here!)
            if first_pass:
                self.X_stack_train = X_stack_train_K
                self.y_stack_train = y_holdout
            else:
                self.X_stack_train = np.concatenate((self.X_stack_train, X_stack_train_K))
                self.y_stack_train = np.concatenate((self.y_stack_train, y_holdout))
            
            first_pass = False
        ########################
        # LEVEL 1
        ########################

        # Create the stack layer classifier
        self.stack_layer_classifier_ = create_classifier(self.stack_layer_classifier_type, tree_min_samples_split=math.ceil(len(X)*0.05))

        # Train the stack layer using the newly created dataset
        self.stack_layer_classifier_.fit(self.X_stack_train, self.y_stack_train)
            
        # Return the classifier
        return self

    # The predict function to make a set of predictions for a set of query instances
    def predict(self, X):
        """Predict class labels of the input samples X.
        Parameters
        ----------
        X : array-like matrix of shape = [n_samples, n_features]
            The input samples. 
        Returns
        -------
        p : array of shape = [n_samples, ].
            The predicted class labels of the input samples. 
        """
        
        # Check is fit had been called by confirming that the teamplates_ dictiponary has been set up
        check_is_fitted(self, ['stack_layer_classifier_'])

        # Check that the input features match the type and shape of the training features
        X = check_array(X)
   
        X_stack_queries = None
              
        # Make a prediction with each base classifier and assemble the stack layer query
        for classifier in self.classifiers_:
            
            y_pred = classifier.predict_proba(X)
            
            try:
                X_stack_queries = np.c_[X_stack_queries, y_pred]
            except ValueError:
                X_stack_queries = y_pred
        
        # Return the prediction made by the stack layer classifier
        return self.stack_layer_classifier_.predict(X_stack_queries)
    
    # The predict function to make a set of predictions for a set of query instances
    def predict_proba(self, X):
        """Predict class probabilities of the input samples X.
        Parameters
        ----------
        X : array-like matrix of shape = [n_samples, n_features]
            The input samples. 
        Returns
        -------
        p : array of shape = [n_samples, n_labels].
            The predicted class label probabilities of the input samples. 
        """
        # Check is fit had been called by confirming that the teamplates_ dictiponary has been set up
        check_is_fitted(self, ['stack_layer_classifier_'])

        # Check that the input features match the type and shape of the training features
        X = check_array(X)
        
        X_stack_queries = None
        
        # Make a prediction with each base classifier
        for classifier in self.classifiers_:
            
            y_pred = classifier.predict_proba(X)
                
            try:
                X_stack_queries = np.c_[X_stack_queries, y_pred]
            except ValueError:
                X_stack_queries = y_pred

        # Return the prediction made by the stack layer classifier        
        return self.stack_layer_classifier_.predict_proba(X_stack_queries)

## Task 3: Compare the Performance of Different Stack Layer Approaches

In [12]:
# Comparing the 3 strategies:
# StackedEnsembleClassifier
print("** StackedEnsembleClassifier** ")
clf1 = StackedEnsembleHoldOut()
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_test, y_pred))
print("Cross Validation test")
scores = cross_val_score(clf1, data, target, cv=5)
print(np.mean(scores), " +/- ", np.std(scores))
print("\n")

# StackedEnsembleClassifierHoldOut
print("** StackedEnsembleClassifierHoldOut **")
clf2 = StackedEnsembleHoldOut()
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_test, y_pred))
print("Cross Validation test")
scores = cross_val_score(clf2, data, target, cv=5)
print(np.mean(scores), " +/- ", np.std(scores))
print("\n")

# StackedEnsembleClassifierkFold
print("** StackedEnsembleClassifierkFold **")
clf3 = StackedEnsembleKFold()
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_test, y_pred))
print("Cross Validation test")
scores = cross_val_score(clf3, data, target, cv=5)
print(np.mean(scores), " +/- ", np.std(scores))
print("\n")


** StackedEnsembleClassifier** 
Accuracy: 0.7333333333333333
             precision    recall  f1-score   support

          0       1.00      0.86      0.92         7
          1       0.80      1.00      0.89         4
          2       0.67      0.80      0.73         5
          3       0.75      0.50      0.60         6
          4       0.56      1.00      0.71         5
          5       0.50      1.00      0.67         3
          6       1.00      0.88      0.93         8
          7       0.71      0.83      0.77         6
          8       0.71      0.83      0.77         6
          9       0.67      0.20      0.31        10

avg / total       0.76      0.73      0.71        60

Cross Validation test
0.7899999999999999  +/-  0.025495097567963917


** StackedEnsembleClassifierHoldOut **
Accuracy: 0.8333333333333334
             precision    recall  f1-score   support

          0       1.00      1.00      1.00         7
          1       1.00      1.00      1.00         4
  

## Task 4: Comparing the Performance of Different Stack Layer Approaches with  More Standard Approaches

Random forest and decision tree perform very poorly on just 150 data points, but once given the original dataset split by 70/30 train and test, the performance significantly improves

In [13]:
def get_best_estimator_params(estimator, param_grid, X,Y):
    grid = GridSearchCV(estimator, param_grid, cv=2, return_train_score=False, n_jobs = -1)
    grid.fit(X, Y)
    #df = pd.DataFrame(grid.cv_results_)[['mean_test_score', 'std_test_score', 'params']]
    grid_mean_scores = grid.cv_results_['mean_test_score']
    return grid.best_params_

print("** Decision Tree** ")

##Cross Validation + Grid Search
dec_tree = tree.DecisionTreeClassifier()
# Grid Search
# Set up the parameter grid to seaerch
param_grid ={'criterion': ['gini', "entropy"], \
             'max_depth': list(range(3, 40, 3)), \
             'min_samples_split': [50] }

best_params = get_best_estimator_params(dec_tree, param_grid, X_train, y_train)
print(best_params)

dec_tree = tree.DecisionTreeClassifier(max_depth = best_params['max_depth'], criterion=best_params['criterion'], min_samples_split=best_params['min_samples_split'])
dec_tree.fit(X_train, y_train)

y_pred = dec_tree.predict(X_test)
accuracy = \
metrics.accuracy_score(y_test, y_pred)
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_test, y_pred))

print("** Random Forest** ")

## Cross Validation + Grid Search
rf = RandomForestClassifier()
param_grid ={'criterion': ['gini', "entropy"], \
             'max_depth': list(range(3, 40, 3)), \
             'min_samples_split': [50] }

best_params = get_best_estimator_params(rf, param_grid, X_train,y_train)
print(best_params)

rf = RandomForestClassifier(max_depth = best_params['max_depth'], criterion=best_params['criterion'], min_samples_split=best_params['min_samples_split'])
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
accuracy = \
metrics.accuracy_score(y_test, y_pred)
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_test, y_pred))

** Decision Tree** 
{'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 50}
Accuracy: 0.26666666666666666
             precision    recall  f1-score   support

          0       0.83      0.71      0.77         7
          1       0.12      1.00      0.22         4
          2       0.00      0.00      0.00         5
          3       0.00      0.00      0.00         6
          4       0.25      0.80      0.38         5
          5       0.00      0.00      0.00         3
          6       0.00      0.00      0.00         8
          7       0.50      0.50      0.50         6
          8       0.00      0.00      0.00         6
          9       0.00      0.00      0.00        10

avg / total       0.18      0.27      0.19        60

** Random Forest** 


  'precision', 'predicted', average, warn_for)


{'criterion': 'entropy', 'max_depth': 3, 'min_samples_split': 50}
Accuracy: 0.5333333333333333
             precision    recall  f1-score   support

          0       0.83      0.71      0.77         7
          1       0.57      1.00      0.73         4
          2       0.60      0.60      0.60         5
          3       0.50      0.33      0.40         6
          4       0.80      0.80      0.80         5
          5       0.22      0.67      0.33         3
          6       1.00      0.62      0.77         8
          7       0.30      0.50      0.37         6
          8       0.44      0.67      0.53         6
          9       0.00      0.00      0.00        10

avg / total       0.52      0.53      0.50        60



  'precision', 'predicted', average, warn_for)


## Task 5: Implement the StackedEnsembleOneVsOne Class

In [14]:
# Write your code here
# Create a new classifier which is based on the sckit-learn BaseEstimator and ClassifierMixin classes
class StackedEnsembleOneVsOne(BaseEstimator, ClassifierMixin):
    
    """An ensemble classifier that uses heterogeneous models at the base layer and a aggregatnio model at the aggregation layer. A k-fold cross validation is used to gnerate training data for the stack layer model.

    Parameters
    ----------
    base_estimators: list 
        A list of the classifiers in the ase layer of the ensemble. Supported types are
        - "svm" Support Vector Machine implemented by sklearn.svm.SVC
        - "logreg" Logistic Regression implemented by sklearn.linear_models.LogisticRegression
        - "knn" k Nearest Neighbour implemented by sklearn.neighbors.KNeighborsClassifier
        - "tree" Decision Tree implemented by sklearn.tree.DecisionTreeClassifier
        - "randomforest" RandomForest implemented by sklearn.tree.RandomForestClassifier    
    classifier_duplicates: int, optional (default = 1)
        How many instances of each classifier type listed in base_estimators is included in the ensemble
    stack_layer_classifier: string, optional (default = "logreg')
        The classifier type used at the stack layer. The same classifier types as are supported at the base layer are supported        
    training_folds: int, optional (default = 4)
        How many folds will be used to generate the training set for the stacked layer
        
    Attributes
    ----------
    classes_ : array of shape = [n_classes] 
        The classes labels (single output problem).


    Notes
    -----
    The default values for most base learners are used.

    See also
    --------
    
    ----------
    .. [1]  van der Laan, M., Polley, E. & Hubbard, A. (2007). 
            Super Learner. Statistical Applications in Genetics 
            and Molecular Biology, 6(1) 
            doi:10.2202/1544-6115.1309
    Examples
    --------
    >>> from sklearn.datasets import load_iris
    >>> from sklearn.model_selection import cross_val_score
    >>> clf = StackedEnsembleClassifier()
    >>> iris = load_iris()
    >>> cross_val_score(clf, iris.data, iris.target, cv=10)

    """
    # Constructor for the classifier object
    def __init__(self, base_estimator_types = ["tree"], base_estimator_duplicates = 1, stack_layer_classifier_type = "logreg"):
        """Setup a SuperLearner classifier .
        Parameters
        ----------
        base_estimator_types: The types of classifiers to include at the base layer
        base_estimator_duplicates: The number of duplicates of each type of classiifer to include
        stack_layer_classifier_type: The type of classifier to include at the stack layer 
        
        Returns
        -------
        Nothing
        """     

        # Initialise class variabels
        self.base_estimator_types = base_estimator_types
        self.base_estimator_type_list = list()
        self.base_estimator_duplicates = base_estimator_duplicates
        self.stack_layer_classifier_type = stack_layer_classifier_type

    # The fit function to train a classifier
    def fit(self, X, y):
        """Build a SuperLearner classifier from the training set (X, y).
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The training input samples. 
        y : array-like, shape = [n_samples] 
            The target values (class labels) as integers or strings.
        Returns
        -------
        self : object
        """    
        
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)
        
        ########################
        # LEVEL 0
        ########################
        
        # Set up the base classifeirs in the ensemble
        self.classifiers_ = list()
        
        for i in range(0, self.base_estimator_duplicates):
            for t in self.base_estimator_types:

                self.base_estimator_type_list.append(t)      
                c = create_classifier(t, tree_min_samples_split=math.ceil(len(X)*0.05))
                self.classifiers_.append(c)
        
        # Store the number of classifers in the ensemble
        self.n_estimators_ = len(self.classifiers_)
     
        # Set up empty arrays to hold stack layer training data
        self.X_stack_train = None #(dtype = float)
        self.y_stack_train = None
        first_pass = True
        
        data = np.c_[y, X]
        # Train each base calssifier and generate the stack layer training dataset
        for i in range(len(self.classes_)-1):
            for j in range(i+1,len(self.classes_)):
                subset = data[np.where((data[:,0] == self.classes_[i]) | (data[:,0] == self.classes_[j]))]
               
                y_train = subset[:,0]
                X_train = subset[:,1:]
                X_1v1 = None
                
                for classifier in self.classifiers_:
                    
                    # Extract a bootstrap sample
                    X_train_samp, y_train_samp = resample(X_train, y_train, replace=True)    

                    # Train a base classifier
                    classifier.fit(X_train_samp, y_train_samp)

                    # Make predictions for all instances in the training set
                    y_pred = classifier.predict_proba(X_train)

                    # Append the predictions ot the stack layer traing set (a bit of hacking here!)
                    try:
                        X_1v1 = np.c_[X_1v1, y_pred]
                        
                    except ValueError:
                        X_1v1 = y_pred

                if first_pass:
                    self.X_stack_train = X_1v1
                    self.y_stack_train = y_train
                else:
                    self.X_stack_train = np.concatenate((self.X_stack_train, X_1v1))
                    self.y_stack_train = np.append(self.y_stack_train, y_train)
                   
                first_pass = False
        
        ########################
        # LEVEL 1
        ########################
        self.X_stack_train, self.y_stack_train = resample(self.X_stack_train, self.y_stack_train, replace=True)  
        # Create the stack layer classifier
        self.stack_layer_classifier_ = create_classifier(self.stack_layer_classifier_type, tree_min_samples_split=math.ceil(len(X)*0.05))

        # Train the stack layer using the newly created dataset
        self.stack_layer_classifier_.fit(self.X_stack_train, self.y_stack_train)
        
        print(self.stack_layer_classifier_.classes_)
            
        # Return the classifier
        return self

    # The predict function to make a set of predictions for a set of query instances
    def predict(self, X):
        """Predict class labels of the input samples X.
        Parameters
        ----------
        X : array-like matrix of shape = [n_samples, n_features]
            The input samples. 
        Returns
        -------
        p : array of shape = [n_samples, ].
            The predicted class labels of the input samples. 
        """
        
        # Check is fit had been called by confirming that the teamplates_ dictiponary has been set up
        check_is_fitted(self, ['stack_layer_classifier_'])

        # Check that the input features match the type and shape of the training features
        X = check_array(X)
   
        X_stack_queries = None
              
        # Make a prediction with each base classifier and assemble the stack layer query
        for classifier in self.classifiers_:
            
            y_pred = classifier.predict_proba(X)
            
            try:
                X_stack_queries = np.c_[X_stack_queries, y_pred]
            except ValueError:
                X_stack_queries = y_pred
        
        # Return the prediction made by the stack layer classifier
        return self.stack_layer_classifier_.predict(X_stack_queries)
    
    # The predict function to make a set of predictions for a set of query instances
    def predict_proba(self, X):
        """Predict class probabilities of the input samples X.
        Parameters
        ----------
        X : array-like matrix of shape = [n_samples, n_features]
            The input samples. 
        Returns
        -------
        p : array of shape = [n_samples, n_labels].
            The predicted class label probabilities of the input samples. 
        """
        # Check is fit had been called by confirming that the teamplates_ dictiponary has been set up
        check_is_fitted(self, ['stack_layer_classifier_'])

        # Check that the input features match the type and shape of the training features
        X = check_array(X)
        
        X_stack_queries = None
        
        # Make a prediction with each base classifier
        for classifier in self.classifiers_:
            
            y_pred = classifier.predict_proba(X) 
            try:
                X_stack_queries = np.c_[X_stack_queries, y_pred]
            except ValueError:
                X_stack_queries = y_pred

        # Return the prediction made by the stack layer classifier        
        return self.stack_layer_classifier_.predict_proba(X_stack_queries)

## Task 6 Evaluate the Performance of the StackedEnsembleCalassifierOneVsOne Algorithm

In [15]:
# StackedEnsembleClassifierOneVsOne

print("** StackedEnsembleClassifierOneVsOne **")
clf4 = StackedEnsembleOneVsOne()
clf4.fit(X_train, y_train)
y_pred = clf4.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_test, y_pred))
print("Cross Validation test")
scores = cross_val_score(clf4, data, target, cv=5)
print(np.mean(scores), " +/- ", np.std(scores))
print("\n")

** StackedEnsembleClassifierOneVsOne **
[0 1 2 3 4 5 6 7 8 9]
Accuracy: 0.08333333333333333
             precision    recall  f1-score   support

          0       0.11      0.43      0.18         7
          1       0.00      0.00      0.00         4
          2       0.00      0.00      0.00         5
          3       0.00      0.00      0.00         6
          4       0.00      0.00      0.00         5
          5       0.00      0.00      0.00         3
          6       0.00      0.00      0.00         8
          7       0.00      0.00      0.00         6
          8       0.06      0.33      0.10         6
          9       0.00      0.00      0.00        10

avg / total       0.02      0.08      0.03        60

Cross Validation test
[0 1 2 3 4 5 6 7 8 9]


  'precision', 'predicted', average, warn_for)


[0 1 2 3 4 5 6 7 8 9]
[0 1 2 3 4 5 6 7 8 9]
[0 1 2 3 4 5 6 7 8 9]
[0 1 2 3 4 5 6 7 8 9]
0.11000000000000001  +/-  0.058309518948453




## Task 7 Reflect on the Performance of the Different Models Evaluated

According to the evaluation and performance tests conducted on the 3 classifiers (Q3), I found that the StackedEnsembleClassifierkFold had the highest accuracy (0.986667%) compared to StackedEnsembleClassifier (0.98%) and StackedEnsembleClassifierHoldOut (0.966667%). I believe this is because kFold CV allows us to use all our samples rather than a subset of samples both for training and testing and while evaluating our learning algorithm on unseen data. It also had the highest cross validation score. One reason why the StackedEnsembleClassifierHoldOut classifier had a lower accuracy the the kFold classifier is the fact that when we get to the stack layer, it was only being trained using a small subset of the data, i.e. the holdout set.

When evaluating the performance of a single decision tree and an ensemble of decision trees using bagging (Q4), I found that the accuracy for the single decision tree is 100%. I think this is as a result of the model being too closely fit to the data (overfitting). This model would not so perform well on unseen data. The accuracy of the ensemble using bagging is only 0.1% which may be because... ???
The 1vs1 classifier resulted in an accuracy of A which tells me that... ???

The MNIST data set in its original form is quite large. I used the mnist_train_small.csv which was given to us in a previous lab as it is much smaller. Further to that I subsampled the dataset to use only N data points. This allowed me to work more efficiently without having to wait very long times to load, train and evaluate the datasets and classifiers. I noticed that classifier ABC took longer than the other 4 classifiers implemented which indicated to me that it is of higher complexity and requires more computing power.

The StackedEnsembleCalassifierOneVsOne performs significantly worse than all other classifiers. This is as expected, and is explained below:

During training time, each base classifier is trained on a unique pair of the available. For example, the first base classifier (I tested SVMs) will be trained to distinguish between 0 and 1 labels. This is pretty simple to do during training time and ensures that the base classifiers become "specialists" for these 2 classes. However, during testing time, this 0v1 trained classifier can be fed data from all possible labels. This would mean that it will be forced to predict all its test observations as either a 1 or 0, even if these observation are neither in truth. This explains the extremely poor performance of this classifier as the preprocessing during training, in which data points are filtered based on the 1v1 labels, is not carried during testing (since the test labels are unknown). 

As discussed in the discussion boards on CSMoodle, there was a suggestion to include all possible labels during base classifier training. However, this would mean that either the classifier is no longer a specialist between two labels, or that we would randomly assign the two competing labels to all other possible labels per classifier. These points did not make sense to me, and hence I did not pursue them.