In [1]:
# Default packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

# Classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC as SupportVectorClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Utilities
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, matthews_corrcoef
from sklearn.utils import resample

## Ensembling

Here we will demonstrate how combining multiple classifiers can produce bette performance on unseen data. 

In [2]:
# Import data, seperate and binarize labels
wine_data = pd.read_csv('data/wine_dataset.csv')
wine_data.head(5)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


It looks like all of our variables are continuous. We make the target binary 

In [3]:
y = (wine_data['quality']/10).round()
wine_data = wine_data.drop("quality", axis=1)


Let's do some quick tests with default hyperparameters

In [4]:
trainX, testX, trainY, testY = train_test_split(wine_data, y, stratify=y, test_size=0.20)

In [5]:
SVC = SupportVectorClassifier()
SVC.fit(trainX, trainY)
svm_acc = accuracy_score(testY, SVC.predict(testX))
svm_matt_coef = matthews_corrcoef(testY, SVC.predict(testX))

print("SVM Accuracy: {:0.3f}".format(svm_acc))
print("SVM Matthews' coefficient: {:0.3f}".format(svm_matt_coef))

RFC = RandomForestClassifier()
RFC.fit(trainX, trainY)
rfc_acc = accuracy_score(testY, RFC.predict(testX))
rfc_matt_coef = matthews_corrcoef(testY, RFC.predict(testX))

print("RFC Accuracy: {:0.3f}".format(svm_acc))
print("RFC Matthews' coefficient: {:0.3f}".format(svm_matt_coef))

QDA = QuadraticDiscriminantAnalysis()
QDA.fit(trainX, trainY)
qda_acc = accuracy_score(testY, QDA.predict(testX))
qda_matt_coef = matthews_corrcoef(testY, QDA.predict(testX))

print("QDA Accuracy: {:0.3f}".format(qda_acc))
print("QDA Matthews' coefficient: {:0.3f}".format(qda_matt_coef))

LDA = LinearDiscriminantAnalysis()
LDA.fit(trainX, trainY)
lda_acc = accuracy_score(testY, LDA.predict(testX))
lda_matt_coef = matthews_corrcoef(testY, LDA.predict(testX))

print("LDA Accuracy: {:0.3f}".format(lda_acc))
print("LDA Matthews' coefficient: {:0.3f}".format(lda_matt_coef))

SVM Accuracy: 0.637
SVM Matthews' coefficient: 0.282
RFC Accuracy: 0.637
RFC Matthews' coefficient: 0.282
QDA Accuracy: 0.747
QDA Matthews' coefficient: 0.497
LDA Accuracy: 0.769
LDA Matthews' coefficient: 0.535


LDA performs the best, but the performance is not great.

What happens if we let our classifiers vote?

In [6]:
predictions = np.vstack(( SVC.predict(testX),  RFC.predict(testX),  QDA.predict(testX),  LDA.predict(testX)))

ens_predictions = np.round(np.mean(predictions, axis=0))
ens_acc = accuracy_score(testY, ens_predictions)
ens_matt_coef = matthews_corrcoef(testY, ens_predictions)

print("Ensembled Accuracy: {:0.3f}".format(ens_acc))
print("Ensembled Matthews' coefficient: {:0.3f}".format(ens_matt_coef))

Ensembled Accuracy: 0.794
Ensembled Matthews' coefficient: 0.585


We are already doing slightly better! We expect that combining multiple classifiers will give us better results because their errors are likely tob e uncorrelated. 

We should be able to do even better if we combine classifiers with different hyperparameters and slightly different training sets. We will train many models with different hyperparameters on individual random samples of our training data. 

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC as SupportVectorClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Utilities
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, matthews_corrcoef
from sklearn.utils import resample

class EnsembledClassifier:
    """
    Voating ensembled of classifiers 
    """
    def __init__(self, acc_threshold=0.65, bootstrap_size = 1000, model_types=['svc', 'qda', 'rfc'], num_classifiers=25, Cs=[0.01, 0.1, 1, 10, 100], 
                 gammas=[0.01, 0.1, 1, 10, 100], kernels=['linear', 'rbf', 'sigmoid'], max_depths=[3, 4, 5, 6], 
                 max_trees=[25, 50, 100, 200], reg_params=[0, 0.001, 0.01, 0.1], linear=[True, False]):
        """
        Constructor
        :param acc_threshold: (float) Minimum accuracy to keep a classifier
        :param bootstrap size: (int) Number of training samples to select per classifier
        :param num_classifiers: (int) Number of classifiers to train
        :param model_types: (list) Types of models to train. Accepted values include 'qda', 'svc', and 'rfc'
        :param gammas: (list) Values of gamma to use in SVMs. List of floats
        :param kernels: (list) Kernals to use in SVMs. Accepted values include 'lienar', 'rbf', and 'sigmoid'
        :param max_trees: (list) Values of max_trees to use in rfcs. 
        :param linear: (list) True is LDA, False is QDA. 
        """
    
        self.acc_threshold, self.bootstrap_size, self.num_classifiers, self.Cs, self.gammas, self.kernels, self.max_depths, self.max_trees, self.reg_params, self.linear = acc_threshold, bootstrap_size, num_classifiers, Cs, gammas, kernels, max_depths, max_trees, reg_params, linear
        
        self.models = None
        self.model_types = model_types
        
    def fit(self, X, y, verbose=True):
        """
        Fit the classifiers
        :param X: (Array-like) Training data
        :param y: (Vector-like) Targets
        :verbose: (bool) Print results to console
        """
        self.models = []

        for i in range(self.num_classifiers):
            model_type = random.choice(self.model_types) # Randomly select a model
            if model_type == 'svc':
                params = {'C': random.choice(self.Cs), 'gamma': random.choice(self.gammas), 
                          'kernel': random.choice(self.kernels)}
                model = SupportVectorClassifier(**params)
            elif model_type == 'rfc':
                params = {'max_depth': random.choice(self.max_depths), 'n_estimators': random.choice(self.max_trees)}
                model = RandomForestClassifier(**params)
            elif model_type == 'qda':
                params = {'reg_param': random.choice(self.reg_params)}
                if random.choice(self.linear):
                    model = LinearDiscriminantAnalysis()
                    params['linear'] = True
                else:
                    model = QuadraticDiscriminantAnalysis(**params)
                    params['linear'] = False
            
            # Sample some data and fit the model
            boots_x, boots_y = resample(X, y, n_samples=self.bootstrap_size)
            boots_train_x, boots_test_x, boots_train_y, boots_test_y = train_test_split(boots_x, boots_y, stratify=boots_y, test_size=0.2)
            model.fit(boots_train_x, boots_train_y)
            acc = accuracy_score(boots_test_y, model.predict(boots_test_x))
            
            if verbose:
                print('Model {} {} accuracy: {:0.3f}'.format(model_type, params, acc))

            # Save the model if it's good
            if acc > self.acc_threshold:
                self.models.append( {'type': model_type, 'params': params, 'classifier': model, 'est_acc': acc})
                
    def predict_proba(self, X):
        """
        Make predictions
        :param X: (Array-like_ Feature vectors
        """
        predictions = []
        for model in self.models:
            try:
                predictions.append(model['classifier'].predict_proba(testX)[:,1])
            except:
                predictions.append(model['classifier'].predict(testX))


        predict_array = np.array(predictions)
        ens_predictions = np.mean(predict_array, axis=0)
        return ens_predictions
    
    def predict(self, X):
        """
        Make binary predictions
        :param X: (Array-like_ Feature vectors
        """
        return np.round(self.predict_proba(X))
    
    def get_classifiers(self):
        """
        Get all trained classifiers
        :returns: (list) List of trained classifiers
        """
        classifiers = []
        for model in self.models:
            classifiers.append(model['classifier'])
        return classifiers

In [8]:
EC = EnsembledClassifier(bootstrap_size=800, acc_threshold=0.7)
EC.fit(trainX, trainY)

Model rfc {'max_depth': 6, 'n_estimators': 25} accuracy: 0.838
Model qda {'reg_param': 0.001, 'linear': False} accuracy: 0.762
Model svc {'C': 0.1, 'gamma': 1, 'kernel': 'rbf'} accuracy: 0.512
Model qda {'reg_param': 0.1, 'linear': True} accuracy: 0.713
Model qda {'reg_param': 0.001, 'linear': True} accuracy: 0.725
Model svc {'C': 0.01, 'gamma': 1, 'kernel': 'rbf'} accuracy: 0.531
Model qda {'reg_param': 0.1, 'linear': False} accuracy: 0.725
Model qda {'reg_param': 0.01, 'linear': False} accuracy: 0.756
Model svc {'C': 100, 'gamma': 0.1, 'kernel': 'sigmoid'} accuracy: 0.544
Model qda {'reg_param': 0.01, 'linear': False} accuracy: 0.725
Model rfc {'max_depth': 6, 'n_estimators': 100} accuracy: 0.756
Model rfc {'max_depth': 5, 'n_estimators': 50} accuracy: 0.781
Model svc {'C': 0.01, 'gamma': 100, 'kernel': 'linear'} accuracy: 0.706
Model rfc {'max_depth': 4, 'n_estimators': 100} accuracy: 0.769
Model qda {'reg_param': 0.1, 'linear': False} accuracy: 0.662
Model svc {'C': 0.01, 'gamma': 

In [9]:
ens_predictions = EC.predict(testX)
ens_acc = accuracy_score(testY, ens_predictions)
ens_matt_coef = matthews_corrcoef(testY, ens_predictions)

print("Ensembled Accuracy: {:0.3f}".format(ens_acc))
print("Ensembled Matthews' coefficient: {:0.3f}".format(ens_matt_coef))

Ensembled Accuracy: 0.775
Ensembled Matthews' coefficient: 0.547


Better! We can also fit a classifier on the outputs from our original classifiers

In [10]:
from sklearn.linear_model import LogisticRegression

class L2_Classifier:
    """
    Heirarchical classifier
    """
    def __init__(self, L1_classifiers, Classifier=LogisticRegression, **kwargs):
        """
        Constructor
        :param L1_classifiers: (list) A list of (trained) classifiers
        """
        self.L1_classifiers = L1_classifiers
        self.L2_Classifier = Classifier(**kwargs)
        
    def get_predictions(self, X):
        """
        Get a list of predictions from our models
        :param X: Features vectors
        """
        predictions = []
        for model in self.L1_classifiers:
            try: 
                output = model.predict_proba(X)[:,1]
            except:
                output = model.predict(X)
                
            predictions.append(output)
        return predictions
    
    def fit(self, X, y):
        """
        Fit (only our level two classifier)
        :param X: (Array-like) Training data
        :param y: (Vector-like) Targets
        """
        L1_predictions = self.get_predictions(X)
        X1 = np.array(L1_predictions).T
        self.L2_Classifier.fit(X1, y)
    
    def predict_proba(self, X):
        """
        Make binary predictions
        :param X: (Array-like_ Feature vectors
        """
        L1_predictions = self.get_predictions(X)
        X1 = np.array(L1_predictions).T
        L2_predictions = self.L2_Classifier.predict_proba(X1)[:,1]
        return L2_predictions
    
    def predict(self, X):
        L2_predictions = self.predict_proba(X)
        return np.round(L2_predictions)
        

In [11]:
L2 = L2_Classifier(EC.get_classifiers())
L2.fit(trainX, trainY)

In [12]:
l2_predictions = L2.predict(testX)
l2_acc = accuracy_score(testY, l2_predictions)
l2_matt_coef = matthews_corrcoef(testY, l2_predictions)

print("Two-level Accuracy: {:0.3f}".format(l2_acc))
print("Two-level Matthews coefficient: {:0.3f}".format(l2_matt_coef))

Two-level Accuracy: 0.781
Two-level Matthews coefficient: 0.560
