# Part III: Ensembles and Final Result

## AdaBoost

Train an AdaBoost classifier using Decision Tree stubs as weak learners. Compare its performance to results obtained in Part II using 10 fold CV.

In [5]:
import proj2_lib.util as utils
import proj2_lib.preprocess as preprocess


file_config = utils.file_config
train_X, train_y = preprocess.load_train_data(config=file_config)

In [7]:
# AdaBoost code goes here
from sklearn.tree import DecisionTreeClassifier


# Train a decision tree stub
tree_classifier = DecisionTreeClassifier(max_depth=2)
tree_classifier.fit(train_X, train_y)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [9]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
import numpy as np

# Create and fit an AdaBoosted decision tree
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2))

scores = cross_val_score(bdt, train_X, train_y, 
                         scoring='accuracy', cv=10)
np.mean(scores)

0.7972184705094304

In [27]:
auc_scores = cross_val_score(bdt, train_X, train_y,
                            scoring='roc_auc', cv=10)
np.mean(auc_scores)

0.72824282013900843

## Stacking

Choose a set of 5 or so classifiers. Write a function that trains an ensemble using stacking

In [45]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

def build_stack_ensemble(X, y):
    # create train/validation sets
    # using StratifiedShuffleSplit

    
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)


    for train_index, test_index in sss.split(X, y):
#          print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

    
    
    # train classifiers in ensemble using train set
    tc_1 = DecisionTreeClassifier(max_depth=5)
    tc_1.fit(X, y)
    
    tc_2 = DecisionTreeClassifier(max_depth=30)
    tc_2.fit(X, y)
    
    linear_svm = LinearSVC(dual=False)
    linear_svm.fit(X, y)
    
    rf_1 = RandomForestClassifier(n_estimators=5)
    rf_1.fit(X, y)
 
    rf_2 = RandomForestClassifier(n_estimators=15)
    rf_2.fit(X, y)
    
    # create new feature matrix for validation
    # set by getting predictions from the ensemble
    # classifiers

    tc_1_predict = tc_1.predict(X_test)
    tc_2_predict = tc_2.predict(X_test)
    linear_svm_predict = linear_svm.predict(X_test)
    rf_1_predict = rf_1.predict(X_test)
    rf_2_predict = rf_2.predict(X_test)
    
    
    # train logistic regression classifier on
    # new feature matrix
    X_ = np.array([tc_1_predict,
                   tc_2_predict,
                   linear_svm_predict,
                   rf_1_predict,
                   rf_2_predict])
    
    _X = np.transpose(X_)

    logreg = LogisticRegression()
    logreg.fit(_X, y_test)    
    
    # return all trained classifiers
    return tc_1, tc_2, linear_svm, rf_1, rf_2, logreg



Use 10-fold cross validation to measure performance of your stacked classifier. See Part II solution to see how to roll your own sklearn classifier along with http://scikit-learn.org/stable/developers/contributing.html#rolling-your-own-estimator

In [46]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import euclidean_distances

class EnsembleClassifier(BaseEstimator, ClassifierMixin):

    def __init__(self):
        self.logref = ''
        self.f1 = ''
        self.f2 = ''
        self.f3 = ''
        self.f4 = ''
        self.f5 = ''

    
    def fit(self, X, y):
        self.f1, self.f2, self.f3, self.f4, self.f5, self.logref = build_stack_ensemble(X, y)
        return self
    
    def get_feature(self, X):
        pred_1 = self.f1.predict(X)
        pred_2 = self.f2.predict(X)
        pred_3 = self.f3.predict(X)
        pred_4 = self.f4.predict(X)
        pred_5 = self.f5.predict(X)
        
        
        X_ = np.array([pred_1,
                       pred_2,
                       pred_3,
                       pred_4,
                       pred_5])
    
        _X = np.transpose(X_)
        return _X;
    
    def predict(self, X):
        _X = self.get_feature(X)
        return self.logref.predict(_X)
    
    def predict_proba(self, X):
        _X = self.get_feature(X)
        return self.logref.predict_proba(_X)   
        

In [42]:
ec = EnsembleClassifier()

In [47]:
scores = cross_val_score(ec, train_X, train_y, scoring='accuracy', cv=10)
np.mean(scores)

0.76848633292655388

In [48]:
auc_scores = cross_val_score(ec, train_X, train_y, scoring='roc_auc', cv=10)
np.mean(auc_scores)

0.611904995843644

## Final Result

Choose a single model based on all previous project steps. Train this model on the complete training dataset and measure it's performance on the held out test set.

Compare to the 10-fold CV estimate you got previously.

In [59]:
# final result goes here
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

test_X, test_y = preprocess.load_test_data(config=file_config)

rf_classifier = RandomForestClassifier(n_estimators=30, max_features=30)
rf_classifier.fit(train_X, train_y)
text_y_predict = rf_classifier.predict(test_X)

accuracy = accuracy_score(test_y, text_y_predict)
auc = roc_auc_score(test_y, text_y_predict)

print('accuracy = {}, roc_auc = {}'.format(accuracy, auc))

accuracy = 0.7813, roc_auc = 0.567455064515
