# Part III: Ensembles and Final Result

In [1]:
import proj2_lib.util as utils
file_config = utils.file_config

In [2]:
RUN_MAKE_TRAIN_TEST_FILES = False
if RUN_MAKE_TRAIN_TEST_FILES:
    utils.make_train_test_sets(config=file_config)

In [3]:
import proj2_lib.preprocess as preprocess

RUN_FIT_PREPROCESSING = False
if RUN_FIT_PREPROCESSING:
    preprocess.fit_save_pipelines(config=file_config)

In [4]:
train_X, train_y = preprocess.load_train_data(config=file_config)
print(train_X.shape, train_y.shape)

(90526, 101) (90526,)


In [5]:
test_X, test_y = preprocess.load_test_data(config=file_config)
print(test_X.shape, test_y.shape)

(20000, 101) (20000,)


## AdaBoost

Train an AdaBoost classifier using Decision Tree stubs as weak learners. Compare its performance to results obtained in Part II using 10 fold CV.

In [6]:
# AdaBoost code goes here
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

adaboost_clf = AdaBoostClassifier()
#adaboost_clf.fit(train_X,train_y)
#base_estimator : object, optional (default=DecisionTreeClassifier)

ada_accuracy = cross_val_score(adaboost_clf, train_X, train_y, 
                                 scoring='accuracy', cv=10)
print('AdaBoost Accuracy mean: ', np.mean(ada_accuracy))

ada_auc = cross_val_score(adaboost_clf, train_X, train_y, 
                          scoring="roc_auc", cv=10)
print('AdaBoost AUC mean: ', np.mean(ada_auc))

AdaBoost Accuracy mean:  0.797494628163
AdaBoost AUC mean:  0.727120212485


## Compare the performance:
  - best performance in PartII is RandomForest Classifier, with AUC mean of 0.6932 .  
  - AdaBoost Classifier, with AUC mean is 0.7271,   
      which is better. 

## Stacking

Choose a set of 5 or so classifiers. Write a function that trains an ensemble using stacking

In [7]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression


def build_stack_ensemble(X, y):
    # create train/validation sets
    # using StratifiedShuffleSplit
    split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=1234)
    
    for train_index, valid_index in split.split(X,y):
        x_train = X[train_index] 
        y_train = y[train_index]
        x_valid = X[valid_index]
        y_valid = y[valid_index]
    
    # train classifiers in ensemble using train set
    tree_classifier = DecisionTreeClassifier(max_depth=5)
    tree_1 = tree_classifier.fit(x_train,y_train)
    
    tree_classifier = DecisionTreeClassifier(max_depth=30)
    tree_2 = tree_classifier.fit(x_train,y_train)
    
    rf_classifier = RandomForestClassifier(n_estimators=10)
    rf_1 = rf_classifier.fit(x_train,y_train)
    
    rf_classifier = RandomForestClassifier(n_estimators=30)
    rf_2 = rf_classifier.fit(x_train,y_train)
    
    linear_svm = LinearSVC(dual=False)
    lsvm_1 = linear_svm.fit(x_train,y_train)
    
    #rbf_svm = SVC(kernel='rbf', gamma='auto')
    #svm_clf = rbf_svm.fit(x_train,y_train)
    
    
    # create new feature matrix for validation
    # set by getting predictions from the ensemble
    # classifiers
    predicts = []
    tr_predict_1 = tree_1.predict(x_valid)
    predicts.append(tr_predict_1)
    
    tr_predict_2 = tree_2.predict(x_valid)
    predicts.append(tr_predict_2)
    
    rf_predict_1 = rf_1.predict(x_valid)
    predicts.append(rf_predict_1)
    
    rf_predict_2 = rf_2.predict(x_valid)
    predicts.append(rf_predict_2)

    lsvm_predict_1 = lsvm_1.predict(x_valid)
    predicts.append(lsvm_predict_1)
    

    pre_mat = np.array(predicts)
    
    predicts_mat = np.transpose(pre_mat)
    
    # train logistic regression classifier on
    # new feature matrix
    lr = LogisticRegression()
    lr_clf = lr.fit(predicts_mat,y_valid)
    
    #pred_lr = lr.predict_proba(x_valid)[:, 1]
    
    # return all trained classifiers
    return tree_1,tree_2,rf_1,rf_2,lsvm_1,lr_clf

Use 10-fold cross validation to measure performance of your stacked classifier. See Part II solution to see how to roll your own sklearn classifier along with http://scikit-learn.org/stable/developers/contributing.html#rolling-your-own-estimator

In [8]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

class StackClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        
        return None
        
    def fit(self, X, y):
        X, y = check_X_y(X, y)
        
        self.tree_1_, self.rf_1_ ,\
        self.lsvm_1_, self.tree_2_,\
        self.rf_2_ , self.lr_clf_ = build_stack_ensemble(X, y)
        
        return self
    
    def decision_function(self, X):
        check_is_fitted(self, ['tree_1_', 'rf_1_', 'lsvm_1_', 
                               'tree_2_', 'rf_2_','lr_clf_'])
        X = check_array(X)
        

        predicts = []
        predicts.append(self.tree_1_.predict(X))
        predicts.append(self.tree_2_.predict(X))
        predicts.append(self.rf_1_.predict(X))
        predicts.append(self.rf_2_.predict(X))
        predicts.append(self.lsvm_1_.predict(X))
        
        pre_mat = np.array(predicts)
        
        predicts_mat = np.transpose(pre_mat)
        
        return self.lr_clf_.predict(predicts_mat)
    
    def predict(self, X):
        
        f = self.decision_function(X)
        return f

In [9]:
stack_clf = StackClassifier()

stack_accuracy = cross_val_score(stack_clf, train_X, train_y, 
                                 scoring='accuracy', cv=10)
print('Stacked Classifier\'s Accuracy mean: ',np.mean(stack_accuracy))

Stacked Classifier's Accuracy mean:  0.798190559361


In [10]:
stack_auc = cross_val_score(stack_clf, train_X, train_y,
                          scoring='roc_auc', cv=10)

print('Stacked Classifier\'s AUC mean: ',np.mean(stack_auc))

Stacked Classifier's AUC mean:  0.501799617978


## Final Result

Choose a single model based on all previous project steps. Train this model on the complete training dataset and measure it's performance on the held out test set.

Compare to the 10-fold CV estimate you got previously.

In [11]:
# final result goes here

Adaboost Classifier got the highest AUC score, 0.7271  
I choose Adaboost Classifier to train it with complete trainning dataset.

In [12]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

adaboost_clf = AdaBoostClassifier()
adaboost_clf.fit(train_X,train_y)

test_predict=adaboost_clf.predict(test_X)

auc_score=roc_auc_score(test_y,test_predict)
print('Adaboost Clf\'s AUC on Test Set: ',auc_score)

Adaboost Clf's AUC on Test Set:  0.504526329102


AUC score is much lower than we got before. Not a good performance.   
Try out other model below:

#### Stacked Classifier

In [13]:
stack_clf = StackClassifier()
stack_clf.fit(train_X,train_y)

test_predict=stack_clf.predict(test_X)

auc_score=roc_auc_score(test_y,test_predict)
print('Stack Clf\'s AUC on Test Set: ',auc_score)

Stack Clf's AUC on Test Set:  0.501789705313


#### RandomForest Classifier

In [14]:
rf_clf = RandomForestClassifier(n_estimators=30)
rf_clf.fit(train_X,train_y)

test_predict=rf_clf.predict(test_X)

auc_score=roc_auc_score(test_y,test_predict)
print('RandomForest Clf\'s AUC on Test Set: ',auc_score)

RandomForest Clf's AUC on Test Set:  0.565270859604


#### DecisionTree Classifier

In [15]:
tree_clf = DecisionTreeClassifier(max_depth=20)
tree_clf.fit(train_X,train_y)

test_predict=tree_clf.predict(test_X)

auc_score=roc_auc_score(test_y,test_predict)
print('DecisionTree Clf\'s AUC on Test Set: ',auc_score)

DecisionTree Clf's AUC on Test Set:  0.532471774982


#### Linear SVM Classifier

In [16]:
lsvm_clf = LinearSVC(dual=False)
lsvm_clf.fit(train_X,train_y)

test_predict=lsvm_clf.predict(test_X)

auc_score=roc_auc_score(test_y,test_predict)
print('DecisionTree Clf\'s AUC on Test Set: ',auc_score)

DecisionTree Clf's AUC on Test Set:  0.501732667919


#### Non-Linear SVM Classifier

In [17]:
from sklearn.svm import SVC

rbf_svm = SVC(kernel='rbf', gamma='auto')
rbf_svm.fit(train_X,train_y)

test_predict=rbf_svm.predict(test_X)

auc_score=roc_auc_score(test_y,test_predict)
print('Non-linear SVM \'s AUC on Test Set: ',auc_score)

Non-linear SVM 's AUC on Test Set:  0.5
