# Part III: Ensembles and Final Result



In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
%reload_ext autoreload

In [6]:
import proj2_lib.util as utils

In [7]:
utils.file_config

{'feature_pipeline_file': 'feature_pipeline.pkl',
 'labels_pipeline_file': 'labels_pipeline.pkl',
 'objstore_path': 'objects',
 'processed_data_path': 'processed_data',
 'raw_data_csv': 'KaggleV2-May-2016.csv',
 'raw_data_path': 'data',
 'test_csv': 'test_set.csv',
 'train_csv': 'train_set.csv'}

In [8]:
file_config = utils.file_config
#config['raw_data_path'] = "some_other_directory"

In [9]:
# ONLY NEED TO RUN THIS STEP ONCE (switch this to True to run it)
RUN_MAKE_TRAIN_TEST_FILES = True
if RUN_MAKE_TRAIN_TEST_FILES:
    utils.make_train_test_sets(config=file_config)

In [10]:
import proj2_lib.preprocess as preprocess

# ONLY NEED TO RUN THIS STEP ONCE
RUN_FIT_PREPROCESSING = True
if RUN_FIT_PREPROCESSING:
    preprocess.fit_save_pipelines(config=file_config)

In [157]:
train_X, train_y = preprocess.load_train_data(config=file_config)

In [158]:
print(train_X.shape)
print(train_y.shape)

(90514, 105)
(90514,)


In [159]:
test_X, test_y = preprocess.load_test_data(config=file_config)

In [160]:
print(test_X.shape)
print(test_y.shape)

(20000, 105)
(20000,)


## AdaBoost

### Train an AdaBoost classifier using Decision Tree stubs as weak learners. Compare its performance to results obtained in Part II using 10 fold CV.
### Here I train an AdaBoost Classifier on Decision Tree stubs (with max_depth=1). n_estimators=200 and algorithm="SAMME.R".

In [152]:
# AdaBoost code goes here
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada_clf = AdaBoostClassifier(DecisionTreeClassifier (max_depth=1),n_estimators=200,algorithm="SAMME.R", learning_rate=1)
ada_clf.fit(train_X,train_y)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=1, n_estimators=200, random_state=None)

### Here we compute the AUC accuracy with using 10 fold cross validation. The mean of AUC accuracy is 73% which is higher than almost every model we have developed so far except the random forest. The score is the same as random forest.

In [153]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_score

roc_scores = cross_val_score(ada_clf, train_X, train_y, scoring="roc_auc", cv=10)
    
print("clf", ada_clf, "Scores:", "AUC","Mean: %0.2f (+/- %0.2f)" % (roc_scores.mean(), roc_scores.std() * 2))

clf AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=1, n_estimators=200, random_state=None) Scores: AUC Mean: 0.73 (+/- 0.01)


## Stacking

Choose a set of 5 or so classifiers. Write a function that trains an ensemble using stacking

### Here  a function is written for training 5 models and then ensemble them with stacking and using LogisticRegression as the blending model.

In [149]:
def build_stack_ensemble(X, y):
    # create train/validation sets using StratifiedShuffleSplit

    import pandas as pd
    import numpy as np

    from sklearn.model_selection import StratifiedShuffleSplit

    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2 , random_state=1234)
    new_set=pd.DataFrame(X.copy())
    new_set['label']=y.copy().ravel()

    for train_index, test_index in split.split(new_set, new_set["label"]):
        new_train_set = new_set.iloc[train_index]
        new_test_set = new_set.iloc[test_index]

    new_train_y = np.array(pd.DataFrame(new_train_set['label'].copy(), columns=["label"])).ravel()
    new_train_X = np.array(new_train_set.drop('label', axis=1))

    new_test_y = np.array(pd.DataFrame(new_test_set['label'].copy(), columns=["label"])).ravel()
    new_test_X = np.array(new_test_set.drop('label', axis=1))


    # train classifiers in ensemble using train set

    from sklearn.tree import DecisionTreeClassifier
    clf1 = DecisionTreeClassifier(max_depth=8)
    clf1_fit=clf1.fit(new_train_X, new_train_y)

    from sklearn.ensemble import RandomForestClassifier
    clf2 = RandomForestClassifier(max_features= 18, n_estimators= 100)
    clf2_fit=clf2.fit(new_train_X, new_train_y)

    clf3 = RandomForestClassifier(max_features= 25, n_estimators= 50)
    clf3_fit=clf3.fit(new_train_X, new_train_y)


    from sklearn.svm import LinearSVC
    clf4 = LinearSVC(dual=False, C= 0.125) # we use dual when number of features are much less than number of examples. in LinearSVC there is this option but if we use SVM with kernel='linear' there is no dual option. In this case the linearSvc is better than SVM with linaer Kernel in terms of running time.
    clf4_fit=clf4.fit(new_train_X, new_train_y)

    clf5 = LinearSVC(dual=False, C= 0.01) # we use dual when number of features are much less than number of examples. in LinearSVC there is this option but if we use SVM with kernel='linear' there is no dual option. In this case the linearSvc is better than SVM with linaer Kernel in terms of running time.
    clf5_fit=clf5.fit(new_train_X, new_train_y)    


    # create new feature matrix for validation set by getting predictions from the ensemble classifiers

    new_feature_matrix=np.full((new_test_X.shape[0],5), 0.0)
    i=0
    for clf in (clf1_fit,clf2_fit,clf3_fit,clf4_fit,clf5_fit):
        new_feature_matrix[:,i]=clf.predict(new_test_X)
        i+=1
    
    # train logistic regression classifier on new feature matrix
    
    from sklearn.linear_model import LogisticRegression
    meta_clf=LogisticRegression()
    blend_clf=meta_clf.fit(new_feature_matrix, new_test_y)
    
    # return all trained classifiers
    return (clf1_fit,clf2_fit,clf3_fit,clf4_fit,clf5_fit,blend_clf)

Use 10-fold cross validation to measure performance of your stacked classifier. See Part II solution to see how to roll your own sklearn classifier along with http://scikit-learn.org/stable/developers/contributing.html#rolling-your-own-estimator

### Here Stack_Ensemble_Classifier class is written.

In [150]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

class Stack_Ensemble_Classifier (BaseEstimator, ClassifierMixin):
    def __init__(self):
        
        return None 
        
    def fit(self, X, y):
        X, y = check_X_y(X, y)
        
        self.clf1_, self.clf2_ , self.clf3_, self.clf4_,self.clf5_,self.blend_clf_= build_stack_ensemble(X, y)
        
        return self
    
    def decision_function(self, X):
        check_is_fitted(self, ['clf1_', 'clf2_', 'clf3_', 'clf4_', 'clf5_','blend_clf_'])
        X = check_array(X)
        
        new_feature_matrix=np.full((X.shape[0],5), 0.0)
        i=0
        for clf in (self.clf1_, self.clf2_ , self.clf3_, self.clf4_,self.clf5_):
            new_feature_matrix[:,i]=clf.predict(X)
            i+=1
        
        return self.blend_clf_.predict(new_feature_matrix)
    
    def predict(self, X):
        
        f = self.decision_function(X)
        return f


### 10-fold cross validation is used to measure performance of the stacked classifier. The results show that the AUC for this classifier is about 50% which is weird. 

In [154]:


from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

stk_ensemble_clf = Stack_Ensemble_Classifier()

roc_scores = cross_val_score(stk_ensemble_clf, train_X, train_y, scoring="roc_auc", cv=10)
    
print("clf", stk_ensemble_clf, "Scores:", "AUC","stk_ensemble_Mean: %0.2f (+/- %0.2f)" % (roc_scores.mean(), roc_scores.std() * 2))
    


clf Stack_Ensemble_Classifier() Scores: AUC stk_ensemble_Mean: 0.51 (+/- 0.00)


## Final Result

Choose a single model based on all previous project steps. Train this model on the complete training dataset and measure it's performance on the held out test set.

Compare to the 10-fold CV estimate you got previously.

We use AdaBoost model and random forest mdel as the best models. Train them on the whole train set first and then we use them on the test set to see how they are performing. It shows the AUC score is 0.51 and 0.56 respectively and Accuracy is 0.80 and 0.78 respectively.

In [163]:
# final result goes here
ada_clf_fit=ada_clf.fit(train_X,train_y)
y_pred=ada_clf_fit.predict(test_X)
AUC_score=roc_auc_score(test_y,y_pred)
print(AUC_score)

0.505566542108


In [162]:
Accuracy_score=accuracy_score(test_y,y_pred)
print(Accuracy_score)

0.79665


In [167]:
from sklearn.ensemble import RandomForestClassifier
RF_clf = RandomForestClassifier(max_features= 18, n_estimators= 100)
RF_clf_fit=RF_clf.fit(train_X,train_y)
y_pred=RF_clf_fit.predict(test_X)
AUC_score=roc_auc_score(test_y,y_pred)
print(AUC_score)

0.563674536832


In [168]:
Accuracy_score=accuracy_score(test_y,y_pred)
print(Accuracy_score)

0.7797


In [169]:
sum(y_pred==1)

1994

In [170]:
sum(test_y==1)

4038