## O

In [3]:
# Loading Libraries
import os
from time import time
import numpy as np
import pandas as pd
import re
import sklearn
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC #SupportVectorClassifier
from sklearn.cross_validation import KFold;
from sklearn.metrics import accuracy_score

In [4]:
data_train = pd.read_csv('../input/preproc2_train.csv')
data_test = pd.read_csv('../input/preproc2_test.csv')

In [5]:
#Preparing data :
X = data_train.drop(['PassengerId','Survived'], axis=1)
X = X.values # creates an array
y = data_train['Survived']
y = y.values
X_test = data_test.drop(['PassengerId'], axis=1)
X_test = X_test.values

In [6]:
ntrain = X.shape[0]
ntest = X_test.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 9 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

In [7]:
# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)

In [8]:
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [9]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV


In [10]:
def hypertuning_rscv(est, p_distr, nbr_iter,X,y):
    rdmsearch = RandomizedSearchCV(est, param_distributions=p_distr,
                                  n_jobs=-1, n_iter=nbr_iter, cv=9)
    #CV = Cross-Validation ( here using Stratified KFold CV)
    start = time()
    rdmsearch.fit(X,y)
    print('hyper-tuning time : %d seconds' % (time()-start))
    start = 0
    ht_params = rdmsearch.best_params_
    ht_score = rdmsearch.best_score_
    return ht_params, ht_score
    

# 1st-level model 1 : RandomForest Classifier
we will search for the best tuning hyperparameters of the random forest classifier 

our estimator is the RandomForestClassifier and we will most likely use the RandomizedSearchCV as our search strategy.  Also, we will use the KFold Cross-validation and the basic accuracy_score as metric. 
NB: remember to use the parameter n_jobs=-1 for parallelization. 

In [11]:
est = RandomForestClassifier(n_jobs=-1, n_estimators=500)
rf_p_dist={'max_depth':[3,5,10,None],
              'max_features':randint(1,6),
               'criterion':['gini','entropy'],
               'bootstrap':[True,False],
               'min_samples_leaf':randint(1,10)
              }
rf_parameters, rf_ht_score = hypertuning_rscv(est, rf_p_dist, 40, X, y)
rf_parameters['n_jobs']=-1
rf_parameters['n_estimators']=500
print(rf_parameters)
print('Hyper-tuned model score :')
print(rf_ht_score)

** Building the Random Forest Classifier ** 
we will build the random forest that we will use as one of the first level models, trained on dataset with an out-of-bag set. 

In [12]:
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_parameters)
# Now we train our model and get our out-of-bag sets
rf_oof_train, rf_oof_test = get_oof(rf, X, y, X_test) 
# basic accuracy_score : 
print(accuracy_score(rf_oof_train.ravel(), y)*100)

##  1st-level model 2 : ExtraTreesClassifier

In [13]:
est = ExtraTreesClassifier(n_jobs=-1, n_estimators=500)
et_p_dist = {'criterion':['gini','entropy'], 
             'max_features':randint(1,6), 
             'max_depth':[3,10,None],
             'bootstrap':[True,False],
             'min_samples_leaf':randint(1,10)
             }

In [14]:
et_parameters, et_ht_score = hypertuning_rscv(est, et_p_dist, 30, X, y)
et_parameters['n_jobs']=-1
et_parameters['n_estimators']=500
print(et_parameters)
print('Hyper-tuned model score :')
print(et_ht_score)

In [15]:
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_parameters)
et_oof_train, et_oof_test = get_oof(et, X, y, X_test) 
print(accuracy_score(et_oof_train.ravel(), y)*100)

## 1st-level model 3 : AdaBoost Classifier

In [None]:
est = AdaBoostClassifier()
ada_p_dist={'learning_rate':[0.25,0.5,0.75,1.],
            'n_estimators':[100,250,500,650],
            }

In [None]:
ada_parameters, ada_ht_score = hypertuning_rscv(est, ada_p_dist, 10, X, y)
print(ada_parameters)
print('Hyper-tuned model score :')
print(ada_ht_score*100)

In [None]:
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_parameters)
ada_oof_train, ada_oof_test = get_oof(ada, X, y, X_test) 
print(accuracy_score(ada_oof_train.ravel(), y)*100)

## 1st-level model 4 : Gradient Boosting Classifier

In [None]:
est = GradientBoostingClassifier()
gb_p_dist={'n_estimators':[100,250,500,750],
           'max_depth':[3,5,10,None],
           'min_samples_leaf':randint(1,10),
           }

In [None]:
gb_parameters, gb_ht_score = hypertuning_rscv(est, gb_p_dist, 40, X, y)
print(gb_parameters)
print('Hyper-tuned model score :')
print(gb_ht_score*100)

In [None]:
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_parameters)
gb_oof_train, gb_oof_test = get_oof(gb, X, y, X_test) 
print(accuracy_score(gb_oof_train.ravel(), y)*100)

## 1st-level model 5 : Support Vector Machine

In [None]:
est = SVC()
from scipy.stats import norm
svc_p_dist={'kernel':['linear','poly','rbf'],
            'C':norm(loc=0.5, scale=0.15)} # A ABSOLUMENT REVOIR 

In [None]:
svc_parameters, svc_ht_score = hypertuning_rscv(est, svc_p_dist, 200, X, y)
print(svc_parameters)
print('Hyper-tuned model score :')
print(svc_ht_score*100)

In [None]:
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_parameters)
svc_oof_train, svc_oof_test = get_oof(svc, X, y, X_test) 
print(accuracy_score(svc_oof_train.ravel(), y)*100)

## 2nd-Level Model : 

In [None]:
Flevel_pred_train = pd.DataFrame({'RF': rf_oof_train.ravel(),
                                 'ET':et_oof_train.ravel(),
                                 'AB':ada_oof_train.ravel(),
                                 'GB':gb_oof_train.ravel(),
                                 'SVC':svc_oof_train.ravel()})
Flevel_pred_train.head()

** Concatenation **

In [None]:
x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)

In [None]:
gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1).fit(X, y)
predictions = gbm.predict(X_test)


In [None]:
print(accuracy_score(gbm.predict(X),y)*100)

In [None]:
PassengerId = data_test['PassengerId']
StackingSubmission = pd.DataFrame({ 'PassengerId': PassengerId,
                            'Survived': predictions })
StackingSubmission.to_csv("StackingSubmission.csv", index=False)