In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import (RandomForestClassifier, 
AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, 
VotingClassifier)
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.cross_validation import KFold
from collections import Counter
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC


import warnings
warnings.filterwarnings('ignore')

title_list=['Mrs', 
            'Mr', 
            'Master', 
            'Miss', 
            'Major', 
            'Rev',
            'Dr', 
            'Ms', 
            'Mlle',
            'Col', 
            'Capt', 
            'Mme', 
            'Countess',
            'Don', 
            'Jonkheer']



In [6]:
# Sanitization helper functions
def extract_title(name):
    for title in title_list:
        if (title in name):
            return title

def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        # outlier step
        outlier_step = 1.5 * IQR
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    return multiple_outliers
        
# Sanitization functions
def add_family_size(input_df):
    df = input_df.copy()
    return (df.assign(family_size=lambda df: df.SibSp + df.Parch + 1))

def add_age_class(input_df):
    df = input_df.copy()
    return (df.assign(age_class=lambda df: df.Age * df.Pclass))

def add_gender_class(input_df):
    df = input_df.copy()
    return (df.assign(gender_class=lambda df: df.Sex + df.Pclass.apply(str)))

def sanitize_name_to_title(input_df):
    df = input_df.copy()
    df = df.assign(Title=lambda df: df.Name.map(lambda name: extract_title(name)))
    return df.drop('Name', axis=1)

def one_hot(input_df):
    df = input_df.copy()
    # titles
    titles_ohe = pd.get_dummies(df.Title, prefix="Title")
    df = pd.concat([df, titles_ohe], 1)
    # sex
    sex_ohe = pd.get_dummies(df.Sex, prefix="Sex")
    df = pd.concat([df, sex_ohe], 1)
    # section
    section_ohe = pd.get_dummies(df.Section, prefix="Section")
    df = pd.concat([df, section_ohe], 1)
    # pclass
    pclass_ohe = pd.get_dummies(df.Pclass, prefix="Pclass")
    df = pd.concat([df, pclass_ohe], 1)
    # embarked
    embarked_ohe = pd.get_dummies(df.Embarked, prefix="Embarked")
    df = pd.concat([df, embarked_ohe], 1)
    # gender class
    genderClass_ohe = pd.get_dummies(df.gender_class, prefix="GenClass")
    df = pd.concat([df, genderClass_ohe], 1)
    return df.reset_index(drop=True)
    
# Prepare data for ml
def prep_data_for_ML(input_df):
    df = input_df.copy()
    # initial sanitization 
    df = (df.pipe(add_family_size) # add a column for sib+parch
          .pipe(add_age_class)   # add a column for age*pclass
          .pipe(sanitize_name_to_title) # get rid of names and extract title
          .pipe(add_gender_class)) # add a column for sex+str(pclass)
    # separating cabin section from cabin number     
    df["Section"] = df.Cabin.str[:1]
    df["CabinNumber"] = df.Cabin.str.extract("(\d+)")
    df = one_hot(df)
    df = df.fillna(0)
    return df.drop(['Title', 'Sex', 'Section', 'Pclass', 'Cabin', 'Embarked', 'Ticket', 'gender_class'], axis=1)

# Check your accuracy
def mean_accuracy(x_all, y_all):
    kf = KFold(891, n_folds=10)
    outcomes = []
    fold = 0
    for train_index, test_index in kf:
        fold += 1
        X_train, X_test = x_all.values[train_index], x_all.values[test_index]
        y_train, y_test = y_all.values[train_index], y_all.values[test_index]
        classifier.fit(X_train, y_train)
        predictions = classifier.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        outcomes.append(accuracy)
        print("Fold {0} accuracy: {1}".format(fold, accuracy))     
    mean_outcome = np.mean(outcomes)
    print("Mean Accuracy: {0}".format(mean_outcome)) 
    
# Grid Search
def run_ml(classifier, parameters, x_train, y_train):
    # Type of scoring used to compare parameter combinations
    acc_scorer = make_scorer(accuracy_score)
    # Run the grid search
    # Grid search finds the best combo of parameters
    # It randomly splits the train and validation set therefore results of each would be different
    grid_obj = GridSearchCV(classifier, parameters, scoring=acc_scorer)
    grid_obj = grid_obj.fit(x_train, y_train)
    classifier = grid_obj.best_estimator_
    # Fit the best algorithm to the data.
    print(grid_obj.best_estimator_)
    print(classifier.score(x_train, y_train))
    return classifier

def get_data_for_ML(train, test):
    train_csv = train.copy()
    test_csv = test.copy()
    
    train = prep_data_for_ML(train_csv)
    y_train = train.Survived
    X_train = train.drop("Survived", axis=1)
    
    test = prep_data_for_ML(test_csv)
    # test data is missing the following features:
    test['Title_Capt'] = 0
    test['Title_Countess'] = 0
    test['Title_Jonkheer'] = 0
    test['Title_Major'] = 0
    test['Title_Mlle'] = 0
    test['Title_Mme'] = 0
    test['Section_T'] = 0
    
    return X_train, y_train, test

In [37]:
# Functions for "Titanic top 4" kernel

# Helper functions 
def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        # outlier step
        outlier_step = 1.5 * IQR
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    return multiple_outliers

# Sanitization
def drop_outliers(input_df):
    df = input_df.copy()
    outliers = detect_outliers(train,2,["Age","SibSp","Parch","Fare"])
    return df.drop(outliers, axis = 0).reset_index(drop=True)

def fill_in_age(input_df):
    df = input_df.copy()
    index_NaN_age = list(df["Age"][df["Age"].isnull()].index)
    
    for i in index_NaN_age :
        age_med = df["Age"].median()
        age_pred = df["Age"][(
            (df['SibSp'] == df.iloc[i]["SibSp"]) & 
            (df['Parch'] == df.iloc[i]["Parch"]) & 
            (df['Pclass'] == df.iloc[i]["Pclass"])
        )].median()
        
        if not np.isnan(age_pred) :
            df['Age'].iloc[i] = age_pred
        else :
            df['Age'].iloc[i] = age_med
            
    return df

def sanitize_title(input_df):
    df = sanitize_name_to_title(input_df)    
    titles_to_replace = ['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
    title_map = {"Master" : 0,
                 "Miss" : 1,
                 "Ms" : 1,
                 "Mme" : 1,
                 "Mlle" : 1,
                 "Mrs" : 1,
                 "Mr" : 2,
                 "Rare" : 3}    
    df["Title"] = df["Title"].replace(titles_to_replace, 'Rare')
    df["Title"] = df["Title"].map(title_map)
    df["Title"] = df["Title"].astype(int)
    return df

def add_family_buckets(input_df):
    df = input_df.copy()
    df['Single'] = df['family_size'].map(lambda s: 1 if s == 1 else 0)
    df['SmallF'] = df['family_size'].map(lambda s: 1 if  s == 2  else 0)
    df['MedF'] = df['family_size'].map(lambda s: 1 if 3 <= s <= 4 else 0)
    df['LargeF'] = df['family_size'].map(lambda s: 1 if s >= 5 else 0)
    return df

def sanitize_ticket(input_df):
    df = input_df.copy()    
    Ticket = []
    for i in list(df.Ticket):
        if not i.isdigit() :
            Ticket.append(i.replace(".","").replace("/","").strip().split(' ')[0]) #Take prefix
        else:
            Ticket.append("X")    
    df["Ticket"] = Ticket
    return df

def one_hot_2(input_df):
    df = input_df.copy()    
    df = pd.get_dummies(df, columns = ["Title"])
    df = pd.get_dummies(df, columns = ["Embarked"], prefix="Em")
    df = pd.get_dummies(df, columns = ["Cabin"],prefix="Cabin")
    df = pd.get_dummies(df, columns = ["Ticket"], prefix="T")
    df = pd.get_dummies(df, columns = ["Pclass"],prefix="Pc")
    return df

def prep_and_get_data_top_4(train, test):
    train_csv = train.copy()
    test_csv = test.copy()
    
    # Load and check data
    train_csv = drop_outliers(train)
    dataset = pd.concat(objs=[train, test], axis=0).reset_index(drop=True)
    dataset = dataset.fillna(np.nan)
    
    # Feature analysis
    dataset["Fare"] = (dataset["Fare"].fillna(dataset["Fare"].median())
                   .pipe(lambda fare: fare.map(lambda i: np.log(i) if i > 0 else 0)))
    dataset["Embarked"] = dataset["Embarked"].fillna("S")
    
    # Fill in missing values
    dataset["Sex"] = dataset["Sex"].map({"male": 0, "female":1})
    dataset = fill_in_age(dataset)
    
#     dataset = add_age_class(dataset)
    
    # Feature engineering
    dataset = sanitize_title(dataset) # this drops "name" column
    dataset = add_family_size(dataset)
    dataset = add_family_buckets(dataset)
    
    # replacing blank cabin values with X
    dataset["Cabin"] = pd.Series([i[0] if not pd.isnull(i) else 'X' for i in dataset['Cabin'] ])
    
    # Ticket prefixed seem to indicate placement on the ship, get those prefixes
    dataset = sanitize_ticket(dataset)
    
    # Categorical values for pclass
    dataset["Pclass"] = dataset["Pclass"].astype("category")
    
    dataset = one_hot_2(dataset)
    
    train_len = len(train)
    train2 = dataset[:train_len]
    test = dataset[train_len:]
    test.drop(labels=["Survived"],axis = 1,inplace=True)
    train2["Survived"] = train2["Survived"].astype(int)
    X_train = train2.drop(labels = ["Survived"],axis = 1)
    y_train = train2["Survived"]
    
    return X_train, y_train, test

In [43]:
train = pd.read_csv("titanic_train.csv")
test = pd.read_csv("titanic_test.csv")

# original
X_train, y_train, test = get_data_for_ML(train, test)

# top 4 kernel
# X_train, y_train, test = prep_and_get_data_top_4(train, test)

In [39]:
X_train_c = X_train.copy()
test_c = test.copy()

test_pass_ids = test.PassengerId

X_train_c = X_train_c.drop(["PassengerId"], axis=1)
test_c = test_c.drop(["PassengerId"], axis=1)

In [40]:
X_train = X_train_c
test = test_c

In [44]:
# Choose the type of classifier. 
rfc = RandomForestClassifier()

# Choose some parameter combinations to try
params_rfc = {'n_estimators': [4, 6, 9], 
              'max_features': ['log2', 'sqrt','auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [2, 3, 5, 10], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8],
              'oob_score': [True],
              'random_state' : [2]
             }

# Highest accuracy with this combo
# RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
#             max_depth=10, max_features='auto', max_leaf_nodes=None,
#             min_impurity_decrease=0.0, min_impurity_split=None,
#             min_samples_leaf=1, min_samples_split=2,
#             min_weight_fraction_leaf=0.0, n_estimators=9, n_jobs=1,
#             oob_score=True, random_state=None, verbose=0, warm_start=False)

rfc = run_ml(rfc, params_rfc, X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=10, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=6, n_jobs=1,
            oob_score=True, random_state=2, verbose=0, warm_start=False)
0.910213243547


In [45]:
pred = rfc.predict(test)

output = test_pass_ids.to_frame("PassengerId")
output = output.assign(Survived=pred)
output.to_csv('output.csv', index=False)

In [163]:
# Extra Trees!
etc = ExtraTreesClassifier()
# Choose some parameter combinations to try
params_etc = {'n_estimators': [4, 6, 9], 
              'max_features': ['log2', 'sqrt','auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [2, 3, 5, 10], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8],
              'oob_score': [True],
              'bootstrap': [True],
              'random_state' : [2]
              
             }
etc = run_ml(etc, params_etc, X_train, y_train)

ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='entropy',
           max_depth=5, max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=5,
           min_weight_fraction_leaf=0.0, n_estimators=4, n_jobs=1,
           oob_score=True, random_state=2, verbose=0, warm_start=False)
0.842873176207


In [166]:
gbc = GradientBoostingClassifier()
params_gbc = {
    # tree params     
    "min_samples_split" : [2, 3, 5],
    "min_samples_leaf" : [1, 5, 8],
    "max_depth" : [2, 3, 5, 10],
    "max_features" : ['auto', 'sqrt', 'log2'],
    # model building params
    "learning_rate" : [0.05, 0.75, 0.1],
    "n_estimators" : [4, 6, 9],
    "subsample" : [0.6, 0.8, 0.9],
    # misc params
    "loss" : ["deviance", "exponential"],
    "random_state" : [2]
}
gbc = run_ml(gbc, params_gbc, X_train, y_train)

print(gbc.feature_importances_)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=10,
              max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=8, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=6, presort='auto',
              random_state=2, subsample=0.8, verbose=0, warm_start=False)
0.863075196409
[  7.41199060e-02   1.03809707e-01   6.80845474e-03   9.54475875e-02
   1.50647143e-01   2.42701092e-03   7.27508193e-02   7.05076685e-06
   1.43198233e-03   2.14749865e-03   0.00000000e+00   8.96832430e-03
   0.00000000e+00   2.79085350e-01   3.36667004e-02   1.70105050e-03
   5.56285027e-03   8.06274101e-03   6.71101191e-04   0.00000000e+00
   2.71928644e-03   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   2.77043026e-02   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000

In [160]:
# Predict
pred_rfc = rfc.predict(test)
pred_etc = etc.predict(test)
pred_gbc = gbc.predict(test)

In [161]:
# Output
output = test["PassengerId"].to_frame("PassengerId")

output_rfc = output.assign(Survived=pred_rfc)
output_rfc.to_csv('output_rfc.csv', index=False)

output_etc = output.assign(Survived=pred_etc)
output_etc.to_csv('output_etc.csv', index=False)

output_gbc = output.assign(Survived=pred_gbc)
output_gbc.to_csv('output_gbc.csv', index=False)

In [8]:
# Top 4 kernel - kfold
kfold = StratifiedKFold(n_splits=10)

In [16]:
# Top 4 kernel - adaboost
dtc = DecisionTreeClassifier()
ada_dtc = AdaBoostClassifier(dtc, random_state=7)
ada_params = {
    "base_estimator__criterion" : ["gini", "entropy"],
    "base_estimator__splitter" : ["best", "random"],
    "algorithm" : ["SAMME","SAMME.R"],
    "n_estimators" : [1, 2],
    "learning_rate":  [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3,1.5]
}
gs_ada_dtc = GridSearchCV(ada_dtc, param_grid=ada_params, cv=kfold, scoring="accuracy", n_jobs=2, verbose=1)
gs_ada_dtc.fit(X_train, y_train)
ada_best = gs_ada_dtc.best_estimator_
print(ada_best)
print(gs_ada_dtc.best_score_)

Fitting 10 folds for each of 112 candidates, totalling 1120 fits
AdaBoostClassifier(algorithm='SAMME',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='random'),
          learning_rate=0.0001, n_estimators=1, random_state=7)
0.769921436588


[Parallel(n_jobs=2)]: Done 1120 out of 1120 | elapsed:    5.2s finished


In [17]:
# Top 4 kernel - extra trees
etc = ExtraTreesClassifier()
etc_params = {
    "max_depth": [None],
    "max_features": [1, 3, 10],
    "min_samples_split": [2, 3, 10],
    "min_samples_leaf": [1, 3, 10],
    "bootstrap": [False],
    "n_estimators" :[100,300],
    "criterion": ["gini"]
}
gs_etc = GridSearchCV(etc, param_grid=etc_params, cv=kfold, scoring="accuracy", n_jobs=2, verbose = 1)
gs_etc.fit(X_train, y_train)
etc_best = gs_etc.best_estimator_
print(etc_best)
print(gs_etc.best_score_)

Fitting 10 folds for each of 54 candidates, totalling 540 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    9.4s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   33.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:  1.3min
[Parallel(n_jobs=2)]: Done 540 out of 540 | elapsed:  1.6min finished


ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features=3, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=3, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
0.830527497194


In [20]:
# Top 4 kernel - Random trees
rfc = RandomForestClassifier()
rfc_params = {
    "max_depth": [None],
    "max_features": [1, 3, 10],
    "min_samples_split": [2, 3, 10],
    "min_samples_leaf": [1, 3, 10],
    "bootstrap": [False],
    "n_estimators" :[100,300],
    "criterion": ["gini"]
}
gs_rfc = GridSearchCV(rfc, param_grid=rfc_params, cv=kfold, scoring="accuracy", n_jobs=2, verbose=1)
gs_rfc.fit(X_train, y_train)
rfc_best = gs_rfc.best_estimator_
print(rfc_best)
print(gs_rfc.best_score_)

Fitting 10 folds for each of 54 candidates, totalling 540 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   11.6s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   34.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:  1.3min
[Parallel(n_jobs=2)]: Done 540 out of 540 | elapsed:  1.5min finished


RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=3, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
0.837261503928


In [21]:
# Top 4 kernel - gradient boosting
gbc = GradientBoostingClassifier()
gbc_params = {
    'loss' : ["deviance"],
    'n_estimators' : [100,200,300],
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': [4, 8],
    'min_samples_leaf': [100,150],
    'max_features': [0.3, 0.1] 
}
gs_gbc = GridSearchCV(gbc, param_grid=gbc_params, cv=kfold, scoring="accuracy", n_jobs=2, verbose=1)
gs_gbc.fit(X_train, y_train)
gbc_best = gs_gbc.best_estimator_
print(gbc_best)
print(gs_gbc.best_score_)

Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=2)]: Done  88 tasks      | elapsed:    5.7s
[Parallel(n_jobs=2)]: Done 388 tasks      | elapsed:   24.1s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=4,
              max_features=0.1, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=100, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
0.818181818182


[Parallel(n_jobs=2)]: Done 720 out of 720 | elapsed:   45.0s finished


In [28]:
# Top 4 kernel - scv classifier
svc = SVC(probability=True)
svc_params = {
    'kernel': ['rbf'], 
    'gamma': [ 0.001, 0.01, 0.1, 1],
    'C': [1, 10, 50, 100,200,300, 1000]
}
gs_svc = GridSearchCV(svc, param_grid=svc_params, cv=kfold, scoring="accuracy", n_jobs= 2, verbose = 1)
gs_svc.fit(X_train, y_train)
print(svc_best)
print(gs_svc.best_score_)

Fitting 10 folds for each of 28 candidates, totalling 280 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    8.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   40.8s
[Parallel(n_jobs=2)]: Done 280 out of 280 | elapsed:  1.1min finished


SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
0.616161616162


In [29]:
# Top 4 kernel - voting classifier
vc = VotingClassifier(estimators=[("rfc", rfc_best), ("etc", etc_best), ("svc", svc_best), ("ada", ada_best), ("gbc", gbc_best)], voting="soft", n_jobs=2)
vc = vc.fit(X_train, y_train)

pred_vc = vc.predict(test)

output = test["PassengerId"].to_frame("PassengerId")
output_vc = output.assign(Survived=pred_vc)
output_vc.to_csv('output_vc.csv', index=False)