# Titanic Machine Learning Solution using
# Pipelines to get incrementally better

1. Collecting Data
2. Data Exploration
3. Feature Engineering
4. Model building
5. Testing

In [238]:
# standard
import pandas as pd
import numpy as np

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
% matplotlib inline

#Scikit Learn Base
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler

# Scikit Learn tools
from sklearn.model_selection import KFold, cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import learning_curve

from xgboost import XGBClassifier

# Machine Learning
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC

import warnings
warnings.filterwarnings('ignore')

In [19]:
df_train = pd.read_csv('input/train.csv')
df_test = pd.read_csv('input/test.csv')
combine = [df_train, df_test]
df_train.name = 'Train'
df_test.name = 'Test'

kfolds = KFold(n_splits=5, shuffle=True, random_state=14)
target = df_train.Survived

In [5]:
def get_survived(estimator, kfolds, data, target):
    scores = []
    train = data.copy()
    for i, (train_index, test_index) in enumerate(kfolds.split(target)):
        training = train.iloc[train_index,:]
        valid = train.iloc[test_index,:]
        tr_label = target.iloc[train_index]
        val_label = target.iloc[test_index]
        estimator.fit(training, tr_label)
        pred = estimator.predict(valid)
        score = accuracy_score(y_pred=pred, y_true=val_label)
        scores.append(score)
    return round(np.mean(scores),3)

def get_coef(clsf, features):
    imp = clsf.steps[1][1].coef_.tolist()
    results = pd.DataFrame({'Features':features,'Score':imp})
    results = results.sort_values(by='Score', ascending=False)
    return results

## 1. Baseline
As baseline we will predict all women survived.

In [20]:
scores = []

for i, (train_index, test_index) in enumerate(kfolds.split(target)):
    df_train['prediction'] = 0 #Initialize all predictions as Not Survived (0)
    print("Fold {} in progress".format(i))
    result = df_train['Survived'].iloc[test_index] #Get actual answers 
    df_train['prediction'].loc[df_train.Sex == 'female'] = 1 #Predict all using method
    pred = df_train['prediction'].iloc[test_index] #Get predictions for test index
    score = accuracy_score(y_pred=pred, y_true=result) #Calculate score
    scores.append(score)
    print("Score : {}".format(round(score,3)))
    print("-"*40)
    
print("Baseline: {} +- {}".format(round(np.mean(scores),3), round(np.std(scores),3)))
df_train.drop('prediction', axis=1)   
    

Fold 0 in progress
Score : 0.788
----------------------------------------
Fold 1 in progress
Score : 0.787
----------------------------------------
Fold 2 in progress
Score : 0.781
----------------------------------------
Fold 3 in progress
Score : 0.798
----------------------------------------
Fold 4 in progress
Score : 0.781
----------------------------------------
Baseline: 0.787 +- 0.006


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


## 2. Basic Imputing and Dropping advanced features


In [7]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    """
    Selects columns from the dataframe on which other processing is to be done
    attribute_names = list of column names to be selected
    """
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[(self.attribute_names)].values

In [8]:
class GeneralImputer(BaseEstimator, TransformerMixin):
    """
    General Imputer to impute missing values by applying certain functions on a group of columns
    col_impute - column into which values are imputed
    col_group - column groups which are taken into account
    impute_method - choose from 'median' or 'average'
    """
    def __init__(self, col_impute, col_group, impute_method='median'):
        self.col_impute = col_impute
        self.col_group = col_group
        self.impute_method = impute_method
        return None
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        df = X.copy()
        
        grouped = df.groupby(self.col_group)
        
        def imputer_median(series):
            return series.fillna(series.median())

        def imputer_average(series):
            return series.fillna(series.mean())
        
        if self.impute_method == 'median':
            df[(self.col_impute)] = grouped[self.col_impute].transform(imputer_median)
            return df
        elif self.impute_method == 'average':
            df[(self.col_impute)] = grouped[self.col_impute].transform(imputer_average)
            return df
        else:
            return np.nan
            

In [9]:
class EmbarkedImputer(BaseEstimator, TransformerMixin):
    """
    Imputes values into Embarked variable
    """
    def __init__(self): # no *args or **kargs
        return None
    def fit(self, X):
        return self  # nothing else to do
    def transform(self, X):
        # deep copy the df
        df = X.copy()
        
        # Clean up fares.
        value_to_input = df['Embarked'].mode()
        value_to_input = value_to_input[0]
        
        df.loc[(df['Embarked'].isnull()),['Embarked']] = value_to_input

        return(df)

In [21]:
train_2 = df_train.drop(['Name','Ticket','Cabin'], axis=1)
train_2.info()
train_2.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Embarked       889 non-null object
prediction     891 non-null int64
dtypes: float64(2), int64(6), object(2)
memory usage: 69.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,prediction
0,1,0,3,male,22.0,1,0,7.25,S,0
1,2,1,1,female,38.0,1,0,71.2833,C,1
2,3,1,3,female,26.0,0,0,7.925,S,1
3,4,1,1,female,35.0,1,0,53.1,S,1
4,5,0,3,male,35.0,0,0,8.05,S,0
5,6,0,3,male,,0,0,8.4583,Q,0
6,7,0,1,male,54.0,0,0,51.8625,S,0
7,8,0,3,male,2.0,3,1,21.075,S,0
8,9,1,3,female,27.0,0,2,11.1333,S,1
9,10,1,2,female,14.0,1,0,30.0708,C,1


In [22]:
from sklearn_pandas import DataFrameMapper

CAT_ATTRIBS = ['Sex','Embarked']
NUM_ATTRIBS = ['Pclass','Age','SibSp','Parch','Fare']

# map transformers on variables
my_mapper = DataFrameMapper([
    ('Sex', LabelBinarizer()),
    ('Embarked', LabelBinarizer()),
    ], input_df=True)

categorical_pipeline = Pipeline([
    ('Embarked_imputer', EmbarkedImputer()),
    ('label_binarizer_df', my_mapper),
])

numerical_pipeline = Pipeline([
    ('Fare_imputer', GeneralImputer(col_impute='Fare',col_group=['Sex','Pclass'], impute_method='median')),
    ('Age_imputer', GeneralImputer(col_impute='Age',col_group=['Sex','Pclass'], impute_method='median')),
    ('Selector', DataFrameSelector(NUM_ATTRIBS)),
    ('Scaler', StandardScaler())
])

full_pipeline = FeatureUnion(transformer_list=[
    ('Num_pipeline', numerical_pipeline),
    ('Cat_pipeline', categorical_pipeline),
])

In [23]:
train_prepared = full_pipeline.fit_transform(train_2)

In [16]:
# Random Forest Classifier - Grid Search
param_grid = [
    {
        'bootstrap':[False, True],
        'n_estimators':[80,90,100],
        'max_features':[0.6,0.7,0.8],
        'min_samples_leaf':[10,12,14],
        'min_samples_split':[3,5,7]
    },
]

rfc = RandomForestClassifier()
# grid_search = GridSearchCV(rfc, param_grid, cv=5, scoring='neg_mean_squared_error', refit=True)
grid_search = GridSearchCV(rfc, param_grid, cv=5, scoring='accuracy', refit=True)


%time grid_search.fit(train_prepared,train_2['Survived'])

#let's see the best estimator
best_rfc = grid_search.best_estimator_
print(best_rfc)
print("_"*40)
#with its score
# print(np.sqrt(-grid_search.best_score_))
print(grid_search.best_score_)
print(pd.DataFrame(grid_search.cv_results_)[['params','mean_test_score','std_test_score']])

Wall time: 7min 13s
RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.7, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=7,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
________________________________________
0.8316498316498316
                                                params  mean_test_score  \
0    {'bootstrap': False, 'max_features': 0.6, 'min...         0.819304   
1    {'bootstrap': False, 'max_features': 0.6, 'min...         0.823793   
2    {'bootstrap': False, 'max_features': 0.6, 'min...         0.826038   
3    {'bootstrap': False, 'max_features': 0.6, 'min...         0.820426   
4    {'bootstrap': False, 'max_features': 0.6, 'min...         0.821549   
5    {'bootstrap': False, 'max_features': 0.6,

In [17]:
best_rfc

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.7, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=7,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [43]:
# Gradient Boosting Classifier - Grid Search
# param_grid = [
#     {
#         'loss':['deviance', 'exponential'],
#         'learning_rate':[0.1, 0.2, 0.3],
#         'n_estimators':[80,90,100],
#         'max_features':[0.6,0.7,0.8],
#         'min_samples_leaf':[10,12,14],
#         'min_samples_split':[3,5,7]
#     },
# ]
param_grid = [
    {
        'loss':['exponential'],
        'learning_rate':[0.1],
        'n_estimators':[10,30,40],
        'max_features':[0.1,0.2,0.3],
        'min_samples_leaf':[7,8,9],
        'min_samples_split':[13,15,17,19]
    },
]

gbc = GradientBoostingClassifier()
grid_search = GridSearchCV(gbc, param_grid, cv=5, scoring='accuracy', refit=True)


%time grid_search.fit(train_prepared,train_2['Survived'])

#let's see the best estimator
best_gbc = grid_search.best_estimator_
print(best_gbc)
print("_"*40)
#with its score
# print(np.sqrt(-grid_search.best_score_))
print(grid_search.best_score_)
print(pd.DataFrame(grid_search.cv_results_)[['params','mean_test_score','std_test_score']])

Wall time: 24.9 s
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=3,
              max_features=0.3, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=8, min_samples_split=17,
              min_weight_fraction_leaf=0.0, n_estimators=40,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
________________________________________
0.7384960718294051
                                                params  mean_test_score  \
0    {'learning_rate': 0.1, 'loss': 'exponential', ...         0.704826   
1    {'learning_rate': 0.1, 'loss': 'exponential', ...         0.728395   
2    {'learning_rate': 0.1, 'loss': 'exponential', ...         0.734007   
3    {'learning_rate': 0.1, 'loss': 'exponential', ...         0.704826   


In [44]:
best_gbc

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=3,
              max_features=0.3, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=8, min_samples_split=17,
              min_weight_fraction_leaf=0.0, n_estimators=40,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [50]:
scores_rfc = []
scores_gbc = []

for i, (train_index, test_index) in enumerate(kfolds.split(target)):
    print("Fold {} in progress".format(i))
#     print(test_index)
    result = train_2['Survived'].iloc[test_index] #Get actual answers 
    
    #Get prepared train and test
    train_2_prepared = full_pipeline.fit_transform(train_2.iloc[train_index])
#     import pdb; pdb.set_trace()
    test_2_prepared = full_pipeline.fit_transform(train_2.iloc[test_index])
    
    #Train estimator for train_index
    best_rfc.fit(train_2_prepared, train_2['Survived'].iloc[train_index])
    best_gbc.fit(train_2_prepared, train_2['Survived'].iloc[train_index])
    
    #Get predictions for test_index
    pred_rfc = best_rfc.predict(test_2_prepared)
    pred_gbc = best_gbc.predict(test_2_prepared)
  
    #Calculate score
    score_rfc = accuracy_score(y_pred=pred_rfc, y_true=result) #Calculate score
    score_gbc = accuracy_score(y_pred=pred_gbc, y_true=result) #Calculate score
    scores_rfc.append(score_rfc)
    scores_gbc.append(score_gbc)
    print("Score : {} : {}".format(round(score_rfc,3),round(score_gbc,3)))
    print("-"*40)
    
print("Baseline RFC: {} +- {}".format(round(np.mean(scores_rfc),3), round(np.std(scores_rfc),3)))
print("Baseline GBC: {} +- {}".format(round(np.mean(scores_gbc),3), round(np.std(scores_gbc),3)))


Fold 0 in progress
Score : 0.844 : 0.827
----------------------------------------
Fold 1 in progress
Score : 0.803 : 0.775
----------------------------------------
Fold 2 in progress
Score : 0.826 : 0.826
----------------------------------------
Fold 3 in progress
Score : 0.803 : 0.837
----------------------------------------
Fold 4 in progress
Score : 0.831 : 0.815
----------------------------------------
Baseline RFC: 0.822 +- 0.016
Baseline GBC: 0.816 +- 0.022


In [26]:
train_prepared = full_pipeline.fit_transform(train_2)

In [27]:
best_rfc.fit(train_prepared, train_2['Survived'])

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.7, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=7,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [28]:
test_2 = df_test.drop(['Name','Ticket','Cabin'], axis=1)
test_2.info()
test_2.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           417 non-null float64
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 26.2+ KB


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S
5,897,3,male,14.0,0,0,9.225,S
6,898,3,female,30.0,0,0,7.6292,Q
7,899,2,male,26.0,1,1,29.0,S
8,900,3,female,18.0,0,0,7.2292,C
9,901,3,male,21.0,2,0,24.15,S


In [29]:
test_prepared = full_pipeline.fit_transform(test_2)

In [30]:
test_survived = best_rfc.predict(test_prepared)

In [31]:
submission_2 = pd.DataFrame({
    'PassengerId':test_2.PassengerId,
    'Survived':test_survived
})
submission_2.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [27]:
submission_2.to_csv('submission_2.csv',index=False)

## 3. Advanced Features (Family Size)

In [32]:
class FamilySize_feature(BaseEstimator, TransformerMixin):
    """
    Imputes values into Embarked variable
    """
    def __init__(self): # no *args or **kargs
        return None
    def fit(self, X):
        return self  # nothing else to do
    def transform(self, X):
        # deep copy the df
        df = X.copy()
        df['FamilySize'] = df['Parch'] + df['SibSp']
        return(df)

In [52]:
from sklearn_pandas import DataFrameMapper

CAT_ATTRIBS = ['Sex','Embarked']
NUM_ATTRIBS = ['Pclass','Age','FamilySize','Fare']

# map transformers on variables
my_mapper = DataFrameMapper([
    ('Sex', LabelBinarizer()),
    ('Embarked', LabelBinarizer()),
    ], input_df=True)

categorical_pipeline = Pipeline([
    ('Embarked_imputer', EmbarkedImputer()),
    ('label_binarizer_df', my_mapper),
])

numerical_pipeline = Pipeline([
    ('Fare_imputer', GeneralImputer(col_impute='Fare',col_group=['Sex','Pclass'], impute_method='median')),
    ('Age_imputer', GeneralImputer(col_impute='Age',col_group=['Sex','Pclass'], impute_method='median')),
    ('FamilySize_feature_creation', FamilySize_feature()),
    ('Selector', DataFrameSelector(NUM_ATTRIBS)),
    ('Scaler', StandardScaler())
])

full_pipeline = FeatureUnion(transformer_list=[
    ('Num_pipeline', numerical_pipeline),
    ('Cat_pipeline', categorical_pipeline),
])

In [53]:
train_prepared = full_pipeline.fit_transform(train_2)
print(train_prepared)

[[ 0.82737724 -0.53489116  0.05915988 ...  0.          0.
   1.        ]
 [-1.56610693  0.66839176  0.05915988 ...  1.          0.
   0.        ]
 [ 0.82737724 -0.23407043 -0.56097483 ...  0.          0.
   1.        ]
 ...
 [ 0.82737724 -0.57249375  1.29942929 ...  0.          0.
   1.        ]
 [-1.56610693 -0.23407043 -0.56097483 ...  1.          0.
   0.        ]
 [ 0.82737724  0.21716066 -0.56097483 ...  0.          1.
   0.        ]]


In [54]:
# Random Forest Classifier - Grid Search
param_grid = [
    {
        'bootstrap':[False, True],
        'n_estimators':[80,90,100],
        'max_features':[0.6,0.7,0.8],
        'min_samples_leaf':[10,12,14],
        'min_samples_split':[3,5,7]
    },
]

rfc = RandomForestClassifier()
# grid_search = GridSearchCV(rfc, param_grid, cv=5, scoring='neg_mean_squared_error', refit=True)
grid_search = GridSearchCV(rfc, param_grid, cv=5, scoring='accuracy', refit=True)


%time grid_search.fit(train_prepared,train_2['Survived'])

#let's see the best estimator
best_rfc = grid_search.best_estimator_
print(best_rfc)
print("_"*40)
#with its score
# print(np.sqrt(-grid_search.best_score_))
print(grid_search.best_score_)
print(pd.DataFrame(grid_search.cv_results_)[['params','mean_test_score','std_test_score']])

Wall time: 6min 2s
RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.8, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=7,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
________________________________________
0.8294051627384961
                                                params  mean_test_score  \
0    {'bootstrap': False, 'max_features': 0.6, 'min...         0.821549   
1    {'bootstrap': False, 'max_features': 0.6, 'min...         0.821549   
2    {'bootstrap': False, 'max_features': 0.6, 'min...         0.823793   
3    {'bootstrap': False, 'max_features': 0.6, 'min...         0.820426   
4    {'bootstrap': False, 'max_features': 0.6, 'min...         0.820426   
5    {'bootstrap': False, 'max_features': 0.6, 

In [55]:
best_rfc

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.8, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=7,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [56]:
# Gradient Boosting Classifier - Grid Search
# param_grid = [
#     {
#         'loss':['deviance', 'exponential'],
#         'learning_rate':[0.1, 0.2, 0.3],
#         'n_estimators':[80,90,100],
#         'max_features':[0.6,0.7,0.8],
#         'min_samples_leaf':[10,12,14],
#         'min_samples_split':[3,5,7]
#     },
# ]
param_grid = [
    {
        'loss':['exponential'],
        'learning_rate':[0.1],
        'n_estimators':[10,30,40],
        'max_features':[0.1,0.2,0.3],
        'min_samples_leaf':[7,8,9],
        'min_samples_split':[13,15,17,19]
    },
]

gbc = GradientBoostingClassifier()
grid_search = GridSearchCV(gbc, param_grid, cv=5, scoring='accuracy', refit=True)


%time grid_search.fit(train_prepared,train_2['Survived'])

#let's see the best estimator
best_gbc = grid_search.best_estimator_
print(best_gbc)
print("_"*40)
#with its score
# print(np.sqrt(-grid_search.best_score_))
print(grid_search.best_score_)
print(pd.DataFrame(grid_search.cv_results_)[['params','mean_test_score','std_test_score']])

Wall time: 37.5 s
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=3,
              max_features=0.1, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=9, min_samples_split=19,
              min_weight_fraction_leaf=0.0, n_estimators=30,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
________________________________________
0.8249158249158249
                                                params  mean_test_score  \
0    {'learning_rate': 0.1, 'loss': 'exponential', ...         0.768799   
1    {'learning_rate': 0.1, 'loss': 'exponential', ...         0.809203   
2    {'learning_rate': 0.1, 'loss': 'exponential', ...         0.815937   
3    {'learning_rate': 0.1, 'loss': 'exponential', ...         0.786756   


In [57]:
best_gbc

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=3,
              max_features=0.1, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=9, min_samples_split=19,
              min_weight_fraction_leaf=0.0, n_estimators=30,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [168]:
estimator = [
    ('RandomForestClassifier',best_rfc),
    ('GradientBoostedClassifier',best_gbc),
]

scores = [[],[]]

for i, (train_index, test_index) in enumerate(kfolds.split(target)):
    print("Fold {} in progress".format(i))
    for k in enumerate(estimator):
        result = train_2['Survived'].iloc[test_index] #Get actual answers 

        #Get prepared train and test
        train_2_prepared = full_pipeline.fit_transform(train_2.iloc[train_index])
    #     import pdb; pdb.set_trace()
        test_2_prepared = full_pipeline.fit_transform(train_2.iloc[test_index])

        #Train estimator for train_index
        estimator[k[0]][1].fit(train_2_prepared, train_2['Survived'].iloc[train_index])

        #Get predictions for test_index
        pred = estimator[k[0]][1].predict(test_2_prepared)

        #Calculate score
        score = accuracy_score(y_pred=pred, y_true=result) #Calculate score
        scores[k[0]].append(score)
        print("{} Score : {}".format(estimator[k[0]][0],round(score,3)))
    print("-"*40)
    

for k in enumerate(estimator):
    print("Baseline {}:\t {} +- {}".format(estimator[k[0]][0],round(np.mean(scores[k[0]]),3), round(np.std(scores[k[0]]),3)))


Fold 0 in progress
RandomForestClassifier Score : 0.838
GradientBoostedClassifier Score : 0.793
----------------------------------------
Fold 1 in progress
RandomForestClassifier Score : 0.792
GradientBoostedClassifier Score : 0.758
----------------------------------------
Fold 2 in progress
RandomForestClassifier Score : 0.809
GradientBoostedClassifier Score : 0.815
----------------------------------------
Fold 3 in progress
RandomForestClassifier Score : 0.831
GradientBoostedClassifier Score : 0.848
----------------------------------------
Fold 4 in progress
RandomForestClassifier Score : 0.815
GradientBoostedClassifier Score : 0.775
----------------------------------------
Baseline RandomForestClassifier:	 0.817 +- 0.016
Baseline GradientBoostedClassifier:	 0.798 +- 0.031


In [169]:
train_prepared = full_pipeline.fit_transform(train_2)

In [170]:
best_rfc.fit(train_prepared, train_2['Survived'])

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.8, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=7,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [171]:
test_2 = df_test.drop(['Name','Ticket','Cabin'], axis=1)
test_2.info()
test_2.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           417 non-null float64
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 26.2+ KB


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S
5,897,3,male,14.0,0,0,9.225,S
6,898,3,female,30.0,0,0,7.6292,Q
7,899,2,male,26.0,1,1,29.0,S
8,900,3,female,18.0,0,0,7.2292,C
9,901,3,male,21.0,2,0,24.15,S


In [172]:
test_prepared = full_pipeline.fit_transform(test_2)

In [173]:
test_survived = best_rfc.predict(test_prepared)

In [174]:
submission_3 = pd.DataFrame({
    'PassengerId':test_2.PassengerId,
    'Survived':test_survived
})
submission_3.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [175]:
submission_3.to_csv('submission_3.csv',index=False)

## 4. Advanced Features (Adjusted Fare Price)

In [254]:
class FareAdjusted_feature(BaseEstimator, TransformerMixin):
    """
    Imputes values into Embarked variable
    """
    def __init__(self): # no *args or **kargs
        return None
    def fit(self, X):
        return self  # nothing else to do
    def transform(self, X):
        # deep copy the df
        df = X.copy()
        
        df['TicketFrequency'] = train_4.groupby(['Ticket'])['Ticket'].transform('count')
#         for i in np.arange(df.shape[0]):
#             df['TicketFrequency'].iloc[i] = train_4.Ticket.loc[train_4.Ticket == train_4.Ticket.iloc[i]].count()
        
        df['FareAdj'] = df['Fare']/df['TicketFrequency']    
        return(df)

In [180]:
train_3 = df_train.drop(['Name','Cabin','prediction'], axis=1)
train_3.info()
train_3.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(3)
memory usage: 69.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,female,35.0,1,0,113803,53.1,S
4,5,0,3,male,35.0,0,0,373450,8.05,S
5,6,0,3,male,,0,0,330877,8.4583,Q
6,7,0,1,male,54.0,0,0,17463,51.8625,S
7,8,0,3,male,2.0,3,1,349909,21.075,S
8,9,1,3,female,27.0,0,2,347742,11.1333,S
9,10,1,2,female,14.0,1,0,237736,30.0708,C


In [255]:
train_4 = FareAdjusted_feature.transform(train_3, train_3)
train_4.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,TicketFrequency,FareAdj
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,S,1,7.25
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C,1,71.2833
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,S,1,7.925
3,4,1,1,female,35.0,1,0,113803,53.1,S,2,26.55
4,5,0,3,male,35.0,0,0,373450,8.05,S,1,8.05
5,6,0,3,male,,0,0,330877,8.4583,Q,1,8.4583
6,7,0,1,male,54.0,0,0,17463,51.8625,S,1,51.8625
7,8,0,3,male,2.0,3,1,349909,21.075,S,4,5.26875
8,9,1,3,female,27.0,0,2,347742,11.1333,S,3,3.7111
9,10,1,2,female,14.0,1,0,237736,30.0708,C,2,15.0354


In [226]:
from sklearn_pandas import DataFrameMapper

CAT_ATTRIBS = ['Sex','Embarked']
NUM_ATTRIBS = ['Pclass','Age','FamilySize','FareAdj']

# map transformers on variables
my_mapper = DataFrameMapper([
    ('Sex', LabelBinarizer()),
    ('Embarked', LabelBinarizer()),
    ], input_df=True)

categorical_pipeline = Pipeline([
    ('Embarked_imputer', EmbarkedImputer()),
    ('label_binarizer_df', my_mapper),
])

numerical_pipeline = Pipeline([
    ('Fare_imputer', GeneralImputer(col_impute='Fare',col_group=['Sex','Pclass'], impute_method='median')),
    ('Age_imputer', GeneralImputer(col_impute='Age',col_group=['Sex','Pclass'], impute_method='median')),
    ('FamilySize_feature_creation', FamilySize_feature()),
    ('Fare_Adjustments',FareAdjusted_feature()),
    ('Selector', DataFrameSelector(NUM_ATTRIBS)),
    ('Scaler', StandardScaler())
])

full_pipeline = FeatureUnion(transformer_list=[
    ('Num_pipeline', numerical_pipeline),
    ('Cat_pipeline', categorical_pipeline),
])

In [227]:
train_prepared = full_pipeline.fit_transform(train_3)
print(train_prepared)

[[ 0.82737724 -0.53489116  0.05915988 ...  0.          0.
   1.        ]
 [-1.56610693  0.66839176  0.05915988 ...  1.          0.
   0.        ]
 [ 0.82737724 -0.23407043 -0.56097483 ...  0.          0.
   1.        ]
 ...
 [ 0.82737724 -0.57249375  1.29942929 ...  0.          0.
   1.        ]
 [-1.56610693 -0.23407043 -0.56097483 ...  1.          0.
   0.        ]
 [ 0.82737724  0.21716066 -0.56097483 ...  0.          1.
   0.        ]]


In [228]:
# Random Forest Classifier - Grid Search
param_grid = [
    {
        'bootstrap':[False, True],
        'n_estimators':[80,90,100],
        'max_features':[0.6,0.7,0.8],
        'min_samples_leaf':[10,12,14],
        'min_samples_split':[3,5,7]
    },
]

rfc = RandomForestClassifier()
# grid_search = GridSearchCV(rfc, param_grid, cv=5, scoring='neg_mean_squared_error', refit=True)
grid_search = GridSearchCV(rfc, param_grid, cv=5, scoring='accuracy', refit=True)


%time grid_search.fit(train_prepared,train_3['Survived'])

#let's see the best estimator
best_rfc = grid_search.best_estimator_
print(best_rfc)
print("_"*40)
#with its score
# print(np.sqrt(-grid_search.best_score_))
print(grid_search.best_score_)
print(pd.DataFrame(grid_search.cv_results_)[['params','mean_test_score','std_test_score']])

Wall time: 4min 51s
RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.6, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=12, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
________________________________________
0.8260381593714927
                                                params  mean_test_score  \
0    {'bootstrap': False, 'max_features': 0.6, 'min...         0.818182   
1    {'bootstrap': False, 'max_features': 0.6, 'min...         0.822671   
2    {'bootstrap': False, 'max_features': 0.6, 'min...         0.820426   
3    {'bootstrap': False, 'max_features': 0.6, 'min...         0.815937   
4    {'bootstrap': False, 'max_features': 0.6, 'min...         0.823793   
5    {'bootstrap': False, 'max_features': 0.6,

In [229]:
best_rfc

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.6, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=12, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [230]:
# Gradient Boosting Classifier - Grid Search
# param_grid = [
#     {
#         'loss':['deviance', 'exponential'],
#         'learning_rate':[0.1, 0.2, 0.3],
#         'n_estimators':[80,90,100],
#         'max_features':[0.6,0.7,0.8],
#         'min_samples_leaf':[10,12,14],
#         'min_samples_split':[3,5,7]
#     },
# ]
param_grid = [
    {
        'loss':['exponential'],
        'learning_rate':[0.1],
        'n_estimators':[10,30,40],
        'max_features':[0.1,0.2,0.3],
        'min_samples_leaf':[7,8,9],
        'min_samples_split':[13,15,17,19]
    },
]

gbc = GradientBoostingClassifier()
grid_search = GridSearchCV(gbc, param_grid, cv=5, scoring='accuracy', refit=True)


%time grid_search.fit(train_prepared,train_3['Survived'])

#let's see the best estimator
best_gbc = grid_search.best_estimator_
print(best_gbc)
print("_"*40)
#with its score
# print(np.sqrt(-grid_search.best_score_))
print(grid_search.best_score_)
print(pd.DataFrame(grid_search.cv_results_)[['params','mean_test_score','std_test_score']])

Wall time: 30.7 s
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=3,
              max_features=0.2, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=8, min_samples_split=13,
              min_weight_fraction_leaf=0.0, n_estimators=30,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
________________________________________
0.8249158249158249
                                                params  mean_test_score  \
0    {'learning_rate': 0.1, 'loss': 'exponential', ...         0.755331   
1    {'learning_rate': 0.1, 'loss': 'exponential', ...         0.809203   
2    {'learning_rate': 0.1, 'loss': 'exponential', ...         0.811448   
3    {'learning_rate': 0.1, 'loss': 'exponential', ...         0.780022   


In [231]:
best_gbc

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=3,
              max_features=0.2, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=8, min_samples_split=13,
              min_weight_fraction_leaf=0.0, n_estimators=30,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [232]:
estimator = [
    ('RandomForestClassifier',best_rfc),
    ('GradientBoostedClassifier',best_gbc),
]

scores = [[],[]]

for i, (train_index, test_index) in enumerate(kfolds.split(target)):
    print("Fold {} in progress".format(i))
    for k in enumerate(estimator):
        result = train_3['Survived'].iloc[test_index] #Get actual answers 

        #Get prepared train and test
        train_3_prepared = full_pipeline.fit_transform(train_3.iloc[train_index])
    #     import pdb; pdb.set_trace()
        test_3_prepared = full_pipeline.fit_transform(train_3.iloc[test_index])

        #Train estimator for train_index
        estimator[k[0]][1].fit(train_3_prepared, train_3['Survived'].iloc[train_index])

        #Get predictions for test_index
        pred = estimator[k[0]][1].predict(test_3_prepared)

        #Calculate score
        score = accuracy_score(y_pred=pred, y_true=result) #Calculate score
        scores[k[0]].append(score)
        print("{} Score : {}".format(estimator[k[0]][0],round(score,3)))
    print("-"*40)
    

for k in enumerate(estimator):
    print("Baseline {}:\t {} +- {}".format(estimator[k[0]][0],round(np.mean(scores[k[0]]),3), round(np.std(scores[k[0]]),3)))


Fold 0 in progress
RandomForestClassifier Score : 0.827
GradientBoostedClassifier Score : 0.838
----------------------------------------
Fold 1 in progress
RandomForestClassifier Score : 0.781
GradientBoostedClassifier Score : 0.803
----------------------------------------
Fold 2 in progress
RandomForestClassifier Score : 0.798
GradientBoostedClassifier Score : 0.815
----------------------------------------
Fold 3 in progress
RandomForestClassifier Score : 0.803
GradientBoostedClassifier Score : 0.871
----------------------------------------
Fold 4 in progress
RandomForestClassifier Score : 0.792
GradientBoostedClassifier Score : 0.787
----------------------------------------
Baseline RandomForestClassifier:	 0.8 +- 0.015
Baseline GradientBoostedClassifier:	 0.823 +- 0.029


In [234]:
train_prepared = full_pipeline.fit_transform(train_3)

In [235]:
test_survived = best_gbc.predict(test_prepared)

In [236]:
submission_4 = pd.DataFrame({
    'PassengerId':test_2.PassengerId,
    'Survived':test_survived
})
submission_4.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [237]:
submission_4.to_csv('submission_4.csv',index=False)

## 5. Advanced Features (Woman Child Groups)

In [257]:
train_5 = df_train
train_5.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,prediction,Title,SexGroup,WCG,WCG_frequency
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,Mr,man,NoGroup,537
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,Mrs,woman,Cumings,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,Miss,woman,Heikkinen,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,Mrs,woman,Futrelle,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,Mr,man,NoGroup,537


In [258]:
train_5['Title'] = train_5['Name'].str.extract(' ([A-Za-z]+)\.',expand=False)
train_5.Title.value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Major         2
Col           2
Mlle          2
Lady          1
Jonkheer      1
Countess      1
Don           1
Sir           1
Ms            1
Capt          1
Mme           1
Name: Title, dtype: int64

In [259]:
mapping = {'Mr':'man','Miss':'woman','Mrs':'woman','Master':'boy',
           'Dr':'man','Rev':'man','Major':'man','Col':'man','Mlle':'woman',
           'Lady':'woman','Johkheer':'man','Countess':'woman','Don':'man',
           'Sir':'man','Ms':'woman','Capt':'man','Mme':'woman'}

train_5['SexGroup'] = train_5['Title'].map(mapping)
train_5.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,prediction,Title,SexGroup,WCG,WCG_frequency
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,Mr,man,NoGroup,537
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,Mrs,woman,Cumings,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,Miss,woman,Heikkinen,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,Mrs,woman,Futrelle,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,Mr,man,NoGroup,537


In [350]:
train_5['WCG'] = train_5['Name'].str.extract('([A-Za-z]+)\,',expand=False)
train_5['WCG'].loc[train_5['SexGroup'] == 'man'] = 'NoGroup'
train_5['WCG_frequency'] = train_5.groupby(['WCG'])['WCG'].transform('count')
train_5['WCG'].loc[train_5['WCG_frequency'] <= 1] = 'NoGroup'
# train_5['WCG_survived'] = train_5.groupby(['Survived','WCG'])['Survived'].transform('mean')
# train_5['WCG_survived'].value_counts()
gr = train_5.groupby(['WCG','Survived'])['Survived'].count()
# gr1 = train_5.groupby(['WCG','Survived'])['Survived'].count()/train_5['WCG_frequency']
gr

WCG         Survived
Allison     0             2
            1             1
Andersson   0             6
            1             1
Asplund     0             1
            1             3
Baclini     1             4
Barbara     0             2
Becker      1             2
Boulos      0             2
Bourke      0             2
Brown       1             3
Caldwell    1             2
Carter      0             1
            1             3
Collyer     1             2
Coutts      1             2
Doling      1             2
Ford        0             3
Fortune     1             2
Goldsmith   1             2
Goodwin     0             5
Graham      1             2
Hamalainen  1             2
Harper      1             2
Hart        1             2
Hays        1             2
Herman      1             2
Hippach     1             2
Impe        0             2
Johnson     1             3
Jussila     0             2
Kelly       1             3
Laroche     1             2
Lefebre     0             4

In [222]:
class WCG_feature(BaseEstimator, TransformerMixin):
    """
    Imputes values into Embarked variable
    """
    def __init__(self): # no *args or **kargs
        return None
    def fit(self, X):
        return self  # nothing else to do
    def transform(self, X):
        # deep copy the df
        df = X.copy()
        
        for i in np.arange(df.shape[0]):
            df['TicketFrequency'].iloc[i] = train_4.Ticket.loc[train_4.Ticket == train_4.Ticket.iloc[i]].count()
        
        df['FareAdj'] = df['Fare']/df['TicketFrequency']    
        return(df)

In [240]:
# XGBoost Classifier - Grid Search
# param_grid = [
#     {
#         'loss':['deviance', 'exponential'],
#         'learning_rate':[0.1, 0.2, 0.3],
#         'n_estimators':[80,90,100],
#         'max_features':[0.6,0.7,0.8],
#         'min_samples_leaf':[10,12,14],
#         'min_samples_split':[3,5,7]
#     },
# ]
param_grid = [
    {
        'booster':['gbtree','dart'],
        'gamma':[0,0.1,0.25,0.5,0.75,1],
        'alpha':[0,0.1,0.25,0.5,0.75,1],
    },
]

xgbc = XGBClassifier()
grid_search = GridSearchCV(xgbc, param_grid, cv=5, scoring='accuracy', refit=True)


%time grid_search.fit(train_prepared,train_3['Survived'])

#let's see the best estimator
best_xgbc = grid_search.best_estimator_
print(best_xgbc)
print("_"*40)
#with its score
# print(np.sqrt(-grid_search.best_score_))
print(grid_search.best_score_)
print(pd.DataFrame(grid_search.cv_results_)[['params','mean_test_score','std_test_score']])

Wall time: 9.23 s
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.5, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
________________________________________
0.8305274971941639
                                  params  mean_test_score  std_test_score
0      {'booster': 'gbtree', 'gamma': 0}         0.824916        0.020106
1    {'booster': 'gbtree', 'gamma': 0.1}         0.823793        0.019700
2   {'booster': 'gbtree', 'gamma': 0.25}         0.828283        0.020915
3    {'booster': 'gbtree', 'gamma': 0.5}         0.830527        0.018990
4   {'booster': 'gbtree', 'gamma': 0.75}         0.829405        0.016152
5      {'booster': 'gbtree', 'gamma': 1}         0.827160        0.015090
6        {'boo