In [37]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer
import xgboost as xgb
import time
import pickle

def train_model(X_train, X_test, y_train, y_test):
    rfc = RandomForestClassifier(random_state=0, verbose=25)
    xgboost = xgb.XGBClassifier(random_state=0, verbosity=2)
    rfc_param = {
        'N_estimators': [120, 300],
        'Max_depth': [5, 8, 15],
        'Min_samples_split': [1, 2, 5, 10],
        'Min_samples_leaf': [1, 2, 5, 10]
#         'Max_features': []
    }
    
    xgb_param = {
        'Eta': [0.01, 0.015, 0.025, 0.05, 0.1],
        'Gamma': [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
        'Max_depth': [3, 12, 7, 9, 12],
        'Min_child_weight': [1, 7, 3, 5, 7],
#         'Subsample'
#         'Colsample_bytree'
        'Lambda': [0.01, 1.0, 0.1, 1.0],
        'alpha': [0, 0.1, 0.5, 1.0]
    }
    
    gs_rfc = GridSearchCV(rfc, param_grid=rfc_param, cv=10, scoring=make_scorer(accuracy_score), n_jobs=-1)
    gs_xgb = GridSearchCV(xgboost, param_grid=xgb_param, cv=10, scoring=make_scorer(accuracy_score), n_jobs=-1)
    
    models = {
#         'gs_rfc': gs_rfc,
        'gs_xgb': gs_xgb
    }
    model_names = {'gs_rfc': 'Random Forest', 'gs_xgb': 'XGBoost'}
    
    best_score = 0    
    for key, clf in models.items():
        start_time = time.clock()
        clf.fit(X_train, y_train)
        print("Training Score:{}".format(accuracy_score(y_train, clf.predict(X_train))))
        test_score = accuracy_score(y_test, clf.predict(X_test))
        print("Test Data Score:{}".format(test_score))
        print("and best params {0}".format(clf.best_params_))
        if test_score > best_score:
            best_model = clf
            best_model_name = model_names[key]
            best_score = test_score
        print("Time taken: {}\n".format(time.clock() - start_time))
    
    print("The best model is:{0} with score of {1}".format(best_model, best_score))
    file_name = "Best Model" + best_model_name + ".p"
    with open(file_name, 'wb') as f:
        pickle.dump(best_model, f)

In [38]:
if __name__ == '__main__':
    X_train = pd.read_csv('X_train.csv', index_col='PassengerId')
    X_test = pd.read_csv('X_test.csv', index_col='PassengerId')
    y_train = pd.read_csv('y_train.csv', index_col=0, header=None)
    y_test = pd.read_csv('y_test.csv', index_col=0, header=None)
    
    train_model(X_train, X_test, y_train.values.ravel(), y_test.values.ravel())

Training Score:0.8802395209580839
Test Data Score:0.8475336322869955
and best params {'Eta': 0.01, 'Gamma': 0.05, 'Lambda': 0.01, 'Max_depth': 3, 'Min_child_weight': 1, 'alpha': 0}
Time taken: 4643.962741428057

The best model is:GridSearchCV(cv=10, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1, verbosity=2),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'Eta': [0.01, 0.015, 0.025, 0.05, 0.1], 'Gamma': [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0], 'Max_depth': [3, 12, 7, 9, 12], 'Min_child_weight': [1, 7, 3, 5, 7], 'Lambda': [0.01, 1.0, 0.1, 1.0], 'alpha': [0, 0.1, 0.5, 1.0]},
       pre_dispatch='2*n_jo

  if diff:
  if diff:


In [14]:
y_train.values.ravel()

array([0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1,