In [14]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer
import xgboost as xgb
import time
import pickle

def train_model(X_train, y_train):
    rfc = RandomForestClassifier(class_weight='balanced', random_state=0, verbose=25)
    
    ratio_neg_pos_class = ((y_train==0).sum())/((y_train==1).sum())
#     xgboost = xgb.XGBClassifier(Eta = 0.01, Gamma = 0.05, Lambda = 0.01, Max_depth = 3,
#                                 Min_child_weight = 1, alpha = 0, random_state=0,
#                                 scale_pos_weight=ratio_neg_pos_class, verbosity=2)
    xgboost = xgb.XGBClassifier(scale_pos_weight=ratio_neg_pos_class,random_state=0, verbosity=2)
    rfc_param = {
        'n_estimators': [120, 300],
        'max_depth': [5, 8, 15],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 5, 10]
#         'Max_features': []
    }
    
    xgb_param = {
        'Eta': [0.01, 0.015, 0.025, 0.05, 0.1],
        'Gamma': [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
        'Max_depth': [3, 12, 7, 9, 12],
        'Min_child_weight': [1, 7, 3, 5, 7],
#         'Subsample'
#         'Colsample_bytree'
        'Lambda': [0.01, 1.0, 0.1, 1.0],
        'alpha': [0, 0.1, 0.5, 1.0]
    }
    
    gs_rfc = GridSearchCV(rfc, param_grid=rfc_param, cv=10,
                          scoring=make_scorer(accuracy_score), n_jobs=-1)
    gs_xgb = GridSearchCV(xgboost, param_grid=xgb_param, cv=10,
                          scoring=make_scorer(accuracy_score), n_jobs=-1)
    
    models = {
        'gs_rfc': gs_rfc,
        'gs_xgb': xgboost
    }
    model_names = {'gs_rfc': 'Random Forest', 'gs_xgb': 'XGBoost'}
    
    best_score = 0    
    for key, clf in models.items():
        start_time = time.clock()
        clf.fit(X_train, y_train)
        train_score = accuracy_score(y_train, clf.predict(X_train))
        print("Training Score:{}".format(train_score))
        print("and best params {0}".format(clf.best_params_))
        if train_score > best_score:
            best_model = clf
            best_model_name = model_names[key]
            best_score = train_score
        print("Time taken: {}\n".format(time.clock() - start_time))
    
    print("The best model is:{0} with score of {1}".format(best_model, best_score))
    file_name = "Best Model" + best_model_name + ".p"
    with open(file_name, 'wb') as f:
        pickle.dump(best_model, f)

In [12]:
if __name__ == '__main__':
    X_train = pd.read_csv('X_train.csv', index_col='PassengerId')
#     X_test = pd.read_csv('X_test.csv', index_col='PassengerId')
    y_train = pd.read_csv('y_train.csv', index_col=0, header=None)
#     y_test = pd.read_csv('y_test.csv', index_col=0, header=None)
    
    train_model(X_train, y_train.values.ravel())

Training Score:0.8496071829405163
Time taken: 0.5347067070542835

The best model is:XGBClassifier(Eta=0.01, Gamma=0.05, Lambda=0.01, Max_depth=3,
       Min_child_weight=1, alpha=0, base_score=0.5, booster='gbtree',
       colsample_bylevel=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1.605263157894737, seed=None,
       silent=True, subsample=1, verbosity=2) with score of 0.8496071829405163


  if diff:
