In [9]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import seaborn as sns

import xgboost
import xgboost as xgb
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import log_loss

from scipy.stats import randint, uniform
from sklearn.grid_search import RandomizedSearchCV
from sklearn.model_selection import KFold, StratifiedKFold

from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from hyperopt import hp, fmin, tpe, STATUS_OK, Trials 

import matplotlib.pyplot as plt
%matplotlib inline

In [10]:
train = pd.read_csv('train_win_scores.csv')
test = pd.read_csv('test2.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [13]:
train[:1]

Unnamed: 0.1,Unnamed: 0,year,day,team1,team2,score1,score2,target,team1_win_score,team2_win_score
0,0,2998,19,317,131,336,278,True,0.393665,0.255605


# make train in 2 times bigger

In [19]:
train_add = train.copy()
train_add['team1_win_score'] = train['team2_win_score']
train_add['team2_win_score'] = train['team1_win_score']
train_add['target'] = train_add['target'].astype(int).apply(lambda x: False if x == True else True)
train_add[:1]

Unnamed: 0.1,Unnamed: 0,year,day,team1,team2,score1,score2,target,team1_win_score,team2_win_score
0,0,2998,19,317,131,336,278,False,0.255605,0.393665


In [20]:
train = pd.concat([train, train_add])

In [21]:
train[:3]

Unnamed: 0.1,Unnamed: 0,year,day,team1,team2,score1,score2,target,team1_win_score,team2_win_score
0,0,2998,19,317,131,336,278,True,0.393665,0.255605
1,1,2998,28,61,29,301,259,True,0.02589,0.107937
2,2,2998,28,110,141,359,267,True,0.070465,-0.050536


In [22]:
train.shape

(203218, 10)

# feats

In [28]:
features = ['team1_win_score', 'team2_win_score']
X = train[features]
y = train['target'].astype(int) 

In [30]:
X[:20]

Unnamed: 0,team1_win_score,team2_win_score
0,0.393665,0.255605
1,0.02589,0.107937
2,0.070465,-0.050536
3,0.165891,0.575758
4,0.344461,-0.004695
5,0.085537,0.301349
6,-0.074074,0.255172
7,0.279522,0.062201
8,0.363636,0.096573
9,0.02589,0.070465


In [None]:
X['team1_win_score'] = X['team1_win_score']

In [24]:
X = X.values
y = y.values

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [26]:
X_train.shape, X_test.shape

((162574, 2), (40644, 2))

# hyperopt

In [None]:
def objective(space):

    clf = xgb.XGBClassifier(#booster = 'gbtree',
                            objective = 'binary:logistic',
                            #eval_metrics = 'logloss', 
                            n_estimators = 1300,
                            max_depth = 10,
                            silent = 1,
                            nthread = 4, 
                            scale_pos_weight = 1,
                            learning_rate = 0.1,
                            min_child_weight = space['min_child_weight'],
                            subsample = space['subsample'],                                
                            colsample_bytree = space['colsample_bytree'],                                
                            colsample_bylevel = 1.0,
                            gamma = space['gamma'],
                            seed = 27                                
                            )

    eval_set  = [(X_train, y_train), (X_test, y_test)]

    clf.fit(X_train, y_train, eval_set=eval_set, eval_metric="logloss")

    pred = clf.predict_proba(X_test)[:,1]
    loss = log_loss(y_test, pred)
    print "log~loss:", loss

    return{'loss':1-loss, 'status': STATUS_OK }


space = {
    'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
#     'n_estimators': hp.choice('n_estimators', range(1000, 1200)),
#     'learning_rate': hp.quniform('learning_rate', 0.09, 0.11, 0.005),
#     'max_depth': hp.choice('max_depth', range(9, 11)),
    'gamma': hp.quniform('gamma', 0.5, 1, 0.01),
    'min_child_weight': hp.quniform('min_child_weight', 1, 5, 1),
    'subsample' : hp.quniform('subsample', 0.8, 1, 0.025),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.8, 1, 0.025),
    'alpha': hp.quniform('alpha', 0.01, 0.1, 0.01)
    }

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=1,
            trials=trials)

print best

[0]	validation_0-logloss:0.688219	validation_1-logloss:0.688392
[1]	validation_0-logloss:0.683069	validation_1-logloss:0.683388
[2]	validation_0-logloss:0.678843	validation_1-logloss:0.679335
[3]	validation_0-logloss:0.675413	validation_1-logloss:0.676039
[4]	validation_0-logloss:0.670799	validation_1-logloss:0.671574
[5]	validation_0-logloss:0.667042	validation_1-logloss:0.667951
[6]	validation_0-logloss:0.663977	validation_1-logloss:0.665017
[7]	validation_0-logloss:0.660576	validation_1-logloss:0.661795
[8]	validation_0-logloss:0.657804	validation_1-logloss:0.659176
[9]	validation_0-logloss:0.654939	validation_1-logloss:0.656431
[10]	validation_0-logloss:0.652507	validation_1-logloss:0.654145
[11]	validation_0-logloss:0.650521	validation_1-logloss:0.65228
[12]	validation_0-logloss:0.64889	validation_1-logloss:0.650768
[13]	validation_0-logloss:0.646088	validation_1-logloss:0.648094
[14]	validation_0-logloss:0.64381	validation_1-logloss:0.645928
[15]	validation_0-logloss:0.641942	val