In [1]:
# Copyright 2018 Esref Ozdemir
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Model Optimization
In this document we optimize model parameters and obtain a single, best classifier.

In [1]:
%matplotlib inline

from collections import Counter
from pprint import pprint
import pickle
import datetime
import traceback

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.linalg as la

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold
from sklearn.metrics import f1_score, make_scorer, confusion_matrix
from sklearn.utils import shuffle

from utils import plot_confusion_matrix, plot_hbar_nameval
from utils import merge_home_away

## Get the Training Set

In [2]:
train = pd.read_csv('../data/train/all_train.csv').dropna().reset_index(drop=True)
X_train = train.iloc[:, 1:].values
y_train = train['eventId'].values
X_train, y_train = shuffle(X_train, y_train)

In [3]:
for col in train.columns:
    print(col)

eventId
awayAvgX
awayAvgY
awayConvexCenterX
awayConvexCenterY
awayConvexClosestDistance
awayConvexFarDistance
awayConvexMaxSpeed
awayConvexMaxX
awayConvexMaxY
awayConvexMinX
awayConvexMinY
awayDenseClusterDensity
awayInnerDistance
awaySparseClusterDensity
homeAvgX
homeAvgY
homeConvexCenterX
homeConvexCenterY
homeConvexClosestDistance
homeConvexFarDistance
homeConvexMaxSpeed
homeConvexMaxX
homeConvexMaxY
homeConvexMinX
homeConvexMinY
homeDenseClusterDensity
homeInnerDistance
homeSparseClusterDensity
maxClusterImpurity
playerConvexCenterX
playerConvexCenterY
playerConvexClosestDistance
playerConvexFarDistance
playerConvexMaxSpeed
playerConvexMaxX
playerConvexMaxY
playerConvexMinX
playerConvexMinY
playerDenseClusterDensity
playerSparseClusterDensity
playerVerticalLinearity
refSpeed
refX
refY


In [4]:
Counter(y_train)

Counter({0: 2230, 60: 2884, 62: 2345, 80: 3229, 93: 1331})

## Classifier Description
Default classifier.

In [5]:
clf = RandomForestClassifier(
    n_estimators=128,
    n_jobs=-1,
)

params = clf.get_params()
max_key_len = max(len(key) for key in params.keys())
max_val_len = max(len(str(val)) for val in params.values())
header = '{:<{key_width}} : {:<{val_width}}'.format(
    'Parameters',
    'Current values',
    key_width=max_key_len,
    val_width=max_val_len
)

print(header)
print('-'*len(header))
for key, val in params.items():
    print('{:{width}} : {}'.format(key, val, width=max_key_len))

Parameters               : Current values
-----------------------------------------
bootstrap                : True
class_weight             : None
criterion                : gini
max_depth                : None
max_features             : auto
max_leaf_nodes           : None
min_impurity_split       : 1e-07
min_samples_leaf         : 1
min_samples_split        : 2
min_weight_fraction_leaf : 0.0
n_estimators             : 128
n_jobs                   : -1
oob_score                : False
random_state             : None
verbose                  : 0
warm_start               : False


## Parameters
Search space. Every possible combination will be tested by GridSearchCV.

In [6]:
param_grid = [
    {
        'criterion'    : ['gini', 'entropy'],
        'max_features' : ['sqrt', 'log2'],
        'class_weight' : [None, 'balanced', 'balanced_subsample'],
        'min_samples_split' : [10, 25, 35, 50],
    },
]

In [7]:
prod = 1
for key, val in param_grid[0].items():
    prod *= len(val)
    
print('Total number of combinations: {}'.format(prod))

Total number of combinations: 48


## Scorer
We implement our own scorer method that gives higher scores to classifiers that correctly classify important events (goal, corner, freekick, penalty).

In [8]:
event_labels  = sorted(list(np.unique(y_train)))
event_weights = np.array([0.5, 2.5, 2.5, 2.5, 2.5])
event_weights = event_weights/la.norm(event_weights, ord=1)
print(event_labels)
print(event_weights)



def weighted_event_scorer(clf, X_test, y_test):
    y_pred = clf.predict(X_test)
    score_list = f1_score(y_test, y_pred, labels=event_labels, average=None)
    
    inner_prod = np.sum(event_weights*np.array(score_list))
    return inner_prod

[0, 60, 62, 80, 93]
[ 0.04761905  0.23809524  0.23809524  0.23809524  0.23809524]


## Searcher
Grid search object that does a 10-fold cross validation for each parameter configuration. Average score on a 10-fold cross validation is set as the score of that particular parameter configuration. The configuration with the highest 10-fold CV score wins.

In [9]:
scorer = weighted_event_scorer
cross_validator = KFold(n_splits=10, shuffle=True)

grid_search_cv = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    scoring=weighted_event_scorer,
    n_jobs=-1,
    cv=cross_validator,
    verbose=1
)

## Search Parameter Space

In [10]:
# initial message accumulators
train_result_msg = ''
fmt = 'Training has ended in {} minutes and {} seconds'

# we will measure time elapsed
t_beg = datetime.datetime.now()
try:
    # train and save the grid search cross validator
    grid_search_cv = grid_search_cv.fit(X_train, y_train)
    with open('grid_search_cv.pkl', 'wb') as f:
        pickle.dump(grid_search_cv, f)
        
    # success
    fmt = 'SUCCESS! ' + fmt
    fmt += '\n\n'

    # build up success message with training results
    result_dic = grid_search_cv.cv_results_
    for header, content in sorted(result_dic.items()):
        train_result_msg += '{}\n{}\n'.format(header, '-'*len(header))
        train_result_msg += str(content)
        train_result_msg += '\n\n\n'
except:
    # send stack trace
    fmt = 'FAILED! ' + fmt + '\n\n{}'.format(traceback.format_exc())
finally:
    # time elapsed
    t = datetime.datetime.now() - t_beg
    sec = t.seconds
    minutes = sec//60
    seconds = sec - 60*minutes
    
    # final message
    msg = fmt.format(minutes, seconds) + train_result_msg

# write the message to a file to be sent via email
with open('grid_search_results.txt', 'w') as f:
    f.write(msg + '\n')

Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 42.9min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 47.6min finished


## Resulting GridSearchCV
In this section, we list the parameters of the best model picked by GridSearchCV.

In [11]:
with open('grid_search_cv.pkl', 'rb') as f:
    grid_search_cv = pickle.load(f)
    
grid_search_cv

GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=True),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=128, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'criterion': ['gini', 'entropy'], 'max_features': ['sqrt', 'log2'], 'class_weight': [None, 'balanced', 'balanced_subsample'], 'min_samples_split': [10, 25, 35, 50]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=<function weighted_event_scorer at 0x7f027baaeb70>,
       verbose=1)

In [12]:
clf = grid_search_cv.best_estimator_

## Create the Final Model
In this section, we retrain the model with the optimal parameters using the whole training set and create our final model. Then, we pickle our model.

In [20]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=None, max_features='sqrt',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=128, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [21]:
with open('../data/random_forest.pkl', 'wb') as f:
    f.write(pickle.dumps(clf))