In [12]:
import pandas as pd
import numpy as np
import time
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score

In [2]:
# logloss function used for scoring the predictions
# source: https://www.kaggle.com/wiki/LogarithmicLoss
import scipy as sp
def logloss(act, pred):
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    return ll

In [3]:
df_train = pd.read_pickle('../input/processed_train_data.pickle')
df_test = pd.read_pickle('../input/processed_test_data.pickle')

## Get baseline performance

In [4]:
df_train.dropna(subset=['MA200'], inplace=True)

X = np.array(df_train.drop(['game_date','shot_id','shot_made_flag'], axis=1))
y = np.array(df_train['shot_made_flag'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

ada = AdaBoostClassifier()
ada.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [7]:
num_fold = 10
num_instances = len(X)
seed = 7
scoring = 'neg_log_loss'

kfold = KFold(n_splits=num_fold, random_state=seed)
cv_results = cross_val_score(ada, X, y, 
                             cv=kfold,
                             scoring=scoring)
score = cv_results.mean()
print(score)

KeyboardInterrupt: 

## Features Selection

In [8]:
from sklearn.feature_selection import RFE
    
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2)

n_features = [20]
num_fold = 10
seed = 7
scoring = 'neg_log_loss'
best_cv = 0
best_mask = []

for n in n_features:
    rfe = RFE(ada,n)
    X_new = rfe.fit_transform(X,y)

    kfold = KFold(n_splits = num_fold,
                  random_state = seed)
    
    cv_results = cross_val_score(ada, X_new, y,
                                 cv = kfold,
                                 scoring = scoring)
    score = cv_results.mean()
    
    print('n=' + str(n) + ' : ' + str(score))
    
    if score < best_cv:
        X_best = X_new
        best_mask = rfe.get_support(False)
        best_cv = score
            
  

n=20 : -0.684988724475


In [9]:
features_masks = pd.DataFrame({
        'feature': df_train.drop(['game_date','shot_id','shot_made_flag'], axis=1).columns,
        'best': best_mask
    })
features_masks.head()

Unnamed: 0,best,feature
0,False,loc_x
1,True,loc_y
2,False,playoffs
3,True,shot_distance
4,True,seconds_to_end


In [10]:
best_features = features_masks[features_masks['best'] == True]['feature'].values
best_features

array(['loc_y', 'shot_distance', 'seconds_to_end', 'MA10', 'MA20',
       'action_type_Driving Dunk Shot',
       'action_type_Driving Finger Roll Layup Shot',
       'action_type_Driving Finger Roll Shot',
       'action_type_Driving Layup Shot',
       'action_type_Driving Reverse Layup Shot', 'action_type_Layup Shot',
       'action_type_Pullup Jump shot', 'action_type_Running Hook Shot',
       'action_type_Running Jump Shot', 'action_type_Slam Dunk Shot',
       'combined_shot_type_Bank Shot', 'combined_shot_type_Dunk',
       'combined_shot_type_Tip Shot', 'period_1', 'season_2005-06'], dtype=object)

In [5]:
# skip the features selection which takes way too long
best_features = ['loc_y', 'shot_distance', 'seconds_to_end', 'MA10', 'MA20',
                 'action_type_Driving Dunk Shot',
                 'action_type_Driving Finger Roll Layup Shot',
                 'action_type_Driving Finger Roll Shot',
                 'action_type_Driving Layup Shot',
                 'action_type_Driving Reverse Layup Shot', 'action_type_Layup Shot',
                 'action_type_Pullup Jump shot', 'action_type_Running Hook Shot',
                 'action_type_Running Jump Shot', 'action_type_Slam Dunk Shot',
                 'combined_shot_type_Bank Shot', 'combined_shot_type_Dunk',
                 'combined_shot_type_Tip Shot', 'period_1', 'season_2005-06']

In [6]:
X_best = np.array(df_train[best_features])
X_best.shape

(25538, 20)

## Parameters Optimization
### base estimator
One of the best models was Logistic Regression. We'll use this model as base estimator

In [23]:
from sklearn.model_selection import GridSearchCV

def get_best_params(model, X, y, param_grid, verbose=0):
    ''' get the parameters that return the best score
    parameters
    ----------
        model object
            a supervised learning estimator
        X    array
            training set
        y    array
            target values
        param_grid    dictionary
            the list of parameters and their possible values
    output
    ------
        best_params    dictionary
            the best value for each parameter in initial list
    '''
    
    scoring = 'neg_log_loss'
    cv = 10
    
    grid = GridSearchCV(model, 
                        param_grid,
                        scoring = scoring,
                        n_jobs = 3,
                        cv = cv,
                        verbose = verbose)
    
    grid.fit(X,y)
    
    print(grid.best_score_)
    print(grid.best_estimator_)

In [9]:
from sklearn.linear_model import LogisticRegression

In [8]:
param_grid = {'solver': ['liblinear','newton-cg','lbfgs'],
              'penalty': ['l2'],
              'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'fit_intercept': [True,False],
              'intercept_scaling':[0.001, 0.01, 0.1, 1, 10, 100, 1000]
              }

lr = LogisticRegression()

get_best_params(lr,X_best,y,param_grid)



0.695395097502
LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=0.1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [10]:
lr = LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=0.1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [24]:
ada = AdaBoostClassifier(base_estimator = lr)

param_grid = {
    'n_estimators' : [10,50,100,200],
    'learning_rate' : [0.001, 0.01, 0.1, 1]
}

startime = time.time()

get_best_params(ada, X_best, y, param_grid, 10)

print('Ran in ', time.time()-strattime, 'seconds.')

Fitting 10 folds for each of 16 candidates, totalling 160 fits
[CV] learning_rate=0.001, n_estimators=10 ............................
[CV] learning_rate=0.001, n_estimators=10 ............................
[CV] learning_rate=0.001, n_estimators=10 ............................
[CV]  learning_rate=0.001, n_estimators=10, score=-0.646478, total=   1.6s
[CV] learning_rate=0.001, n_estimators=10 ............................
[CV]  learning_rate=0.001, n_estimators=10, score=-0.657826, total=   1.9s
[CV] learning_rate=0.001, n_estimators=10 ............................
[CV]  learning_rate=0.001, n_estimators=10, score=-0.655021, total=   1.8s
[CV] learning_rate=0.001, n_estimators=10 ............................


[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    2.0s


[CV]  learning_rate=0.001, n_estimators=10, score=-0.658020, total=   1.8s
[CV] learning_rate=0.001, n_estimators=10 ............................
[CV]  learning_rate=0.001, n_estimators=10, score=-0.652616, total=   2.5s
[CV] learning_rate=0.001, n_estimators=10 ............................
[CV]  learning_rate=0.001, n_estimators=10, score=-0.655251, total=   2.5s
[CV] learning_rate=0.001, n_estimators=10 ............................
[CV]  learning_rate=0.001, n_estimators=10, score=-0.653411, total=   1.8s
[CV] learning_rate=0.001, n_estimators=10 ............................


[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:    5.9s


[CV]  learning_rate=0.001, n_estimators=10, score=-0.655618, total=   1.6s
[CV] learning_rate=0.001, n_estimators=50 ............................
[CV]  learning_rate=0.001, n_estimators=10, score=-0.659404, total=   2.1s
[CV] learning_rate=0.001, n_estimators=50 ............................
[CV]  learning_rate=0.001, n_estimators=10, score=-0.651226, total=   1.8s
[CV] learning_rate=0.001, n_estimators=50 ............................
[CV]  learning_rate=0.001, n_estimators=50, score=-0.646701, total=   7.6s
[CV] learning_rate=0.001, n_estimators=50 ............................
[CV]  learning_rate=0.001, n_estimators=50, score=-0.657875, total=   8.3s
[CV] learning_rate=0.001, n_estimators=50 ............................


[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:   15.3s


[CV]  learning_rate=0.001, n_estimators=50, score=-0.655060, total=   8.9s
[CV] learning_rate=0.001, n_estimators=50 ............................
[CV]  learning_rate=0.001, n_estimators=50, score=-0.652669, total=   7.8s
[CV] learning_rate=0.001, n_estimators=50 ............................
[CV]  learning_rate=0.001, n_estimators=50, score=-0.657920, total=   8.9s
[CV] learning_rate=0.001, n_estimators=50 ............................
[CV]  learning_rate=0.001, n_estimators=50, score=-0.655211, total=   8.4s
[CV] learning_rate=0.001, n_estimators=50 ............................
[CV]  learning_rate=0.001, n_estimators=50, score=-0.653484, total=   9.1s
[CV] learning_rate=0.001, n_estimators=50 ............................
[CV]  learning_rate=0.001, n_estimators=50, score=-0.659376, total=  10.6s
[CV] learning_rate=0.001, n_estimators=100 ...........................
[CV]  learning_rate=0.001, n_estimators=50, score=-0.655682, total=  10.7s
[CV] learning_rate=0.001, n_estimators=100 ......

[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:   37.8s


[CV]  learning_rate=0.001, n_estimators=50, score=-0.651220, total=  10.3s
[CV] learning_rate=0.001, n_estimators=100 ...........................
[CV]  learning_rate=0.001, n_estimators=100, score=-0.646993, total=  17.0s
[CV] learning_rate=0.001, n_estimators=100 ...........................
[CV]  learning_rate=0.001, n_estimators=100, score=-0.657955, total=  16.9s
[CV] learning_rate=0.001, n_estimators=100 ...........................
[CV]  learning_rate=0.001, n_estimators=100, score=-0.655131, total=  17.9s
[CV] learning_rate=0.001, n_estimators=100 ...........................
[CV]  learning_rate=0.001, n_estimators=100, score=-0.652757, total=  16.6s
[CV] learning_rate=0.001, n_estimators=100 ...........................
[CV]  learning_rate=0.001, n_estimators=100, score=-0.657830, total=  16.8s
[CV] learning_rate=0.001, n_estimators=100 ...........................
[CV]  learning_rate=0.001, n_estimators=100, score=-0.655189, total=  16.8s
[CV] learning_rate=0.001, n_estimators=100 

[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:  1.3min


[CV]  learning_rate=0.001, n_estimators=100, score=-0.653599, total=  16.4s
[CV] learning_rate=0.001, n_estimators=100 ...........................
[CV]  learning_rate=0.001, n_estimators=100, score=-0.659365, total=  18.3s
[CV] learning_rate=0.001, n_estimators=200 ...........................
[CV]  learning_rate=0.001, n_estimators=100, score=-0.655784, total=  20.2s
[CV] learning_rate=0.001, n_estimators=200 ...........................
[CV]  learning_rate=0.001, n_estimators=100, score=-0.651240, total=  18.1s
[CV] learning_rate=0.001, n_estimators=200 ...........................
[CV]  learning_rate=0.001, n_estimators=200, score=-0.647611, total=  35.9s
[CV] learning_rate=0.001, n_estimators=200 ...........................
[CV]  learning_rate=0.001, n_estimators=200, score=-0.658173, total=  35.9s
[CV] learning_rate=0.001, n_estimators=200 ...........................
[CV]  learning_rate=0.001, n_estimators=200, score=-0.655335, total=  36.2s
[CV] learning_rate=0.001, n_estimators=200

[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  2.9min


[CV]  learning_rate=0.001, n_estimators=200, score=-0.655233, total=  33.4s
[CV] learning_rate=0.001, n_estimators=200 ...........................
[CV]  learning_rate=0.001, n_estimators=200, score=-0.653891, total=  35.2s
[CV] learning_rate=0.001, n_estimators=200 ...........................
[CV]  learning_rate=0.001, n_estimators=200, score=-0.659420, total=  35.4s
[CV] learning_rate=0.01, n_estimators=10 .............................
[CV]  learning_rate=0.01, n_estimators=10, score=-0.646941, total=   2.6s
[CV] learning_rate=0.01, n_estimators=10 .............................
[CV]  learning_rate=0.01, n_estimators=10, score=-0.657939, total=   1.6s
[CV] learning_rate=0.01, n_estimators=10 .............................
[CV]  learning_rate=0.01, n_estimators=10, score=-0.655117, total=   2.0s
[CV] learning_rate=0.01, n_estimators=10 .............................
[CV]  learning_rate=0.001, n_estimators=200, score=-0.656051, total=  36.9s
[CV] learning_rate=0.01, n_estimators=10 .......

[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  3.7min


[CV]  learning_rate=0.01, n_estimators=10, score=-0.655190, total=   2.1s
[CV] learning_rate=0.01, n_estimators=10 .............................
[CV]  learning_rate=0.01, n_estimators=10, score=-0.653577, total=   2.3s
[CV] learning_rate=0.01, n_estimators=10 .............................
[CV]  learning_rate=0.01, n_estimators=10, score=-0.659365, total=   2.3s
[CV] learning_rate=0.01, n_estimators=10 .............................
[CV]  learning_rate=0.01, n_estimators=10, score=-0.655764, total=   2.3s
[CV] learning_rate=0.01, n_estimators=50 .............................
[CV]  learning_rate=0.01, n_estimators=10, score=-0.651234, total=   2.0s
[CV] learning_rate=0.01, n_estimators=50 .............................
[CV]  learning_rate=0.01, n_estimators=50, score=-0.649556, total=   9.3s
[CV] learning_rate=0.01, n_estimators=50 .............................
[CV]  learning_rate=0.01, n_estimators=50, score=-0.659101, total=   9.7s
[CV] learning_rate=0.01, n_estimators=50 ...............

[Parallel(n_jobs=3)]: Done  55 tasks      | elapsed:  4.2min


[CV]  learning_rate=0.01, n_estimators=50, score=-0.655821, total=   9.6s
[CV] learning_rate=0.01, n_estimators=50 .............................
[CV]  learning_rate=0.01, n_estimators=50, score=-0.655051, total=  10.6s
[CV] learning_rate=0.01, n_estimators=50 .............................
[CV]  learning_rate=0.01, n_estimators=50, score=-0.660000, total=   9.0s
[CV] learning_rate=0.01, n_estimators=100 ............................
[CV]  learning_rate=0.01, n_estimators=50, score=-0.657156, total=  10.1s
[CV] learning_rate=0.01, n_estimators=100 ............................
[CV]  learning_rate=0.01, n_estimators=50, score=-0.652208, total=  11.3s
[CV] learning_rate=0.01, n_estimators=100 ............................
[CV]  learning_rate=0.01, n_estimators=100, score=-0.653127, total=  24.5s
[CV] learning_rate=0.01, n_estimators=100 ............................
[CV]  learning_rate=0.01, n_estimators=100, score=-0.661255, total=  25.7s
[CV] learning_rate=0.01, n_estimators=100 ............

[Parallel(n_jobs=3)]: Done  66 tasks      | elapsed:  5.4min


[CV]  learning_rate=0.01, n_estimators=100, score=-0.657681, total=  17.3s
[CV] learning_rate=0.01, n_estimators=100 ............................
[CV]  learning_rate=0.01, n_estimators=100, score=-0.661836, total=  21.5s
[CV] learning_rate=0.01, n_estimators=200 ............................
[CV]  learning_rate=0.01, n_estimators=100, score=-0.659741, total=  22.4s
[CV] learning_rate=0.01, n_estimators=200 ............................
[CV]  learning_rate=0.01, n_estimators=100, score=-0.654594, total=  19.3s
[CV] learning_rate=0.01, n_estimators=200 ............................
[CV]  learning_rate=0.01, n_estimators=200, score=-0.659548, total=  39.3s
[CV] learning_rate=0.01, n_estimators=200 ............................
[CV]  learning_rate=0.01, n_estimators=200, score=-0.665777, total=  41.3s
[CV] learning_rate=0.01, n_estimators=200 ............................
[CV]  learning_rate=0.01, n_estimators=200, score=-0.663080, total=  39.6s
[CV] learning_rate=0.01, n_estimators=200 .......

[Parallel(n_jobs=3)]: Done  79 tasks      | elapsed:  7.9min


[CV]  learning_rate=0.1, n_estimators=10, score=-0.660943, total=   2.0s
[CV] learning_rate=0.1, n_estimators=10 ..............................
[CV]  learning_rate=0.1, n_estimators=10, score=-0.658080, total=   2.3s
[CV] learning_rate=0.1, n_estimators=10 ..............................
[CV]  learning_rate=0.1, n_estimators=10, score=-0.656161, total=   1.9s
[CV] learning_rate=0.1, n_estimators=10 ..............................
[CV]  learning_rate=0.1, n_estimators=10, score=-0.659346, total=   1.7s
[CV] learning_rate=0.1, n_estimators=10 ..............................
[CV]  learning_rate=0.01, n_estimators=200, score=-0.665079, total=  41.0s
[CV] learning_rate=0.1, n_estimators=10 ..............................
[CV]  learning_rate=0.1, n_estimators=10, score=-0.657495, total=   2.2s
[CV] learning_rate=0.1, n_estimators=10 ..............................
[CV]  learning_rate=0.1, n_estimators=10, score=-0.657308, total=   2.0s
[CV] learning_rate=0.1, n_estimators=10 .....................

[Parallel(n_jobs=3)]: Done  92 tasks      | elapsed:  8.4min


[CV]  learning_rate=0.1, n_estimators=50, score=-0.671998, total=   9.7s
[CV] learning_rate=0.1, n_estimators=50 ..............................
[CV]  learning_rate=0.01, n_estimators=200, score=-0.660074, total=  39.8s
[CV] learning_rate=0.1, n_estimators=50 ..............................
[CV]  learning_rate=0.1, n_estimators=50, score=-0.673447, total=  10.1s
[CV] learning_rate=0.1, n_estimators=50 ..............................
[CV]  learning_rate=0.1, n_estimators=50, score=-0.673088, total=   7.9s
[CV] learning_rate=0.1, n_estimators=50 ..............................
[CV]  learning_rate=0.1, n_estimators=50, score=-0.672478, total=   9.8s
[CV] learning_rate=0.1, n_estimators=50 ..............................
[CV]  learning_rate=0.1, n_estimators=50, score=-0.675043, total=   8.9s
[CV] learning_rate=0.1, n_estimators=100 .............................
[CV]  learning_rate=0.1, n_estimators=50, score=-0.674917, total=   8.9s
[CV] learning_rate=0.1, n_estimators=100 ....................

[Parallel(n_jobs=3)]: Done 107 tasks      | elapsed:  9.5min


[CV]  learning_rate=0.1, n_estimators=100, score=-0.681331, total=  13.3s
[CV] learning_rate=0.1, n_estimators=200 .............................
[CV]  learning_rate=0.1, n_estimators=100, score=-0.681190, total=  15.4s
[CV] learning_rate=0.1, n_estimators=200 .............................
[CV]  learning_rate=0.1, n_estimators=100, score=-0.678609, total=  15.1s
[CV] learning_rate=0.1, n_estimators=200 .............................
[CV]  learning_rate=0.1, n_estimators=200, score=-0.683582, total=  25.6s
[CV] learning_rate=0.1, n_estimators=200 .............................
[CV]  learning_rate=0.1, n_estimators=200, score=-0.685070, total=  25.8s
[CV] learning_rate=0.1, n_estimators=200 .............................
[CV]  learning_rate=0.1, n_estimators=200, score=-0.684100, total=  26.4s
[CV] learning_rate=0.1, n_estimators=200 .............................
[CV]  learning_rate=0.1, n_estimators=200, score=-0.684457, total=  26.7s
[CV] learning_rate=0.1, n_estimators=200 ...............

[Parallel(n_jobs=3)]: Done 122 tasks      | elapsed: 11.1min


[CV]  learning_rate=1, n_estimators=10, score=-0.679927, total=   1.4s
[CV] learning_rate=1, n_estimators=10 ................................
[CV]  learning_rate=0.1, n_estimators=200, score=-0.685980, total=  26.8s
[CV] learning_rate=1, n_estimators=10 ................................
[CV]  learning_rate=1, n_estimators=10, score=-0.679208, total=   1.4s
[CV] learning_rate=1, n_estimators=10 ................................
[CV]  learning_rate=1, n_estimators=10, score=-0.679645, total=   1.4s
[CV] learning_rate=1, n_estimators=10 ................................
[CV]  learning_rate=1, n_estimators=10, score=-0.681009, total=   1.3s
[CV] learning_rate=1, n_estimators=10 ................................
[CV]  learning_rate=1, n_estimators=10, score=-0.681171, total=   1.4s
[CV] learning_rate=1, n_estimators=50 ................................
[CV]  learning_rate=1, n_estimators=10, score=-0.678385, total=   1.4s
[CV] learning_rate=1, n_estimators=50 ................................
[CV

[Parallel(n_jobs=3)]: Done 139 tasks      | elapsed: 11.6min


[CV]  learning_rate=1, n_estimators=50, score=-0.689115, total=   6.1s
[CV] learning_rate=1, n_estimators=100 ...............................
[CV]  learning_rate=1, n_estimators=100, score=-0.690575, total=  11.4s
[CV] learning_rate=1, n_estimators=100 ...............................
[CV]  learning_rate=1, n_estimators=100, score=-0.690970, total=  11.4s
[CV] learning_rate=1, n_estimators=100 ...............................
[CV]  learning_rate=1, n_estimators=100, score=-0.690777, total=  10.3s
[CV] learning_rate=1, n_estimators=100 ...............................
[CV]  learning_rate=1, n_estimators=100, score=-0.690921, total=  10.3s
[CV] learning_rate=1, n_estimators=100 ...............................
[CV]  learning_rate=1, n_estimators=100, score=-0.691060, total=  10.2s
[CV]  learning_rate=1, n_estimators=100, score=-0.691157, total=  11.4s
[CV] learning_rate=1, n_estimators=100 ...............................
[CV] learning_rate=1, n_estimators=100 ...............................


[Parallel(n_jobs=3)]: Done 160 out of 160 | elapsed: 13.4min finished


-0.654486984878
AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=0.1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          learning_rate=0.001, n_estimators=10, random_state=None)


NameError: name 'strattime' is not defined