# Explore and Model StatFox Matchup Data
`mlb_bet_notebooks/model_statfox_matchups.ipynb`
- Explore features
- Convert historical moneylines to break-even probabilities
- Model pre-computed features with RF and maybe PCA
- Compare model predictions to historical moneylines
    - Use break-even probabilities as alternative model and compare ROC
- Try VIF filter
- Try k-fold CV
- Try grid search model complexity
- Try to get player salary
    - Combine with addition, subtraction from statfox blobs
- Try fix Opening Line feature 
    - Try openline probability as feature

Jonathan Sims 2020-02-24

In [186]:
import math
import boto3
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import roc_auc_score
import numpy as np
from sklearn import decomposition
from sklearn.preprocessing import StandardScaler
import random
from sklearn.model_selection import train_test_split, GridSearchCV, LeaveOneOut

### Import model data

In [172]:
s3 = boto3.client('s3')

In [173]:
df_feat_fill = pd.read_csv('s3://scrapes-rawhtml-dev/statfox/20200313.statfox_features.tsv.gz', sep='\t', index_col=0)
df_targ = pd.read_csv('s3://scrapes-rawhtml-dev/statfox/20200313.statfox_target.tsv.gz', sep='\t', index_col=0, header=None, squeeze=True)
df_lateline_prob = pd.read_csv('s3://scrapes-rawhtml-dev/statfox/20200313.statfox_lateline_prob.tsv.gz', sep='\t', index_col=0, header=None, squeeze=True)

In [174]:
df_targ.shape

(17573,)

In [175]:
df_feat_fill.shape

(17573, 1127)

In [176]:
df_targ?

[0;31mType:[0m        Series
[0;31mString form:[0m
0
           0        1
           1        1
           2        0
           3        0
           4        1
           5        1
           6        1
           7        0
           8       <...>     0
           17568    1
           17569    1
           17570    1
           17571    0
           17572    1
           Name: 1, Length: 17573, dtype: int64
[0;31mLength:[0m      17573
[0;31mFile:[0m        ~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/series.py
[0;31mDocstring:[0m  
One-dimensional ndarray with axis labels (including time series).

Labels need not be unique but must be a hashable type. The object
supports both integer- and label-based indexing and provides a host of
methods for performing operations involving the index. Statistical
methods from ndarray have been overridden to automatically exclude
missing data (currently represented as NaN).

Operations between Series (+, -, /, *, **

In [177]:
### Make sure matchidx column exists

[col for col in df_feat_fill.columns if 'match' in col]

['matchidx']

In [178]:
### Partial header names to looks for unknown/erroneous columns

f = lambda x: x[2:13]
colstrip = pd.Series(df_feat_fill.columns).map(f)
[col for col in colstrip.drop_duplicates() if '_h_' not in col and '_v_' not in col]

['Bullpen_BB_',
 'Bullpen_BSV',
 'Bullpen_ERA',
 'Bullpen_ER_',
 'Bullpen_HR_',
 'Bullpen_H_A',
 'Bullpen_H_H',
 'Bullpen_IP_',
 'Bullpen_L_A',
 'Bullpen_L_H',
 'Bullpen_R_A',
 'Bullpen_R_H',
 'Bullpen_SO_',
 'Bullpen_SV_',
 'Bullpen_WHI',
 'Bullpen_W_A',
 'Bullpen_W_H',
 'HitField_Te',
 'Overall_Opp',
 'Overall_Tea',
 'Bullpen_H_R',
 'Bullpen_L_R',
 'Bullpen_R_R',
 'Bullpen_W_R',
 'tchidx',
 'Bullpen_Pct',
 '_Opening_Li',
 'nth',
 'ar',
 '_Latest_Tot',
 '_Opening_To']

### Calculate close moneyline ROC AUC

In [179]:
df_targ_dropna = df_targ[df_lateline_prob.isna() == False]
df_lateline_prob_dropna = df_lateline_prob[df_lateline_prob.isna() == False]

In [180]:
roc_auc_score(df_targ_dropna, df_lateline_prob_dropna)

0.5965750121668296

In [181]:
df_targ.shape

(17573,)

In [182]:
df_feat_fill.shape

(17573, 1127)

In [202]:
for cnt in range(0,10):

    df_feat_fill_train, df_feat_fill_test, df_targ_train, df_targ_test = train_test_split(df_feat_fill, df_targ, test_size=0.2, random_state=cnt)

#     clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=cnt, 
#                                  max_depth=4, 
#                                  min_samples_split=8)
    complexity_par = {'class_weight': 'balanced', 
                      'criterion': 'entropy', 
                      'max_depth': 2, 
                      'min_samples_split': 9, 
                      'oob_score': False}
    
    clf = RandomForestClassifier(n_estimators=50, n_jobs=-1, **complexity_par)
    
    df_fit = clf.fit(df_feat_fill_train, df_targ_train)
    df_pred = df_fit.predict(df_feat_fill_test)
    
    print(roc_auc_score(df_targ_test, df_pred))
    

0.5550098254183768
0.5693366606318605
0.5622337317085262
0.56053618588455
0.5488892694622499
0.5556903219342338
0.5647986792476228
0.5612488607538981
0.5405110317975341
0.5524151045501084


### RF with AUC and no PCA

In [82]:
df_feat_fill.shape

(17573, 1123)

## Tune Parameters

### Leave one out CV to find model complexity

In [192]:
%%time 


### Define len for subset for development

_dev_len = 15000


### Define feature and target data

X = df_feat_fill[:_dev_len].to_numpy()
y = df_targ[:_dev_len].to_numpy()


### Save number of splits for leave-one-out CV

loo = LeaveOneOut()
splits = loo.split(X)


### Grid of hyperparams to search

n_estimators = [10, 50, 100, 200]
max_depth_par = range(1,5)
min_samples_split_par = [5,7,9,11,14,17,20,30]
min_samples_leaf_par = range(1,11)
criterion_par = ['gini', 'entropy']
class_weight_par = [None, 'balanced', 'balanced_subsample']
oob_score_par = [True, False]

parameters = {'n_estimators':n_estimators_par, 
              'max_depth':max_depth_par, 
              'min_samples_split':min_samples_split_par, 
              'min_samples_leaf':min_samples_leaf_par, 
              'criterion':criterion_par, 
              'class_weight':class_weight_par, 
              'oob_score':oob_score_par}

rfc = RandomForestClassifier(n_estimators=5, n_jobs=1)
clf = GridSearchCV(rfc, parameters, n_jobs=-1, cv=3)
clf.fit(X, y)

CPU times: user 1.26 s, sys: 304 ms, total: 1.57 s
Wall time: 1min 19s


In [193]:
clf.best_params_

{'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': 2,
 'min_samples_split': 9,
 'oob_score': False}

In [194]:
clf.score(X, y)

0.5562666666666667

In [195]:
X_val = df_feat_fill[_dev_len:].to_numpy()
y_val = df_targ[_dev_len:].to_numpy()

clf_pred = clf.predict(X_val)
roc_auc_score(y_val, clf_pred)

0.5828358742850351

In [None]:
df_feat_fill.shape

In [None]:
X.shape

In [None]:
for cnt in range(1,100,10):
#     X_train, X_test, Y_train, Y_test = train_test_split(df_feat_fill, df_targ, test_size=0.2, random_state=cnt)

    ### Standarize data

    min_max_scaler = preprocessing.MinMaxScaler()
    np_scaled = min_max_scaler.fit_transform(df_feat_fill)
    df_feat_fill_st = pd.DataFrame(np_scaled)

    df_feat_fill_train, df_feat_fill_test, df_targ_train, df_targ_test = train_test_split(df_feat_fill_st, df_targ, test_size=0.2)

    clf = AdaBoostClassifier()
    df_fit = clf.fit(df_feat_fill_train, df_targ_train)
    df_pred = df_fit.predict(df_feat_fill_test)
    
    print(roc_auc_score(df_targ_test, df_pred))
    

## Test out PCA

### Standardize Features

In [75]:
def RunPCA(X,n):
    """Takes an input data set X and returns n principal components
    """
    # Create a scaler object
    sc = StandardScaler()
    
    # Fit the scaler to the features and transform
    X_std = sc.fit_transform(X)

    # Create a pca object with the 2 components as a parameter
    pca = decomposition.PCA(n_components=n)

    # Fit the PCA and transform the data
    X_std_pca = pca.fit_transform(X_std)
    
    return X_std_pca

In [76]:
df_feat_fill_train = df_feat_fill.iloc[:nsplit]
df_targ_train = df_targ.iloc[:nsplit]
df_feat_fill_test = df_feat_fill.iloc[nsplit:]
df_targ_test = df_targ.iloc[nsplit:]

### Practice with PCA

transform df with fit on train

### Tune n_components param

In [94]:
scores = dict()

for cnt in range(1,100,5):

    ### Create a pca object with the 2 components as a parameter
    
    pca = decomposition.PCA(n_components=50)

    df_feat_fill_train, df_feat_fill_test, df_targ_train, df_targ_test = train_test_split(df_feat_fill, df_targ, test_size=0.1, random_state=1)

    X1 = df_feat_fill_train
    X2 = df_feat_fill_test

    ### Create a scaler object
    
    sc = StandardScaler()

    ### Fit the scaler to the features and transform
    
    X1_std = sc.fit_transform(X1)
    X2_std = sc.fit(X1).transform(X2)

    ### Fit the PCA and transform the data
    
    X1_std_pca = pca.fit_transform(X1_std)
    X2_std_pca = pca.fit(X1_std).transform(X2_std)

    std_pca_train = X1_std_pca
    std_pca_df = X2_std_pca

    clf = AdaBoostClassifier(n_estimators=100, random_state=1)
    df_fit = clf.fit(std_pca_train, df_targ_train)
    df_pred = df_fit.predict(std_pca_df)
    score = roc_auc_score(df_targ_test, df_pred)
    
    ### Append score to dict
    
    scores[cnt] = score
    
    ### Print for OCD
    print(cnt,' ',score)

1   0.5406012163200662
6   0.5254810861839616
11   0.5257931631733549
16   0.5242600849629604
21   0.5194033868155274
26   0.5261052401627482
31   0.5394582343464133
36   0.5252879885467745
41   0.5288456662258579
46   0.530548436299235
51   0.512311437231565
56   0.5231171029893075
61   0.5362496927992135
66   0.5289373388414922
71   0.5312876686678604
76   0.5225846216261553
81   0.5338584028679876
86   0.5248198730626845
91   0.5148587656574877
96   0.531898169278361


In [None]:
RandomForestClassifier?

In [None]:
pd.DataFrame([clf.feature_importances_, df_feat_fill_train.columns]).transpose().sort_values

In [None]:
pd.DataFrame([clf.feature_importances_, df_feat_fill_train.columns]).transpose.sort

In [None]:
df_feat_fill_desc = df_feat_fill_train.describe().loc[['mean', 'std']]

In [None]:
df_feat_fill_desc

In [None]:
for x in df_feat_fill_desc.columns:
    print(df_feat_fill_desc[x])

In [None]:
scores = dict()
for cnt in range(4):
    scores[cnt] = cnt*4

In [None]:
scores