# Explore and Model StatFox Matchup Data
`mlb_bet_notebooks/model_statfox_matchups.ipynb`
- Explore features
- Convert historical moneylines to break-even probabilities
- Model pre-computed features with RF and maybe PCA
- Compare model predictions to historical moneylines
    - Use break-even probabilities as alternative model and compare ROC
- Try VIF filter
- Try k-fold CV
- Try grid search model complexity
- Try to get player salary
    - Combine with addition, subtraction from statfox blobs
Jonathan Sims 2020-02-24

In [43]:
import math
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import roc_auc_score
import numpy as np
from sklearn import decomposition
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import random
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import preprocessing

In [2]:
file_in1 = '20200310.skr_statfox.2010-2018.0.7400.tsv.gz'
file_in2 = '20200310.skr_statfox.2010-2018.7400.10000.tsv.gz'
file_in3 = '20200310.skr_statfox.2010-2018.10000.12000.tsv.gz'
file_in4 = '20200310.skr_statfox.2010-2018.12000.16000.tsv.gz'
file_in5 = '20200310.skr_statfox.2010-2018.16000.20600.tsv.gz'
file_in6 = '20200310.skr_statfox.2010-2018.20600.tsv.gz'

### Import features, moneylines, target

In [3]:
### Import chunks and append

df1 = pd.read_csv(file_in1, sep='\t', low_memory=False)
df2 = pd.read_csv(file_in2, sep='\t', low_memory=False)
df3 = pd.read_csv(file_in3, sep='\t', low_memory=False)
df4 = pd.read_csv(file_in4, sep='\t', low_memory=False)
df5 = pd.read_csv(file_in5, sep='\t', low_memory=False)
df6 = pd.read_csv(file_in6, sep='\t', low_memory=False)

df = pd.concat([df1, df2, df3, df4, df5, df6], ignore_index=False, sort=True)

In [4]:
### Clear useless dataframes

df1 = []
df2 = []
df3 = []
df4 = []
df5 = []
df6 = []

In [5]:
### Give proper name to the match index

df['matchidx'] = df['Unnamed: 0']
df = df.drop(['Unnamed: 0'], axis=1)

In [6]:
df.shape

(17632, 590)

In [7]:
df.tail()

Unnamed: 0,H_Bullpen_BB_AllGames,H_Bullpen_BB_HomeGames,H_Bullpen_BSV_AllGames,H_Bullpen_BSV_HomeGames,H_Bullpen_ERA_AllGames,H_Bullpen_ERA_HomeGames,H_Bullpen_ER_AllGames,H_Bullpen_ER_HomeGames,H_Bullpen_HR_AllGames,H_Bullpen_HR_HomeGames,...,V__Latest_Line,V__Latest_Total,V__Opening_Line,V__Opening_Total,sc_h,sc_v,tm_h,tm_v,win_h,matchidx
1262,221,105,27,10,4.51,4.22,303,142,89,39,...,120,"Ov 9,-105",135,"Ov 9,-110",5,4,MINNESOTA,CHIWHITESOX,1,20180930926925
1263,174,94,21,12,3.79,3.58,233,114,67,42,...,155,"Ov 8.5,-110",155,"Ov 8.5,-120",3,1,SEATTLE,TEXAS,1,20180930922921
1264,285,139,18,7,3.81,3.25,347,156,88,44,...,150,"Ov 8,-115",150,"Ov 8,-115",9,4,TAMPABAY,TORONTO,1,20180930924923
1265,273,137,18,10,3.34,3.76,217,121,51,29,...,115,"Ov 8,-115",115,"Ov 8,-115",1,3,CHICAGOCUBS,MILWAUKEE,0,20181001952951
1266,198,85,26,11,3.7,3.77,238,117,73,39,...,150,"Ov 7,-105",155,"Ov 7,-105",5,2,LADODGERS,COLORADO,1,20181001954953


### Clean up OU Totals, OSB, and DP features
- This should really be in skr_statfox_matchups.ipynb 

In [8]:
### Split by a comma then drop Ov and Un text

ou_cols = [col for col in df.columns if '_Total' in col]

if ou_cols != []:
    for x in ou_cols:
        xO = x.replace('_Total', '_Tot')
        xU = x.replace('_Total', '_TotLn')
        
        df[[xO,xU]] = df[x].str.split(",", expand=True)    
        df[xO] = df[xO].str.replace("Ov ","")
        df[xO] = df[xO].str.replace("Un ","")
        df = df.drop(x, axis=1)
    
new_ou_cols = [col for col in df.columns if '_Tot' in col]    
df[new_ou_cols].head()

Unnamed: 0,H__Latest_Tot,H__Latest_TotLn,H__Opening_Tot,H__Opening_TotLn,V__Latest_Tot,V__Latest_TotLn,V__Opening_Tot,V__Opening_TotLn
0,8.5,-105,8.5,-105,8.5,-115,8.5,-115
1,9.0,-115,9.0,-115,9.0,-105,9.0,-105
2,7.0,-110,7.0,-110,7.0,-110,7.0,-110
3,7.0,105,7.0,105,7.0,-125,7.0,-125
4,9.0,-115,9.5,-120,9.0,-105,9.5,100


In [9]:
### Remove parantheses from numeric values

osb_cols = [col for col in df.columns if '_OSB_' in col]
dp_cols = [col for col in df.columns if '_DP_' in col]
paren_cols = osb_cols + dp_cols

if paren_cols != []:
    for x in paren_cols:
        df[x] = df[x].str.replace("(","")
        df[x] = df[x].str.replace(")","")
        
df[paren_cols].head()        

Unnamed: 0,H_HitField_TeamFielding_OSB_AllGames,H_HitField_TeamFielding_OSB_HomeGames,H_HitField_TeamFielding_OSB_LeftyStarters,H_HitField_TeamFielding_OSB_RightyStarters,V_HitField_TeamFielding_OSB_AllGames,V_HitField_TeamFielding_OSB_LeftyStarters,V_HitField_TeamFielding_OSB_RightyStarters,V_HitField_TeamFielding_OSB_RoadGames,H_HitField_TeamFielding_DP_AllGames,H_HitField_TeamFielding_DP_HomeGames,H_HitField_TeamFielding_DP_LeftyStarters,H_HitField_TeamFielding_DP_RightyStarters,V_HitField_TeamFielding_DP_AllGames,V_HitField_TeamFielding_DP_LeftyStarters,V_HitField_TeamFielding_DP_RightyStarters,V_HitField_TeamFielding_DP_RoadGames
0,129,64,,90,104,,70.0,37,182,94,,114,138,,77.0,67
1,124,66,,88,111,31.0,,58,157,77,,103,170,32.0,,82
2,108,47,,78,86,,61.0,45,160,98,,96,164,,112.0,88
3,86,40,,59,65,,46.0,26,155,78,,104,150,,98.0,68
4,97,50,,62,87,,58.0,48,168,97,,109,168,,112.0,83


In [10]:
### Remove percent signs

perc = [col for col in df.columns if '_Pct_' in col]

if perc != []:
    for x in perc:
        df[x] = df[x].str.replace("%","")
        df[x] = df[x].str.replace(" ","")

df[perc].head()

Unnamed: 0,H_Bullpen_Pct_AllGames,H_Bullpen_Pct_HomeGames,V_Bullpen_Pct_AllGames,V_Bullpen_Pct_RoadGames
0,76.1,73.5,73.8,79.3
1,66.7,68.0,58.1,53.6
2,60.7,57.7,65.6,58.1
3,76.0,77.3,63.6,60.0
4,78.9,82.8,62.5,68.4


### Dedupe on matchup index (YYYYMMDDHHHVVV)

In [11]:
df = df.drop_duplicates(['matchidx'])

In [12]:
df.shape

(17573, 594)

### Separate features and targets

In [13]:
lines = [col for col in df.columns if 'Latest_Line' in col]
teams = ['tm_h', 'tm_v']
scores = ['sc_h','sc_v']
feats = ['win_h']
# drop = pd.concat([lines, totals, scores, feats])
drop = lines+scores+feats
drop

['H__Latest_Line', 'V__Latest_Line', 'sc_h', 'sc_v', 'win_h']

In [14]:
df_feat = df.drop(drop, axis=1).reset_index(drop=True)
df_targ = pd.to_numeric(df['win_h'], errors='coerce').reset_index(drop=True)
df_openline = pd.to_numeric(df['H__Opening_Line'], errors='coerce').reset_index(drop=True)
df_lateline = pd.to_numeric(df['H__Latest_Line'], errors='coerce').reset_index(drop=True)

### Add year and month variable

In [15]:
def get_month(x):
    """Take match index YYYYMMDDHHHVVV and return the month of game
    """
    flr = math.floor(x/100000000)
    flrmod = flr%100
    return str(flrmod)

def get_year(x):
    """Take match index YYYYMMDDHHHVVV and return the year of game
    """
    flr = math.floor(x/10000000000)
    return str(flr)

In [16]:
df_feat['month'] = df_feat['matchidx'].apply(get_month)
df_feat['year'] = df_feat['matchidx'].apply(get_year)

In [17]:
df_feat.shape

(17573, 591)

In [18]:
df_feat[['matchidx','year','month']].head()

Unnamed: 0,matchidx,year,month
0,20100405924923,2010,4
1,20100405918917,2010,4
2,20100405922921,2010,4
3,20100405926925,2010,4
4,20100405920919,2010,4


### Final Clean
- Try to convert object to numeric
- If except: convert object to binary dummies

In [19]:
def preprocess_tonumeric(x):
    """Clean features to get everything numeric.
    1) Strip percent sign, space, and commas
    2) Convert objects to numeric if possible
    3) Else, convert objects to dummies
    """
#     output = pd.DataFrame()
    
    for col, col_data in x.iteritems():
        
        if col_data.dtype == object:
        
            try:
                col_data = pd.to_numeric(col_data)
                x = x.drop(col, axis=1)
                
            except:
                col_data = pd.get_dummies(col_data, prefix=col)
                x = x.drop(col, axis=1)
            
        x = pd.concat([x, col_data], axis=1)
    
    ### Keep first of all columns then drop duplicates

    Cols = list(x.columns)
    for i,item in enumerate(x.columns):
        if item in x.columns[:i]: Cols[i] = "toDROP"
    x.columns = Cols
    x = x.drop("toDROP",1)
    
    return x

### Impute missing values and create dummy for missing

In [20]:
# def preprocess_nan(x, fill_strategy='mean'):
def preprocess_nan(z):
    """Process NaNs in a dataframe to prepare for RF or other model.
    1) Creates dummy columns for each existing column with > 1 NaN
    2) Fill NaN in existing column with desired strategy (see scikit SimpleImputer)
    """
    for col in z.columns:
        if z[col].isnull().any().any():
            nancol = col+'_NaN'
            z[nancol] = [1 if s == True else 0 for s in z[col].isnull()]
        
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    x_out = pd.DataFrame(imp.fit_transform(z), columns=z.columns)
    
    return x_out

In [21]:
df_feat_num = preprocess_tonumeric(df_feat)

In [22]:
df_feat_num[:2]

Unnamed: 0,H_Bullpen_BB_AllGames,H_Bullpen_BB_HomeGames,H_Bullpen_BSV_AllGames,H_Bullpen_BSV_HomeGames,H_Bullpen_ERA_AllGames,H_Bullpen_ERA_HomeGames,H_Bullpen_ER_AllGames,H_Bullpen_ER_HomeGames,H_Bullpen_HR_AllGames,H_Bullpen_HR_HomeGames,...,H__Latest_Tot,H__Latest_TotLn,H__Opening_Tot,H__Opening_TotLn,V__Latest_Tot,V__Latest_TotLn,V__Opening_Tot,V__Opening_TotLn,month,year
0,228,117,17,9,4.42,4.69,253,138,56,30,...,8.5,-105.0,8.5,-105.0,8.5,-115.0,8.5,-115.0,4,2010
1,190,107,18,8,4.06,4.41,212,126,49,25,...,9.0,-115.0,9.0,-115.0,9.0,-105.0,9.0,-105.0,4,2010


In [23]:
df_feat_num.shape

(17573, 897)

In [24]:
df.shape

(17573, 594)

In [25]:
df_feat_fill = preprocess_nan(df_feat_num)
df_feat_fill[:2]

Unnamed: 0,H_Bullpen_BB_AllGames,H_Bullpen_BB_HomeGames,H_Bullpen_BSV_AllGames,H_Bullpen_BSV_HomeGames,H_Bullpen_ERA_AllGames,H_Bullpen_ERA_HomeGames,H_Bullpen_ER_AllGames,H_Bullpen_ER_HomeGames,H_Bullpen_HR_AllGames,H_Bullpen_HR_HomeGames,...,V_HitField_TeamFielding_OSB_RightyStarters_NaN,V_HitField_TeamFielding_OSB_RoadGames_NaN,H__Latest_Tot_NaN,H__Latest_TotLn_NaN,H__Opening_Tot_NaN,H__Opening_TotLn_NaN,V__Latest_Tot_NaN,V__Latest_TotLn_NaN,V__Opening_Tot_NaN,V__Opening_TotLn_NaN
0,228.0,117.0,17.0,9.0,4.42,4.69,253.0,138.0,56.0,30.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,190.0,107.0,18.0,8.0,4.06,4.41,212.0,126.0,49.0,25.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Convert open and close moneylines to probability

In [26]:
def american_to_probability(x):
    """Turns American +/- odds into probability 0 to 1 exclusive
    NOTE: Returns 0 if missing since 0 and 1 are impossible from lines
    """
    if x < -99:
        num = abs(x)
        pr = num/(100+num)
        return pr

    elif x >= 100:
        num = x
        pr = 100/(100+num)
        return pr
    
    elif x == None:
        pr = 0

#     else:
#         print('Error: No sign found in betting line string')

### Calculate open and close moneyline ROC AUC

In [27]:
df_lateline.shape

(17573,)

In [28]:
df_lateline.head()

0   -130.0
1   -160.0
2   -110.0
3    120.0
4   -140.0
Name: H__Latest_Line, dtype: float64

In [29]:
df_lateline_prob = df_lateline.apply(american_to_probability)

df_lateline_prob = df_lateline_prob[lambda x: (x > 0) & (x < 1)]

In [30]:
df_targ_keeps = df_targ[df_lateline_prob.index]

roc_auc_score(df_targ_keeps, df_lateline_prob)

0.5965750121668296

In [31]:
df_targ.shape

(17573,)

In [32]:
df_targ_keeps.shape

(17552,)

In [33]:
df_lateline_prob.shape

(17552,)

In [34]:
df_openline_prob = df_openline.apply(american_to_probability)

keeps = df_openline_prob[lambda x: (0 <= x) & (x <= 1)]

df_openline_prob = df_openline_prob[keeps.index]

df_targ_keeps = df_targ[keeps.index]

roc_auc_score(df_targ_keeps, df_openline_prob)

0.5912541455140431

### RF with AUC and no PCA

In [35]:
df_feat_fill.shape

(17573, 1363)

## Tune Parameters

### Leave one out CV to find model complexity

In [None]:
### Define len for subset for development
_dev_len = 2000

### Define feature and target data

X = df_feat_fill[:_dev_len].to_numpy()
y = df_targ[:_dev_len].to_numpy()


### Save number of splits for leave-one-out CV

loo = LeaveOneOut()
splits = loo.split(X)

### Grid of hyperparams to search

parameters = {'max_depth': [1,20,50,100], 'min_samples_split': [2,5,10,20,30,50]}
rfc = RandomForestClassifier(n_estimators=100, n_jobs=-1)
clf = GridSearchCV(rfc, parameters, n_jobs=-1, cv=splits)
clf.fit(X, y)



### Create splits

In [57]:
clf.best_params_

{'max_depth': 1, 'min_samples_split': 30}

In [58]:
clf.score(X, y)

0.5415

In [40]:
X_train.shape

(17572, 1363)

In [96]:
for cnt in range(1,100,10):
#     X_train, X_test, Y_train, Y_test = train_test_split(df_feat_fill, df_targ, test_size=0.2, random_state=cnt)

    ### Standarize data

    min_max_scaler = preprocessing.MinMaxScaler()
    np_scaled = min_max_scaler.fit_transform(df_feat_fill)
    df_feat_fill_st = pd.DataFrame(np_scaled)

    df_feat_fill_train, df_feat_fill_test, df_targ_train, df_targ_test = train_test_split(df_feat_fill_st, df_targ, test_size=0.2)

    clf = AdaBoostClassifier()
    df_fit = clf.fit(df_feat_fill_train, df_targ_train)
    df_pred = df_fit.predict(df_feat_fill_test)
    
    print(roc_auc_score(df_targ_test, df_pred))
    

0.531850700968348
0.5253301793810368
0.5339395051700977
0.5286288962759551
0.541519634963862
0.5108677676766884
0.5377828569796435
0.5361884654914358
0.5375349355181782
0.5227284219288163


In [84]:
for cnt in range(0,10):

    df_feat_fill_train, df_feat_fill_test, df_targ_train, df_targ_test = train_test_split(df_feat_fill, df_targ, test_size=0.2, random_state=cnt)

    clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=cnt)
    df_fit = clf.fit(df_feat_fill_train, df_targ_train)
    df_pred = df_fit.predict(df_feat_fill_test)
    
    print(roc_auc_score(df_targ_test, df_pred))
    

0.512822626163314
0.5273800786155829
0.5164933069315196
0.5252343658877168
0.5261184779240267
0.5283997236213925
0.5191845548465334
0.5174868588530961
0.5170416937053861
0.5225325038238827


## Test out PCA

### Standardize Features

In [75]:
def RunPCA(X,n):
    """Takes an input data set X and returns n principal components
    """
    # Create a scaler object
    sc = StandardScaler()
    
    # Fit the scaler to the features and transform
    X_std = sc.fit_transform(X)

    # Create a pca object with the 2 components as a parameter
    pca = decomposition.PCA(n_components=n)

    # Fit the PCA and transform the data
    X_std_pca = pca.fit_transform(X_std)
    
    return X_std_pca

In [76]:
df_feat_fill_train = df_feat_fill.iloc[:nsplit]
df_targ_train = df_targ.iloc[:nsplit]
df_feat_fill_test = df_feat_fill.iloc[nsplit:]
df_targ_test = df_targ.iloc[nsplit:]

### Practice with PCA

transform df with fit on train

### Tune n_components param

In [94]:
scores = dict()

for cnt in range(1,100,5):

    ### Create a pca object with the 2 components as a parameter
    
    pca = decomposition.PCA(n_components=50)

    df_feat_fill_train, df_feat_fill_test, df_targ_train, df_targ_test = train_test_split(df_feat_fill, df_targ, test_size=0.1, random_state=1)

    X1 = df_feat_fill_train
    X2 = df_feat_fill_test

    ### Create a scaler object
    
    sc = StandardScaler()

    ### Fit the scaler to the features and transform
    
    X1_std = sc.fit_transform(X1)
    X2_std = sc.fit(X1).transform(X2)

    ### Fit the PCA and transform the data
    
    X1_std_pca = pca.fit_transform(X1_std)
    X2_std_pca = pca.fit(X1_std).transform(X2_std)

    std_pca_train = X1_std_pca
    std_pca_df = X2_std_pca

    clf = AdaBoostClassifier(n_estimators=100, random_state=1)
    df_fit = clf.fit(std_pca_train, df_targ_train)
    df_pred = df_fit.predict(std_pca_df)
    score = roc_auc_score(df_targ_test, df_pred)
    
    ### Append score to dict
    
    scores[cnt] = score
    
    ### Print for OCD
    print(cnt,' ',score)

1   0.5406012163200662
6   0.5254810861839616
11   0.5257931631733549
16   0.5242600849629604
21   0.5194033868155274
26   0.5261052401627482
31   0.5394582343464133
36   0.5252879885467745
41   0.5288456662258579
46   0.530548436299235
51   0.512311437231565
56   0.5231171029893075
61   0.5362496927992135
66   0.5289373388414922
71   0.5312876686678604
76   0.5225846216261553
81   0.5338584028679876
86   0.5248198730626845
91   0.5148587656574877
96   0.531898169278361


In [89]:
RandomForestClassifier?

[0;31mInit signature:[0m [0mRandomForestClassifier[0m[0;34m([0m[0mn_estimators[0m[0;34m=[0m[0;34m'warn'[0m[0;34m,[0m [0mcriterion[0m[0;34m=[0m[0;34m'gini'[0m[0;34m,[0m [0mmax_depth[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mmin_samples_split[0m[0;34m=[0m[0;36m2[0m[0;34m,[0m [0mmin_samples_leaf[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m [0mmin_weight_fraction_leaf[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m [0mmax_features[0m[0;34m=[0m[0;34m'auto'[0m[0;34m,[0m [0mmax_leaf_nodes[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mmin_impurity_decrease[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m [0mmin_impurity_split[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mbootstrap[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0moob_score[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m [0mn_jobs[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mrandom_state[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mverbose[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0mwarm_start[0m[

In [None]:
pd.DataFrame([clf.feature_importances_, df_feat_fill_train.columns]).transpose().sort_values

In [None]:
pd.DataFrame([clf.feature_importances_, df_feat_fill_train.columns]).transpose.sort

In [None]:
df_feat_fill_desc = df_feat_fill_train.describe().loc[['mean', 'std']]

In [None]:
df_feat_fill_desc

In [None]:
for x in df_feat_fill_desc.columns:
    print(df_feat_fill_desc[x])

In [None]:
scores = dict()
for cnt in range(4):
    scores[cnt] = cnt*4

In [None]:
scores