# Explore and Model StatFox Matchup Data
`mlb_bet_notebooks/model_statfox_matchups.ipynb`
- Explore features
- Convert historical moneylines to break-even probabilities
- Model pre-computed features with RF and maybe PCA
- Compare model predictions to historical moneylines
    - Use break-even probabilities as alternative model and compare ROC
Jonathan Sims 2020-02-24

In [19]:
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import numpy as np
from sklearn import decomposition
from sklearn.preprocessing import StandardScaler

In [6]:
file_in = '20200222.skr_statfox.2018.tsv.gz'

#### Import features, moneylines, target

In [7]:
df = pd.read_csv(file_in, sep='\t')

In [9]:
#### Separate features and targets

lines = [col for col in df.columns if '_Line' in col]
totals = [col for col in df.columns if '_Total' in col]
percs = [col for col in df.columns if 'Pct' in col]
teams = ['tm_h', 'tm_v']
scores = ['sc_h','sc_v']
feats = ['win_h']
# drop = pd.concat([lines, totals, scores, feats])
drop = lines+totals+scores+feats+percs+teams
drop

['H__Latest_Line',
 'H__Opening_Line',
 'V__Latest_Line',
 'V__Opening_Line',
 'H__Latest_Total',
 'H__Opening_Total',
 'V__Latest_Total',
 'V__Opening_Total',
 'sc_h',
 'sc_v',
 'win_h',
 'H_Bullpen_Pct_AllGames',
 'H_Bullpen_Pct_HomeGames',
 'V_Bullpen_Pct_AllGames',
 'V_Bullpen_Pct_RoadGames',
 'tm_h',
 'tm_v']

In [10]:
df_feat = df.drop(drop, axis=1)
df_targ = df['win_h']
df_openline = df['H__Opening_Line']
df_lateline = df['H__Latest_Line']

In [11]:
def american_to_probability(x):
    """
    Turns American +/- odds into probability 0 to 1 inclusive
    """
    if x.find('-') > -1 and x.find('+') > -1:
        print('Error: + and - signs found in betting line string') 

    elif x.find('-') > -1:
        num = int(x.replace(' ','').replace('-',''))
        if num < 100:
            print('Error: Betting line outside bounds [100,+inf]')
        else:
            pr = num/(100+num)
            return pr

    elif x.find('+') > -1:
        num = int(x.replace(' ','').replace('+',''))
        if num < 100 or num > 999:
            print('Error: Betting line outside bounds [100,999]')
        else:
            pr = 100/(100+num)
            return pr

#     else:
#         print('Error: No sign found in betting line string')

#### Calculate open and close moneyline ROC AUC

In [12]:
df_lateline_prob = df_lateline.apply(american_to_probability)

keeps = df_lateline_prob[lambda x: (0 <= x) & (x <= 1)]

df_lateline_prob = df_lateline_prob[keeps.index]

df_targ_keeps = df_targ[keeps.index]

roc_auc_score(df_targ_keeps, df_lateline_prob)

0.6200886381840088

In [13]:
df_openline_prob = df_openline.apply(american_to_probability)

keeps = df_openline_prob[lambda x: (0 <= x) & (x <= 1)]

df_openline_prob = df_openline_prob[keeps.index]

df_targ_keeps = df_targ[keeps.index]

roc_auc_score(df_targ_keeps, df_openline_prob)

0.616253699511644

#### Test RF with AUC and no preprocessing

In [23]:
with pd.option_context('mode.use_inf_as_null', True):
    df_feat_clean = df_feat.fillna(value=0)

In [52]:
n_split = 2000

In [56]:
df_feat_train = df_feat_clean.iloc[:nsplit]
df_targ_train = df_targ.iloc[:nsplit]
df_feat_test = df_feat_clean.iloc[nsplit:]
df_targ_test = df_targ.iloc[nsplit:]

In [57]:
clf = RandomForestClassifier(n_estimators=1000)
df_fit = clf.fit(df_feat_train, df_targ_train)
df_pred = df_fit.predict(df_feat_test)
roc_auc_score(df_targ_test, df_pred)

0.5366005097928149

#### Examine overall features

In [17]:
df_feat[df_feat.columns[10:30]].describe()

Unnamed: 0,H_Bullpen_HR_HomeGames,H_Bullpen_H_AllGames,H_Bullpen_H_HomeGames,H_Bullpen_IP_AllGames,H_Bullpen_IP_HomeGames,H_Bullpen_L_AllGames,H_Bullpen_L_HomeGames,H_Bullpen_R_AllGames,H_Bullpen_R_HomeGames,H_Bullpen_SO_AllGames,H_Bullpen_SO_HomeGames,H_Bullpen_SV_AllGames,H_Bullpen_SV_HomeGames,H_Bullpen_WHIP_AllGames,H_Bullpen_WHIP_HomeGames,H_Bullpen_W_AllGames,H_Bullpen_W_HomeGames,H_HitField_TeamBatting_2B_AllGames,H_HitField_TeamBatting_2B_HomeGames,H_HitField_TeamBatting_2B_LeftyStarters
count,2430.0,2430.0,2430.0,2430.0,2430.0,2430.0,2430.0,2430.0,2430.0,2430.0,2430.0,2430.0,2430.0,2430.0,2430.0,2430.0,2430.0,2430.0,2415.0,744.0
mean,17.655967,273.954733,138.768724,295.671317,152.12823,14.071605,6.412346,143.340741,71.263374,294.634979,149.693827,21.585597,10.211523,1.319481,1.282777,15.108642,8.495062,132.513169,65.813251,38.63172
std,11.335818,158.823509,80.419178,168.19769,87.629249,8.568219,4.357663,86.203069,43.047828,167.428139,87.290277,12.938555,6.747864,0.134223,0.177452,9.606702,5.824009,74.786869,37.993496,22.949564
min,0.0,6.0,0.0,10.0,0.0,0.0,0.0,2.0,0.0,9.0,0.0,0.0,0.0,0.844,0.0,0.0,0.0,4.0,0.0,0.0
25%,8.0,140.0,71.0,152.7,77.325,7.0,3.0,70.0,35.0,157.25,76.0,11.0,4.25,1.224,1.21325,7.0,4.0,69.0,34.0,20.0
50%,16.0,266.0,136.0,290.15,149.3,14.0,6.0,136.0,69.0,290.5,146.0,21.0,10.0,1.324,1.284,14.0,7.0,132.0,65.0,38.0
75%,26.0,404.0,206.0,433.0,223.7,21.0,10.0,212.75,106.0,428.0,218.0,32.0,15.0,1.409,1.372,21.0,12.0,196.0,96.0,55.0
max,47.0,713.0,342.0,818.7,432.0,36.0,19.0,377.0,192.0,752.0,409.0,59.0,31.0,2.0,2.151,54.0,34.0,318.0,186.0,103.0


## Test out PCA

#### Standardize Features

In [44]:
def RunPCA(X,n):
    """Takes an input data set X and returns n principal components
    """
    # Create a scaler object
    sc = StandardScaler()
    
    # Fit the scaler to the features and transform
    X_std = sc.fit_transform(X)

    # Create a pca object with the 2 components as a parameter
    pca = decomposition.PCA(n_components=n)

    # Fit the PCA and transform the data
    X_std_pca = pca.fit_transform(X_std)
    
    return X_std_pca

IndentationError: unexpected indent (<ipython-input-44-490a622fbf81>, line 5)

In [64]:
# Set number of games to partition train/test split at
nsplit = 2200

In [65]:
df_feat_train = df_feat_clean.iloc[:nsplit]
df_targ_train = df_targ.iloc[:nsplit]
df_feat_test = df_feat_clean.iloc[nsplit:]
df_targ_test = df_targ.iloc[nsplit:]

#### Practice with PCA

transform test with fit on train

In [83]:
X1 = df_feat_train
X2 = df_feat_test

# Create a scaler object
sc = StandardScaler()

# Fit the scaler to the features and transform
X1_std = sc.fit_transform(X1)
X2_std = sc.fit(X1).transform(X2)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)


#### Tune n_components param

In [84]:
scores = dict()

for cnt in range(1,102,5):

    # Create a pca object with the 2 components as a parameter
    pca = decomposition.PCA(n_components=cnt)

    # Fit the PCA and transform the data
    X1_std_pca = pca.fit_transform(X1_std)
    X2_std_pca = pca.fit(X1_std).transform(X2_std)

    std_pca_train = X1_std_pca
    std_pca_test = X2_std_pca

    clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=1)
    df_fit = clf.fit(std_pca_train, df_targ_train)
    df_pred = df_fit.predict(std_pca_test)
    score = roc_auc_score(df_targ_test, df_pred)
    
    # Append score to dict
    scores[cnt] = score
    
    # Print for OCD
    print(cnt,' ',score)

1   0.44689592209373097
6   0.4583840535605599
11   0.4844796104686549
16   0.5079884357881924
21   0.5053256238587949
26   0.5416159464394401
31   0.5477023737066342
36   0.5840687766281193
41   0.551049908703591
46   0.585362142422398
51   0.546941570298235
56   0.5442787583688374
61   0.5577449786975046
66   0.5947200243457091
71   0.546941570298235
76   0.5651247717589775
81   0.5752434570906878
86   0.5737979306147292
91   0.5785149117468047
96   0.5435940353012781
101   0.5489196591600731


In [77]:
for cnt in range(0,20,10):
    print(cnt)

0
10


In [70]:
scores = dict()
for cnt in range(4):
    scores[cnt] = cnt*4

In [71]:
scores

{0: 0, 1: 4, 2: 8, 3: 12}