# Explore and Model StatFox Matchup Data
`mlb_bet_notebooks/model_statfox_matchups.ipynb`
- Explore features
- Convert historical moneylines to break-even probabilities
- Model pre-computed features with RF and maybe PCA
- Compare model predictions to historical moneylines
    - Use break-even probabilities as alternative model and compare ROC
- Try VIF filter
- Try k-fold CV
- Try grid search model complexity
- Try to get player salary
    - Combine with addition, subtraction from statfox blobs
- Try fix Opening Line feature 
    - Try openline probability as feature

Jonathan Sims 2020-02-24

In [43]:
import math
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import roc_auc_score
import numpy as np
from sklearn import decomposition
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import random
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import preprocessing

### Calculate open and close moneyline ROC AUC

In [30]:
df_targ_keeps = df_targ[df_lateline_prob.index]

roc_auc_score(df_targ_keeps, df_lateline_prob)

0.5965750121668296

In [31]:
df_targ.shape

(17573,)

In [32]:
df_targ_keeps.shape

(17552,)

In [33]:
df_lateline_prob.shape

(17552,)

In [34]:
df_openline_prob = df_openline.apply(american_to_probability)

keeps = df_openline_prob[lambda x: (0 <= x) & (x <= 1)]

df_openline_prob = df_openline_prob[keeps.index]

df_targ_keeps = df_targ[keeps.index]

roc_auc_score(df_targ_keeps, df_openline_prob)

0.5912541455140431

### RF with AUC and no PCA

In [35]:
df_feat_fill.shape

(17573, 1363)

## Tune Parameters

### Leave one out CV to find model complexity

In [99]:
%%time 


### Define len for subset for development

_dev_len = 15000


### Define feature and target data

X = df_feat_fill[:_dev_len].to_numpy()
y = df_targ[:_dev_len].to_numpy()


### Save number of splits for leave-one-out CV

loo = LeaveOneOut()
splits = loo.split(X)


### Grid of hyperparams to search

max_depth_par = range(1,5)
min_samples_split_par = range(2,10)

parameters = {'max_depth': max_depth_par, 'min_samples_split': min_samples_split_par}
rfc = RandomForestClassifier(n_estimators=10, n_jobs=1)
clf = GridSearchCV(rfc, parameters, n_jobs=-1, cv=10)
clf.fit(X, y)

CPU times: user 1.37 s, sys: 278 ms, total: 1.65 s
Wall time: 1min 24s


In [100]:
clf.best_params_

{'max_depth': 4, 'min_samples_split': 4}

In [101]:
clf.score(X, y)

0.5694

In [103]:
X_val = df_feat_fill[_dev_len:].to_numpy()
y_val = df_targ[_dev_len:].to_numpy()

clf_pred = clf.predict(X_val)
roc_auc_score(y_val, clf_pred)

0.5365021565653075

In [None]:
df_feat_fill.shape

In [None]:
X.shape

In [72]:
for cnt in range(1,100,10):
#     X_train, X_test, Y_train, Y_test = train_test_split(df_feat_fill, df_targ, test_size=0.2, random_state=cnt)

    ### Standarize data

    min_max_scaler = preprocessing.MinMaxScaler()
    np_scaled = min_max_scaler.fit_transform(df_feat_fill)
    df_feat_fill_st = pd.DataFrame(np_scaled)

    df_feat_fill_train, df_feat_fill_test, df_targ_train, df_targ_test = train_test_split(df_feat_fill_st, df_targ, test_size=0.2)

    clf = AdaBoostClassifier()
    df_fit = clf.fit(df_feat_fill_train, df_targ_train)
    df_pred = df_fit.predict(df_feat_fill_test)
    
    print(roc_auc_score(df_targ_test, df_pred))
    

ValueError: not enough values to unpack (expected 5, got 0)

In [84]:
for cnt in range(0,10):

    df_feat_fill_train, df_feat_fill_test, df_targ_train, df_targ_test = train_test_split(df_feat_fill, df_targ, test_size=0.2, random_state=cnt)

    clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=cnt)
    df_fit = clf.fit(df_feat_fill_train, df_targ_train)
    df_pred = df_fit.predict(df_feat_fill_test)
    
    print(roc_auc_score(df_targ_test, df_pred))
    

0.512822626163314
0.5273800786155829
0.5164933069315196
0.5252343658877168
0.5261184779240267
0.5283997236213925
0.5191845548465334
0.5174868588530961
0.5170416937053861
0.5225325038238827


## Test out PCA

### Standardize Features

In [75]:
def RunPCA(X,n):
    """Takes an input data set X and returns n principal components
    """
    # Create a scaler object
    sc = StandardScaler()
    
    # Fit the scaler to the features and transform
    X_std = sc.fit_transform(X)

    # Create a pca object with the 2 components as a parameter
    pca = decomposition.PCA(n_components=n)

    # Fit the PCA and transform the data
    X_std_pca = pca.fit_transform(X_std)
    
    return X_std_pca

In [76]:
df_feat_fill_train = df_feat_fill.iloc[:nsplit]
df_targ_train = df_targ.iloc[:nsplit]
df_feat_fill_test = df_feat_fill.iloc[nsplit:]
df_targ_test = df_targ.iloc[nsplit:]

### Practice with PCA

transform df with fit on train

### Tune n_components param

In [94]:
scores = dict()

for cnt in range(1,100,5):

    ### Create a pca object with the 2 components as a parameter
    
    pca = decomposition.PCA(n_components=50)

    df_feat_fill_train, df_feat_fill_test, df_targ_train, df_targ_test = train_test_split(df_feat_fill, df_targ, test_size=0.1, random_state=1)

    X1 = df_feat_fill_train
    X2 = df_feat_fill_test

    ### Create a scaler object
    
    sc = StandardScaler()

    ### Fit the scaler to the features and transform
    
    X1_std = sc.fit_transform(X1)
    X2_std = sc.fit(X1).transform(X2)

    ### Fit the PCA and transform the data
    
    X1_std_pca = pca.fit_transform(X1_std)
    X2_std_pca = pca.fit(X1_std).transform(X2_std)

    std_pca_train = X1_std_pca
    std_pca_df = X2_std_pca

    clf = AdaBoostClassifier(n_estimators=100, random_state=1)
    df_fit = clf.fit(std_pca_train, df_targ_train)
    df_pred = df_fit.predict(std_pca_df)
    score = roc_auc_score(df_targ_test, df_pred)
    
    ### Append score to dict
    
    scores[cnt] = score
    
    ### Print for OCD
    print(cnt,' ',score)

1   0.5406012163200662
6   0.5254810861839616
11   0.5257931631733549
16   0.5242600849629604
21   0.5194033868155274
26   0.5261052401627482
31   0.5394582343464133
36   0.5252879885467745
41   0.5288456662258579
46   0.530548436299235
51   0.512311437231565
56   0.5231171029893075
61   0.5362496927992135
66   0.5289373388414922
71   0.5312876686678604
76   0.5225846216261553
81   0.5338584028679876
86   0.5248198730626845
91   0.5148587656574877
96   0.531898169278361


In [89]:
RandomForestClassifier?

[0;31mInit signature:[0m [0mRandomForestClassifier[0m[0;34m([0m[0mn_estimators[0m[0;34m=[0m[0;34m'warn'[0m[0;34m,[0m [0mcriterion[0m[0;34m=[0m[0;34m'gini'[0m[0;34m,[0m [0mmax_depth[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mmin_samples_split[0m[0;34m=[0m[0;36m2[0m[0;34m,[0m [0mmin_samples_leaf[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m [0mmin_weight_fraction_leaf[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m [0mmax_features[0m[0;34m=[0m[0;34m'auto'[0m[0;34m,[0m [0mmax_leaf_nodes[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mmin_impurity_decrease[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m [0mmin_impurity_split[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mbootstrap[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0moob_score[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m [0mn_jobs[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mrandom_state[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mverbose[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0mwarm_start[0m[

In [None]:
pd.DataFrame([clf.feature_importances_, df_feat_fill_train.columns]).transpose().sort_values

In [None]:
pd.DataFrame([clf.feature_importances_, df_feat_fill_train.columns]).transpose.sort

In [None]:
df_feat_fill_desc = df_feat_fill_train.describe().loc[['mean', 'std']]

In [None]:
df_feat_fill_desc

In [None]:
for x in df_feat_fill_desc.columns:
    print(df_feat_fill_desc[x])

In [None]:
scores = dict()
for cnt in range(4):
    scores[cnt] = cnt*4

In [None]:
scores