# Scrape StatFox for Current and Historical Matchup Stats
`Notebooks/skr_statfox.ipynb`
###### `BY: Jonathan Sims` 
###### `CREATED: 2019-06-15`
- GOAL: Scrape daily to build up historical matchup tables
- USE: Treat each matchup page as one obs in RF model to asses feature importance
    - Useful before spending time building up game/pitch/player-level count data 
    - Compare odds here with odds from VegasInsider to look for odds movement features

In [1]:
import sys
import os.path
from bs4 import BeautifulSoup
from urllib.request import urlopen
import boto3
import numpy as np
import pandas as pd 
import pickle
import codecs

In [3]:
def FromPickleS3(bucketname, keyname):
    """Get dataframe from s3 and unpickle
    """
    s3 = boto3.client('s3')
    pkldf = s3.get_object(Bucket=bucketname,Key=keyname)['Body'].read()
    df = pickle.loads(pkldf)
    return df

In [4]:
def Flatten2DLabel(obj, row_label, col_label):
    """Flattens a dataframe to single row and concatenates row and 
    column lables within the dataframe into one column name.                                 
    """
    # Return all row indexes after last row_label
    st_row = row_label[-1]+1
    # Return all columns after last col_label
    st_col = col_label[-1]+1


    # Get row labels and concatenate
    for x in row_label:
        if x == row_label[0]:
            cols = []
            cols = obj.iloc[x][st_col:]
        else:
            cols = cols+'_'+obj.iloc[x][st_col:]

    # Get column labels and concatenate
    for x  in col_label:
        if x == col_label[0]:
            rows = []
            rows = obj[st_row:][x]
        else:
            rows = rows+'_'+obj[st_row:][x]  

    # Strip non alphanumeric
    colstrip = cols.str.replace('[^\w]','')
    rowstrip = rows.str.replace('[^\w]','')


    obj_in = pd.DataFrame([])
    for x in rowstrip.index:
        for y in colstrip.index:
            obj_tmp = pd.DataFrame([obj.iloc[x][y]], columns=[str(colstrip[y])+'_'+str(rowstrip[x])])
            obj_in = pd.concat([obj_in,obj_tmp], axis=1)
            
    return obj_in

In [101]:
def CleanStatfoxScrape(obj):
    """Takes flattened sub-tables from statfox matchup
    page and cleans and shortens feature names
    """
    # Team names to V or H
    obj.columns = obj.columns.str.replace(tm_v,'V_')
    obj.columns = obj.columns.str.replace(tm_h,'H_')

    # Shorten sub-table names
    obj.columns = obj.columns.str.replace('CurrentSeasonPerformance', 'Overall')
    obj.columns = obj.columns.str.replace('TeamHittingandFieldingStatistics','HitField')
    obj.columns = obj.columns.str.replace('BullpenPitchingStatistics','Bullpen')
    
    return obj

In [6]:
def DuplicatedVarnames(df):
    """Return a dict of all variable names that 
    are duplicated in a given dataframe."""
    repeat_dict = {}
    var_list = list(df) # list of varnames as strings
    for varname in var_list:
        # make a list of all instances of that varname
        test_list = [v for v in var_list if v == varname] 
        # if more than one instance, report duplications in repeat_dict
        if len(test_list) > 1: 
            repeat_dict[varname] = len(test_list)
    return repeat_dict

#### Get list of games

In [14]:
glhead = pd.read_csv('GLHEADER.CSV',header=None)
gms = pd.read_csv('GL2018.CSV',header=0,names=list(glhead[0]))
gms = gms[['date','team_h','team_v','score_h','score_v']]

teams = pd.read_csv('TEAM_NAMES.CSV',header=0,index_col=['name1'],usecols=['name1','name3'])
teams = teams['name3'].to_dict()

gms['team_h'] = gms['team_h'].map(lambda x: teams[x.upper()])
gms['team_v'] = gms['team_v'].map(lambda x: teams[x.upper()])

#### Get dataframe from s3 and unpickle
#### Format and append each matchup together from s3

In [None]:
df_all = pd.DataFrame()

for x in gms.index:
# for x in range(10):
    # Parse date, team names, and score from games list
    dt = str(gms.loc[x,'date'])
    tm_h = str(gms.loc[x,'team_h']).replace(' ','')                                                              
    tm_v = str(gms.loc[x,'team_v']).replace(' ','')
    sc_h = gms.loc[x,'score_h']
    sc_v = gms.loc[x,'score_v']

    # Adjust URL if second game of double header
    if (x > 1) and (str(gms.loc[x-1,'date']) == dt) and (str(gms.loc[x-1,'team_h']).replace(' ','') == tm_h):
        tm_h_url = tm_h+'2'
    else:
        tm_h_url = tm_h
        
    # Create dataframe from s3 pickle
    df = FromPickleS3(bucketname='scrapes-rawhtml-dev', keyname='statfox_DEV/'+dt+tm_h_url+'.pkl')

    # Show progress
    if x%100 == 0:
        print('On match: ', dt, tm_h_url)

#### Parse and concatenate all features together, by matchup

    # 6:  Overall - board and line
    # 11: Away - Current Season Performance
    # 12: Away - Team Hitting and Fielding
    # 13: Away - Bullpen Pitching 
    # 14: Home - Current Season Performance
    # 15: Home - Team Hitting and Fielding
    # 16: Home - Bullpen Pitching 
    df6  = df[6] 
    df11 = df[11]
    df12 = df[12]
    df13 = df[13]
    df14 = df[14]
    df15 = df[15]
    df16 = df[16]

    df6 = df6.transpose() # Make board and team labels first
    df6_wd = Flatten2DLabel(df6, [1], [0,1])

    df11_wd = Flatten2DLabel(df11, [0,1,2], [0])
    df14_wd = Flatten2DLabel(df14, [0,1,2], [0])

    df12_wd = Flatten2DLabel(df12, [0,1,2], [0])
    df15_wd = Flatten2DLabel(df15, [0,1,2], [0])

    df13_wd = Flatten2DLabel(df13, [0,1], [0])
    df16_wd = Flatten2DLabel(df16, [0,1], [0])

#### Concatenate all sub-tables and clean labels

    df_wd_all = pd.concat([df6_wd, df11_wd, df12_wd, df13_wd, df14_wd, df15_wd, df16_wd], axis=1)
    df_wd_all = CleanStatfoxScrape(df_wd_all)

#### Add matchup index value date-board_h-board_v

    board_h = df6[3].iloc[0]
    board_v = df6[2].iloc[0]
    df_wd_all.index = [int(dt)*1000000+int(board_h)*1000+int(board_v)]
    
#### Add team names and final scores

    df_wd_all['tm_h'] = tm_h
    df_wd_all['tm_v'] = tm_v
    df_wd_all['sc_h'] = sc_h
    df_wd_all['sc_v'] = sc_v
    
#### Drop columns duplicates (usually nan)

    for x in DuplicatedVarnames(df_wd_all):
        df_wd_all = df_wd_all.drop(list(df_wd_all.filter(regex=x)), axis=1)
        
#### Split features on hyphen if OU (over-under) or WL (win-loss)        
    
    ou_cols = [col for col in df_wd_all.columns if 'OU' in col]
    
    if ou_cols != []:
        for x in ou_cols:
            xO = x.replace('OU', 'O')
            xU = x.replace('OU', 'U')
            df_wd_all[[xO,xU]] = df_wd_all[x].str.split("-",expand=True)    
            df_wd_all = df_wd_all.drop(x, axis=1)

    wl_cols = [col for col in df_wd_all.columns if 'WL' in col]
    
    if wl_cols != []:
        for x in wl_cols:
            xW = x.replace('WL', 'W')
            xL = x.replace('WL', 'L')
            df_wd_all[[xW,xL]] = df_wd_all[x].str.split("-",expand=True)    
            df_wd_all = df_wd_all.drop(x, axis=1)    

#### Create target variable (home win)

    df_wd_all['win_h'] = np.where(df_wd_all['sc_h'] > df_wd_all['sc_v'], 1, 0)
    
#### Combine feature rows

    df_all = pd.concat([df_all, df_wd_all], ignore_index=False, sort=True)

On match:  20180329 ATLANTA


In [None]:
x

#### Save dataframe to tsv

In [None]:
df_all.to_csv('20200222.skr_statfox.2018.tsv.gz', sep='\t')

#### Subset target variable and features

In [112]:
for x in gms.index:
    print(x.type)

AttributeError: 'int' object has no attribute 'type'

## `20200222`

#### MySQL Connector needs Python float, not numpy float64

#### Convert American betting lines to probability value satisfying E[x]=0

#### Set chrome options

#### Get all column names from `skr_statfox_matchups_cols.py` run

#### get list of games

#### translate 3 letter team name to full

#### Insert db connection string. Eff security or other good practices for now

## Main scrape code