# Scrape StatFox for Current and Historical Matchup Stats
`Notebooks/skr_statfox.ipynb`
###### `BY: Jonathan Sims` 
###### `CREATED: 2019-06-15`
- GOAL: Scrape daily to build up historical matchup tables
- USE: Treat each matchup page as one obs in RF model to asses feature importance
    - Useful before spending time building up game/pitch/player-level count data 
    - Compare odds here with odds from VegasInsider to look for odds movement features

In [1]:
import sys
import os.path
from bs4 import BeautifulSoup
from urllib.request import urlopen
import boto3
import numpy as np
import pandas as pd 
import pickle
import codecs

In [2]:
def ToPickleS3(obj, bucketname, keyname):
    """Pickle dataframe and put to s3 bucket in site name folder
    i.e. 'statfox/'
    """
    s3 = boto3.client('s3')
    serializedListObject = pickle.dumps(obj)
    s3.put_object(Bucket=bucketname,Key=keyname,Body=serializedListObject)

In [3]:
def FromPickleS3(bucketname, keyname):
    """Get dataframe from s3 and unpickle
    """
    s3 = boto3.client('s3')
    pkldf = s3.get_object(Bucket=bucketname,Key=keyname)['Body'].read()
    df = pickle.loads(pkldf)
    return df

In [4]:
def Flatten2DLabel(obj, row_label, col_label):
    """Flattens a dataframe to single row and concatenates row and 
    column lables within the dataframe into one column name.                                 
    """
    # Return all row indexes after last row_label
    st_row = row_label[-1]+1
    # Return all columns after last col_label
    st_col = col_label[-1]+1


    # Get row labels and concatenate
    for x in row_label:
        if x == row_label[0]:
            cols = []
            cols = obj.iloc[x][st_col:]
        else:
            cols = cols+'_'+obj.iloc[x][st_col:]

    # Get column labels and concatenate
    for x  in col_label:
        if x == col_label[0]:
            rows = []
            rows = obj[st_row:][x]
        else:
            rows = rows+'_'+obj[st_row:][x]  

    # Strip non alphanumeric
    colstrip = cols.str.replace('[^\w]','')
    rowstrip = rows.str.replace('[^\w]','')


    obj_in = pd.DataFrame([])
    for x in rowstrip.index:
        for y in colstrip.index:
            obj_tmp = pd.DataFrame([obj.iloc[x][y]], columns=[str(colstrip[y])+'_'+str(rowstrip[x])])
            obj_in = pd.concat([obj_in,obj_tmp], axis=1)
            
    return obj_in

In [5]:
def CleanStatfoxScrape(obj):
    """Takes flattened sub-tables from statfox matchup
    page and cleans and shortens feature names
    """
    obj_in = obj
    
    # Team names to V or H
    obj_in.columns = obj_in.columns.str.replace(tm_v,'V_')
    obj_in.columns = obj_in.columns.str.replace(tm_h,'H_')

    # Shorten sub-table names
    obj_in.columns = obj_in.columns.str.replace('CurrentSeasonPerformance', 'Overall')
    obj_in.columns = obj_in.columns.str.replace('TeamHittingandFieldingStatistics','HitField')
    obj_in.columns = obj_in.columns.str.replace('BullpenPitchingStatistics','Bullpen')
    
    return obj_in

#### Get list of games

In [6]:
glhead = pd.read_csv('GLHEADER.CSV',header=None)
gms = pd.read_csv('GL2018.CSV',header=0,names=list(glhead[0]))
gms = gms[['date','team_h','team_v','score_h','score_v']]

teams = pd.read_csv('TEAM_NAMES.CSV',header=0,index_col=['name1'],usecols=['name1','name3'])
teams = teams['name3'].to_dict()

gms['team_h'] = gms['team_h'].map(lambda x: teams[x.upper()])
gms['team_v'] = gms['team_v'].map(lambda x: teams[x.upper()])

#### Visit each page in games list, scrape, and send to s3 as pickle

In [None]:
# x is game number in games list
for x in range(len(gms)):
# for x in range(40,46):
    
    
    # Parse date, team names, and score from games list
    dt = str(gms.loc[x,'date'])
    tm_h = str(gms.loc[x,'team_h']).replace(' ','')                                                              
    tm_v = str(gms.loc[x,'team_v']).replace(' ','')
    sc_h = gms.loc[x,'score_h']
    sc_v = gms.loc[x,'score_v']


    # Adjust URL if second game of double header
    if (x > 1) and (str(gms.loc[x-1,'date']) == dt) and (str(gms.loc[x-1,'team_h']).replace(' ','') == tm_h):
        tm_h = tm_h+'2'

    
    # Set game and s3 bucket, URL, and file name
    game = dt+tm_h
    keyname = folder+game+'.pkl'
    url = 'http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g='+dt+tm_h+'&r=at'
    
    
    # Parse HTML
    html = urlopen(url)
    bs = BeautifulSoup(html.read(), 'html.parser')


    # Get all tables from page
    nameList = bs.findAll('td', {'class':['matchupBorder']})

     
    # Save each table to a dataframe and pickle
    namestr = str(nameList)
    df = pd.read_html(namestr)
    ToPickleS3(df, bucketname='scrapes-rawhtml-dev', keyname='statfox_DEV/'+dt+tm_h+'.pkl')
    
    # Checkpoint
    if x%100 == 0:
        print(url)

#### Get dataframe from s3 and unpickle

#### Format and append each matchup together from s3

In [20]:
df_all = pd.DataFrame([])

for x in range(45,47):
    # Parse date, team names, and score from games list
    dt = str(gms.loc[x,'date'])
    tm_h = str(gms.loc[x,'team_h']).replace(' ','')                                                              
    tm_v = str(gms.loc[x,'team_v']).replace(' ','')
    sc_h = gms.loc[x,'score_h']
    sc_v = gms.loc[x,'score_v']

    # Adjust URL if second game of double header
    if (x > 1) and (str(gms.loc[x-1,'date']) == dt) and (str(gms.loc[x-1,'team_h']).replace(' ','') == tm_h):
        tm_h = tm_h+'2'
    
    # Create dataframe from s3 pickle
    df = FromPickleS3(bucketname='scrapes-rawhtml-dev', keyname='statfox_DEV/'+dt+tm_h+'.pkl')

#### Parse and concatenate all features together, by matchup

    # 6:  Overall - board and line
    # 11: Away - Current Season Performance
    # 12: Away - Team Hitting and Fielding
    # 13: Away - Bullpen Pitching 
    # 14: Home - Current Season Performance
    # 15: Home - Team Hitting and Fielding
    # 16: Home - Bullpen Pitching 
    df6  = df[6] 
    df11 = df[11]
    df12 = df[12]
    df13 = df[13]
    df14 = df[14]
    df15 = df[15]
    df16 = df[16]

    df6 = df6.transpose() # Make board and team labels first
    df6_wd = Flatten2DLabel(df6, [0,1], [0,1])

    df11_wd = Flatten2DLabel(df11, [0,1,2], [0])
    df14_wd = Flatten2DLabel(df14, [0,1,2], [0])

    df12_wd = Flatten2DLabel(df12, [0,1,2], [0])
    df15_wd = Flatten2DLabel(df15, [0,1,2], [0])

    df13_wd = Flatten2DLabel(df13, [0,1], [0])
    df16_wd = Flatten2DLabel(df16, [0,1], [0])

#### Concatenate all sub-tables and clean labels

    df_wd_all = pd.concat([df6_wd, df11_wd, df12_wd, df13_wd, df14_wd, df15_wd, df16_wd], axis=1)
    df_wd_all = CleanStatfoxScrape(df_wd_all)

#### Combine feature rows

    df_all = df_all.append(df_wd_all, sort=True)

AssertionError: Number of manager items must equal union of block items
# manager items: 338, # tot_items: 354

In [16]:
df_all = df_all.append(df_wd_all, axis=1)

TypeError: append() got an unexpected keyword argument 'axis'

In [17]:
df_wd_all.append?

[0;31mSignature:[0m [0mdf_wd_all[0m[0;34m.[0m[0mappend[0m[0;34m([0m[0mother[0m[0;34m,[0m [0mignore_index[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m [0mverify_integrity[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m [0msort[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Append rows of `other` to the end of caller, returning a new object.

Columns in `other` that are not in the caller are added as new columns.

Parameters
----------
other : DataFrame or Series/dict-like object, or list of these
    The data to append.
ignore_index : boolean, default False
    If True, do not use the index labels.
verify_integrity : boolean, default False
    If True, raise ValueError on creating index with duplicates.
sort : boolean, default None
    Sort columns if the columns of `self` and `other` are not aligned.
    The default sorting is deprecated and will change to not-sorting
    in a future version of pandas. Explicitly pass ``sort=True`` to

    .

## `20200221`

#### MySQL Connector needs Python float, not numpy float64

#### Convert American betting lines to probability value satisfying E[x]=0

#### Set chrome options

#### Get all column names from `skr_statfox_matchups_cols.py` run

#### get list of games

#### translate 3 letter team name to full

#### Insert db connection string. Eff security or other good practices for now

## Main scrape code