# Scrape StatFox for Current and Historical Matchup Stats
`Notebooks/skr_statfox.ipynb`
###### `BY: Jonathan Sims` 
###### `CREATED: 2019-06-15`
- GOAL: Scrape daily to build up historical matchup tables
- USE: Treat each matchup page as one obs in RF model to asses feature importance
    - Useful before spending time building up game/pitch/player-level count data 
    - Compare odds here with odds from VegasInsider to look for odds movement features

In [17]:
import sys
import os.path
from bs4 import BeautifulSoup
from urllib.request import urlopen
import boto3
import numpy as np
import pandas as pd 
import pickle
import codecs

In [26]:
file_o = '20200223.skr_statfox.2012.tsv.gz'
file_l = '20200223.skr_statfox.2012.log'
glyear = 'GL2012.CSV'

In [19]:
def FromPickleS3(bucketname, keyname):
    """Get dataframe from s3 and unpickle
    """
    s3 = boto3.client('s3')
    pkldf = s3.get_object(Bucket=bucketname,Key=keyname)['Body'].read()
    df = pickle.loads(pkldf)
    return df

In [20]:
def Flatten2DLabel(obj, row_label, col_label):
    """Flattens a dataframe to single row and concatenates row and 
    column lables within the dataframe into one column name.                                 
    """
    # Return all row indexes after last row_label
    st_row = row_label[-1]+1
    # Return all columns after last col_label
    st_col = col_label[-1]+1


    # Get row labels and concatenate
    for x in row_label:
        if x == row_label[0]:
            cols = []
            cols = obj.iloc[x][st_col:]
        else:
            cols = cols+'_'+obj.iloc[x][st_col:]

    # Get column labels and concatenate
    for x  in col_label:
        if x == col_label[0]:
            rows = []
            rows = obj[st_row:][x]
        else:
            rows = rows+'_'+obj[st_row:][x]  

    # Strip non alphanumeric
    colstrip = cols.str.replace('[^\w]','')
    rowstrip = rows.str.replace('[^\w]','')


    obj_in = pd.DataFrame([])
    for x in rowstrip.index:
        for y in colstrip.index:
            obj_tmp = pd.DataFrame([obj.iloc[x][y]], columns=[str(colstrip[y])+'_'+str(rowstrip[x])])
            obj_in = pd.concat([obj_in,obj_tmp], axis=1)
            
    return obj_in

In [21]:
def CleanStatfoxScrape(obj):
    """Takes flattened sub-tables from statfox matchup
    page and cleans and shortens feature names
    """
    # Team names to V or H
    obj.columns = obj.columns.str.replace(tm_v,'V_')
    obj.columns = obj.columns.str.replace(tm_h,'H_')

    # Shorten sub-table names
    obj.columns = obj.columns.str.replace('CurrentSeasonPerformance', 'Overall')
    obj.columns = obj.columns.str.replace('TeamHittingandFieldingStatistics','HitField')
    obj.columns = obj.columns.str.replace('BullpenPitchingStatistics','Bullpen')
    
    return obj

In [22]:
def DuplicatedVarnames(df):
    """Return a dict of all variable names that 
    are duplicated in a given dataframe."""
    repeat_dict = {}
    var_list = list(df) # list of varnames as strings
    for varname in var_list:
        # make a list of all instances of that varname
        test_list = [v for v in var_list if v == varname] 
        # if more than one instance, report duplications in repeat_dict
        if len(test_list) > 1: 
            repeat_dict[varname] = len(test_list)
    return repeat_dict

#### Get list of games

In [23]:
glhead = pd.read_csv('GLHEADER.CSV',header=None)
gms = pd.read_csv(glyear,header=0,names=list(glhead[0]))
gms = gms[['date','team_h','team_v','score_h','score_v']]

teams = pd.read_csv('TEAM_NAMES.CSV',header=0,index_col=['name1'],usecols=['name1','name3'])
teams = teams['name3'].to_dict()

gms['team_h'] = gms['team_h'].map(lambda x: teams[x.upper()])
gms['team_v'] = gms['team_v'].map(lambda x: teams[x.upper()])

In [24]:
gms

Unnamed: 0,date,team_h,team_v,score_h,score_v
0,20120329,OAKLAND,SEATTLE,4,1
1,20120404,MIAMI,ST LOUIS,1,4
2,20120405,CLEVELAND,TORONTO,4,7
3,20120405,DETROIT,BOSTON,3,2
4,20120405,CHICAGO CUBS,WASHINGTON,1,2
5,20120405,CINCINNATI,MIAMI,4,0
6,20120405,NY METS,ATLANTA,1,0
7,20120405,PITTSBURGH,PHILADELPHIA,0,1
8,20120405,SAN DIEGO,LA DODGERS,3,5
9,20120406,LA ANGELS,KANSAS CITY,5,0


#### Get dataframe from s3 and unpickle
#### Format and append each matchup together from s3

In [27]:
df_all = pd.DataFrame()

# gms = gms.drop([9])

for x in gms.index:
# for x in range(10):
    try:
        
        #### Parse date, team names, and score from games list
        
        dt = str(gms.loc[x,'date'])
        tm_h = str(gms.loc[x,'team_h']).replace(' ','')                                                              
        tm_v = str(gms.loc[x,'team_v']).replace(' ','')
        sc_h = gms.loc[x,'score_h']
        sc_v = gms.loc[x,'score_v']

        #### Adjust URL if second game of double header
        
        if (x > 1) and (str(gms.loc[x-1,'date']) == dt) and (str(gms.loc[x-1,'team_h']).replace(' ','') == tm_h):
            tm_h_url = tm_h+'2'
        else:
            tm_h_url = tm_h

        #### Create dataframe from s3 pickle
        
        df = FromPickleS3(bucketname='scrapes-rawhtml-dev', keyname='statfox_DEV/'+dt+tm_h_url+'.pkl')

        #### Show progress
        
        if x%100 == 0:
            print('On match: ', dt, tm_h_url)

        #### Parse and concatenate all features together, by matchup

        # 6:  Overall - board and line
        # 11: Away - Current Season Performance
        # 12: Away - Team Hitting and Fielding
        # 13: Away - Bullpen Pitching 
        # 14: Home - Current Season Performance
        # 15: Home - Team Hitting and Fielding
        # 16: Home - Bullpen Pitching 
        df6  = df[6] 
        df11 = df[11]
        df12 = df[12]
        df13 = df[13]
        df14 = df[14]
        df15 = df[15]
        df16 = df[16]

        df6 = df6.transpose() # Make board and team labels first
        df6_wd = Flatten2DLabel(df6, [1], [0,1])

        df11_wd = Flatten2DLabel(df11, [0,1,2], [0])
        df14_wd = Flatten2DLabel(df14, [0,1,2], [0])

        df12_wd = Flatten2DLabel(df12, [0,1,2], [0])
        df15_wd = Flatten2DLabel(df15, [0,1,2], [0])

        df13_wd = Flatten2DLabel(df13, [0,1], [0])
        df16_wd = Flatten2DLabel(df16, [0,1], [0])

        #### Concatenate all sub-tables and clean labels

        df_wd_all = pd.concat([df6_wd, df11_wd, df12_wd, df13_wd, df14_wd, df15_wd, df16_wd], axis=1)
        df_wd_all = CleanStatfoxScrape(df_wd_all)

        #### Add matchup index value date-board_h-board_v

        board_h = df6[3].iloc[0]
        board_v = df6[2].iloc[0]
        df_wd_all.index = [int(dt)*1000000+int(board_h)*1000+int(board_v)]

        #### Add team names and final scores

        df_wd_all['tm_h'] = tm_h
        df_wd_all['tm_v'] = tm_v
        df_wd_all['sc_h'] = sc_h
        df_wd_all['sc_v'] = sc_v

        #### Drop columns duplicates (usually nan)

        for x in DuplicatedVarnames(df_wd_all):
            df_wd_all = df_wd_all.drop(list(df_wd_all.filter(regex=x)), axis=1)

        #### Split features on hyphen if OU (over-under) or WL (win-loss)        

        ou_cols = [col for col in df_wd_all.columns if 'OU' in col]

        if ou_cols != []:
            for x in ou_cols:
                xO = x.replace('OU', 'O')
                xU = x.replace('OU', 'U')
                df_wd_all[[xO,xU]] = df_wd_all[x].str.split("-",expand=True)    
                df_wd_all = df_wd_all.drop(x, axis=1)

        wl_cols = [col for col in df_wd_all.columns if 'WL' in col]

        if wl_cols != []:
            for x in wl_cols:
                xW = x.replace('WL', 'W')
                xL = x.replace('WL', 'L')
                df_wd_all[[xW,xL]] = df_wd_all[x].str.split("-",expand=True)    
                df_wd_all = df_wd_all.drop(x, axis=1)    

        #### Create target variable (home win)

        df_wd_all['win_h'] = np.where(df_wd_all['sc_h'] > df_wd_all['sc_v'], 1, 0)

        #### Combine feature rows

        df_all = pd.concat([df_all, df_wd_all], ignore_index=False, sort=True)
    
    #### Print exception to log and go to next iteration (URL)
    
    except Exception as exc:
        excstamp = datetime.now()+' - '+exc+' - 'url
        print(excstamp, file=open(file_l, 'a'))
        continue

On match:  20120329 OAKLAND
On match:  20120413 TORONTO
On match:  20120420 MILWAUKEE
On match:  20120428 NYYANKEES
On match:  20120505 NYMETS
On match:  20120513 BOSTON
On match:  20120519 PHILADELPHIA
On match:  20120527 BOSTON
On match:  20120603 MILWAUKEE
On match:  20120611 TORONTO
On match:  20120619 BOSTON
On match:  20120626 ATLANTA
On match:  20120703 ARIZONA
On match:  20120714 NYYANKEES
On match:  20120721 OAKLAND
On match:  20120728 ATLANTA
On match:  20120804 PHILADELPHIA
On match:  20120811 SANFRANCISCO
On match:  20120819 DETROIT
On match:  20120826 ARIZONA
On match:  20120902 HOUSTON
On match:  20120910 LAANGELS
On match:  20120918 LAANGELS
On match:  20120925 CHIWHITESOX
On match:  20121002 KANSASCITY


In [31]:
df_all

Unnamed: 0,H_Bullpen_BB_AllGames,H_Bullpen_BB_HomeGames,H_Bullpen_BSV_AllGames,H_Bullpen_BSV_HomeGames,H_Bullpen_ERA_AllGames,H_Bullpen_ERA_HomeGames,H_Bullpen_ER_AllGames,H_Bullpen_ER_HomeGames,H_Bullpen_HR_AllGames,H_Bullpen_HR_HomeGames,...,V_Overall_Team_Runs_vsRighthandedStarters,V__Latest_Line,V__Latest_Total,V__Opening_Line,V__Opening_Total,sc_h,sc_v,tm_h,tm_v,win_h
20120329996995,189,90,18,7,3.78,3.51,191,90,40,18,...,3.6,+115,"Ov 8.5,+100",-105,"Ov 8.5,+100",4,1,OAKLAND,SEATTLE,1
20120404998997,208,106,19,9,3.44,2.90,197,86,41,18,...,4.9,+160,"Ov 7,+125",+170,"Ov 7,+125",1,4,MIAMI,STLOUIS,0
20120405914913,176,79,16,5,3.73,3.06,204,90,48,23,...,4.4,-120,"Ov 7,-115",-120,"Ov 7,-115",4,7,CLEVELAND,TORONTO,0
20120405912911,237,111,10,6,4.26,4.39,231,118,46,17,...,5.3,+130,"Ov 7,+105",+125,"Ov 7,+105",3,2,DETROIT,BOSTON,1
20120405906905,227,122,23,12,3.51,3.56,196,101,46,24,...,4.0,-140,"Ov 6.5,+110",-170,"Ov 6.5,+110",1,2,CHICAGOCUBS,WASHINGTON,0
20120405908907,228,105,22,7,3.61,3.18,201,90,47,23,...,3.7,+120,"Ov 7.5,-105",+140,"Ov 7.5,-105",4,0,CINCINNATI,MIAMI,1
20120405902901,199,111,23,12,4.33,4.55,228,118,43,17,...,,-110,"Ov 7,+105",-105,"Ov 7,+105",1,0,NYMETS,ATLANTA,1
20120405904903,220,105,22,10,3.73,3.42,218,108,47,19,...,,-150,"Ov 7,+120",-190,"Ov 6.5,-115",0,1,PITTSBURGH,PHILADELPHIA,0
20120405910909,210,109,18,7,3.05,2.71,164,75,32,8,...,4.1,-150,"Ov 6,+100",-145,"Ov 6,+100",3,5,SANDIEGO,LADODGERS,0
20120406968967,0,0,0,0,3.60,0.00,2,0,0,0,...,2.0,+140,"Ov 6.5,-110",+125,"Ov 7,+110",3,7,OAKLAND,SEATTLE,0


#### Save dataframe to tsv

In [32]:
df_all.to_csv(file_o, sep='\t')

In [33]:
test = pd.read_csv(file_o, sep='\t')

In [34]:
test.shape

(2422, 462)

#### Subset target variable and features

## `20200222`

#### MySQL Connector needs Python float, not numpy float64

#### Convert American betting lines to probability value satisfying E[x]=0

#### Set chrome options

#### Get all column names from `skr_statfox_matchups_cols.py` run

#### get list of games

#### translate 3 letter team name to full

#### Insert db connection string. Eff security or other good practices for now

## Main scrape code