# Scrape StatFox for Current and Historical Matchup Stats
###### BY: Jonathan Sims
###### CREATED: 2019-06-15
- USE: Scrape each statfox matchup page and save to S3 as pickled dataframe (~20kb each)

#### `Modules`

In [1]:
import sys
import os.path
from bs4 import BeautifulSoup
from urllib.request import urlopen
import boto3
import numpy as np
import pandas as pd 
import pickle
import codecs

#### `Parameters`

In [2]:
glyear = 'GL2012.CSV' # Year of game logs to scrape

#### `Functions`

In [3]:
def ToPickleS3(obj, bucketname, keyname):
    """Pickle dataframe and put to s3 bucket in site name folder
    i.e. 'statfox/'
    """
    s3 = boto3.client('s3')
    serializedListObject = pickle.dumps(obj)
    s3.put_object(Bucket=bucketname,Key=keyname,Body=serializedListObject)

#### Get list of games

In [4]:
glhead = pd.read_csv('GLHEADER.CSV',header=None)
gms = pd.read_csv(glyear,header=0,names=list(glhead[0]))
gms = gms[['date','team_h','team_v','score_h','score_v']]

teams = pd.read_csv('TEAM_NAMES.CSV',header=0,index_col=['name1'],usecols=['name1','name3'])
teams = teams['name3'].to_dict()

gms['team_h'] = gms['team_h'].map(lambda x: teams[x.upper()])
gms['team_v'] = gms['team_v'].map(lambda x: teams[x.upper()])

#### Visit each page in games list, scrape, and send to s3 as pickle

In [5]:
for x in range(len(gms)):    
# for x in range(3):
    
    # Parse date, team names, and score from games list
    dt = str(gms.loc[x,'date'])
    tm_h = str(gms.loc[x,'team_h']).replace(' ','')                                                              
    tm_v = str(gms.loc[x,'team_v']).replace(' ','')
    sc_h = gms.loc[x,'score_h']
    sc_v = gms.loc[x,'score_v']

    # Adjust URL if second game of double header
    if (x > 1) and (str(gms.loc[x-1,'date']) == dt) and (str(gms.loc[x-1,'team_h']).replace(' ','') == tm_h):
        tm_h = tm_h+'2'
    
    # URL
    url = 'http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g='+dt+tm_h+'&r=at'    
    
    # Parse HTML
    html = urlopen(url)
    bs = BeautifulSoup(html.read(), 'html.parser')

    # Get all tables from page
    nameList = bs.findAll('td', {'class':['matchupBorder']})
     
    # Save each table to a dataframe and pickle
    namestr = str(nameList)
    df = pd.read_html(namestr)
    ToPickleS3(df, bucketname='scrapes-rawhtml-dev', keyname='statfox_DEV/'+dt+tm_h+'.pkl')
    
    # Checkpoint
    if x%100 == 0:
        print(url)

http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g=20120329OAKLAND&r=at
http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g=20120413TORONTO&r=at
http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g=20120420MILWAUKEE&r=at
http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g=20120428NYYANKEES&r=at
http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g=20120505NYMETS&r=at
http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g=20120513BOSTON&r=at
http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g=20120519PHILADELPHIA&r=at
http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g=20120527BOSTON&r=at
http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g=20120603MILWAUKEE&r=at
http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g=20120611TORONTO&r=at
http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g=20120619BOSTON&r=at
http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g=20120626ATLANTA&r=at
http://foxsheets.statfoxsports.com/foxsheets.