# Scrape StatFox for Current and Historical Matchup Stats
###### BY: Jonathan Sims
###### CREATED: 2019-06-15
- USE: Scrape each statfox matchup page and save to S3 as pickled dataframe (~20kb each)

#### `Modules`

In [30]:
import sys
import os.path
from bs4 import BeautifulSoup
from urllib.request import urlopen
import boto3
import numpy as np
import pandas as pd 
import pickle
import codecs
from datetime import datetime

#### `Parameters`

In [21]:
file_i = 'gamelogs_2010_2018.csv.gz' 
file_l = '20200224.skr_rawhtml.2014.log'

#### `Functions`

In [8]:
def ToPickleS3(obj, bucketname, keyname):
    """Pickle dataframe and put to s3 bucket in site name folder
    i.e. 'statfox/'
    """
    s3 = boto3.client('s3')
    serializedListObject = pickle.dumps(obj)
    s3.put_object(Bucket=bucketname,Key=keyname,Body=serializedListObject)

#### Get list of games

In [28]:
glhead = pd.read_csv('GLHEADER.CSV',header=None)

gms = pd.read_csv(file_i,
                  header=0,
                  names=list(glhead[0]),
                  usecols=['date','team_h','team_v','score_h','score_v'])
# gms = gms[['date','team_h','team_v','score_h','score_v']]

teams = pd.read_csv('TEAM_NAMES.CSV',header=0,index_col=['name1'],usecols=['name1','name3'])
teams = teams['name3'].to_dict()

gms['team_h'] = gms['team_h'].map(lambda x: teams[x.upper()])
gms['team_v'] = gms['team_v'].map(lambda x: teams[x.upper()])

#### Only scrape 2014 to 2017

In [None]:
gms_year = gms['date']/10000
gms[(2014 <= gms_year) & (gms_year < 2018)]

#### Visit each page in games list, scrape, and send to s3 as pickle

In [10]:
for x in range(len(gms)):    
# for x in range(3):

    try:
        
        #### Parse date, team names, and score from games list
        
        dt = str(gms.loc[x,'date'])
        tm_h = str(gms.loc[x,'team_h']).replace(' ','')                                                              
        tm_v = str(gms.loc[x,'team_v']).replace(' ','')
        sc_h = gms.loc[x,'score_h']
        sc_v = gms.loc[x,'score_v']

        #### Adjust URL if second game of double header
        
        if (x > 1) and (str(gms.loc[x-1,'date']) == dt) and (str(gms.loc[x-1,'team_h']).replace(' ','') == tm_h):
            tm_h = tm_h+'2'

        #### URL to scrape
        
        url = 'http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g='+dt+tm_h+'&r=at'    

        #### Parse HTML
        
        html = urlopen(url)
        bs = BeautifulSoup(html.read(), 'html.parser')

        #### Get all tables from page
        
        nameList = bs.findAll('td', {'class':['matchupBorder']})

        #### Save each table to a dataframe and pickle
        
        namestr = str(nameList)
        df = pd.read_html(namestr)
        ToPickleS3(df, bucketname='scrapes-rawhtml-dev', keyname='statfox_DEV/'+dt+tm_h+'.pkl')

    #### Write exception to log and continue
    
    except Exception as exc:
        excstamp = datetime.now()+' - '+exc+' - 'url
        print(excstamp, file=open(file_l, 'a'))
        continue
        
    #### Print checkpoint for my OCD
    
    if x%100 == 0:
        print(datetime.now()+' - '+url)

http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g=20130401CHIWHITESOX&r=at
http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g=20130408STLOUIS&r=at
http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g=20130416MILWAUKEE&r=at
http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g=20130424TAMPABAY&r=at
http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g=20130501ARIZONA&r=at
http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g=20130509TAMPABAY&r=at
http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g=20130516STLOUIS&r=at
http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g=20130524TORONTO&r=at
http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g=20130531NYYANKEES&r=at
http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g=20130607LADODGERS&r=at
http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g=20130615HOUSTON&r=at
http://foxsheets.statfoxsports.com/foxsheets.aspx?s=mlb&g=20130622DETROIT&r=at
http://foxsheets.statfoxsports.com/foxsh