# Clean StatFox Matchup Scrapes
#### `Jonathan Sims 2020-03-13`
- USE: Get semi-parsed html for each statfox matchup page and clean into modeling input
    - Treat each matchup page as one obs in RF model to asses feature importance
    - Matchup data is pre-computed as of day of match, but needs lots of cleaning 
    - Includes opening and close money and total lines

In [4]:
### Set datetime stamp in YYYYMMDD for all file outputs

from datetime import date as dt
YMD = dt.today().isoformat().replace('-','') + '.'

In [5]:
### Import all modules at top

import sys
import math
import boto3
import numpy as np
import pandas as pd 
import pickle
import codecs
import os.path
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.request import urlopen
from sklearn.impute import SimpleImputer

In [42]:
def FromPickleS3(bucketname, keyname):
    """Get dataframe from s3 and unpickle
    """
    s3 = boto3.client('s3')
    pkldf = s3.get_object(Bucket=bucketname,Key=keyname)['Body'].read()
    df = pickle.loads(pkldf)
    return df

In [43]:
def Flatten2DLabel(obj, row_label, col_label):
    """Flattens a dataframe to single row and concatenates row and 
    column lables within the dataframe into one column name.                                 
    """
    # Return all row indexes after last row_label
    st_row = row_label[-1]+1
    # Return all columns after last col_label
    st_col = col_label[-1]+1


    # Get row labels and concatenate
    for x in row_label:
        if x == row_label[0]:
            cols = []
            cols = obj.iloc[x][st_col:]
        else:
            cols = cols+'_'+obj.iloc[x][st_col:]

    # Get column labels and concatenate
    for x  in col_label:
        if x == col_label[0]:
            rows = []
            rows = obj[st_row:][x]
        else:
            rows = rows+'_'+obj[st_row:][x]  

    # Strip non alphanumeric
    colstrip = cols.str.replace('[^\w]','')
    rowstrip = rows.str.replace('[^\w]','')


    obj_in = pd.DataFrame([])
    for x in rowstrip.index:
        for y in colstrip.index:
            obj_tmp = pd.DataFrame([obj.iloc[x][y]], columns=[str(colstrip[y])+'_'+str(rowstrip[x])])
            obj_in = pd.concat([obj_in,obj_tmp], axis=1)
            
    return obj_in

In [44]:
def CleanStatfoxScrape(obj):
    """Takes flattened sub-tables from statfox matchup
    page and cleans and shortens feature names
    """
    # Team names to V or H
    obj.columns = obj.columns.str.replace(tm_v,'V_')
    obj.columns = obj.columns.str.replace(tm_h,'H_')

    # Shorten sub-table names
    obj.columns = obj.columns.str.replace('CurrentSeasonPerformance', 'Overall')
    obj.columns = obj.columns.str.replace('TeamHittingandFieldingStatistics','HitField')
    obj.columns = obj.columns.str.replace('BullpenPitchingStatistics','Bullpen')
    
    return obj

In [45]:
def DuplicatedVarnames(df):
    """Return a dict of all variable names that 
    are duplicated in a given dataframe."""
    repeat_dict = {}
    var_list = list(df) # list of varnames as strings
    for varname in var_list:
        # make a list of all instances of that varname
        test_list = [v for v in var_list if v == varname] 
        # if more than one instance, report duplications in repeat_dict
        if len(test_list) > 1: 
            repeat_dict[varname] = len(test_list)
    return repeat_dict

### Get list of games

In [48]:
glhead = pd.read_csv('GLHEADER.CSV',header=None)

gms = pd.read_csv('20200313.matchup_list.2010-2018.csv.gz',
                  header=0, 
                  names=glhead[0], 
                  low_memory=False
                 )

gms = gms[['date','team_h','team_v','score_h','score_v']]

teams = pd.read_csv('TEAM_NAMES.CSV',
                    header=0,
                    index_col=['name1'],
                    usecols=['name1','name3']
                   )

teams = teams['name3'].to_dict()

gms['team_h'] = gms['team_h'].map(lambda x: teams[x.upper()])
gms['team_v'] = gms['team_v'].map(lambda x: teams[x.upper()])

### Get dataframe from s3 and unpickle
### Format and append each matchup together from s3

In [50]:
for x in gms.index[:2]:
    print(gms.iloc[x])

date        20100405
team_h     LA ANGELS
team_v     MINNESOTA
score_h            6
score_v            3
Name: 0, dtype: object
date            20100405
team_h     CHI WHITE SOX
team_v         CLEVELAND
score_h                6
score_v                0
Name: 1, dtype: object


In [10]:
df_all = pd.DataFrame()


# for x in gms.index[20600:]:
    try:
        
        ### Parse date, team names, and score from games list
        
        dt = str(gms.loc[x,'date'])
        tm_h = str(gms.loc[x,'team_h']).replace(' ','')                                                              
        tm_v = str(gms.loc[x,'team_v']).replace(' ','')
        sc_h = gms.loc[x,'score_h']
        sc_v = gms.loc[x,'score_v']

        ### Adjust URL if second game of double header
        
        if (x > 1) and (str(gms.loc[x-1,'date']) == dt) and (str(gms.loc[x-1,'team_h']).replace(' ','') == tm_h):
            tm_h_url = tm_h+'2'
        else:
            tm_h_url = tm_h

        ### Create dataframe from s3 pickle
        
        df = FromPickleS3(bucketname='scrapes-rawhtml-dev', keyname='statfox_DEV/'+dt+tm_h_url+'.pkl')

        ### Show progress for my OCD
        
        if x%100 == 0:
            timestamp = str(datetime.now())
            print(timestamp, ' - ', 'statfox_DEV/'+dt+tm_h_url+'.pkl', ' - ',x)

        ### Parse and concatenate all features together, by matchup

        # 6:  Overall - board and line
        # 11: Away - Current Season Performance
        # 12: Away - Team Hitting and Fielding
        # 13: Away - Bullpen Pitching 
        # 14: Home - Current Season Performance
        # 15: Home - Team Hitting and Fielding
        # 16: Home - Bullpen Pitching 
        df6  = df[6] 
        df11 = df[11]
        df12 = df[12]
        df13 = df[13]
        df14 = df[14]
        df15 = df[15]
        df16 = df[16]

        df6 = df6.transpose() # Make board and team labels first
        df6_wd = Flatten2DLabel(df6, [1], [0,1])

        df11_wd = Flatten2DLabel(df11, [0,1,2], [0])
        df14_wd = Flatten2DLabel(df14, [0,1,2], [0])

        df12_wd = Flatten2DLabel(df12, [0,1,2], [0])
        df15_wd = Flatten2DLabel(df15, [0,1,2], [0])

        df13_wd = Flatten2DLabel(df13, [0,1], [0])
        df16_wd = Flatten2DLabel(df16, [0,1], [0])

        ### Concatenate all sub-tables and clean labels

        df_wd_all = pd.concat([df6_wd, df11_wd, df12_wd, df13_wd, df14_wd, df15_wd, df16_wd], axis=1)
        df_wd_all = CleanStatfoxScrape(df_wd_all)

        ### Add matchup index value date-board_h-board_v

        board_h = df6[3].iloc[0]
        board_v = df6[2].iloc[0]
        df_wd_all.index = [int(dt)*1000000+int(board_h)*1000+int(board_v)]

        ### Add team names and final scores

        df_wd_all['tm_h'] = tm_h
        df_wd_all['tm_v'] = tm_v
        df_wd_all['sc_h'] = sc_h
        df_wd_all['sc_v'] = sc_v

        ### Drop columns duplicates (usually nan)

        for x in DuplicatedVarnames(df_wd_all):
            df_wd_all = df_wd_all.drop(list(df_wd_all.filter(regex=x)), axis=1)

        ### Split features on hyphen if OU (over-under) or WL (win-loss)        

        ou_cols = [col for col in df_wd_all.columns if 'OU' in col]

        if ou_cols != []:
            for x in ou_cols:
                xO = x.replace('OU', 'O')
                xU = x.replace('OU', 'U')
                df_wd_all[[xO,xU]] = df_wd_all[x].str.split("-",expand=True)    
                df_wd_all = df_wd_all.drop(x, axis=1)

        wl_cols = [col for col in df_wd_all.columns if 'WL' in col]

        if wl_cols != []:
            for x in wl_cols:
                xW = x.replace('WL', 'W')
                xL = x.replace('WL', 'L')
                df_wd_all[[xW,xL]] = df_wd_all[x].str.split("-",expand=True)    
                df_wd_all = df_wd_all.drop(x, axis=1)    

        ### Create target variable (home win)

        df_wd_all['win_h'] = np.where(df_wd_all['sc_h'] > df_wd_all['sc_v'], 1, 0)

        ### Combine feature rows

        df_all = pd.concat([df_all, df_wd_all], ignore_index=False, sort=True)
    
    ### Print exception to log and go to next iteration (URL)
    
    except Exception as exc:
        timestamp = str(datetime.now())
        exc = str(exc)
        excstamp = timestamp+' - '+exc+' - '+dt+tm_h_url
        print(excstamp, file=open(file_l, 'a'))
        continue
    
    except IndexError as err:
        timestamp = str(datetime.now())
        excstamp = timestamp+' - '+err+' - '+dt+tm_h_url
        print(excstamp, file=open(file_l, 'a'))
        continue

2020-03-12 03:13:28.735535  -  statfox_DEV/20180625KANSASCITY.pkl  -  20600
2020-03-12 03:14:24.561005  -  statfox_DEV/20180703COLORADO.pkl  -  20700
2020-03-12 03:15:22.653832  -  statfox_DEV/20180710LAANGELS.pkl  -  20800
2020-03-12 03:16:23.348677  -  statfox_DEV/20180721LAANGELS.pkl  -  20900
2020-03-12 03:17:26.065971  -  statfox_DEV/20180728BOSTON.pkl  -  21000
2020-03-12 03:18:31.321667  -  statfox_DEV/20180805LADODGERS.pkl  -  21100
2020-03-12 03:19:39.274568  -  statfox_DEV/20180812LAANGELS.pkl  -  21200
2020-03-12 03:20:52.291550  -  statfox_DEV/20180819OAKLAND.pkl  -  21300
2020-03-12 03:22:06.514873  -  statfox_DEV/20180827SANFRANCISCO.pkl  -  21400
2020-03-12 03:23:23.910832  -  statfox_DEV/20180903OAKLAND.pkl  -  21500
2020-03-12 03:24:43.069145  -  statfox_DEV/20180911BOSTON.pkl  -  21600
2020-03-12 03:26:06.577939  -  statfox_DEV/20180918OAKLAND.pkl  -  21700
2020-03-12 03:27:34.141391  -  statfox_DEV/20180926SANFRANCISCO.pkl  -  21800


### Save cleaned to gzip tsv

In [11]:
df_all.shape

(1267, 581)

In [12]:
df_all.to_csv(YMD+'statfox_clean_scrapes.2010-2018.csv.gz', sep='\t')

In [6]:
file_in1 = '20200310.skr_statfox.2010-2018.0.7400.tsv.gz'
file_in2 = '20200310.skr_statfox.2010-2018.7400.10000.tsv.gz'
file_in3 = '20200310.skr_statfox.2010-2018.10000.12000.tsv.gz'
file_in4 = '20200310.skr_statfox.2010-2018.12000.16000.tsv.gz'
file_in5 = '20200310.skr_statfox.2010-2018.16000.20600.tsv.gz'
file_in6 = '20200310.skr_statfox.2010-2018.20600.tsv.gz'

### Get statfox_clean_scrapes from s3 since not local anymore

In [None]:
s3 = boto3.client('s3')
s3.list_objects(Bucket='scrapes-rawhtml-dev')

In [None]:
s3.download_file(YMD+'statfox_features.tsv.gz', 
               Bucket='scrapes-rawhtml-dev', 
               Key='statfox/'+YMD+'statfox_features.tsv.gz')

### Import features, moneylines, target

In [7]:
### Import chunks and append

df1 = pd.read_csv(file_in1, sep='\t', low_memory=False)
df2 = pd.read_csv(file_in2, sep='\t', low_memory=False)
df3 = pd.read_csv(file_in3, sep='\t', low_memory=False)
df4 = pd.read_csv(file_in4, sep='\t', low_memory=False)
df5 = pd.read_csv(file_in5, sep='\t', low_memory=False)
df6 = pd.read_csv(file_in6, sep='\t', low_memory=False)

df = pd.concat([df1, df2, df3, df4, df5, df6], ignore_index=False, sort=True)

FileNotFoundError: [Errno 2] No such file or directory: '20200310.skr_statfox.2010-2018.0.7400.tsv.gz'

In [None]:
### Clear useless dataframes

df1 = []
df2 = []
df3 = []
df4 = []
df5 = []
df6 = []

In [None]:
### Give proper name to the match index

df['matchidx'] = df['Unnamed: 0']
df = df.drop(['Unnamed: 0'], axis=1)

In [None]:
df.shape

In [None]:
df.tail()

### Dedupe on matchup index (YYYYMMDDHHHVVV)

In [None]:
df = df.drop_duplicates(['matchidx'])

In [None]:
df.shape

### Add year and month variable

In [None]:
def get_month(x):
    """Take match index YYYYMMDDHHHVVV and return the month of game
    """
    flr = math.floor(x/100000000)
    flrmod = flr%100
    return str(flrmod)

def get_year(x):
    """Take match index YYYYMMDDHHHVVV and return the year of game
    """
    flr = math.floor(x/10000000000)
    return str(flr)

In [None]:
df['month'] = df['matchidx'].apply(get_month)
df['year'] = df['matchidx'].apply(get_year)

In [None]:
df.shape

In [None]:
df[['matchidx','year','month']].head()

### Clean up OU Totals, OSB, and DP features
- This should really be in skr_statfox_matchups.ipynb 

In [None]:
### Split by a comma then drop Ov and Un text

ou_cols = [col for col in df.columns if '_Total' in col]

if ou_cols != []:
    for x in ou_cols:
        xO = x.replace('_Total', '_Tot')
        xU = x.replace('_Total', '_TotLn')
        
        df[[xO,xU]] = df[x].str.split(",", expand=True)    
        df[xO] = df[xO].str.replace("Ov ","")
        df[xO] = df[xO].str.replace("Un ","")
        df = df.drop(x, axis=1)
    
new_ou_cols = [col for col in df.columns if '_Tot' in col]    
df[new_ou_cols].head()

In [None]:
### Remove parantheses from numeric values

osb_cols = [col for col in df.columns if '_OSB_' in col]
dp_cols = [col for col in df.columns if '_DP_' in col]
paren_cols = osb_cols + dp_cols

if paren_cols != []:
    for x in paren_cols:
        df[x] = df[x].str.replace("(","")
        df[x] = df[x].str.replace(")","")
        
df[paren_cols].head()        

In [None]:
### Remove percent signs

perc = [col for col in df.columns if '_Pct_' in col]

if perc != []:
    for x in perc:
        df[x] = df[x].str.replace("%","")
        df[x] = df[x].str.replace(" ","")

df[perc].head()

### Separate features, targets, and misc

In [None]:
lines = [col for col in df.columns if 'Latest_Line' in col]
teams = ['tm_h', 'tm_v']
scores = ['sc_h','sc_v']
feats = ['win_h']
# drop = pd.concat([lines, totals, scores, feats])
drop = lines+scores+feats
drop

In [None]:
df_feat = df.drop(drop, axis=1).reset_index(drop=True)
df_targ = pd.to_numeric(df['win_h'], errors='coerce').reset_index(drop=True)
df_openline = pd.to_numeric(df['H__Opening_Line'], errors='coerce').reset_index(drop=True)
df_lateline = pd.to_numeric(df['H__Latest_Line'], errors='coerce').reset_index(drop=True)

### Final Clean
- Try to convert object to numeric
- If except: convert object to binary dummies

In [None]:
def preprocess_tonumeric(x, coercecols):
    """Clean features to get everything numeric.
    1) Strip percent sign, space, and commas
    2) Convert objects to numeric if possible
    3) Else, convert objects to dummies
    """
#     output = pd.DataFrame()
    
    for col in coercecols:
        if col not in x.columns:
            raise ValueError
    
    for col, col_data in x.iteritems():
        
        if col_data.dtype == object and col in coercecols:
            
            col_data = pd.to_numeric(col_data, errors='coerce')
            x = x.drop(col, axis=1)
            
        elif col_data.dtype == object:
                
            try:
                col_data = pd.to_numeric(col_data)
                x = x.drop(col, axis=1)
                
            except:
                col_data = pd.get_dummies(col_data, prefix=col)
                x = x.drop(col, axis=1)
            
        x = pd.concat([x, col_data], axis=1)
    
    ### Keep first of all columns then drop duplicates

    Cols = list(x.columns)
    for i,item in enumerate(x.columns):
        if item in x.columns[:i]: Cols[i] = "toDROP"
    x.columns = Cols
    x = x.drop("toDROP",1)
    
    return x

In [None]:
# def preprocess_nan(x, fill_strategy='mean'):
def preprocess_nan(z):
    """Process NaNs in a dataframe to prepare for RF or other model.
    1) Creates dummy columns for each existing column with > 1 NaN
    2) Fill NaN in existing column with desired strategy (see scikit SimpleImputer)
    """
    for col in z.columns:
        if z[col].isnull().any().any():
            nancol = col+'_NaN'
            z[nancol] = [1 if s == True else 0 for s in z[col].isnull()]
        
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    x_out = pd.DataFrame(imp.fit_transform(z), columns=z.columns)
    
    return x_out

In [None]:
df_feat_num = preprocess_tonumeric(df_feat, coercecols=['H__Opening_Line', 'V__Opening_Line'])

In [None]:
df_feat_num[:2]

### Convert open and close moneylines to probability

In [None]:
def american_to_probability(x):
    """Turns American +/- odds into probability 0 to 1 exclusive
    NOTE: Returns 0 if missing since 0 and 1 are impossible from lines
    """
    if x < -99:
        num = abs(x)
        pr = num/(100+num)
        return pr

    elif x >= 100:
        num = x
        pr = 100/(100+num)
        return pr
    
    elif x == None:
        pr = 0

#     else:
#         print('Error: No sign found in betting line string')

### Convert moneylines to probabilities

In [None]:
### Separate open/close moneylines from features

df_lateline_prob = df_lateline.apply(american_to_probability)
# df_lateline_prob = df_lateline_prob[lambda x: (x > 0) & (x < 1)]
df_openline_prob = df_openline.apply(american_to_probability)
# df_openline_prob = df_openline_prob[lambda x: (x > 0) & (x < 1)]

In [None]:
### Convert line features to probability

lines = ['H__Opening_Line', 'V__Opening_Line', 'H__Opening_TotLn', 'V__Opening_TotLn']

for col in lines:
    newcol = col + 'Pr'
    df_feat_num[newcol] = df_feat_num[col].apply(american_to_probability)

In [None]:
### Check for any Opening lines (should have probabilities created *Pr*)

df_feat_num[[col for col in df_feat_num.columns if 'Opening' in col]].dtypes

### Create sample weighting based on payouts

In [None]:
def probability_to_weight(x):
    """Compute the payout of correct prediction to use as weights
    in modeling, assuming $1.00 bet each time. 
        This should be a good
    proxy for profit/loss since payouts change but loss is always equal
    to bet ($1.00 in this case).
    """
    win_h = x['win_h']
    pr_h = x['H__Opening_LinePr']
    pr_v = x['V__Opening_LinePr']
    
    if win_h == 0:
        wt = 1 / pr_v
    elif win_h == 1:
        wt = 1 / pr_h
    else:
        wt = np.nan
        
    return wt

In [None]:
test = df_feat_num[['win_h', 'H__Opening_LinePr', 'V__Opening_LinePr']]
pd.concat([test, test.apply(probability_to_weight, axis=1)])

### Impute missing values and create dummy for missing

In [190]:
df_feat_fill = preprocess_nan(df_feat_num)
df_feat_fill[:2]

Unnamed: 0,H_Bullpen_BB_AllGames,H_Bullpen_BB_HomeGames,H_Bullpen_BSV_AllGames,H_Bullpen_BSV_HomeGames,H_Bullpen_ERA_AllGames,H_Bullpen_ERA_HomeGames,H_Bullpen_ER_AllGames,H_Bullpen_ER_HomeGames,H_Bullpen_HR_AllGames,H_Bullpen_HR_HomeGames,...,H__Opening_Tot_NaN,H__Opening_TotLn_NaN,V__Latest_Tot_NaN,V__Latest_TotLn_NaN,V__Opening_Tot_NaN,V__Opening_TotLn_NaN,H__Opening_LinePr_NaN,V__Opening_LinePr_NaN,H__Opening_TotLnPr_NaN,V__Opening_TotLnPr_NaN
0,228.0,117.0,17.0,9.0,4.42,4.69,253.0,138.0,56.0,30.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,190.0,107.0,18.0,8.0,4.06,4.41,212.0,126.0,49.0,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [191]:
### Check for any Closing lines (should be none)

df_feat_fill[[col for col in df_feat_fill.columns if 'Closing' in col]].dtypes

Series([], dtype: object)

In [192]:
### Check of any object cols (should be none)

for col, col_data in df_feat_fill.iteritems():
    if col_data.dtype == object:
        print(col)

In [193]:
### Check if any nan entries (should be none)

for col in df_feat_fill.columns:
    if df_feat_fill[col].isna().any():
        print(col)

### Export to tsv.gz then s3

In [194]:
df_feat_fill.to_csv(YMD+'statfox_features.tsv.gz', sep='\t')
df_targ.to_csv(YMD+'statfox_target.tsv.gz', sep='\t', header=False)
df_lateline_prob.to_csv(YMD+'statfox_lateline_prob.tsv.gz', sep='\t', header=False)

In [195]:
s3 = boto3.client('s3')

s3.upload_file(YMD+'statfox_features.tsv.gz', 
               Bucket='scrapes-rawhtml-dev', 
               Key='statfox/'+YMD+'statfox_features.tsv.gz')

s3.upload_file(YMD+'statfox_target.tsv.gz', 
               Bucket='scrapes-rawhtml-dev', 
               Key='statfox/'+YMD+'statfox_target.tsv.gz')

s3.upload_file(YMD+'statfox_lateline_prob.tsv.gz', 
               Bucket='scrapes-rawhtml-dev', 
               Key='statfox/'+YMD+'statfox_lateline_prob.tsv.gz')

In [196]:
df_feat_fill.shape

(17573, 1127)

In [197]:
df_targ.shape

(17573,)

In [198]:
df_targ?

[0;31mType:[0m        Series
[0;31mString form:[0m
0        1
           1        1
           2        0
           3        0
           4        1
           5        1
           6        1
           7        0
           8         <...> 0
           17568    1
           17569    1
           17570    1
           17571    0
           17572    1
           Name: win_h, Length: 17573, dtype: int64
[0;31mLength:[0m      17573
[0;31mFile:[0m        ~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/series.py
[0;31mDocstring:[0m  
One-dimensional ndarray with axis labels (including time series).

Labels need not be unique but must be a hashable type. The object
supports both integer- and label-based indexing and provides a host of
methods for performing operations involving the index. Statistical
methods from ndarray have been overridden to automatically exclude
missing data (currently represented as NaN).

Operations between Series (+, -, /, *, **) align val