In [None]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

from scipy import stats as stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import plot_confusion_matrix, confusion_matrix, plot_roc_curve, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import export_graphviz

import pickle

import requests
from bs4 import BeautifulSoup as bs
from bs4 import Comment

import regex
import datetime

from tqdm import tqdm

# Data Uploading & Cleaning

## Upload & Clean Odds Data

Due to the unusual and shortened schedule in 2020, chose to drop.  Similarly for the not-yet-completed 2021 season.

In [None]:
years=['2019','2018','2017','2016','2015']
data={}

In [None]:
for yr in years:
    data[yr]=pd.read_excel('../Raw Data/'+yr+'.xlsx')

In [None]:
for yr,df in data.items():
    df_clean=df.drop(['Rot','1st','2nd','3rd','4th','5th','6th','7th','8th','9th','Open','Close','RunLine','Run Line','Unnamed: 18'],axis=1,errors='ignore')
    df_clean['year']=int(yr)
    df_clean['day']=df_clean.Date.apply(lambda x:x%100)
    df_clean['month']=df_clean.Date.apply(lambda x:int(np.round(x,-2)/100))
    df_clean['date']=df_clean.apply(lambda r:datetime.datetime(r.year,r.month,r.day),axis=1)
    df_clean.drop(['Date','day'],axis=1,inplace=True)
    data[yr]=df_clean.rename({'Open OU':'OpenOU','Close OU':'CloseOU','Unnamed: 20':'Open_Odds','Unnamed: 22':'Close_Odds'},axis=1)

In [None]:
all_data=pd.concat(data.values(),ignore_index=True)

In [None]:
patt=r'\-[A-Z]{1}'
all_data.Pitcher=all_data.Pitcher.str.replace(patt,'')

Checking team labels in the odds data, note a couple of issues to fix.

In [None]:
all_data.Team.value_counts()

First, some inconsistencent labeling (LOS v LA, e.g.).

In [None]:
all_data.Team.replace({'CUB':'CHC','LOS':'LAD','SFG':'SFO','BRS':'BOS'}).value_counts()

In [None]:
all_data.Team=all_data.Team.replace({'CUB':'CHC','LOS':'LAD','SFG':'SFO','BRS':'BOS'})

Second, need to adjust labels to be consistent with official labels.  

In [None]:
all_data.Team=all_data.Team.replace({
                                    'KAN':'KCR',
                                    'TAM':'TBR',
                                    'SDG':'SDP',
                                    'WAS':'WSN',
                                    'SFO':'SFG',
                                    'CWS':'CHW'
                                    })

In [None]:
all_data

Next, looking at Home/Away labels, there are four (two games) rows which have neither.  Upon further expection, this comes from games played outside of the US.  Links:

1) https://www.baseball-reference.com/boxes/OAK/OAK201903200.shtml

2) https://www.baseball-reference.com/boxes/OAK/OAK201903210.shtml

In [None]:
all_data.VH.value_counts()

In [None]:
all_data[all_data.VH=='N']

In [None]:
all_data.at[0,'VH']='V'
all_data.at[1,'VH']='H'
all_data.at[2,'VH']='V'
all_data.at[3,'VH']='H'

In [None]:
all_data.head()

In [None]:
all_data.VH.value_counts()

Next, need to clean up a single observation with missing run data.  Link:

https://www.baseball-reference.com/boxes/PIT/PIT201606071.shtml

Convert column into integers.

In [None]:
all_data.Final.value_counts()

In [None]:
all_data[all_data.Final=='NL']

In [None]:
all_data.at[16276,'Final']=3
all_data.at[16277,'Final']=3

In [None]:
all_data.Final.value_counts()

In [None]:
all_data.Final=all_data.Final.astype(int)

Next looking @ the target variable, there are some outrageous values that cannot be explained.  Since there is no other resource to check for the correct value, replace the line @ close with the line @ open.

Finally, convert the column into float.

In [None]:
all_data.CloseOU.value_counts()

In [None]:
all_data[all_data.CloseOU > 20]

In [None]:
all_data.CloseOU=all_data.apply(lambda r:r.OpenOU if r.CloseOU>20 else r.CloseOU,axis=1)

In [None]:
all_data.CloseOU.value_counts()

In [None]:
all_data.CloseOU=all_data.CloseOU.astype(float)

The original data is a bit confusing.  For each game, there are two rows in the dataset: one containing data for the Road team and one for the Home Team.  However, the over-under variable in both rows refers to the total runs scored.  

As a result, need to consolidate each row-pair into a single row with both Home and Away data.  Because the paired rows are adjacent, it is easy to combine.  This was done for each row--even though it adds significant execution time--and removed duplicates to be safe.

In [None]:
all_data['Home_Pitcher']=''
all_data['Away_Pitcher']=''
all_data['Home_Team']=''
all_data['Away_Team']=''
all_data['Home_Score']=0
all_data['Away_Score']=0
for i,row in all_data.iterrows():
    if row.VH=='V':
        all_data.loc[i,'Away_Pitcher']=all_data.loc[i,'Pitcher']
        all_data.loc[i,'Away_Team']=all_data.loc[i,'Team']
        all_data.loc[i,'Away_Score']=all_data.loc[i,'Final']
        all_data.loc[i,'Home_Score']=all_data.loc[i+1,'Final']
        all_data.loc[i,'Home_Team']=all_data.loc[i+1,'Team']
        all_data.loc[i,'Home_Pitcher']=all_data.loc[i+1,'Pitcher']
        pass
    elif row.VH=='H':
        all_data.loc[i,'Away_Pitcher']=all_data.loc[i-1,'Pitcher']
        all_data.loc[i,'Away_Team']=all_data.loc[i-1,'Team']
        all_data.loc[i,'Away_Score']=all_data.loc[i-1,'Final']
        all_data.loc[i,'Home_Score']=all_data.loc[i,'Final']
        all_data.loc[i,'Home_Team']=all_data.loc[i,'Team']
        all_data.loc[i,'Home_Pitcher']=all_data.loc[i,'Pitcher']
        pass
    else:
        pass
all_data

In [None]:
all_data=all_data.drop_duplicates(subset=['Away_Pitcher','Home_Pitcher','date'],ignore_index=True)
all_data

At this point, it makes sense to start dropping some unneeded rows.

In [None]:
all_data.drop(['VH','Team','Pitcher','Final','OpenOU','Open_Odds'],axis=1,inplace=True)
all_data

Finally, create some additional features:  Total Runs; OVER Dummy Variable; and a key value.

To build the key value, start with date + Home Team (later date sources will use a similar key).  However, because of double-headers, this does not create unique key values.  Next, tried date combined with the runs scored by the home and away team.  This works well with one exception where both games in a double header ended in the same score.  These observations' key-values were manually adjusted.

In [None]:
all_data['Total_Runs']=all_data['Home_Score']+all_data['Away_Score']
all_data['OVER']=all_data.apply(lambda r:r.Total_Runs>r.CloseOU,axis=1)
all_data

In [None]:
all_data['KEY']=all_data.date.apply(lambda d:str(d).replace(' 00:00:00','-'))+all_data.Home_Team

In [None]:
all_data[all_data.KEY.duplicated(keep=False)]

In [None]:
all_data.KEY=all_data.apply(lambda r:r.KEY+str(r.Home_Score)+str(r.Away_Score),axis=1)
all_data[all_data.KEY.duplicated(keep=False)]

In [None]:
all_data.at[8147,'KEY']=all_data.at[8147,'KEY']='A'
all_data.at[8162,'KEY']=all_data.at[8162,'KEY']='B'

In [None]:
all_data

In [None]:
all_data.to_pickle('../Base DFs/Odds_Data.pkl')

In [None]:
%reset -f

## Upload information on each game in the dataset.

In [2]:
TEAMS={}
with open('../Teams.txt') as f:
    for r in f.readlines():
        t=r.split(',')
        TEAMS[t[0]]=t[1]

YRs=['2019','2018','2017','2016','2015']
odds_data=pd.read_pickle('../Base DFs/Odds_Data.pkl')

On the baseball-reference website, there is a page with the entire year schedule for all teams.  

In [3]:
s_patt=r'([A-Za-z .\']+) \(([0-9]{1,2})\)\s{0,2}\@ ([A-Za-z .\']+) \(([0-9]{1,2})\)'
s_rgx=regex.compile(s_patt)
d_patt=r'[A-Z]{3}([0-9]{8})[0-9]{1}\.shtml'
d_rgx=regex.compile(d_patt)

In [4]:
game_data=[]
for yr in YRs:
    link='https://www.baseball-reference.com/leagues/MLB/'+yr+'-schedule.shtml'
    r=requests.get(link)
    soup=bs(r.content)
    table=soup.find('span',attrs={'data-label':'MLB Schedule'}).parent.find_next_sibling('div')
    games=table.find_all('p',attrs={'class':'game'})
    for g in tqdm(games):
        s=' '.join(g.stripped_strings)
        info=s_rgx.search(s).groups()
        links=g.find_all('a')
        l=links[-1]['href']
        d={'Away_Team':links[0]['href'],
           'Home_Team':links[1]['href'],
           'Away_Score':info[1],
           'Home_Score':info[3],
           'link':l,
           'date':pd.to_datetime(d_rgx.search(l).groups()[0],format='%Y%m%d')
          }
        game_data.append(d)

100%|██████████| 2429/2429 [00:00<00:00, 6110.53it/s]
100%|██████████| 2431/2431 [00:00<00:00, 6637.94it/s]
100%|██████████| 2430/2430 [00:00<00:00, 6692.91it/s]
100%|██████████| 2428/2428 [00:00<00:00, 6596.37it/s]
100%|██████████| 2429/2429 [00:00<00:00, 6531.42it/s]


In [5]:
len(game_data)

12147

In [6]:
games_meta=pd.DataFrame(game_data)
games_meta

Unnamed: 0,Away_Team,Home_Team,Away_Score,Home_Score,link,date
0,/teams/SEA/2019.shtml,/teams/OAK/2019.shtml,9,7,/boxes/OAK/OAK201903200.shtml,2019-03-20
1,/teams/SEA/2019.shtml,/teams/OAK/2019.shtml,5,4,/boxes/OAK/OAK201903210.shtml,2019-03-21
2,/teams/PIT/2019.shtml,/teams/CIN/2019.shtml,3,5,/boxes/CIN/CIN201903280.shtml,2019-03-28
3,/teams/CHW/2019.shtml,/teams/KCR/2019.shtml,3,5,/boxes/KCA/KCA201903280.shtml,2019-03-28
4,/teams/ARI/2019.shtml,/teams/LAD/2019.shtml,5,12,/boxes/LAN/LAN201903280.shtml,2019-03-28
...,...,...,...,...,...,...
12142,/teams/CIN/2015.shtml,/teams/PIT/2015.shtml,0,4,/boxes/PIT/PIT201510040.shtml,2015-10-04
12143,/teams/OAK/2015.shtml,/teams/SEA/2015.shtml,2,3,/boxes/SEA/SEA201510040.shtml,2015-10-04
12144,/teams/COL/2015.shtml,/teams/SFG/2015.shtml,7,3,/boxes/SFN/SFN201510040.shtml,2015-10-04
12145,/teams/TOR/2015.shtml,/teams/TBR/2015.shtml,3,12,/boxes/TBA/TBA201510040.shtml,2015-10-04


In [7]:
patt=r'\/teams\/([A-Z]{3})\/[0-9]{4}\.shtml'
games_meta.Away_Team=games_meta.Away_Team.str.extract(patt)
games_meta.Home_Team=games_meta.Home_Team.str.extract(patt)
games_meta

Unnamed: 0,Away_Team,Home_Team,Away_Score,Home_Score,link,date
0,SEA,OAK,9,7,/boxes/OAK/OAK201903200.shtml,2019-03-20
1,SEA,OAK,5,4,/boxes/OAK/OAK201903210.shtml,2019-03-21
2,PIT,CIN,3,5,/boxes/CIN/CIN201903280.shtml,2019-03-28
3,CHW,KCR,3,5,/boxes/KCA/KCA201903280.shtml,2019-03-28
4,ARI,LAD,5,12,/boxes/LAN/LAN201903280.shtml,2019-03-28
...,...,...,...,...,...,...
12142,CIN,PIT,0,4,/boxes/PIT/PIT201510040.shtml,2015-10-04
12143,OAK,SEA,2,3,/boxes/SEA/SEA201510040.shtml,2015-10-04
12144,COL,SFG,7,3,/boxes/SFN/SFN201510040.shtml,2015-10-04
12145,TOR,TBR,3,12,/boxes/TBA/TBA201510040.shtml,2015-10-04


In [8]:
games_meta['KEY']=games_meta.date.apply(lambda d:str(d).replace(' 00:00:00','-'))+games_meta.Home_Team
sum(games_meta.KEY.duplicated(keep=False))

278

In [9]:
games_meta.KEY=games_meta.apply(lambda r:r.KEY+str(r.Home_Score)+str(r.Away_Score),axis=1)
games_meta[games_meta.KEY.duplicated(keep=False)]

Unnamed: 0,Away_Team,Home_Team,Away_Score,Home_Score,link,date,KEY
8158,NYM,PIT,1,3,/boxes/PIT/PIT201606071.shtml,2016-06-07,2016-06-07-PIT31
8159,NYM,PIT,1,3,/boxes/PIT/PIT201606072.shtml,2016-06-07,2016-06-07-PIT31


In [10]:
games_meta.at[8158,'KEY']=games_meta.at[8158,'KEY']='A'
games_meta.at[8159,'KEY']=games_meta.at[8159,'KEY']='B'

In [11]:
games_meta

Unnamed: 0,Away_Team,Home_Team,Away_Score,Home_Score,link,date,KEY
0,SEA,OAK,9,7,/boxes/OAK/OAK201903200.shtml,2019-03-20,2019-03-20-OAK79
1,SEA,OAK,5,4,/boxes/OAK/OAK201903210.shtml,2019-03-21,2019-03-21-OAK45
2,PIT,CIN,3,5,/boxes/CIN/CIN201903280.shtml,2019-03-28,2019-03-28-CIN53
3,CHW,KCR,3,5,/boxes/KCA/KCA201903280.shtml,2019-03-28,2019-03-28-KCR53
4,ARI,LAD,5,12,/boxes/LAN/LAN201903280.shtml,2019-03-28,2019-03-28-LAD125
...,...,...,...,...,...,...,...
12142,CIN,PIT,0,4,/boxes/PIT/PIT201510040.shtml,2015-10-04,2015-10-04-PIT40
12143,OAK,SEA,2,3,/boxes/SEA/SEA201510040.shtml,2015-10-04,2015-10-04-SEA32
12144,COL,SFG,7,3,/boxes/SFN/SFN201510040.shtml,2015-10-04,2015-10-04-SFG37
12145,TOR,TBR,3,12,/boxes/TBA/TBA201510040.shtml,2015-10-04,2015-10-04-TBR123


In [12]:
games_meta.to_pickle('../Base DFs/Games_Meta.pkl')

In [13]:
X=pd.merge(games_meta,odds_data,on='KEY',how='outer')
'''
errors=pd.concat([X[X.date_x.isna()],X[X.date_y.isna()]],axis=0)
errors['date']=errors.apply(lambda r:(r.date_x,r.date_y)[r.date_x is pd.NaT],axis=1)

errors['home']=errors.apply(lambda r:(r.Home_Team_x,r.Home_Team_y)[r.date_x is pd.NaT],axis=1)
errors=errors.sort_values(['date','home'])
errors=errors.reset_index()
errors
'''
X.dropna(subset=['Away_Team_x','Away_Team_y'])

Unnamed: 0,Away_Team_x,Home_Team_x,Away_Score_x,Home_Score_x,link,date_x,KEY,CloseOU,Close_Odds,year,month,date_y,Home_Pitcher,Away_Pitcher,Home_Team_y,Away_Team_y,Home_Score_y,Away_Score_y,Total_Runs,OVER
0,SEA,OAK,9,7,/boxes/OAK/OAK201903200.shtml,2019-03-20,2019-03-20-OAK79,8.5,-120.0,2019.0,3.0,2019-03-20,MFIERS,MGONZALES,OAK,SEA,7.0,9.0,16.0,True
1,SEA,OAK,5,4,/boxes/OAK/OAK201903210.shtml,2019-03-21,2019-03-21-OAK45,9.0,-130.0,2019.0,3.0,2019-03-21,MESTRADA,YKIKUCHI,OAK,SEA,4.0,5.0,9.0,False
2,PIT,CIN,3,5,/boxes/CIN/CIN201903280.shtml,2019-03-28,2019-03-28-CIN53,8.5,-105.0,2019.0,3.0,2019-03-28,LCASTILLO,JTAILLON,CIN,PIT,5.0,3.0,8.0,False
3,CHW,KCR,3,5,/boxes/KCA/KCA201903280.shtml,2019-03-28,2019-03-28-KCR53,8.5,-105.0,2019.0,3.0,2019-03-28,BKELLER,CRODON,KCR,CHW,5.0,3.0,8.0,False
4,ARI,LAD,5,12,/boxes/LAN/LAN201903280.shtml,2019-03-28,2019-03-28-LAD125,7.0,-120.0,2019.0,3.0,2019-03-28,HRYU,ZGREINKE,LAD,ARI,12.0,5.0,17.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12142,CIN,PIT,0,4,/boxes/PIT/PIT201510040.shtml,2015-10-04,2015-10-04-PIT40,7.5,-105.0,2015.0,10.0,2015-10-04,JHAPP,JSMITH,PIT,CIN,4.0,0.0,4.0,False
12143,OAK,SEA,2,3,/boxes/SEA/SEA201510040.shtml,2015-10-04,2015-10-04-SEA32,7.5,-115.0,2015.0,10.0,2015-10-04,VNUNO,CBASSITT,SEA,OAK,3.0,2.0,5.0,False
12144,COL,SFG,7,3,/boxes/SFN/SFN201510040.shtml,2015-10-04,2015-10-04-SFG37,7.5,-115.0,2015.0,10.0,2015-10-04,MCAIN,CBERGMAN,SFG,COL,3.0,7.0,10.0,True
12145,TOR,TBR,3,12,/boxes/TBA/TBA201510040.shtml,2015-10-04,2015-10-04-TBR123,8.5,-120.0,2015.0,10.0,2015-10-04,MMOORE,MBUEHRLE,TBR,TOR,12.0,3.0,15.0,True


In [14]:
game_data=pd.merge(games_meta,odds_data,on='KEY',how='inner')
game_data=game_data.drop(['Away_Score_y','Home_Score_y','Away_Team_y','Home_Team_y','Total_Runs','date_y'],axis=1)
game_data=game_data.rename({'Away_Score_x':'Away_Score','Home_Score_x':'Home_Score','Away_Team_x':'Away_Team','Home_Team_x':'Home_Team','date_x':'date'},axis=1)
game_data=game_data.astype({'Away_Score':'int64','Home_Score':'int64','Close_Odds':'int64','month':'int64','year':'int64'})
game_data.CloseOU=game_data.CloseOU.astype(float)
game_data['Total_Runs']=game_data.Away_Score+game_data.Home_Score
game_data

Unnamed: 0,Away_Team,Home_Team,Away_Score,Home_Score,link,date,KEY,CloseOU,Close_Odds,year,month,Home_Pitcher,Away_Pitcher,OVER,Total_Runs
0,SEA,OAK,9,7,/boxes/OAK/OAK201903200.shtml,2019-03-20,2019-03-20-OAK79,8.5,-120,2019,3,MFIERS,MGONZALES,True,16
1,SEA,OAK,5,4,/boxes/OAK/OAK201903210.shtml,2019-03-21,2019-03-21-OAK45,9.0,-130,2019,3,MESTRADA,YKIKUCHI,False,9
2,PIT,CIN,3,5,/boxes/CIN/CIN201903280.shtml,2019-03-28,2019-03-28-CIN53,8.5,-105,2019,3,LCASTILLO,JTAILLON,False,8
3,CHW,KCR,3,5,/boxes/KCA/KCA201903280.shtml,2019-03-28,2019-03-28-KCR53,8.5,-105,2019,3,BKELLER,CRODON,False,8
4,ARI,LAD,5,12,/boxes/LAN/LAN201903280.shtml,2019-03-28,2019-03-28-LAD125,7.0,-120,2019,3,HRYU,ZGREINKE,True,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12111,CIN,PIT,0,4,/boxes/PIT/PIT201510040.shtml,2015-10-04,2015-10-04-PIT40,7.5,-105,2015,10,JHAPP,JSMITH,False,4
12112,OAK,SEA,2,3,/boxes/SEA/SEA201510040.shtml,2015-10-04,2015-10-04-SEA32,7.5,-115,2015,10,VNUNO,CBASSITT,False,5
12113,COL,SFG,7,3,/boxes/SFN/SFN201510040.shtml,2015-10-04,2015-10-04-SFG37,7.5,-115,2015,10,MCAIN,CBERGMAN,True,10
12114,TOR,TBR,3,12,/boxes/TBA/TBA201510040.shtml,2015-10-04,2015-10-04-TBR123,8.5,-120,2015,10,MMOORE,MBUEHRLE,True,15


In [15]:
base='https://www.baseball-reference.com/teams/{t}/{y}-schedule-scores.shtml'
stats=['boxscore','day_or_night','attendance','cli']
game_stats={}

In [16]:
for yr in YRs:
    for team in TEAMS.keys():
        link=base.format_map({'t':team,'y':yr})
        r=requests.get(link)
        soup=bs(r.content)
        table=soup.find('table',attrs={'id':'team_schedule'}).tbody
        games=table.find_all('tr')
        i=1
        for g in games:
            info=g.find_all('td',attrs={'data-stat':stats})
            if(info):
                link=info[0].a['href']
                d=dict(zip(stats[1:],[col.text for col in info[1:]]))
                d['link']=link
                game_stats[team+'_'+yr+'_'+str(i)]=d
                i+=1

In [17]:
game_logs=pd.DataFrame.from_dict(game_stats,orient='index')
#game_logs.attendance=game_logs.attendance.astype(int)
#game_logs.cli=game_logs.cli.astype(float)
game_logs.to_pickle('../Base DFs/Games_Info_I.pkl')
game_logs

Unnamed: 0,day_or_night,attendance,cli,link
ARI_2019_1,D,53086,1.08,/boxes/LAN/LAN201903280.shtml
ARI_2019_2,N,42266,.99,/boxes/LAN/LAN201903290.shtml
ARI_2019_3,N,50626,1.06,/boxes/LAN/LAN201903300.shtml
ARI_2019_4,D,43815,1.04,/boxes/LAN/LAN201903310.shtml
ARI_2019_5,N,18683,.98,/boxes/SDN/SDN201904010.shtml
...,...,...,...,...
WSN_2015_158,N,13860,.00,/boxes/ATL/ATL201509300.shtml
WSN_2015_159,N,37790,.00,/boxes/ATL/ATL201510010.shtml
WSN_2015_160,D,39465,.00,/boxes/NYN/NYN201510031.shtml
WSN_2015_161,N,41480,.00,/boxes/NYN/NYN201510032.shtml


In [18]:
game_data=pd.merge(game_data,game_logs.drop_duplicates(['link']),left_on=['link'],right_on=['link'])
game_data

Unnamed: 0,Away_Team,Home_Team,Away_Score,Home_Score,link,date,KEY,CloseOU,Close_Odds,year,month,Home_Pitcher,Away_Pitcher,OVER,Total_Runs,day_or_night,attendance,cli
0,SEA,OAK,9,7,/boxes/OAK/OAK201903200.shtml,2019-03-20,2019-03-20-OAK79,8.5,-120,2019,3,MFIERS,MGONZALES,True,16,N,45787,1.10
1,SEA,OAK,5,4,/boxes/OAK/OAK201903210.shtml,2019-03-21,2019-03-21-OAK45,9.0,-130,2019,3,MESTRADA,YKIKUCHI,False,9,N,46451,1.01
2,PIT,CIN,3,5,/boxes/CIN/CIN201903280.shtml,2019-03-28,2019-03-28-CIN53,8.5,-105,2019,3,LCASTILLO,JTAILLON,False,8,D,44049,1.06
3,CHW,KCR,3,5,/boxes/KCA/KCA201903280.shtml,2019-03-28,2019-03-28-KCR53,8.5,-105,2019,3,BKELLER,CRODON,False,8,N,31675,1.08
4,ARI,LAD,5,12,/boxes/LAN/LAN201903280.shtml,2019-03-28,2019-03-28-LAD125,7.0,-120,2019,3,HRYU,ZGREINKE,True,17,D,53086,1.08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12111,CIN,PIT,0,4,/boxes/PIT/PIT201510040.shtml,2015-10-04,2015-10-04-PIT40,7.5,-105,2015,10,JHAPP,JSMITH,False,4,D,35362,.00
12112,OAK,SEA,2,3,/boxes/SEA/SEA201510040.shtml,2015-10-04,2015-10-04-SEA32,7.5,-115,2015,10,VNUNO,CBASSITT,False,5,D,22402,.00
12113,COL,SFG,7,3,/boxes/SFN/SFN201510040.shtml,2015-10-04,2015-10-04-SFG37,7.5,-115,2015,10,MCAIN,CBERGMAN,True,10,D,41399,.00
12114,TOR,TBR,3,12,/boxes/TBA/TBA201510040.shtml,2015-10-04,2015-10-04-TBR123,8.5,-120,2015,10,MMOORE,MBUEHRLE,True,15,D,15815,.00


In [19]:
base='https://www.baseball-reference.com'
more_data={}
#checked=[]

for game in tqdm(game_data.link):
    #if(game in checked):
    #    pass
    #else:
    #    checked.append(game)
    try:
        D={}
        link=base+game
        r=requests.get(link)
        soup=bs(r.content)
        X=soup.find('span',attrs={'data-label':'Pitching Lines and Info'}).parent.parent
        lineups=bs(X.find_all(text=lambda text:isinstance(text,Comment))[0]).find_all('table')
        pitcher=lineups[0].tbody.tr.th.a
        D['Away_Pitch']=(pitcher.text,pitcher['href'])
        pitcher=lineups[1].tbody.tr.th.a
        D['Home_Pitch']=(pitcher.text,pitcher['href'])
        X=soup.find('span',attrs={'data-label':'Other Info'}).parent.parent
        other=bs(X.find_all(text=lambda text:isinstance(text,Comment))[0])
        D['other']=other.text.strip()
        more_data[game]=D
    except:
        print(game)
        pass

100%|██████████| 12116/12116 [43:12<00:00,  4.67it/s] 


In [20]:
game_detail=pd.DataFrame.from_dict(more_data,orient='index')
game_detail

Unnamed: 0,Away_Pitch,Home_Pitch,other
/boxes/OAK/OAK201903200.shtml,"(Marco Gonzales, /players/g/gonzama02.shtml)","(Mike Fiers, /players/f/fiersmi01.shtml)","Umpires: HP - Jeff Nelson, 1B - Tripp Gibson, ..."
/boxes/OAK/OAK201903210.shtml,"(Yusei Kikuchi, /players/k/kikucyu01.shtml)","(Marco Estrada, /players/e/estrama01.shtml)","Umpires: HP - Bill Welke, 1B - Lance Barksdale..."
/boxes/CIN/CIN201903280.shtml,"(Jameson Taillon, /players/t/taillja01.shtml)","(Luis Castillo, /players/c/castilu02.shtml)","Umpires: HP - Joe West, 1B - Eric Cooper, 2B -..."
/boxes/KCA/KCA201903280.shtml,"(Carlos Rodon, /players/r/rodonca01.shtml)","(Brad Keller, /players/k/kellebr01.shtml)","Umpires: HP - Jerry Meals, 1B - Ron Kulpa, 2B ..."
/boxes/LAN/LAN201903280.shtml,"(Zack Greinke, /players/g/greinza01.shtml)","(Hyun Jin Ryu, /players/r/ryuhy01.shtml)","Umpires: HP - Brian Gorman, 1B - Scott Barry, ..."
...,...,...,...
/boxes/PIT/PIT201510040.shtml,"(Josh A. Smith, /players/s/smithjo07.shtml)","(J.A. Happ, /players/h/happja01.shtml)","Umpires: HP - Tim Welke, 1B - Todd Tichenor, 2..."
/boxes/SEA/SEA201510040.shtml,"(Chris Bassitt, /players/b/bassich01.shtml)","(Vidal Nuno III, /players/n/nunovi01.shtml)","Umpires: HP - Mike Estabrook, 1B - Ed Hickox, ..."
/boxes/SFN/SFN201510040.shtml,"(Christian Bergman, /players/b/bergmch01.shtml)","(Matt Cain, /players/c/cainma01.shtml)","Umpires: HP - Carlos Torres, 1B - Andy Fletche..."
/boxes/TBA/TBA201510040.shtml,"(Mark Buehrle, /players/b/buehrma01.shtml)","(Matt Moore, /players/m/moorema02.shtml)","Umpires: HP - Alfonso Marquez, 1B - Tom Hallio..."


In [21]:
game_detail.to_pickle('../Base DFs/Games_Info_II.pkl')

In [22]:
game_data=pd.merge(game_data,game_detail,how='left',right_index=True,left_on=['link'])
game_data

Unnamed: 0,Away_Team,Home_Team,Away_Score,Home_Score,link,date,KEY,CloseOU,Close_Odds,year,...,Home_Pitcher,Away_Pitcher,OVER,Total_Runs,day_or_night,attendance,cli,Away_Pitch,Home_Pitch,other
0,SEA,OAK,9,7,/boxes/OAK/OAK201903200.shtml,2019-03-20,2019-03-20-OAK79,8.5,-120,2019,...,MFIERS,MGONZALES,True,16,N,45787,1.10,"(Marco Gonzales, /players/g/gonzama02.shtml)","(Mike Fiers, /players/f/fiersmi01.shtml)","Umpires: HP - Jeff Nelson, 1B - Tripp Gibson, ..."
1,SEA,OAK,5,4,/boxes/OAK/OAK201903210.shtml,2019-03-21,2019-03-21-OAK45,9.0,-130,2019,...,MESTRADA,YKIKUCHI,False,9,N,46451,1.01,"(Yusei Kikuchi, /players/k/kikucyu01.shtml)","(Marco Estrada, /players/e/estrama01.shtml)","Umpires: HP - Bill Welke, 1B - Lance Barksdale..."
2,PIT,CIN,3,5,/boxes/CIN/CIN201903280.shtml,2019-03-28,2019-03-28-CIN53,8.5,-105,2019,...,LCASTILLO,JTAILLON,False,8,D,44049,1.06,"(Jameson Taillon, /players/t/taillja01.shtml)","(Luis Castillo, /players/c/castilu02.shtml)","Umpires: HP - Joe West, 1B - Eric Cooper, 2B -..."
3,CHW,KCR,3,5,/boxes/KCA/KCA201903280.shtml,2019-03-28,2019-03-28-KCR53,8.5,-105,2019,...,BKELLER,CRODON,False,8,N,31675,1.08,"(Carlos Rodon, /players/r/rodonca01.shtml)","(Brad Keller, /players/k/kellebr01.shtml)","Umpires: HP - Jerry Meals, 1B - Ron Kulpa, 2B ..."
4,ARI,LAD,5,12,/boxes/LAN/LAN201903280.shtml,2019-03-28,2019-03-28-LAD125,7.0,-120,2019,...,HRYU,ZGREINKE,True,17,D,53086,1.08,"(Zack Greinke, /players/g/greinza01.shtml)","(Hyun Jin Ryu, /players/r/ryuhy01.shtml)","Umpires: HP - Brian Gorman, 1B - Scott Barry, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12111,CIN,PIT,0,4,/boxes/PIT/PIT201510040.shtml,2015-10-04,2015-10-04-PIT40,7.5,-105,2015,...,JHAPP,JSMITH,False,4,D,35362,.00,"(Josh A. Smith, /players/s/smithjo07.shtml)","(J.A. Happ, /players/h/happja01.shtml)","Umpires: HP - Tim Welke, 1B - Todd Tichenor, 2..."
12112,OAK,SEA,2,3,/boxes/SEA/SEA201510040.shtml,2015-10-04,2015-10-04-SEA32,7.5,-115,2015,...,VNUNO,CBASSITT,False,5,D,22402,.00,"(Chris Bassitt, /players/b/bassich01.shtml)","(Vidal Nuno III, /players/n/nunovi01.shtml)","Umpires: HP - Mike Estabrook, 1B - Ed Hickox, ..."
12113,COL,SFG,7,3,/boxes/SFN/SFN201510040.shtml,2015-10-04,2015-10-04-SFG37,7.5,-115,2015,...,MCAIN,CBERGMAN,True,10,D,41399,.00,"(Christian Bergman, /players/b/bergmch01.shtml)","(Matt Cain, /players/c/cainma01.shtml)","Umpires: HP - Carlos Torres, 1B - Andy Fletche..."
12114,TOR,TBR,3,12,/boxes/TBA/TBA201510040.shtml,2015-10-04,2015-10-04-TBR123,8.5,-120,2015,...,MMOORE,MBUEHRLE,True,15,D,15815,.00,"(Mark Buehrle, /players/b/buehrma01.shtml)","(Matt Moore, /players/m/moorema02.shtml)","Umpires: HP - Alfonso Marquez, 1B - Tom Hallio..."


In [23]:
game_data.columns

Index(['Away_Team', 'Home_Team', 'Away_Score', 'Home_Score', 'link', 'date',
       'KEY', 'CloseOU', 'Close_Odds', 'year', 'month', 'Home_Pitcher',
       'Away_Pitcher', 'OVER', 'Total_Runs', 'day_or_night', 'attendance',
       'cli', 'Away_Pitch', 'Home_Pitch', 'other'],
      dtype='object')

In [30]:
game_data['Away_Pitcher_Name']=game_data.Away_Pitch.apply(lambda t:t[0])
game_data['Away_Pitcher_ID']=game_data.Away_Pitch.apply(lambda t:t[1])

game_data['Home_Pitcher_Name']=game_data.Home_Pitch.apply(lambda t:t[0])
game_data['Home_Pitcher_ID']=game_data.Home_Pitch.apply(lambda t:t[1])
game_data.Home_Pitcher=game_data.Home_Pitch
game_data.Away_Pitcher=game_data.Away_Pitch
game_data.drop(['Away_Pitch','Home_Pitch'],inplace=True,axis=1,errors='ignore')

In [31]:
print(game_data.columns)
game_data

Index(['Away_Team', 'Home_Team', 'Away_Score', 'Home_Score', 'link', 'date',
       'KEY', 'CloseOU', 'Close_Odds', 'year', 'month', 'Home_Pitcher',
       'Away_Pitcher', 'OVER', 'Total_Runs', 'day_or_night', 'attendance',
       'cli', 'other', 'Away_Pitcher_Name', 'Away_Pitcher_ID',
       'Home_Pitcher_Name', 'Home_Pitcher_ID'],
      dtype='object')


Unnamed: 0,Away_Team,Home_Team,Away_Score,Home_Score,link,date,KEY,CloseOU,Close_Odds,year,...,OVER,Total_Runs,day_or_night,attendance,cli,other,Away_Pitcher_Name,Away_Pitcher_ID,Home_Pitcher_Name,Home_Pitcher_ID
0,SEA,OAK,9,7,/boxes/OAK/OAK201903200.shtml,2019-03-20,2019-03-20-OAK79,8.5,-120,2019,...,True,16,N,45787,1.10,"Umpires: HP - Jeff Nelson, 1B - Tripp Gibson, ...",Marco Gonzales,/players/g/gonzama02.shtml,Mike Fiers,/players/f/fiersmi01.shtml
1,SEA,OAK,5,4,/boxes/OAK/OAK201903210.shtml,2019-03-21,2019-03-21-OAK45,9.0,-130,2019,...,False,9,N,46451,1.01,"Umpires: HP - Bill Welke, 1B - Lance Barksdale...",Yusei Kikuchi,/players/k/kikucyu01.shtml,Marco Estrada,/players/e/estrama01.shtml
2,PIT,CIN,3,5,/boxes/CIN/CIN201903280.shtml,2019-03-28,2019-03-28-CIN53,8.5,-105,2019,...,False,8,D,44049,1.06,"Umpires: HP - Joe West, 1B - Eric Cooper, 2B -...",Jameson Taillon,/players/t/taillja01.shtml,Luis Castillo,/players/c/castilu02.shtml
3,CHW,KCR,3,5,/boxes/KCA/KCA201903280.shtml,2019-03-28,2019-03-28-KCR53,8.5,-105,2019,...,False,8,N,31675,1.08,"Umpires: HP - Jerry Meals, 1B - Ron Kulpa, 2B ...",Carlos Rodon,/players/r/rodonca01.shtml,Brad Keller,/players/k/kellebr01.shtml
4,ARI,LAD,5,12,/boxes/LAN/LAN201903280.shtml,2019-03-28,2019-03-28-LAD125,7.0,-120,2019,...,True,17,D,53086,1.08,"Umpires: HP - Brian Gorman, 1B - Scott Barry, ...",Zack Greinke,/players/g/greinza01.shtml,Hyun Jin Ryu,/players/r/ryuhy01.shtml
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12111,CIN,PIT,0,4,/boxes/PIT/PIT201510040.shtml,2015-10-04,2015-10-04-PIT40,7.5,-105,2015,...,False,4,D,35362,.00,"Umpires: HP - Tim Welke, 1B - Todd Tichenor, 2...",Josh A. Smith,/players/s/smithjo07.shtml,J.A. Happ,/players/h/happja01.shtml
12112,OAK,SEA,2,3,/boxes/SEA/SEA201510040.shtml,2015-10-04,2015-10-04-SEA32,7.5,-115,2015,...,False,5,D,22402,.00,"Umpires: HP - Mike Estabrook, 1B - Ed Hickox, ...",Chris Bassitt,/players/b/bassich01.shtml,Vidal Nuno III,/players/n/nunovi01.shtml
12113,COL,SFG,7,3,/boxes/SFN/SFN201510040.shtml,2015-10-04,2015-10-04-SFG37,7.5,-115,2015,...,True,10,D,41399,.00,"Umpires: HP - Carlos Torres, 1B - Andy Fletche...",Christian Bergman,/players/b/bergmch01.shtml,Matt Cain,/players/c/cainma01.shtml
12114,TOR,TBR,3,12,/boxes/TBA/TBA201510040.shtml,2015-10-04,2015-10-04-TBR123,8.5,-120,2015,...,True,15,D,15815,.00,"Umpires: HP - Alfonso Marquez, 1B - Tom Hallio...",Mark Buehrle,/players/b/buehrma01.shtml,Matt Moore,/players/m/moorema02.shtml


In [32]:
game_data.to_pickle('../Merged DFs/Games_AllData.pkl')