In [2]:
from bs4 import BeautifulSoup
from datetime import datetime
import urllib2
import pandas as pd
from settings import *
#from get_pts import *
from time import sleep
#from get_schedule_results import append_opposing_pitcher
#from get_opposing_pitcher_info import append_pitcher_data
#from feature_engineering import *
from os import listdir
import numpy as np
import warnings
from tqdm import tqdm
warnings.filterwarnings('ignore')

In [3]:
#Scrape a players career stats from baseball reference
def scrape_career_data(url):
    soup = BeautifulSoup(urllib2.urlopen(url).read(),"html.parser") 
    tableStats = soup.find("table",{"id":"batting_standard"}).find('tfoot')
    rows = tableStats.findAll('tr',{'class':'stat_total'})
    data = []
    for row in rows[:2]:
        current = []
        for col in row:
            if col.name == 'td':
                current.append(col.string)
        data.append(current)
    cols = ['Year','G','PA','AB','R','H','2B','3B','HR','RBI','SB','CS','BB','SO','BA','OBP','SLG','OPS','OPS+','TB','GDP','HBP','SH','SF','IBB','Pos','Awards']
    df = pd.DataFrame(data,columns=cols)
    return df

In [4]:
def get_hitter_data(url):
    #print 'Scraping hitter data...'
    
    soup = BeautifulSoup(urllib2.urlopen(url).read(),"html.parser") 
    tableStats = soup.find("table",{"class":"dbd"})
    rows = tableStats.findAll('tr')
    
    #column names are in first row; they have extra space fields so remove those
    col_names = [x.string for x in rows[0] if x.string != ' ']    
    
    #iterate thru each row getting the data from each td tag
    #then append each row to a list containing all data rows
    data_rows = []
    for row in rows[:-1]:
        current = []
        for col in row:
            if col.name == 'td':
                current.append(str(col.string))
        data_rows.append(current)
            
    #create dataframe and drop empty rows
    df = pd.DataFrame(data_rows,columns=col_names)
    df1 = df.dropna().reset_index(drop=True)
    
    #Append batting style
    bats = str(soup.find('h5').text).split()[1][0]
    df1['bats'] = bats
    
    #change dtypes to ints where possible
    for column in df1:
        try:
            df1[column] = df1[column].astype(int)
        except:
            pass
    
    return df1

In [None]:
def scrape_game_log(url):
    soup = BeautifulSoup(urllib2.urlopen(url).read(),"html.parser") 
    tableStats = soup.find("table",{"id":"batting_gamelogs"}).find('tbody')
    rows = tableStats.findAll('tr')
    tableHead = soup.find("table",{"id":"batting_gamelogs"}).find('thead').find('tr')
    col_names = [x.string for x in tableHead if x.name == 'th']
    
    data = []
    for row in rows:
        current = [col.string for col in row if col.name == 'td']
        data.append(current)
    df = pd.DataFrame(data,columns=col_names)
    def convert_date(d):
        try:
            return datetime.strptime(d,'%b %d').strftime('2016-%m-%d')
        except:
            return None
    df['Date'] = df['Date'].apply(lambda x: convert_date(x))

    return df

### 1. Get Hitter Data

#### Right now we only have 279 unique players- need more

In [None]:
all_hitter_data = []
for player,url in tqdm(players_urls0.iteritems()):
    url = players_urls0[player]
    hitter_data = get_hitter_data(url)
    hitter_data['hitter_name'] = player
    all_hitter_data.append(hitter_data)
    #print 'Got {}'.format(player)
    sleep(np.random.rand())
    
all_hitter_data1 = pd.concat(all_hitter_data)

In [None]:
#hitter_data = get_hitter_data(url)

In [None]:
def append_hitter_score_to_df(df):
    #print 'Calculating draftkings points...'
    s = (df['Hits'] - (df['2B'] + df['3B'] + df['HR'])) * 3
    d = df['2B'] * 5
    t = df['3B'] * 8
    hr = df['HR'] * 10
    rbi = df['RBI'] * 2
    r = df['Runs'] * 2
    bb = df['BB'] * 2
    hbp = df['HBP'] * 2 
    sb = df['SB'] * 5
    df['FANTASY_PTS'] = s+d+t+hr+rbi+r+bb+hbp+sb
    return df

all_hitter_data2 = append_hitter_score_to_df(all_hitter_data1)

In [None]:
all_hitter_data2.shape

In [None]:
all_hitter_data2.to_csv('_data/hitter_data_2016/hitter_data_2016.csv',index=False)

### Get Game Logs

In [None]:
#url = players_game_logs0['Mookie_Betts']

In [None]:
#soup = BeautifulSoup(urllib2.urlopen(url).read(),"html.parser") 
#tableStats = soup.find("table",{"id":"batting_gamelogs"}).find('tbody')
#rows = tableStats.findAll('tr')
#tableHead = soup.find("table",{"id":"batting_gamelogs"}).find('thead').find('tr')
#col_names = [x.string for x in tableHead if x.name == 'th']

In [None]:
# data = []
# for row in rows:
#     current = [col.string for col in row if col.name in ('td','th')]
#     data.append(current)
# df = pd.DataFrame(data,columns=col_names)
# def convert_date(d):
#     try:
#         return datetime.strptime(d,'%b %d').strftime('2016-%m-%d')
#     except:
#         return None
# df['Date'] = df['Date'].apply(lambda x: convert_date(x))

In [None]:
# df.to_csv('_data/game_logs/game_logs.csv',encoding='utf-8',index=False)

### 2. Append Opposing Pitcher

In [None]:
all_hitter_data2.shape

In [None]:
schedules = pd.read_csv('_data/schedules_2016/_ALL_SCHEDULES_2016.csv')

In [None]:
def append_opposing_pitcher(hitter_df,schedule_df):
    #Reformat hitter dates
    hitter_df['Game Date'] = hitter_df['Game Date'].apply(lambda x: x.replace('\\r\\n','').strip())
    hitter_df['Game Date'] = pd.to_datetime(hitter_df['Game Date'],format="%m/%d/%Y",errors='coerce')
    schedule_df['Date'] = pd.to_datetime(schedule_df['Date'],format="%m/%d/%Y",errors='coerce')
    
    df1 = pd.merge(hitter_df,schedule_df[['Date','Opp','opp_pitcher','Gm#','Team']],how='left',
                   left_on=['Team','Game Date'],right_on=['Team','Date'])
    df1.dropna(inplace=True)
    return df1

In [None]:
all_hitter_data3 = append_opposing_pitcher(all_hitter_data2.copy(),schedules)

In [None]:
all_hitter_data3.head()

In [None]:
all_hitter_data3.to_csv('_data/hitter_data_2016/hitter_data_2016_w_pitcher.csv',index=False)

### 3. Do the same things for 2017

In [12]:
#Create players urls for 2017
players_urls_2017 = {}
for player, url in players_urls0.iteritems():
    players_urls_2017[player] = players_urls0[player].replace('F2016','F2017')

In [14]:
all_hitter_data_2017 = []
for player,url in tqdm(players_urls_2017.iteritems()):
    url = players_urls_2017[player]
    
    try:
        hitter_data = get_hitter_data(url)
    except:
        print 'Problem with {} for 2017'.format(player)
        continue
        
    hitter_data['hitter_name'] = player
    all_hitter_data_2017.append(hitter_data)
    #print 'Got {}'.format(player)
    sleep(np.random.rand())
    
all_hitter_data1 = pd.concat(all_hitter_data_2017)

27it [00:30,  1.04it/s]

Problem with Dae-Ho_Lee for 2017


29it [00:32,  1.10it/s]

Problem with Coco_Crisp for 2017


43it [00:47,  1.36it/s]

Problem with Mark_Teixeira for 2017


66it [01:10,  1.02it/s]

Problem with Jimmy_Rollins for 2017


69it [01:12,  1.40it/s]

Problem with Byung-Ho_Park for 2017


99it [01:43,  1.24it/s]

Problem with Ryan_Howard for 2017


105it [01:48,  1.40it/s]

Problem with Alexei_Ramirez for 2017


134it [02:23,  1.05it/s]

Problem with James_Loney for 2017


150it [02:39,  1.42it/s]

Problem with Angel_Pagan for 2017


152it [02:40,  1.63it/s]

Problem with Michael_Bourn for 2017


153it [02:41,  1.93it/s]

Problem with Juan_Uribe for 2017


199it [03:31,  1.21it/s]

Problem with Ortiz for 2017


206it [03:38,  1.19it/s]

Problem with Brett_Lawrie for 2017


208it [03:40,  1.43it/s]

Problem with Alex_Rodriguez for 2017


236it [04:13,  1.03it/s]

Problem with Dioner_Navarro for 2017


242it [04:19,  1.14it/s]

Problem with Prince_Fielder for 2017


252it [04:30,  1.08it/s]

Problem with David_Ross for 2017


260it [04:37,  1.18it/s]

Problem with Gregorio_Petit for 2017


268it [04:45,  1.23it/s]

Problem with Jung-ho Kang for 2017


279it [04:59,  1.29s/it]


In [15]:
def append_hitter_score_to_df(df):
    #print 'Calculating draftkings points...'
    s = (df['Hits'] - (df['2B'] + df['3B'] + df['HR'])) * 3
    d = df['2B'] * 5
    t = df['3B'] * 8
    hr = df['HR'] * 10
    rbi = df['RBI'] * 2
    r = df['Runs'] * 2
    bb = df['BB'] * 2
    hbp = df['HBP'] * 2 
    sb = df['SB'] * 5
    df['FANTASY_PTS'] = s+d+t+hr+rbi+r+bb+hbp+sb
    return df

all_hitter_data2 = append_hitter_score_to_df(all_hitter_data1)

In [17]:
all_hitter_data2.to_csv('_data/hitter_data/hitter_data_2017.csv',index=False)

In [36]:
schedules = pd.read_csv('_data/schedules/_ALL_SCHEDULES_2017.csv')

In [37]:
all_hitter_data2.head()

Unnamed: 0,Game Date,Team,Opponent,GS,AB,Runs,Hits,2B,3B,HR,...,K,SB,CS,SH,SF,GDP,CI,bats,hitter_name,FANTASY_PTS
0,04/02/2017\n,SLN,Vs. CHN,1,2,0,0,0,0,0,...,1,0,0,0,0,0,0,R,Jedd_Gyorko,0
1,04/06/2017\n,SLN,Vs. CHN,1,4,0,0,0,0,0,...,2,0,0,0,0,0,0,R,Jedd_Gyorko,0
2,04/08/2017\n,SLN,Vs. CIN,1,3,1,2,1,0,0,...,1,0,0,0,0,0,0,R,Jedd_Gyorko,18
3,04/09/2017\n,SLN,Vs. CIN,1,3,0,0,0,0,0,...,0,0,0,0,0,0,0,R,Jedd_Gyorko,0
4,04/10/2017\n,SLN,At WSH,0,2,1,1,0,0,1,...,0,0,0,0,0,0,0,R,Jedd_Gyorko,14


In [38]:
schedules.head()

Unnamed: 0.1,Unnamed: 0,Opp,opp_pitcher,Gm#,Team,Date
0,0,SFG,Melancon,1,ARI,04/02/2017
1,2,SFG,Moore,3,ARI,04/05/2017
2,3,SFG,Samardzija,4,ARI,04/06/2017
3,4,CLE,Tomlin,5,ARI,04/07/2017
4,5,CLE,Bauer,6,ARI,04/08/2017


In [40]:
#pd.to_datetime(schedules['Date'],format="%m/%d/%Y",errors='coerce')

In [41]:
def append_opposing_pitcher(hitter_df,schedule_df):
    #Reformat hitter dates
    hitter_df['Game Date'] = hitter_df['Game Date'].apply(lambda x: x.replace('\\r\\n','').strip())
    hitter_df['Game Date'] = pd.to_datetime(hitter_df['Game Date'],format="%m/%d/%Y",errors='coerce')
    schedule_df['Date'] = pd.to_datetime(schedule_df['Date'],format="%m/%d/%Y",errors='coerce')
    
    df1 = pd.merge(hitter_df,schedule_df[['Date','Opp','opp_pitcher','Gm#','Team']],how='left',
                   left_on=['Team','Game Date'],right_on=['Team','Date'])
    df1.dropna(inplace=True)
    return df1

In [42]:
all_hitter_data3 = append_opposing_pitcher(all_hitter_data2.copy(),schedules)

In [43]:
all_hitter_data3

Unnamed: 0,Game Date,Team,Opponent,GS,AB,Runs,Hits,2B,3B,HR,...,SF,GDP,CI,bats,hitter_name,FANTASY_PTS,Date,Opp,opp_pitcher,Gm#
0,2017-04-02,SLN,Vs. CHN,1,2,0,0,0,0,0,...,0,0,0,R,Jedd_Gyorko,0,2017-04-02,CHC,Montgomery,1.0
1,2017-04-06,SLN,Vs. CHN,1,4,0,0,0,0,0,...,0,0,0,R,Jedd_Gyorko,0,2017-04-06,CHC,Lackey,3.0
2,2017-04-08,SLN,Vs. CIN,1,3,1,2,1,0,0,...,0,0,0,R,Jedd_Gyorko,18,2017-04-08,CIN,Arroyo,5.0
3,2017-04-09,SLN,Vs. CIN,1,3,0,0,0,0,0,...,0,0,0,R,Jedd_Gyorko,0,2017-04-09,CIN,Feldman,6.0
4,2017-04-10,SLN,At WSH,0,2,1,1,0,0,1,...,0,0,0,R,Jedd_Gyorko,14,2017-04-10,WSN,Roark,7.0
5,2017-04-11,SLN,At WSH,1,4,0,1,0,0,0,...,0,0,0,R,Jedd_Gyorko,3,2017-04-11,WSN,Gonzalez,8.0
6,2017-04-12,SLN,At WSH,1,4,0,1,0,0,0,...,0,0,0,R,Jedd_Gyorko,8,2017-04-12,WSN,Scherzer,9.0
7,2017-04-15,SLN,At NYA,1,3,1,1,0,0,1,...,0,0,0,R,Jedd_Gyorko,14,2017-04-15,NYY,Sabathia,11.0
8,2017-04-16,SLN,At NYA,1,3,1,1,0,0,0,...,0,0,0,R,Jedd_Gyorko,7,2017-04-16,NYY,Pineda,12.0
9,2017-04-18,SLN,Vs. PIT,0,1,0,0,0,0,0,...,0,0,0,R,Jedd_Gyorko,0,2017-04-18,PIT,Kuhl,14.0


In [44]:
all_hitter_data3.to_csv('_data/hitter_data/hitter_data_2017_w_pitcher.csv',index=False)

### 3. Append Ballpark