# Write a new (better) scraper

In [1]:
import pandas as pd
import numpy as np
from lxml import etree
from selenium import webdriver
import time
import json
import re

In [2]:
def read_table( html_tree, 
                tablename ):
    
    # Make list to house dictionaries for each row
    rows = []
    tablepath = '//table[@id="{0}"]/tbody/tr'.format(tablename)
    
    # Split table into rows
    for row in html_tree.xpath(tablepath):
        
        # Make a dictionary to store each cell in the row
        rd = {}
        rowclass = row.xpath('./@class')
        try:
            rd["rowclass"] = rowclass[0]
        except:
            pass
        try:
            cells = [e for e in row.xpath('./td|./th')]
            for i, cell in enumerate(cells):
                
                # Depending on cell contents, add cell to row dict
                try:
                    txt = cell.xpath('./text()')
                    a_text = cell.xpath('./a/text()')
                    a_href = cell.xpath('./a/@href')
                    stat = cell.xpath('./@data-stat')
                
                    # Logic map for cell contents
                    if (len(txt) >= 1) and (len(a_text) >= 1):
                        # Have both links and standard text. Save both
                        rd[stat[0]+"_text"] = "brk, ".join(txt)
                        rd[stat[0]+"_a"] = ", ".join(a_text)
                        rd[stat[0]+"_href"] = ", ".join(a_href)
                    elif len(a_text) >= 1:
                        # Have just text from a link
                        rd[stat[0]+"_a"] = a_text[0]
                        rd[stat[0]+"_href"] = a_href[0]
                    else:
                        try:
                            # Maybe we just have text
                            rd[stat[0]] = txt[0]
                        except:
                            # If all fails, then we probably have no text
                            rd[stat[0]] = ""
                                        
                except:
                    print("Couldn't parse cell")
            
            
            # Add row dictionary to list of rows
            rows.append(rd)

        except:
            pass
        
    return rows

In [3]:
# One function to take an element tree and parse all of the tables on it
def get_tables(url):
    
    options = webdriver.ChromeOptions()
    options.add_argument('headless')

    driver = webdriver.Chrome(chrome_options=options)
    try:
        driver.get(url)
        page_html = driver.page_source
        tree = etree.HTML(page_html)
        tablenames = tree.xpath('//table/@id')
        
    except:
        print("webdriver failed to get url",url)
        tablenames = [""]
    driver.quit()
    
    tables = {}
    for tab in tablenames:
        try:
            tables[tab] = read_table(tree, tab)
        except:
            print("Failed to read table",tab)
            tables[tab] = ""
            
    return tables

In [9]:
page_dict = get_tables("https://www.pro-football-reference.com/boxscores/201511150gnb.htm")

Read scoring
Read game_info
Read officials
Read expected_points
Read team_stats
Read player_offense
Read player_defense
Read returns
Read kicking
Read home_starters
Read vis_starters
Read home_snap_counts
Read vis_snap_counts
Read targets_directions
Read rush_directions
Read pass_tackles
Read rush_tackles
Read home_drives
Read vis_drives
Read pbp_clone
Read pbp


In [10]:
pd.DataFrame(page_dict['home_drives'])

Unnamed: 0,drive_num,end_event,net_yds,play_count_tip,quarter,rowclass,start_at,time_start,time_total
0,1,Field Goal,57,,1,bold,GNB 17,15:00,4:54
1,2,Punt,2,,1,,GNB 39,8:20,1:05
2,3,Punt,25,,1,,GNB 16,6:24,2:29
3,4,Punt,2,,1,,GNB 21,1:45,1:57
4,5,Punt,46,,2,,GNB 7,13:27,5:11
5,6,Punt,7,,2,,GNB 5,1:30,0:45
6,7,End of Half,-1,,2,,GNB 20,0:12,0:12
7,8,Punt,2,,3,,GNB 20,13:33,0:45
8,9,Punt,41,,3,,GNB 16,11:20,3:10
9,10,Punt,1,,3,,GNB 8,1:23,1:34


In [13]:
with open("test.json","w") as f:
    json.dump(page_dict, f)

In [14]:
with open("test.json",'r') as f:
    read_dict = json.load(f)

In [16]:
pd.DataFrame(read_dict['vis_drives'])

Unnamed: 0,drive_num,end_event,net_yds,play_count_tip,quarter,rowclass,start_at,time_start,time_total
0,1,Punt,-1,,1,,DET 24,10:06,1:46
1,2,Punt,2,,1,,DET 26,7:15,0:51
2,3,Punt,16,,1,,DET 15,3:55,2:10
3,4,Punt,8,,2,,DET 46,14:48,1:21
4,5,Punt,55,,2,,DET 3,8:16,6:46
5,6,Field Goal,22,,2,bold,DET 47,0:45,0:33
6,7,Touchdown,1,,3,bold,GNB 1,15:00,1:27
7,8,Punt,3,,3,,GNB 47,12:48,1:28
8,9,Interception,70,,3,,DET 7,8:10,6:47
9,10,Field Goal,22,,4,bold,DET 45,14:49,1:39


# Work on getting additional pages

In [17]:
page_dict = get_tables("https://www.pro-football-reference.com/teams/gnb/2017.htm")

In [18]:
page_dict.keys()

dict_keys(['team_stats_clone', 'team_stats', 'games_clone', 'games', 'team_conversions', 'passing_clone', 'passing', 'rushing_and_receiving', 'returns', 'kicking_clone', 'kicking', 'defense', 'scoring_clone', 'scoring', 'team_td_log', 'opp_td_log_clone', 'opp_td_log'])

In [25]:
pd.DataFrame(page_dict['games']).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 25 columns):
boxscore_word       17 non-null object
exp_pts_def         17 non-null object
exp_pts_off         17 non-null object
exp_pts_st          17 non-null object
first_down_def      17 non-null object
first_down_off      17 non-null object
game_date           17 non-null object
game_day_of_week    17 non-null object
game_location       17 non-null object
game_outcome        17 non-null object
game_time           17 non-null object
opp                 17 non-null object
overtime            17 non-null object
pass_yds_def        17 non-null object
pass_yds_off        17 non-null object
pts_def             17 non-null object
pts_off             17 non-null object
rush_yds_def        17 non-null object
rush_yds_off        17 non-null object
team_record         17 non-null object
to_def              17 non-null object
to_off              17 non-null object
week_num            17 non-null object


In [32]:
season_2017 = get_tables("https://www.pro-football-reference.com/years/2017/index.htm")

In [28]:
season_2017.keys()

dict_keys(['AFC', 'NFC', 'playoff_results', 'afc_playoff_standings', 'nfc_playoff_standings', 'team_stats_clone', 'team_stats', 'passing_clone', 'passing', 'rushing', 'returns', 'kicking_clone', 'kicking', 'team_scoring_clone', 'team_scoring', 'team_conversions', 'drives'])

In [37]:
results_2017 = pd.concat( 
          [pd.DataFrame(season_2017['AFC']),
           pd.DataFrame(season_2017['NFC'])] )

In [41]:
results_2017.team_href.unique()

array([nan, '/teams/nwe/2017.htm', '/teams/buf/2017.htm',
       '/teams/mia/2017.htm', '/teams/nyj/2017.htm', '/teams/pit/2017.htm',
       '/teams/rav/2017.htm', '/teams/cin/2017.htm', '/teams/cle/2017.htm',
       '/teams/jax/2017.htm', '/teams/oti/2017.htm', '/teams/htx/2017.htm',
       '/teams/clt/2017.htm', '/teams/kan/2017.htm', '/teams/sdg/2017.htm',
       '/teams/rai/2017.htm', '/teams/den/2017.htm', '/teams/phi/2017.htm',
       '/teams/dal/2017.htm', '/teams/was/2017.htm', '/teams/nyg/2017.htm',
       '/teams/min/2017.htm', '/teams/det/2017.htm', '/teams/gnb/2017.htm',
       '/teams/chi/2017.htm', '/teams/nor/2017.htm', '/teams/car/2017.htm',
       '/teams/atl/2017.htm', '/teams/tam/2017.htm', '/teams/ram/2017.htm',
       '/teams/sea/2017.htm', '/teams/crd/2017.htm', '/teams/sfo/2017.htm'], dtype=object)

In [65]:
teamyear = {}
for x in results_2017.team_href.unique()[:]:
    
    url = "http://www.pro-football-reference.com"+str(x)
    team_and_season = "".join( url.replace('.','/').split('/')[6:8] )
    teamyear[team_and_season] = {"":""}
    tries = 1
    while (tries < 3) and ( teamyear[team_and_season] == {"":""} ):
        print("reading",url)
        teamyear_page = get_tables(url)
        teamyear[team_and_season] = teamyear_page
        tries += 1
        time.sleep(0.5)

#teamyear

reading http://www.pro-football-reference.comnan
reading http://www.pro-football-reference.com/teams/nwe/2017.htm
webdriver failed to get url http://www.pro-football-reference.com/teams/nwe/2017.htm
Failed to read table 
reading http://www.pro-football-reference.com/teams/nwe/2017.htm
reading http://www.pro-football-reference.com/teams/buf/2017.htm
reading http://www.pro-football-reference.com/teams/mia/2017.htm
reading http://www.pro-football-reference.com/teams/nyj/2017.htm
reading http://www.pro-football-reference.com/teams/pit/2017.htm
reading http://www.pro-football-reference.com/teams/rav/2017.htm
reading http://www.pro-football-reference.com/teams/cin/2017.htm
reading http://www.pro-football-reference.com/teams/cle/2017.htm
reading http://www.pro-football-reference.com/teams/jax/2017.htm
reading http://www.pro-football-reference.com/teams/oti/2017.htm
reading http://www.pro-football-reference.com/teams/htx/2017.htm
reading http://www.pro-football-reference.com/teams/clt/2017.htm

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [66]:
teamyear['pit2017']

{'defense': [{'age': '28',
   'def_int': '1',
   'def_int_long': '13',
   'def_int_td': '0',
   'def_int_yds': '13',
   'fumbles': '',
   'fumbles_forced': '',
   'fumbles_rec': '',
   'fumbles_rec_td': '',
   'fumbles_rec_yds': '',
   'g': '16',
   'gs': '16',
   'pass_defended': '1',
   'player_a': 'Vince Williams',
   'player_href': '/players/W/WillVi01.htm',
   'pos': 'RILB',
   'sacks': '8.0',
   'safety_md': '',
   'tackles_assists': '20',
   'tackles_solo': '68',
   'uniform_number': '98'},
  {'age': '24',
   'def_int': '3',
   'def_int_long': '41',
   'def_int_td': '0',
   'def_int_yds': '76',
   'fumbles': '0',
   'fumbles_forced': '1',
   'fumbles_rec': '0',
   'fumbles_rec_td': '0',
   'fumbles_rec_yds': '0',
   'g': '16',
   'gs': '16',
   'pass_defended': '8',
   'player_a': 'Sean Davis',
   'player_href': '/players/D/DaviSe00.htm',
   'pos': 'SS',
   'sacks': '1.0',
   'safety_md': '',
   'tackles_assists': '21',
   'tackles_solo': '69',
   'uniform_number': '28'},
  {'ag

In [67]:
teamyear.pop("")

{}

In [71]:
pd.DataFrame(teamyear['nwe2017']['games']).boxscore_word_href.values

array(['/boxscores/201709070nwe.htm', '/boxscores/201709170nor.htm',
       '/boxscores/201709240nwe.htm', '/boxscores/201710010nwe.htm',
       '/boxscores/201710050tam.htm', '/boxscores/201710150nyj.htm',
       '/boxscores/201710220nwe.htm', '/boxscores/201710290nwe.htm', nan,
       '/boxscores/201711120den.htm', '/boxscores/201711190rai.htm',
       '/boxscores/201711260nwe.htm', '/boxscores/201712030buf.htm',
       '/boxscores/201712110mia.htm', '/boxscores/201712170pit.htm',
       '/boxscores/201712240nwe.htm', '/boxscores/201712310nwe.htm', nan,
       '/boxscores/201801130nwe.htm', '/boxscores/201801210nwe.htm',
       '/boxscores/201802040nwe.htm'], dtype=object)

# Start grabbing data for past 25 years

In [73]:
# Get main page for each season
league_season_pages = {}
for i in range(25):
    year = str(2018 - i)
    url = "https://www.pro-football-reference.com/years/"+year+"/index.htm"

    league_season_pages[year] = {"":""}
    tries = 1
    while (tries < 3) and ( league_season_pages[year] == {"":""} ):
        print("reading",url)
        leagueyear_page = get_tables(url)
        league_season_pages[year] = leagueyear_page
        tries += 1
        time.sleep(0.5)
    

with open("../pfr_pages/league_season_pages.json", "w") as f:
    json.dump(league_season_pages, f)

reading https://www.pro-football-reference.com/years/2018/index.htm
reading https://www.pro-football-reference.com/years/2017/index.htm
reading https://www.pro-football-reference.com/years/2016/index.htm
reading https://www.pro-football-reference.com/years/2015/index.htm
reading https://www.pro-football-reference.com/years/2014/index.htm
reading https://www.pro-football-reference.com/years/2013/index.htm
reading https://www.pro-football-reference.com/years/2012/index.htm
reading https://www.pro-football-reference.com/years/2011/index.htm
reading https://www.pro-football-reference.com/years/2010/index.htm
reading https://www.pro-football-reference.com/years/2009/index.htm
reading https://www.pro-football-reference.com/years/2008/index.htm
reading https://www.pro-football-reference.com/years/2007/index.htm
reading https://www.pro-football-reference.com/years/2006/index.htm
reading https://www.pro-football-reference.com/years/2005/index.htm
reading https://www.pro-football-reference.com/y

In [86]:
# Get page for each team in each season

for i in range(24):
    year = str(2017 - i)
    team_season_pages = {}
    
    team_results = pd.concat( [
           pd.DataFrame(league_season_pages[year]['AFC']),
           pd.DataFrame(league_season_pages[year]['NFC']) ] )
    teamids = team_results.team_href.unique()

    for x in [s for s in teamids[:] if 'nan' not in str(s)]:
        url = "http://www.pro-football-reference.com"+str(x)
        team_and_season = "".join( url.replace('.','/').split('/')[6:8] )
        team_season_pages[team_and_season] = {"":""}
        tries = 1
        while (tries < 3) and ( team_season_pages[team_and_season] == {"":""} ):
            print("reading",url)
            team_season_pages[team_and_season] = get_tables(url)
            tries += 1
            time.sleep(0.25)
            
    with open("../pfr_pages/team_season_pages_"+year+".json", "w") as f:
        json.dump(team_season_pages, f)

reading http://www.pro-football-reference.com/teams/nwe/2017.htm
reading http://www.pro-football-reference.com/teams/buf/2017.htm
reading http://www.pro-football-reference.com/teams/mia/2017.htm
webdriver failed to get url http://www.pro-football-reference.com/teams/mia/2017.htm
Failed to read table 
reading http://www.pro-football-reference.com/teams/mia/2017.htm
reading http://www.pro-football-reference.com/teams/nyj/2017.htm
reading http://www.pro-football-reference.com/teams/pit/2017.htm
reading http://www.pro-football-reference.com/teams/rav/2017.htm
reading http://www.pro-football-reference.com/teams/cin/2017.htm
reading http://www.pro-football-reference.com/teams/cle/2017.htm
reading http://www.pro-football-reference.com/teams/jax/2017.htm
reading http://www.pro-football-reference.com/teams/oti/2017.htm
reading http://www.pro-football-reference.com/teams/htx/2017.htm
reading http://www.pro-football-reference.com/teams/clt/2017.htm
reading http://www.pro-football-reference.com/te

reading http://www.pro-football-reference.com/teams/chi/2014.htm
reading http://www.pro-football-reference.com/teams/car/2014.htm
reading http://www.pro-football-reference.com/teams/nor/2014.htm
reading http://www.pro-football-reference.com/teams/atl/2014.htm
reading http://www.pro-football-reference.com/teams/tam/2014.htm
reading http://www.pro-football-reference.com/teams/sea/2014.htm
reading http://www.pro-football-reference.com/teams/crd/2014.htm
reading http://www.pro-football-reference.com/teams/sfo/2014.htm
reading http://www.pro-football-reference.com/teams/ram/2014.htm
reading http://www.pro-football-reference.com/teams/nwe/2013.htm
reading http://www.pro-football-reference.com/teams/nyj/2013.htm
reading http://www.pro-football-reference.com/teams/mia/2013.htm
reading http://www.pro-football-reference.com/teams/buf/2013.htm
reading http://www.pro-football-reference.com/teams/cin/2013.htm
reading http://www.pro-football-reference.com/teams/pit/2013.htm
reading http://www.pro-fo

reading http://www.pro-football-reference.com/teams/det/2010.htm
reading http://www.pro-football-reference.com/teams/min/2010.htm
reading http://www.pro-football-reference.com/teams/atl/2010.htm
reading http://www.pro-football-reference.com/teams/nor/2010.htm
reading http://www.pro-football-reference.com/teams/tam/2010.htm
reading http://www.pro-football-reference.com/teams/car/2010.htm
reading http://www.pro-football-reference.com/teams/sea/2010.htm
reading http://www.pro-football-reference.com/teams/ram/2010.htm
reading http://www.pro-football-reference.com/teams/sfo/2010.htm
reading http://www.pro-football-reference.com/teams/crd/2010.htm
reading http://www.pro-football-reference.com/teams/nwe/2009.htm
reading http://www.pro-football-reference.com/teams/nyj/2009.htm
reading http://www.pro-football-reference.com/teams/mia/2009.htm
reading http://www.pro-football-reference.com/teams/buf/2009.htm
reading http://www.pro-football-reference.com/teams/cin/2009.htm
reading http://www.pro-fo

reading http://www.pro-football-reference.com/teams/gnb/2006.htm
reading http://www.pro-football-reference.com/teams/min/2006.htm
reading http://www.pro-football-reference.com/teams/det/2006.htm
reading http://www.pro-football-reference.com/teams/nor/2006.htm
reading http://www.pro-football-reference.com/teams/car/2006.htm
reading http://www.pro-football-reference.com/teams/atl/2006.htm
reading http://www.pro-football-reference.com/teams/tam/2006.htm
reading http://www.pro-football-reference.com/teams/sea/2006.htm
reading http://www.pro-football-reference.com/teams/ram/2006.htm
reading http://www.pro-football-reference.com/teams/sfo/2006.htm
reading http://www.pro-football-reference.com/teams/crd/2006.htm
reading http://www.pro-football-reference.com/teams/nwe/2005.htm
reading http://www.pro-football-reference.com/teams/mia/2005.htm
reading http://www.pro-football-reference.com/teams/buf/2005.htm
reading http://www.pro-football-reference.com/teams/nyj/2005.htm
reading http://www.pro-fo

reading http://www.pro-football-reference.com/teams/gnb/2002.htm
reading http://www.pro-football-reference.com/teams/min/2002.htm
reading http://www.pro-football-reference.com/teams/chi/2002.htm
reading http://www.pro-football-reference.com/teams/det/2002.htm
reading http://www.pro-football-reference.com/teams/tam/2002.htm
reading http://www.pro-football-reference.com/teams/atl/2002.htm
reading http://www.pro-football-reference.com/teams/nor/2002.htm
reading http://www.pro-football-reference.com/teams/car/2002.htm
reading http://www.pro-football-reference.com/teams/sfo/2002.htm
reading http://www.pro-football-reference.com/teams/ram/2002.htm
reading http://www.pro-football-reference.com/teams/sea/2002.htm
reading http://www.pro-football-reference.com/teams/crd/2002.htm
reading http://www.pro-football-reference.com/teams/nwe/2001.htm
reading http://www.pro-football-reference.com/teams/mia/2001.htm
reading http://www.pro-football-reference.com/teams/nyj/2001.htm
reading http://www.pro-fo

KeyboardInterrupt: 

In [88]:
for i in range(7):
    year = str(2001 - i)
    team_season_pages = {}
    
    team_results = pd.concat( [
           pd.DataFrame(league_season_pages[year]['AFC']),
           pd.DataFrame(league_season_pages[year]['NFC']) ] )
    teamids = team_results.team_href.unique()

    for x in [s for s in teamids[:] if 'nan' not in str(s)]:
        url = "http://www.pro-football-reference.com"+str(x)
        team_and_season = "".join( url.replace('.','/').split('/')[6:8] )
        team_season_pages[team_and_season] = {"":""}
        tries = 1
        while (tries < 4) and ( team_season_pages[team_and_season] == {"":""} ):
            print("reading",url)
            team_season_pages[team_and_season] = get_tables(url)
            tries += 1
            time.sleep(0.25)
            
    with open("../pfr_pages/team_season_pages_"+year+".json", "w") as f:
        json.dump(team_season_pages, f)

reading http://www.pro-football-reference.com/teams/nwe/2001.htm
reading http://www.pro-football-reference.com/teams/mia/2001.htm
reading http://www.pro-football-reference.com/teams/nyj/2001.htm
reading http://www.pro-football-reference.com/teams/clt/2001.htm
reading http://www.pro-football-reference.com/teams/buf/2001.htm
webdriver failed to get url http://www.pro-football-reference.com/teams/buf/2001.htm
Failed to read table 
reading http://www.pro-football-reference.com/teams/buf/2001.htm
reading http://www.pro-football-reference.com/teams/pit/2001.htm
reading http://www.pro-football-reference.com/teams/rav/2001.htm
webdriver failed to get url http://www.pro-football-reference.com/teams/rav/2001.htm
Failed to read table 
reading http://www.pro-football-reference.com/teams/rav/2001.htm
webdriver failed to get url http://www.pro-football-reference.com/teams/rav/2001.htm
Failed to read table 
reading http://www.pro-football-reference.com/teams/cle/2001.htm
webdriver failed to get url h

webdriver failed to get url http://www.pro-football-reference.com/teams/clt/1998.htm
Failed to read table 
reading http://www.pro-football-reference.com/teams/clt/1998.htm
reading http://www.pro-football-reference.com/teams/jax/1998.htm
webdriver failed to get url http://www.pro-football-reference.com/teams/jax/1998.htm
Failed to read table 
reading http://www.pro-football-reference.com/teams/jax/1998.htm
reading http://www.pro-football-reference.com/teams/oti/1998.htm
reading http://www.pro-football-reference.com/teams/pit/1998.htm
reading http://www.pro-football-reference.com/teams/rav/1998.htm
reading http://www.pro-football-reference.com/teams/cin/1998.htm
webdriver failed to get url http://www.pro-football-reference.com/teams/cin/1998.htm
Failed to read table 
reading http://www.pro-football-reference.com/teams/cin/1998.htm
reading http://www.pro-football-reference.com/teams/den/1998.htm
webdriver failed to get url http://www.pro-football-reference.com/teams/den/1998.htm
Failed to

reading http://www.pro-football-reference.com/teams/tam/1996.htm
reading http://www.pro-football-reference.com/teams/det/1996.htm
reading http://www.pro-football-reference.com/teams/car/1996.htm
reading http://www.pro-football-reference.com/teams/sfo/1996.htm
reading http://www.pro-football-reference.com/teams/ram/1996.htm
reading http://www.pro-football-reference.com/teams/atl/1996.htm
reading http://www.pro-football-reference.com/teams/nor/1996.htm
reading http://www.pro-football-reference.com/teams/buf/1995.htm
reading http://www.pro-football-reference.com/teams/clt/1995.htm
reading http://www.pro-football-reference.com/teams/mia/1995.htm
reading http://www.pro-football-reference.com/teams/nwe/1995.htm
webdriver failed to get url http://www.pro-football-reference.com/teams/nwe/1995.htm
Failed to read table 
reading http://www.pro-football-reference.com/teams/nwe/1995.htm
reading http://www.pro-football-reference.com/teams/nyj/1995.htm
reading http://www.pro-football-reference.com/te

# Start looking at individual games

In [4]:
yr = 2000
year_file = "../pfr_pages/team_season_pages_"+str(yr)+".json"
with open(year_file,'r') as f:
    year_dict = json.load(f)

all_game_dfs = []
for k in year_dict:
    all_game_dfs.append( pd.DataFrame(year_dict[k]['games']) )
    
game_urls = pd.concat(all_game_dfs).boxscore_word_href.unique()

In [41]:
# Write the game_pages.json file for the first time
game_pages = {}

for gid in game_urls[:2]:
    url = "http://www.pro-football-reference.com"+str(gid)
    gameid = url.replace(".","/").split("/")[6]
    
    game_pages[gameid] = {"":""}   
    tries = 1
    while (tries <= 3) and ( game_pages[gameid] == {"":""} ):
        print("reading",url)
        game_pages[gameid] = get_tables(url)
        tries += 1
        time.sleep(0.25)

with open("../pfr_pages/game_pages.json", "a+") as f:
    json.dump(game_pages, f)

reading http://www.pro-football-reference.com/boxscores/201709070nwe.htm
reading http://www.pro-football-reference.com/boxscores/201709170nor.htm


In [5]:
game_pages = {}

with open("../pfr_pages/game_pages.json", "r") as f:
    existing_games = json.load(f)

for gid in game_urls[:]:
    url = "http://www.pro-football-reference.com"+str(gid)
    try:
        gameid = url.replace(".","/").split("/")[6]
    except:
        print("weird gid =",gid)
        gameid = 'nan'
    
    # Check existing games for this gameid
    # If it's not there, grab it from online
    if (gameid in existing_games) or (str(gid).lower() == 'nan'):
        print("Already have",gameid)
        
    else:
        game_pages[gameid] = {"":""}   
        tries = 1
        while (tries <= 3) and ( game_pages[gameid] == {"":""} ):
            print("reading",url)
            game_pages[gameid] = get_tables(url)
            tries += 1
            time.sleep(0.1)

with open("../pfr_pages/game_pages.json", "w") as f:
    # Combine dictionaries and write to the same file as above
    all_games = {**existing_games, **game_pages}
    json.dump(all_games, f)
    

Already have 200009030mia
Already have 200009100min
Already have 200009170mia
Already have 200009240mia
Already have 200010010cin
Already have 200010080mia
weird gid = nan
Already have nan
Already have 200010230nyj
Already have 200010290mia
Already have 200011050det
Already have 200011120sdg
Already have 200011190mia
Already have 200011260clt
Already have 200012030buf
Already have 200012100mia
Already have 200012170mia
Already have 200012240nwe
Already have 200012300mia
Already have 200101060rai
Already have 200009030kan
Already have 200009100clt
Already have 200009250clt
Already have 200010010buf
Already have 200010080nwe
Already have 200010150sea
Already have 200010220clt
Already have 200010290clt
Already have 200011050chi
Already have 200011120clt
Already have 200011190gnb
Already have 200012030nyj
Already have 200012110clt
Already have 200012240clt
Already have 200009030gnb
Already have 200009110nyj
Already have 200009170nyj
Already have 200009240tam
Already have 200010080nyj
Alrea

reading http://www.pro-football-reference.com/boxscores/200010010det.htm
reading http://www.pro-football-reference.com/boxscores/200010090min.htm
reading http://www.pro-football-reference.com/boxscores/200010150chi.htm
reading http://www.pro-football-reference.com/boxscores/200010290tam.htm
reading http://www.pro-football-reference.com/boxscores/200011060gnb.htm
reading http://www.pro-football-reference.com/boxscores/200011190min.htm
reading http://www.pro-football-reference.com/boxscores/200011300min.htm
reading http://www.pro-football-reference.com/boxscores/200012100ram.htm
reading http://www.pro-football-reference.com/boxscores/200012170min.htm
reading http://www.pro-football-reference.com/boxscores/200101060min.htm
reading http://www.pro-football-reference.com/boxscores/200009100tam.htm
reading http://www.pro-football-reference.com/boxscores/200009170det.htm
reading http://www.pro-football-reference.com/boxscores/200010190tam.htm
reading http://www.pro-football-reference.com/boxsc

In [4]:
# Now go back and get the games that got missed earlier
game_pages = {}
with open("../pfr_pages/game_pages.json", "r") as f:
    existing_games = json.load(f)

for gid in existing_games.keys():
    if (existing_games[gid] == {"":""}) or (existing_games[gid] == {}):
        tries = 1
        game_pages[gid] = {"":""}
        while (tries <= 5) and (game_pages[gid] == {"":""}):
            url = "http://www.pro-football-reference.com/boxscores/"+str(gid)+".htm"
            print("reading",url)
            game_pages[gid] = get_tables(url)
            tries += 1
            
with open("../pfr_pages/game_pages.json", "w") as f:
    # Combine dictionaries and write to the same file as above
    all_games = {**existing_games, **game_pages}
    json.dump(all_games, f)

reading http://www.pro-football-reference.com/boxscores/201212020rav.htm
reading http://www.pro-football-reference.com/boxscores/201109180buf.htm
reading http://www.pro-football-reference.com/boxscores/201012120det.htm
webdriver failed to get url http://www.pro-football-reference.com/boxscores/201012120det.htm
Failed to read table 
reading http://www.pro-football-reference.com/boxscores/201012120det.htm
reading http://www.pro-football-reference.com/boxscores/200910040nwe.htm
reading http://www.pro-football-reference.com/boxscores/200911260den.htm


In [None]:
all_games['201012120']

# Scrape Draft pages

In [5]:
# Get main page for each season
draft_pages = {}
for i in range(25):
    year = str(2018 - i)
    url = "https://www.pro-football-reference.com/years/"+year+"/draft.htm"

    draft_pages[year] = {"":""}
    tries = 1
    while (tries < 3) and ( draft_pages[year] == {"":""} ):
        print("reading",url)
        year_page = get_tables(url)
        draft_pages[year] = year_page
        tries += 1
        time.sleep(0.5)
    

with open("../pfr_pages/draft_pages.json", "w") as f:
    json.dump(draft_pages, f)

reading https://www.pro-football-reference.com/years/2018/draft.htm
reading https://www.pro-football-reference.com/years/2017/draft.htm
reading https://www.pro-football-reference.com/years/2016/draft.htm
reading https://www.pro-football-reference.com/years/2015/draft.htm
webdriver failed to get url https://www.pro-football-reference.com/years/2015/draft.htm
Failed to read table 
reading https://www.pro-football-reference.com/years/2015/draft.htm
reading https://www.pro-football-reference.com/years/2014/draft.htm
reading https://www.pro-football-reference.com/years/2013/draft.htm
reading https://www.pro-football-reference.com/years/2012/draft.htm
reading https://www.pro-football-reference.com/years/2011/draft.htm
reading https://www.pro-football-reference.com/years/2010/draft.htm
reading https://www.pro-football-reference.com/years/2009/draft.htm
reading https://www.pro-football-reference.com/years/2008/draft.htm
reading https://www.pro-football-reference.com/years/2007/draft.htm
readin

In [9]:
pd.DataFrame(draft_pages['2017']['drafts'])

Unnamed: 0,age,all_pros_first_team,career_av,college_id,college_id_a,college_id_href,college_link,college_link_a,college_link_href,def_int,...,rush_att,rush_td,rush_yds,sacks,tackles_solo,team,team_a,team_href,year_max,years_as_primary_starter
0,21,0,4,,Texas A&M,/schools/texasam/,,College Stats,http://www.sports-reference.com/cfb/players/my...,,...,0,0,0,7.0,19,,CLE,/teams/cle/2017_draft.htm,2017,1
1,23,0,7,,North Carolina,/schools/nocarolina/,,College Stats,http://www.sports-reference.com/cfb/players/mi...,,...,41,2,248,,,,CHI,/teams/chi/2017_draft.htm,2017,1
2,21,0,6,,Stanford,/schools/stanford/,,College Stats,http://www.sports-reference.com/cfb/players/so...,,...,0,0,0,3.0,34,,SFO,/teams/sfo/2017_draft.htm,2017,1
3,22,0,8,,LSU,/schools/lsu/,,College Stats,http://www.sports-reference.com/cfb/players/le...,,...,268,9,1040,,,,JAX,/teams/jax/2017_draft.htm,2017,1
4,22,0,3,,West. Michigan,/schools/westmichigan/,,College Stats,http://www.sports-reference.com/cfb/players/co...,,...,0,0,0,,,,TEN,/teams/oti/2017_draft.htm,2017,1
5,21,0,7,,LSU,/schools/lsu/,,College Stats,http://www.sports-reference.com/cfb/players/ja...,,...,0,0,0,2.0,63,,NYJ,/teams/nyj/2017_draft.htm,2017,1
6,22,0,1,,Clemson,/schools/clemson/,,College Stats,http://www.sports-reference.com/cfb/players/mi...,,...,0,0,0,,,,LAC,/teams/sdg/2017_draft.htm,2017,0
7,21,0,9,,Stanford,/schools/stanford/,,College Stats,http://www.sports-reference.com/cfb/players/ch...,,...,117,2,435,,,,CAR,/teams/car/2017_draft.htm,2017,1
8,22,0,0,,Washington,/schools/washington/,,College Stats,http://www.sports-reference.com/cfb/players/jo...,,...,1,0,12,,,,CIN,/teams/cin/2017_draft.htm,2017,0
9,22,0,1,,Texas Tech,/schools/texastech/,,College Stats,http://www.sports-reference.com/cfb/players/pa...,,...,7,0,10,,,,KAN,/teams/kan/2017_draft.htm,2017,0


# Scrape Combine Results

In [10]:
# Get main page for each season
combine_pages = {}
for i in range(25):
    year = str(2018 - i)
    url = "https://www.pro-football-reference.com/draft/"+year+"-combine.htm"

    combine_pages[year] = {"":""}
    tries = 1
    while (tries < 3) and ( combine_pages[year] == {"":""} ):
        print("reading",url)
        year_page = get_tables(url)
        combine_pages[year] = year_page
        tries += 1
        time.sleep(0.5)
    

with open("../pfr_pages/combine_pages.json", "w") as f:
    json.dump(combine_pages, f)

reading https://www.pro-football-reference.com/draft/2018-combine.htm
reading https://www.pro-football-reference.com/draft/2017-combine.htm
reading https://www.pro-football-reference.com/draft/2016-combine.htm
reading https://www.pro-football-reference.com/draft/2015-combine.htm
reading https://www.pro-football-reference.com/draft/2014-combine.htm
reading https://www.pro-football-reference.com/draft/2013-combine.htm
reading https://www.pro-football-reference.com/draft/2012-combine.htm
reading https://www.pro-football-reference.com/draft/2011-combine.htm
reading https://www.pro-football-reference.com/draft/2010-combine.htm
reading https://www.pro-football-reference.com/draft/2009-combine.htm
reading https://www.pro-football-reference.com/draft/2008-combine.htm
reading https://www.pro-football-reference.com/draft/2007-combine.htm
reading https://www.pro-football-reference.com/draft/2006-combine.htm
reading https://www.pro-football-reference.com/draft/2005-combine.htm
reading https://www.

In [13]:
pd.DataFrame(combine_pages['2017']['combine'])

Unnamed: 0,bench_reps,broad_jump,college,college_a,college_href,cone,draft_info,draft_info_a,draft_info_href,draft_info_text,...,player_a,player_href,pos,rowclass,school_name,school_name_a,school_name_href,shuttle,vertical,weight
0,8,125,,College Stats,https://www.sports-reference.com/cfb/players/r...,6.98,,2017,/years/2017/draft.htm,Minnesota Vikings / 5th / 170th pick /,...,Rodney Adams,/players/A/AdamRo01.htm,WR,,,South Florida,/schools/soflorida/,4.28,29.5,189
1,22,108,,College Stats,https://www.sports-reference.com/cfb/players/m...,7.62,,2017,/years/2017/draft.htm,Green Bay Packers / 3rd / 93rd pick /,...,Montravius Adams,/players/A/AdamMo00.htm,DT,,,Auburn,/schools/auburn/,4.89,29.0,304
2,18,120,,College Stats,https://www.sports-reference.com/cfb/players/j...,6.96,,2017,/years/2017/draft.htm,New York Jets / 1st / 6th pick /,...,Jamal Adams,/players/A/AdamJa00.htm,SS,,,LSU,/schools/lsu/,4.13,31.5,214
3,8,123,,College Stats,https://www.sports-reference.com/cfb/players/q...,6.73,,,,,...,Quincy Adeboyejo,/players/A/AdebQu00.htm,WR,,,Mississippi,/schools/mississippi/,4.14,34.5,197
4,21,108,,College Stats,https://www.sports-reference.com/cfb/players/j...,7.49,,2017,/years/2017/draft.htm,Washington Redskins / 1st / 17th pick /,...,Jonathan Allen,/players/A/AlleJo01.htm,DT,,,Alabama,/schools/alabama/,4.50,30.0,286
5,15,117,,College Stats,https://www.sports-reference.com/cfb/players/b...,6.64,,2017,/years/2017/draft.htm,Pittsburgh Steelers / 5th / 173rd pick /,...,Brian Allen,/players/A/AlleBr01.htm,CB,,,Utah,/schools/utah/,4.34,34.5,215
6,,,,College Stats,https://www.sports-reference.com/cfb/players/r...,,,2017,/years/2017/draft.htm,Washington Redskins / 2nd / 49th pick /,...,Ryan Anderson,/players/A/AndeRy00.htm,OLB,,,Alabama,/schools/alabama/,,,253
7,16,116,,College Stats,https://www.sports-reference.com/cfb/players/a...,6.88,,2017,/years/2017/draft.htm,New Orleans Saints / 3rd / 76th pick /,...,Alex Anzalone,/players/A/AnzaAl00.htm,OLB,,,Florida,/schools/florida/,4.25,30.5,241
8,35,102,,College Stats,https://www.sports-reference.com/cfb/players/i...,7.83,,2017,/years/2017/draft.htm,Miami Dolphins / 5th / 164th pick /,...,Isaac Asiata,/players/A/AsiaIs00.htm,OG,,,Utah,/schools/utah/,4.93,25.5,323
9,24,107,,,,8.13,,,,,...,,,OG,,,Charleston Southern,/schools//,4.90,27.5,301
