In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time

In [2]:
def make_soup(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'lxml')
    return soup

In [10]:
# Function to get gameIds/home/away for a particular year/week
results = {}
def get_gameId(year,week):
    # Make a soup object for the appropriate page
    url = "http://www.espn.com/nfl/schedule/_/week/{0}/year/{1}/seasontype/2".format(week,year)
    soup = make_soup(url)
    sched_page = soup.find("section",{"id":"main-container"})
    
    # Make a list for gameIds
    gameids = []
    for link in sched_page.find_all("a"):
        if "gameId" in link.get('href') and "watchespn"  not in link.get('href'):
            # Extract last bit of url listed
            s = "gameId="
            this_game = link.get('href').split(s)[1]
            gameids.append(this_game)
            # And add text displayed to a dictionary
#            results[this_game] = link.contents[0]
    
    hometeams = []
    awayteams = []
    teamnames = sched_page.find_all("a", {"class":"team-name"})
    for i, link in enumerate(teamnames):
        if "team/_/name" in link.get('href'):
            abbr = link.get('href').split("name")[1].split("/")[1]
            if i%2:
                hometeams.append(abbr)
            else:
                awayteams.append(abbr)
                
    hm = {}
    aw = {}
    for i, gameid in enumerate(gameids):
            hm[gameid] = hometeams[i]
            aw[gameid] = awayteams[i]
    
#    return gameids, results
    return (gameids, hm, aw)

In [11]:
gameids = []
gameresults = {}
gameyear = {}
gameweek = {}
hometeam = {}
awayteam = {}
for year in range(2018,2019):
    for week in range(1,18):
        print("Looking up gameIds for {0} week {1}".format(year,week))
        ids, home, away = get_gameId(year,week)
#        ids, results = get_gameId(year,week)
        gameids.append(ids)
        
        # Add entry in dictionaries for each gameId
        for i in ids:
            gameyear[i] = year
            gameweek[i] = week
            hometeam[i] = home[i]
            awayteam[i] = away[i]
        
        # Sleep so we don't get blocked
        time.sleep(0.5)
        
# Now flatten gameids, which is a list of weekly lists
ids = [i for sublist in gameids for i in sublist]
print(gameresults)

Looking up gameIds for 2018 week 1
Looking up gameIds for 2018 week 2
Looking up gameIds for 2018 week 3
Looking up gameIds for 2018 week 4
Looking up gameIds for 2018 week 5
Looking up gameIds for 2018 week 6
Looking up gameIds for 2018 week 7
Looking up gameIds for 2018 week 8
Looking up gameIds for 2018 week 9
Looking up gameIds for 2018 week 10
Looking up gameIds for 2018 week 11
Looking up gameIds for 2018 week 12
Looking up gameIds for 2018 week 13
Looking up gameIds for 2018 week 14
Looking up gameIds for 2018 week 15
Looking up gameIds for 2018 week 16
Looking up gameIds for 2018 week 17
{}


In [12]:
# Make dataframe for game-specific information using our dictionaries for year and week
data = [ {'gameId':i,
          'season':gameyear[i],
          'week':gameweek[i],
          'home':hometeam[i],
          'away':awayteam[i]}
#          'result':gameresults[i]}
        for i in ids]
gamedata_df = pd.DataFrame(data)
# Set the gameId as the unique identifier for each row
gamedata_df.set_index('gameId', inplace=True)
#gamedata_df['home'] = home
#gamedata_df['away'] = away
gamedata_df.head(10)

Unnamed: 0_level_0,away,home,season,week
gameId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
401030710,atl,phi,2018,1
401030718,pit,cle,2018,1
401030717,cin,ind,2018,1
401030716,ten,mia,2018,1
401030715,sf,min,2018,1
401030714,hou,ne,2018,1
401030713,tb,no,2018,1
401030712,jax,nyg,2018,1
401030711,buf,bal,2018,1
401030720,kc,lac,2018,1


In [13]:
gamedata_df.to_csv("../data/2018_schedule.csv")

In [14]:
pastgames = pd.read_csv("../data/espn_gamedata2009-2017.csv")
pastgames.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2306 entries, 0 to 2305
Data columns (total 10 columns):
gameId        2306 non-null int64
result        2306 non-null object
season        2306 non-null int64
week          2306 non-null int64
home          2297 non-null object
away          2297 non-null object
winner        2306 non-null object
home_score    2306 non-null object
away_score    2306 non-null object
OT            2306 non-null object
dtypes: int64(3), object(7)
memory usage: 180.2+ KB


In [15]:
pastgames.sample(5)

Unnamed: 0,gameId,result,season,week,home,away,winner,home_score,away_score,OT
830,321001006,"CHI 34, DAL 18",2012,4,CHI,DAL,CHI,34,18,0
1834,400874620,"SEA 37, SF 18",2016,3,SF,SEA,SEA,18,37,0
1327,400554288,"CHI 27, NYJ 19",2014,3,CHI,NYJ,CHI,27,19,0
1622,400791637,"HOU 31, JAX 20",2015,6,HOU,JAX,HOU,31,20,0
966,321209023,"SD 34, PIT 24",2012,14,SD,PIT,SD,34,24,0


In [17]:
pastgames.set_index('gameId', inplace=True)
pastgames.head(3)

Unnamed: 0_level_0,result,season,week,home,away,winner,home_score,away_score,OT
gameId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
290910023,"PIT 13, TEN 10 (OT)",2009,1,TEN,PIT,PIT,10,13,1
290913001,"ATL 19, MIA 7",2009,1,MIA,ATL,ATL,7,19,0
290913004,"DEN 12, CIN 7",2009,1,DEN,CIN,DEN,12,7,0


In [None]:
pastgames.