In [5]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np
import json

In [12]:
# site used for scraping data
baseUrl = 'https://www.nfl.com'
years = range(2015,2021) # includes 2015 based on yeehaw

In [46]:
# function for getting stats off nfl.com/stats/
def getStats(url, payload): 
    tableRows = []
    pagingText = True
    
    for key in payload:
        if payload[key]:
            url += payload[key]
            url += '/'
    response = requests.get(url)
    
    # get column names
    soup = BeautifulSoup(response.text,'html.parser')
    columnHeader = []
    if soup.find('table') == None: return pd.DataFrame()
        
    header = soup.find('table').find_all('th')
    for col in header:
        columnHeader.append(col.text)
    columnHeader = [c.strip('\n') for c in columnHeader]

    # search through pages and append to tableRows
    while pagingText:
        time.sleep(0.5)
        response = requests.get(url)
        print("url:",response.url)
        soup = BeautifulSoup(response.text,'html.parser')
        table = soup.find('table')
        rows = table.find_all('tr')

        for row in rows:
            cols = row.find_all('td')
            if cols:
                cols = [ele.text.strip() for ele in cols]
                cols = [ele.rstrip('\n') for ele in cols]
                tableRows.append([ele for ele in cols if ele]) # Get rid of empty values
        pagingText = soup.find("a", {"class": "nfl-o-table-pagination__next"})
        if pagingText:
            url = baseUrl + pagingText.get('href')
    
    resultsDf = pd.DataFrame(tableRows, columns=columnHeader)
    find = re.compile(r"^(\n*).*")
    resultsDf['Team'] = resultsDf['Team'].apply(lambda x: re.search(find,x).group())
    resultsDf = resultsDf.sort_values(by=['Team']).reset_index(drop=True)
    return resultsDf

In [48]:
# function for getting team stats, calls getStats()
def getTeamStats(url,side,categ,season,cols):
    allCols = ['Team','weekType']+cols
    seasonTypes = ["PRE","REG","POST"]
    df = None
    for st in seasonTypes:
        teamPayload = {"stats":"/stats","subject":"team-stats",
               "category":side,
               "statisticCategory":categ,
               "season":season,
               "seasonType":st,
               "list":"all"}
        tempDf = getStats(baseUrl,teamPayload)
        isEmpty = tempDf.empty
        if isEmpty:
            teamPayload['seasonType'] = "REG"
            tempDf = getStats(baseUrl,teamPayload)
            for i in cols:
                tempDf[i] = 0
        tempDf['weekType']=st
        tempDf = tempDf[allCols] 
        
        # change raw stats to ordering
        for i in cols:
            colName = i + '(o)'
            temp = pd.to_numeric(tempDf[i])
            tempDf[colName] = temp
            if isEmpty: continue
            tempDf = tempDf.sort_values(by=colName).reset_index(drop=True)
            tempDf[colName] = range(len(tempDf.index),0,-1)
            temp = tempDf[colName].astype(str)
            tempDf[colName] = temp
            
        df = pd.concat([df,tempDf])
    df['year'] = [season]*len(df)
    return df

In [50]:
# specific stats desired
poStats = ['Pass Yds','Yds/Att','Cmp %','TD','INT']
roStats = ['Rush Yds','YPC','TD','Rush FUM']
pdStats = ['Yds','Yds/Att','Cmp %','TD','INT']
rdStats = ['Rush Yds','YPC','TD','Rush FUM']

# function calls to get team stats
teamPODf = pd.DataFrame()
teamRODf = pd.DataFrame()
teamPDDf = pd.DataFrame()
teamRDDf = pd.DataFrame()
for i in years:
    teamPODf = teamPODf.append(getTeamStats(baseUrl,"offense","passing",str(i),poStats))
    teamRODf = teamRODf.append(getTeamStats(baseUrl,"offense","rushing",str(i),roStats))
    teamPDDf = teamPDDf.append(getTeamStats(baseUrl,"defense","passing",str(i),pdStats))
    teamRDDf = teamRDDf.append(getTeamStats(baseUrl,"defense","rushing",str(i),rdStats))
print('done')

url: https://www.nfl.com/stats/team-stats/offense/passing/2015/pre/all
url: https://www.nfl.com/stats/team-stats/offense/passing/2015/reg/all
url: https://www.nfl.com/stats/team-stats/offense/passing/2015/post/all
url: https://www.nfl.com/stats/team-stats/offense/rushing/2015/pre/all
url: https://www.nfl.com/stats/team-stats/offense/rushing/2015/reg/all
url: https://www.nfl.com/stats/team-stats/offense/rushing/2015/post/all
url: https://www.nfl.com/stats/team-stats/defense/passing/2015/pre/all
url: https://www.nfl.com/stats/team-stats/defense/passing/2015/reg/all
url: https://www.nfl.com/stats/team-stats/defense/passing/2015/post/all
url: https://www.nfl.com/stats/team-stats/defense/rushing/2015/pre/all
url: https://www.nfl.com/stats/team-stats/defense/rushing/2015/reg/all
url: https://www.nfl.com/stats/team-stats/defense/rushing/2015/post/all
url: https://www.nfl.com/stats/team-stats/offense/passing/2016/pre/all
url: https://www.nfl.com/stats/team-stats/offense/passing/2016/reg/all
ur

In [51]:
# send data frames to json files for copying to server
dfs = [teamPODf,teamRODf,teamPDDf,teamRDDf]
for i in range(len(dfs)):
    with open("teamStat"+str(i),"w") as outfile:
        outfile.write(dfs[i].to_json(orient='records'))

In [None]:
# # function for getting NFL schedule from nfl.com
# def getMatchups(url,payload):
#     for key in payload:
#         if payload[key]:
#             url += payload[key]
#             url += '/'

#     soup = BeautifulSoup(response.text,'html.parser')
#     matchups = []
#     columnHeader = ['matchupId', 'team1', 'team2', 'team1Score','team2Score','weekType','weekNum']
#     season = soup.find_all('select', {"class":"d3-o-dropdown"})[1].find_all("option")
#     seasonWeeks = []
#     for week in season:
#         seasonWeeks.append(week.get("value"))
#     for week in seasonWeeks:
#         time.sleep(2)
#         url = baseUrl + week
#         response = requests.get(url)
#         print(url)
#         soup = BeautifulSoup(response.text,'html.parser')
#         weekName = week.rsplit('/', 2)[-2]
#         weekType = re.findall("[a-zA-Z]+", weekName)[0]
#         weekNum = re.findall(r'\d+', weekName)[0]
#         games = soup.find_all('a', {"class": "nfl-c-matchup-strip__game"})
#         index = 0
#         for game in games:
#             gameSoup = game.find_all('span', {"class":"nfl-c-matchup-strip__team-abbreviation"})
#             matchup = []
#             matchup.append(str(index))
#             for team in gameSoup:
#                 matchup.append(re.sub(r'[^A-Za-z]', '', team.get_text()))
#             scoreSoup = game.find_all('div', {'class':"nfl-c-matchup-strip__team-score"})
#             if scoreSoup:
#                 for team in scoreSoup:
#                     matchup.append(team.get_text())
#             else:
#                 matchup.append("")
#                 matchup.append("")
#             matchup.append(weekType)
#             matchup.append(weekNum)
#             matchups.append(matchup)
#             index += 1
#     matchupsDf = pd.DataFrame(matchups,columns=columnHeader)
#     return matchupsDf


# function for getting NFL schedule from espn.com
def getMatchups(baseUrl,payload):
    # get url to get array of weeks
    url = baseUrl
    for key in payload:
        if payload[key]:
            url += payload[key]
            url += '/'
    response = requests.get(url,timeout=(30, 27))
    soup = BeautifulSoup(response.text,'html.parser')
    matchups = []
    columnHeader = ['matchupId', 'team1', 'team2', 'team1Score','team2Score','weekType','weekNum','year']
    season = soup.find_all('ul', {"class":"dropdown-menu med"})[1].find_all('li')
    
    # seasonWeeks is the array of weeks
    seasonWeeks = []
    for week in season:
        seasonWeeks.append(week.find('a').get('href'))
    
    # iterate through weeks
    for week in seasonWeeks:
        time.sleep(0.5)
        
        # load the matchups for the week
        url = baseUrl + week
        response = requests.get(url)
        soup = BeautifulSoup(response.text,'html.parser')
        
        # figure out the week type
        if 'seasontype' in week:
            if week.rsplit('/',2)[-1] == '1':
                weekType = 'PRE'
            else:
                weekType = 'POST'
        else:
            weekType = 'REG'
            
        # find the year from the dropdown menu 
        year = soup.find_all('button', {"class":"button-filter med dropdown-toggle"})[1].text
        
        # find the week number 
        weekNum = soup.find_all('button', {"class":"button-filter med dropdown-toggle"})[2].text
        if weekNum == 'Pro Bowl': continue
        postType = {'Wild Card':'1','Divisional Round':'2','Conference Championship':'3','Super Bowl':'4'}
        if weekType == 'POST':
            weekNum = postType[weekNum]
        else:
            weekNum = re.findall(r'\d+',weekNum)
            if len(weekNum)==0: weekNum = '0'
            else: weekNum = weekNum[0]
        
        # get list of all matchups for the week
        games = []
        for i in soup.find_all('div',{'class':'responsive-table-wrap'}):
            if len(i.find_all('tbody')) > 0:
                games.extend(i.find_all('tbody')[0].find_all('tr'))
        index = 0
        
        # iterate through every game
        for game in games:
            # not a game row
            if len(game.find_all('td')) < 5:
                continue
            matchup = [str(index)]
            # check if game hasn't happened
            live = len(game.find_all('td',{'class':'live'}))>0
            timed = len(game.find_all('td',{'data-behavior':'date_time'}))>0
            canceled = game.find_all('td')[2].text == 'Canceled'
            tbd = game.find_all('td')[2].text == 'TBD'
            postponed = game.find_all('td')[2].text == 'Postponed'

            if live or timed or canceled or tbd or postponed:
                team1 = game.find_all('td')[0].find_all('abbr')[0].text
                team2 = game.find_all('td')[1].find_all('abbr')[0].text
                matchup.extend([team1,team2])
                matchup.extend(['-','-'])
            else:
                home = game.find_all('td')[1].find_all('abbr')[0].text
                text = re.findall(r'\w+',game.find_all('td')[2].find_all('a')[0].text)
                if home == text[0]:
                    text = [text[2],text[0],text[3],text[1]]
                else:
                    text = [text[0],text[2],text[1],text[3]]
                matchup.extend(text)
            matchup.extend([weekType,weekNum,year])
            index += 1
            matchups.append(matchup)
    matchupsDf = pd.DataFrame(matchups,columns=columnHeader)
    return matchupsDf     

In [54]:
# function call for getting 2015-2020 seasons' schedules
espnUrl = 'https://www.espn.com'
schedulePayload = {"sport":"/nfl",
                   "type":"schedule",
                  "underscore":"_",
                  "year":"year",
                  "yearNum":"2015"}
df2 = pd.DataFrame()
for i in years:
    schedulePayload['yearNum']=str(i)
    df2 = df2.append(getMatchups(espnUrl,schedulePayload))
    print(str(i))
df2

2015
2016
2017
2018
2019
2020


Unnamed: 0,matchupId,team1,team2,team1Score,team2Score,weekType,weekNum,year
0,0,PIT,MIN,3,14,PRE,0,2015
1,0,NYJ,DET,3,23,PRE,1,2015
2,1,GB,NE,22,11,PRE,1,2015
3,2,NO,BAL,27,30,PRE,1,2015
4,3,MIA,CHI,10,27,PRE,1,2015
...,...,...,...,...,...,...,...,...
329,2,TBD,TBD,-,-,POST,2,2020
330,3,TBD,TBD,-,-,POST,2,2020
331,0,TBD,TBD,-,-,POST,3,2020
332,1,TBD,TBD,-,-,POST,3,2020


In [55]:
# send schedule data frame to json file
matchupsJson = df2.to_json(orient='records')

with open("matchups.json","w") as outfile:
    outfile.write(matchupsJson)

In [1]:
# dictionary to name different sections of the season
weekTypeD = {"PRE":"Pre-Season", "REG":"Regular Season", "POST": "Post-Season"}

# function to get the different weeks of the 2019 schedule
def getWeeks(url,payload):
    for key in payload:
        if payload[key]:
            url += payload[key]
            url += '/'
    response = requests.get(url)
    soup = BeautifulSoup(response.text,'html.parser')
    columnHeader = ['weekType','weekNum','weekTypeF','weekF']
    matchups = []
    season = soup.find_all('select', {"class":"d3-o-dropdown"})[1].find_all("option")
    seasonWeeks = []
    for week in season:
        seasonWeeks.append(week.get("value"))
    for week in seasonWeeks:
        url = baseUrl + week
        weekName = week.rsplit('/', 2)[-2]
        weekType = re.findall("[a-zA-Z]+", weekName)[0]
        weekNum = re.findall(r'\d+', weekName)[0]
        if weekType in weekTypeD.keys():
            weekTypeF = weekTypeD[weekType]
            weekF = weekTypeF + " Week " + weekNum
            matchup = []
            matchup.append(weekType)
            matchup.append(weekNum)
            matchup.append(weekTypeF)
            matchup.append(weekF)
            matchups.append(matchup)
    weeksDf = pd.DataFrame(matchups,columns=columnHeader)
    return weeksDf

In [14]:
# function call to get weeks and send to json file
schedulePayload = {"type":"/schedules",
                  "season":"2015",
                  "seasonType":"REG"}
df3 = pd.DataFrame()

for i in years:
    schedulePayload['season']=str(i)
    tempDf = getWeeks(baseUrl,schedulePayload)
    tempDf['year'] = [str(i)]*len(tempDf)
    df3 = df3.append(tempDf)
df3

weeksJson = df3.to_json(orient='records')

with open("weeks.json","w") as outfile:
    outfile.write(weeksJson)