In [4]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np
import json

In [207]:
# site used for scraping data
baseUrl = 'https://www.nfl.com'

In [245]:
# function for getting stats off nfl.com/stats/
def getStats(url, payload): 
    tableRows = []
    pagingText = True
    
    for key in payload:
        if payload[key]:
            url += payload[key]
            url += '/'
    response = requests.get(url)
    
    # get column names
    soup = BeautifulSoup(response.text,'html.parser')
    columnHeader = []
    header = soup.find('table').find_all('th')
    for col in header:
        columnHeader.append(col.text)
    columnHeader = [c.strip('\n') for c in columnHeader]

    # search through pages and append to tableRows
    while pagingText:
        time.sleep(2)
        response = requests.get(url)
        print("url:",response.url)
        soup = BeautifulSoup(response.text,'html.parser')
        table = soup.find('table')
        rows = table.find_all('tr')

        for row in rows:
            cols = row.find_all('td')
            if cols:
                cols = [ele.text.strip() for ele in cols]
                cols = [ele.rstrip('\n') for ele in cols]
                tableRows.append([ele for ele in cols if ele]) # Get rid of empty values
        pagingText = soup.find("a", {"class": "nfl-o-table-pagination__next"})
        if pagingText:
            url = baseUrl + pagingText.get('href')
    
    resultsDf = pd.DataFrame(tableRows, columns=columnHeader)
    find = re.compile(r"^(\n*).*")
    resultsDf['Team'] = resultsDf['Team'].apply(lambda x: re.search(find,x).group())
    resultsDf = resultsDf.sort_values(by=['Team']).reset_index(drop=True)
    return resultsDf

In [274]:
# function for getting team stats, calls getStats()
def getTeamStats(url,side,categ,season,cols):
    allCols = ['Team','weekType']+cols
    seasonTypes = ["PRE","REG","POST"]
    df = None
    for st in seasonTypes:
        teamPayload = {"stats":"/stats","subject":"team-stats",
               "category":side,
               "statisticCategory":categ,
               "season":season,
               "seasonType":st,
               "list":"all"}
        tempDf = getStats(baseUrl,teamPayload)
        tempDf['weekType']=st
        tempDf = tempDf[allCols] 
        
        # change raw stats to ordering
        for i in cols:
            colName = i + '(o)'
            temp = pd.to_numeric(tempDf[i])
            tempDf[colName] = temp
            tempDf = tempDf.sort_values(by=i).reset_index(drop=True)
            tempDf[colName] = range(len(tempDf.index),0,-1)
            temp = tempDf[colName].astype(str)
            tempDf[colName] = temp
            
        df = pd.concat([df,tempDf])
    return df

In [275]:
# specific stats desired
poStats = ['Pass Yds','Yds/Att','Cmp %','TD','INT']
roStats = ['Rush Yds','YPC','TD','Rush FUM']
pdStats = ['Yds','Yds/Att','Cmp %','TD','INT']
rdStats = ['Rush Yds','YPC','TD','Rush FUM']

# function calls to get team stats
teamPODf = getTeamStats(baseUrl,"offense","passing","2019",poStats)
teamRODf = getTeamStats(baseUrl,"offense","rushing","2019",roStats)
teamPDDf = getTeamStats(baseUrl,"defense","passing","2019",pdStats)
teamRDDf = getTeamStats(baseUrl,"defense","rushing","2019",rdStats)
print('done')

url: https://www.nfl.com/stats/team-stats/offense/passing/2019/pre/all
url: https://www.nfl.com/stats/team-stats/offense/passing/2019/reg/all
url: https://www.nfl.com/stats/team-stats/offense/passing/2019/post/all
url: https://www.nfl.com/stats/team-stats/offense/rushing/2019/pre/all
url: https://www.nfl.com/stats/team-stats/offense/rushing/2019/reg/all
url: https://www.nfl.com/stats/team-stats/offense/rushing/2019/post/all
url: https://www.nfl.com/stats/team-stats/defense/passing/2019/pre/all
url: https://www.nfl.com/stats/team-stats/defense/passing/2019/reg/all
url: https://www.nfl.com/stats/team-stats/defense/passing/2019/post/all
url: https://www.nfl.com/stats/team-stats/defense/rushing/2019/pre/all
url: https://www.nfl.com/stats/team-stats/defense/rushing/2019/reg/all
url: https://www.nfl.com/stats/team-stats/defense/rushing/2019/post/all
done


In [285]:
# send data frames to json files for copying to server
dfs = [teamPODf,teamRODf,teamPDDf,teamRDDf]
for i in range(len(dfs)):
    with open("teamStat"+str(i),"w") as outfile:
        outfile.write(dfs[i].to_json(orient='records'))

In [1]:
# function for getting NFL schedule
def getMatchups(url,payload):
    for key in payload:
        if payload[key]:
            url += payload[key]
            url += '/'
    response = requests.get(url)
    soup = BeautifulSoup(response.text,'html.parser')
    
    matchups = []
    columnHeader = ['matchupId', 'team1', 'team2', 'team1Score','team2Score','weekType','weekNum']
    season = soup.find_all('select', {"class":"d3-o-dropdown"})[1].find_all("option")
    seasonWeeks = []
    for week in season:
        seasonWeeks.append(week.get("value"))
    for week in seasonWeeks:
        time.sleep(2)
        url = baseUrl + week
        response = requests.get(url)
        print(url)
        soup = BeautifulSoup(response.text,'html.parser')
        weekName = week.rsplit('/', 2)[-2]
        weekType = re.findall("[a-zA-Z]+", weekName)[0]
        weekNum = re.findall(r'\d+', weekName)[0]
        games = soup.find_all('a', {"class": "nfl-c-matchup-strip__game"})
        index = 0
        for game in games:
            gameSoup = game.find_all('span', {"class":"nfl-c-matchup-strip__team-abbreviation"})
            matchup = []
            matchup.append(str(index))
            for team in gameSoup:
                matchup.append(re.sub(r'[^A-Za-z]', '', team.get_text()))
            scoreSoup = game.find_all('div', {'class':"nfl-c-matchup-strip__team-score"})
            if scoreSoup:
                for team in scoreSoup:
                    matchup.append(team.get_text())
            else:
                matchup.append("")
                matchup.append("")
            matchup.append(weekType)
            matchup.append(weekNum)
            matchups.append(matchup)
            index += 1
    matchupsDf = pd.DataFrame(matchups,columns=columnHeader)
    return matchupsDf

In [7]:
# function call for getting 2019 season's schedule
schedulePayload = {"type":"/schedules",
                  "season":"2019",
                  "seasonType":"REG"}
df2 = getMatchups(baseUrl,schedulePayload)
print('done')

https://www.nfl.com/schedules/2019/PRE0/
https://www.nfl.com/schedules/2019/PRE1/
https://www.nfl.com/schedules/2019/PRE2/
https://www.nfl.com/schedules/2019/PRE3/
https://www.nfl.com/schedules/2019/PRE4/
https://www.nfl.com/schedules/2019/REG1/
https://www.nfl.com/schedules/2019/REG2/
https://www.nfl.com/schedules/2019/REG3/
https://www.nfl.com/schedules/2019/REG4/
https://www.nfl.com/schedules/2019/REG5/
https://www.nfl.com/schedules/2019/REG6/
https://www.nfl.com/schedules/2019/REG7/
https://www.nfl.com/schedules/2019/REG8/
https://www.nfl.com/schedules/2019/REG9/
https://www.nfl.com/schedules/2019/REG10/
https://www.nfl.com/schedules/2019/REG11/
https://www.nfl.com/schedules/2019/REG12/
https://www.nfl.com/schedules/2019/REG13/
https://www.nfl.com/schedules/2019/REG14/
https://www.nfl.com/schedules/2019/REG15/
https://www.nfl.com/schedules/2019/REG16/
https://www.nfl.com/schedules/2019/REG17/
https://www.nfl.com/schedules/2019/POST1/
https://www.nfl.com/schedules/2019/POST2/
https:

In [11]:
# send schedule data frame to json file
matchupsJson = df2.to_json(orient='records')

with open("matchups.json","w") as outfile:
    outfile.write(matchupsJson)

In [65]:
# dictionary to name different sections of the season
weekTypeD = {"PRE":"Pre-Season", "REG":"Regular Season", "POST": "Post-Season"}

# function to get the different weeks of the 2019 schedule
def getWeeks(url,payload):
    for key in payload:
        if payload[key]:
            url += payload[key]
            url += '/'
    response = requests.get(url)
    soup = BeautifulSoup(response.text,'html.parser')
    columnHeader = ['weekType','weekNum','weekTypeF','weekF']
    matchups = []
    season = soup.find_all('select', {"class":"d3-o-dropdown"})[1].find_all("option")
    seasonWeeks = []
    for week in season:
        seasonWeeks.append(week.get("value"))
    for week in seasonWeeks:
        url = baseUrl + week
        weekName = week.rsplit('/', 2)[-2]
        weekType = re.findall("[a-zA-Z]+", weekName)[0]
        weekNum = re.findall(r'\d+', weekName)[0]
        if weekType in weekTypeD.keys():
            weekTypeF = weekTypeD[weekType]
            weekF = weekTypeF + " Week " + weekNum
            matchup = []
            matchup.append(weekType)
            matchup.append(weekNum)
            matchup.append(weekTypeF)
            matchup.append(weekF)
            matchups.append(matchup)
    weeksDf = pd.DataFrame(matchups,columns=columnHeader)
    return weeksDf

In [66]:
# function call to get weeks and send to json file
schedulePayload = {"type":"/schedules",
                  "season":"2018",
                  "seasonType":"REG"}
df3 = getWeeks(baseUrl,schedulePayload)
df3
weeksJson = df3.to_json(orient='records')

with open("weeks.json","w") as outfile:
    outfile.write(weeksJson)