In [None]:
# creating a range of the data to scrape and preparing for web scraping

In [71]:
years = list(range(2002,2023))

In [72]:
from selenium import webdriver

In [73]:
import requests
import time

In [None]:
# initializing driver - using MacOS, so I used Safari

In [75]:
driver = webdriver.Safari()

In [None]:
# automating Safari to scrape the data from Pro Football Focus
# also bypassing javascript and loading the whole page

In [76]:
url_games = "https://www.pro-football-reference.com/years/{}/games.htm"

for year in years:
    url = url_games.format(year)
    
    driver.get(url)
    driver.execute_script("window.scrollTo(1,100)")
    time.sleep(2)
    
    html = driver.page_source
    with open("games/{}.html".format(year), "w+") as f:
        f.write(html)

In [77]:
from bs4 import BeautifulSoup

In [78]:
import pandas as pd

In [None]:
# transforming .html into usable dataframes
# parsing for usable data -- and deleting unwanted rows

In [79]:
dfs = []
for year in years:
    with open("games/{}.html".format(year)) as f:
            page = f.read()

    soup = BeautifulSoup(page, "html.parser")
    
    for thead_row in soup.find_all('tr', class_="thead"):
        thead_row.decompose()
        
    games_table = soup.find(id="games")
    games = pd.read_html(str(games_table))[0]
    games["Year"] = year
    dfs.append(games)

In [80]:
games = pd.concat(dfs)

In [None]:
# saved into library as a .csv

In [81]:
games.to_csv("games.csv")

In [None]:
# repeat steps for more detailed stats on NFL franchises

In [82]:
url_teamoff = "https://www.pro-football-reference.com/years/{}/#team_stats"

for year in years:
    url = url_teamoff.format(year)
    
    driver.get(url)
    driver.execute_script("window.scrollTo(1,100)")
    time.sleep(2)
    
    html = driver.page_source
    with open("statistics/{}.html".format(year), "w+") as f:
        f.write(html)

In [102]:
dfs = []
for year in years:
    with open("statistics/{}.html".format(year)) as f:
            page = f.read()

    soup = BeautifulSoup(page, "html.parser")
    
    for thead_row in soup.find_all('tr', class_="thead"):
        thead_row.decompose()
    
    for row_class in soup.find_all('tr', class_="average_table no_ranker"):
        row_class.decompose()
        
    for row_class in soup.find_all('tr', class_="league_total"):
        row_class.decompose()
        
    for row_class in soup.find_all('tr', class_="average_line"):
        row_class.decompose()
    
    passing_table = soup.find(id="div_passing")
    passing = pd.read_html(str(passing_table))[0]
    passing["Year"] = year
    dfs.append(passing)

In [103]:
passing = pd.concat(dfs)

In [104]:
passing.to_csv("passing.csv")

In [105]:
dfs = []
for year in years:
    with open("statistics/{}.html".format(year)) as f:
            page = f.read()

    soup = BeautifulSoup(page, "html.parser")
    
    for thead_row in soup.find_all('tr', class_="thead"):
        thead_row.decompose()
    
    for row_class in soup.find_all('tr', class_="average_table no_ranker"):
        row_class.decompose()
        
    for row_class in soup.find_all('tr', class_="league_total"):
        row_class.decompose()
        
    for row_class in soup.find_all('tr', class_="average_line"):
        row_class.decompose()
    
    rushing_table = soup.find(id="div_rushing")
    rushing = pd.read_html(str(rushing_table))[0]
    rushing["Year"] = year
    dfs.append(rushing)

In [106]:
rushing = pd.concat(dfs)

In [107]:
rushing.to_csv("rushing.csv")

In [108]:
dfs = []
for year in years:
    with open("statistics/{}.html".format(year)) as f:
            page = f.read()

    soup = BeautifulSoup(page, "html.parser")
    
    for thead_row in soup.find_all('tr', class_="thead"):
        thead_row.decompose()
    
    for row_class in soup.find_all('tr', class_="average_table no_ranker"):
        row_class.decompose()
        
    for row_class in soup.find_all('tr', class_="league_total"):
        row_class.decompose()
        
    for row_class in soup.find_all('tr', class_="average_line"):
        row_class.decompose()
    
    scoring_table = soup.find(id="div_team_scoring")
    scoring = pd.read_html(str(scoring_table))[0]
    scoring["Year"] = year
    dfs.append(scoring)

In [109]:
scoring = pd.concat(dfs)

In [110]:
scoring.to_csv("scoring.csv")

In [115]:
dfs = []
for year in years:
    with open("statistics/{}.html".format(year)) as f:
            page = f.read()

    soup = BeautifulSoup(page, "html.parser")
    
    for thead_row in soup.find_all('tr', class_="thead"):
        thead_row.decompose()
    
    for row_class in soup.find_all('tr', class_="average_table no_ranker"):
        row_class.decompose()
        
    for row_class in soup.find_all('tr', class_="league_total"):
        row_class.decompose()
        
    for row_class in soup.find_all('tr', class_="average_line"):
        row_class.decompose()
    
    for row_class in soup.find_all('tr', class_="over_header"):
        row_class.decompose()
    
    conversions_table = soup.find(id="div_team_conversions")
    conversions = pd.read_html(str(conversions_table))[0]
    conversions["Year"] = year
    dfs.append(conversions)

In [116]:
conversions = pd.concat(dfs)

In [117]:
conversions.to_csv("conversions.csv")

In [118]:
dfs = []
for year in years:
    with open("statistics/{}.html".format(year)) as f:
            page = f.read()

    soup = BeautifulSoup(page, "html.parser")
    
    for thead_row in soup.find_all('tr', class_="thead"):
        thead_row.decompose()
    
    for row_class in soup.find_all('tr', class_="average_table no_ranker"):
        row_class.decompose()
        
    for row_class in soup.find_all('tr', class_="league_total"):
        row_class.decompose()
        
    for row_class in soup.find_all('tr', class_="average_line"):
        row_class.decompose()
    
    for row_class in soup.find_all('tr', class_="over_header"):
        row_class.decompose()
    
    drives_table = soup.find(id="div_drives")
    drives = pd.read_html(str(drives_table))[0]
    drives["Year"] = year
    dfs.append(drives)

In [119]:
drives = pd.concat(dfs)

In [120]:
drives.to_csv("drives.csv")

In [121]:
dfs = []
for year in years:
    with open("statistics/{}.html".format(year)) as f:
            page = f.read()

    soup = BeautifulSoup(page, "html.parser")
    
    for thead_row in soup.find_all('tr', class_="thead onecell"):
        thead_row.decompose()
        
    for row_class in soup.find_all('tr', class_="over_header"):
        row_class.decompose()
    
    team_table = soup.find(id="all_AFC")
    team = pd.read_html(str(team_table))[0]
    team["Year"] = year
    dfs.append(team)
    
    soup = BeautifulSoup(page, "html.parser")
    
    for thead_row in soup.find_all('tr', class_="thead onecell"):
        thead_row.decompose()
        
    for row_class in soup.find_all('tr', class_="over_header"):
        row_class.decompose()
    
    team_table = soup.find(id="all_NFC")
    team = pd.read_html(str(team_table))[0]
    team["Year"] = year
    dfs.append(team)

In [122]:
team = pd.concat(dfs)

In [125]:
team.to_csv("team.csv")