# Steelers Games Summary Data Scrape

[pro-football reference](https://www.pro-football-reference.com/) Pro-football reference includes NFL data, dating back to 1967. This data includes player statistics, all-time leaders, draft history, coaches, and much more. Statistics are updated by every week, no later than Tuesday at 6pm. Additional data can be found behind a paid subscription.

*this overview comes from [Ohio State's Sports and Society Initiative](https://sportsandsociety.osu.edu/sports-data-sets)*

In [1]:
# packages
import pandas as pd
import warnings

# scraping
import requests
from bs4 import BeautifulSoup
import re
import time
import lxml # used for parsing html

# bigquery
import os
from dotenv import load_dotenv
from google.cloud import bigquery
from datetime import datetime
from google.cloud import bigquery
import db_dtypes

### Web Scraping

In [26]:
# Base URL for the Steelers' main page
base_url = "https://www.pro-football-reference.com/teams/pit/"
headers = {"User-Agent": "Mozilla/5.0"}

# Step 1: Scrape the main page to get links for each year
response = requests.get(base_url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

# check for successful request
if response.status_code != 200:
    print("Failed to retrieve the page")
    exit()

If no Failure message we are good to continue

In [27]:
# Collect all year links from the main team page
year_links_start = [
    base_url + a['href']
    for a in soup.select('table#team_index a[href]')
    if re.match(r'/teams/pit/\d{4}\.htm', a['href'])
]

In [28]:
# function to remove duplicates and preserve the order

def removeduplicate(data):
    countdict = {}
    for element in data:
        if element in countdict.keys():
            
            # increasing the count if the key(or element)
            # is already in the dictionary
            countdict[element] += 1
        else:
            # inserting the element as key  with count = 1
            countdict[element] = 1
    data.clear()
    for key in countdict.keys():
        data.append(key)
        
removeduplicate(year_links_start)

# remove //teams/pit from the urls
year_links = [link.replace("//teams/pit", "") for link in year_links_start]

year_links[:5]

['https://www.pro-football-reference.com/teams/pit/2024.htm',
 'https://www.pro-football-reference.com/teams/pit/2023.htm',
 'https://www.pro-football-reference.com/teams/pit/2022.htm',
 'https://www.pro-football-reference.com/teams/pit/2021.htm',
 'https://www.pro-football-reference.com/teams/pit/2020.htm']

In [None]:
# loop through each year

all_games = []

for link in year_links:
    year = re.search(r'\d{4}', link).group()  # Extract the year from the link
    print(f"Scraping data for year {year}...")
    year_url = f"{link}"

    # Request the year's page
    year_response = requests.get(year_url, headers=headers)
    year_soup = BeautifulSoup(year_response.text, "html.parser")

    # Find the "Schedule & Game Results" table
    table = year_soup.find("table", id="games")
    if table:
        # Load table into a DataFrame
        games_df = pd.read_html(str(table))[0]

        # Add a column for the year
        games_df["Year"] = year

        # Append to the list
        all_games.append(games_df)

    # Be polite and avoid overloading the server
    time.sleep(1)

Scraping data for year 2024...
Scraping data for year 2023...
Scraping data for year 2022...
Scraping data for year 2021...
Scraping data for year 2020...
Scraping data for year 2019...
Scraping data for year 2018...
Scraping data for year 2017...
Scraping data for year 2016...
Scraping data for year 2015...
Scraping data for year 2014...
Scraping data for year 2013...
Scraping data for year 2012...
Scraping data for year 2011...
Scraping data for year 2010...
Scraping data for year 2009...
Scraping data for year 2008...
Scraping data for year 2007...
Scraping data for year 2006...
Scraping data for year 2005...
Scraping data for year 2004...
Scraping data for year 2003...
Scraping data for year 2002...
Scraping data for year 2001...
Scraping data for year 2000...
Scraping data for year 1999...
Scraping data for year 1998...
Scraping data for year 1997...
Scraping data for year 1996...
Scraping data for year 1995...
Scraping data for year 1994...
Scraping data for year 1993...
Scraping

In [44]:
# step 3: Concatenate all dataframes into a single DataFrame
schedule_df = pd.concat(all_games, ignore_index=True)

# make a copy in case we want to go back
schedule_df2 = schedule_df.copy()

# assuming df is your DataFrame after reading the table
# flatten the multi-level column index
schedule_df2.columns = ['_'.join(col).strip() for col in schedule_df2.columns.values]

# rename columns containing 'Unnamed' to add 'Misc_' prefix and clean the rest
schedule_df2.columns = [
    'Misc_' + re.sub(r'^Unnamed:.*?_level_0_', '', col) if 'Unnamed' in col else col
    for col in schedule_df2.columns
]

# rename cols without headers on website
schedule_df2.rename(columns={'Misc_Unnamed: 3_level_1': 'Misc_Time', 'Misc_Unnamed: 4_level_1': 'Misc_Boxscore', 'Misc_Unnamed: 8_level_1': 'Misc_Location',
                             'Misc_Unnamed: 5_level_1': 'Misc_Outcome', 'Expected Points_Sp. Tms': 'Expected Points_Sp_Tms', 'Year_': 'Misc_Year'}, inplace=True)

# remove boxscore and and fix 'Expected Points_Sp. Tms'

# move year to front 
first_col = schedule_df2.pop('Misc_Year')
# insert column using insert(position,column_name, first_column) function 
schedule_df2.insert(0, 'Misc_Year', first_col) 

# drop boxscore
schedule_df3 = schedule_df2.drop('Misc_Boxscore', axis=1)

# cleanup OT overtime and Home/Away

# replace '@' with 'Away' and NaN with 'Home'
schedule_df3['Misc_Location'] = schedule_df3['Misc_Location'].replace('@', 'Away').fillna('Home')

# replace 'OT' with 'Y' and NaN with 'N'
schedule_df3['Misc_OT'] = schedule_df3['Misc_OT'].replace('OT', 'Y').fillna('N')

# add underscores to column headers instead of spaces
schedule_df3.columns = [re.sub(r' ', '_', col) for col in schedule_df3.columns]

schedule_df3.head()
#schedule_df3.columns

Unnamed: 0,Misc_Year,Misc_Week,Misc_Day,Misc_Date,Misc_Time,Misc_Outcome,Misc_OT,Misc_Rec,Misc_Location,Misc_Opp,...,Offense_RushY,Offense_TO,Defense_1stD,Defense_TotYd,Defense_PassY,Defense_RushY,Defense_TO,Expected_Points_Offense,Expected_Points_Defense,Expected_Points_Sp_Tms
0,2024,1,Sun,September 8,1:00PM ET,W,N,1-0,Away,Atlanta Falcons,...,137.0,,15.0,226.0,137.0,89.0,3.0,-6.25,13.7,2.05
1,2024,2,Sun,September 15,4:25PM ET,W,N,2-0,Away,Denver Broncos,...,141.0,,13.0,295.0,231.0,64.0,2.0,-1.2,6.86,-1.25
2,2024,3,Sun,September 22,1:00PM ET,W,N,3-0,Home,Los Angeles Chargers,...,114.0,1.0,10.0,166.0,105.0,61.0,,10.48,6.58,-4.19
3,2024,4,Sun,September 29,1:00PM ET,L,N,3-1,Away,Indianapolis Colts,...,122.0,2.0,22.0,358.0,225.0,133.0,,2.8,-12.91,4.94
4,2024,5,Sun,October 6,8:20PM ET,L,N,3-2,Home,Dallas Cowboys,...,92.0,1.0,25.0,445.0,336.0,109.0,3.0,2.75,-11.22,2.98


In [45]:
schedule_df3.dtypes

Misc_Year                   object
Misc_Week                   object
Misc_Day                    object
Misc_Date                   object
Misc_Time                   object
Misc_Outcome                object
Misc_OT                     object
Misc_Rec                    object
Misc_Location               object
Misc_Opp                    object
Score_Tm                   float64
Score_Opp                  float64
Offense_1stD               float64
Offense_TotYd              float64
Offense_PassY              float64
Offense_RushY              float64
Offense_TO                 float64
Defense_1stD               float64
Defense_TotYd              float64
Defense_PassY              float64
Defense_RushY              float64
Defense_TO                 float64
Expected_Points_Offense    float64
Expected_Points_Defense    float64
Expected_Points_Sp_Tms     float64
dtype: object

### Load scraped data as a table to BigQuery

In [49]:
# used for both BQ read/write

# setting environmental variable directly in your code
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = 'bq-crudek-data.json'

# initialize the BigQuery Client
client = bigquery.Client()

# set table_id to the ID of the table to create
table_id = 'crudek-data.practice_data.steelers_games'

job_config = bigquery.LoadJobConfig(
    schema=[
        bigquery.SchemaField("Misc_Year", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("Misc_Week", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("Misc_Day", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("Misc_Date", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("Misc_Time", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("Misc_Outcome", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("Misc_OT", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("Misc_Rec", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("Misc_Location", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("Misc_Opp", bigquery.enums.SqlTypeNames.STRING),
    ],
    write_disposition="WRITE_TRUNCATE",
)

# make API request
job = client.load_table_from_dataframe(
    schedule_df3, table_id, job_config=job_config
)  
# wait for the job to complete.
job.result()  

Error converting Pandas column with name: "Misc_Week" and datatype: "object" to an appropriate pyarrow datatype: Array, ListArray, or StructArray


ArrowTypeError: Error converting Pandas column with name: "Misc_Week" and datatype: "object" to an appropriate pyarrow datatype: Array, ListArray, or StructArray

In [None]:
# confirm with shape
table = client.get_table(table_id)
print(
    "Loaded {} rows and {} columns to {}".format(
        table.num_rows, len(table.schema), table_id
    )
)