# Actual Analysis of 2014-2022 Data
The previous tutorial goes through the process of data scraping which we want to seperate out from the actual analysis performed

In [1]:
import numpy as np
import pandas as pd
import random
import time

In [2]:
import sys
print(sys.executable)

c:\Users\Hunter\NFLPlayerStats\.venv\Scripts\python.exe


In [3]:
# Create List of pages
StatsPages = ['https://www.pro-football-reference.com/years/2024/passing.htm',
              'https://www.pro-football-reference.com/years/2024/rushing.htm',
              'https://www.pro-football-reference.com/years/2024/receiving.htm',
              'https://www.pro-football-reference.com/years/2024/defense.htm',
              'https://www.pro-football-reference.com/years/2024/kicking.htm',
              'https://www.pro-football-reference.com/years/2024/punting.htm',
              'https://www.pro-football-reference.com/years/2024/returns.htm',
              'https://www.pro-football-reference.com/years/2024/scoring.htm']

In [14]:
import pandas as pd
import time
import random
import requests
from bs4 import BeautifulSoup

# List of statistic types and seasons
StatsID = ['passing', 'rushing', 'receiving',
           'defense', 'kicking', 'punting',
           'returns', 'scoring']

csvEnding = '.csv'
Seasons = ['2020', '2021', '2022', '2023', '2024']

# Dictionary to store structured data for each stat type
data_by_stat = {}

# Loop through each statistic type
for stat in StatsID:
    stat_data = []  # List to store DataFrames for the current stat
    headers = None  # Placeholder for consistent headers

    for season in Seasons:
        # Construct the URL for the given season and statistic type
        url = f'https://www.pro-football-reference.com/years/{season}/{stat}.htm'
        print(f"Fetching data from: {url}")

        try:
            # Special handling for passing (since it required fixes)
            if stat == "passing":
                response = requests.get(url)
                soup = BeautifulSoup(response.text, 'html.parser')

                # Extract all tables
                tables = pd.read_html(url)

                # Find the correct table
                target_table = None
                for table in tables:
                    if "Player" in table.columns and "Team" in table.columns:
                        target_table = table
                        break

                if target_table is None:
                    raise ValueError(f"No valid table found for {season} {stat}")

                df = target_table.copy()

            # For non-passing categories, revert to the last working version
            else:
                df = pd.read_html(url, header=1, attrs={'id': stat})[0]

            # Capture headers from the first valid season
            if headers is None:
                headers = df.columns.tolist()

            # Insert season information at the correct position
            df.insert(0, 'Season', season)
            df.insert(1, 'SeasonType', 'Regular Season')

            # Force all seasons to use the same headers
            df = df.reindex(columns=['Season', 'SeasonType'] + headers, fill_value=None)

            # Append the cleaned dataframe
            stat_data.append(df)

        except Exception as e:
            print(f"Error retrieving data for {season} {stat}: {e}")

        # Respectful delay to avoid being blocked
        time.sleep(random.randint(7, 8))

    # Concatenate all seasons for this stat type
    if stat_data:
        final_stat_df = pd.concat(stat_data, ignore_index=True)
        data_by_stat[stat] = final_stat_df

# Generate the properly structured CSV files
file_prefix = f"{Seasons[0]}To{Seasons[-1]}"

for stat, df in data_by_stat.items():
    fileName = f"{file_prefix}{stat.capitalize()}{csvEnding}"  # e.g., "2020To2024Passing.csv"
    df.to_csv(fileName, index=False)
    print(f"Saved correctly formatted data for {stat} to {fileName}")


Fetching data from: https://www.pro-football-reference.com/years/2020/passing.htm
Fetching data from: https://www.pro-football-reference.com/years/2021/passing.htm
Fetching data from: https://www.pro-football-reference.com/years/2022/passing.htm
Fetching data from: https://www.pro-football-reference.com/years/2023/passing.htm
Fetching data from: https://www.pro-football-reference.com/years/2024/passing.htm
Fetching data from: https://www.pro-football-reference.com/years/2020/rushing.htm
Fetching data from: https://www.pro-football-reference.com/years/2021/rushing.htm
Fetching data from: https://www.pro-football-reference.com/years/2022/rushing.htm
Fetching data from: https://www.pro-football-reference.com/years/2023/rushing.htm
Fetching data from: https://www.pro-football-reference.com/years/2024/rushing.htm
Fetching data from: https://www.pro-football-reference.com/years/2020/receiving.htm
Fetching data from: https://www.pro-football-reference.com/years/2021/receiving.htm
Fetching dat

In [21]:
# Clean up passing data frame
TestDF = pd.DataFrame()

urlTest = 'https://www.pro-football-reference.com/years/2024/passing.htm'

TestDF = pd.read_html(urlTest)[0]

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

TestDF.index.name = "PassingIndex"

# Insert season just for reference
TestDF.insert(loc = 0, column = 'Season', value = '2024')

# Insert season type column
TestDF.insert(loc = 1, column = 'SeasonType', value = 'Regular Season')

TestDF.to_csv('2024passing.csv')
