In [1]:
from typing import Dict, List, Tuple
import pandas as pd
import numpy as np
import os
import time
from tqdm import tqdm as tqdm_
import requests
import ast
from datetime import datetime

In [2]:
data_path = "/Users/connorkitchings/Library/CloudStorage/GoogleDrive-connor.kitchings@gmail.com/My Drive/Colab Notebooks/CFB Notebooks/Data/"
module_path = "/Users/connorkitchings/Desktop/Repositories/CFB-Model/Module/"
logo_path = "/Users/connorkitchings/Desktop/Repositories/CFB-Model/Data/Logos/"

# Lists and Dictionaries

In [3]:
FBS_list = [
    "SEC",
    "American Athletic",
    "FBS Independents",
    "Big Ten",
    "Conference USA",
    "Big 12",
    "Mid-American",
    "ACC",
    "Sun Belt",
    "Pac-12",
    "Mountain West",
]

In [4]:
P5schoolsandconferences = {
    "Alabama": "SEC",
    "Arkansas": "SEC",
    "Auburn": "SEC",
    "Florida": "SEC",
    "Georgia": "SEC",
    "Kentucky": "SEC",
    "LSU": "SEC",
    "Mississippi State": "SEC",
    "Missouri": "SEC",
    "Ole Miss": "SEC",
    "South Carolina": "SEC",
    "Tennessee": "SEC",
    "Texas A&M": "SEC",
    "Vanderbilt": "SEC",
    "Illinois": "Big Ten",
    "Indiana": "Big Ten",
    "Iowa": "Big Ten",
    "Maryland": "Big Ten",
    "Michigan": "Big Ten",
    "Michigan State": "Big Ten",
    "Minnesota": "Big Ten",
    "Nebraska": "Big Ten",
    "Northwestern": "Big Ten",
    "Ohio State": "Big Ten",
    "Penn State": "Big Ten",
    "Purdue": "Big Ten",
    "Rutgers": "Big Ten",
    "Wisconsin": "Big Ten",
    "Illinois": "Big Ten",
    "Indiana": "Big Ten",
    "Iowa": "Big Ten",
    "Maryland": "Big Ten",
    "Michigan": "Big Ten",
    "Michigan State": "Big Ten",
    "Minnesota": "Big Ten",
    "Nebraska": "Big Ten",
    "Northwestern": "Big Ten",
    "Ohio State": "Big Ten",
    "Penn State": "Big Ten",
    "Purdue": "Big Ten",
    "Rutgers": "Big Ten",
    "Wisconsin": "Big Ten",
    "Baylor": "Big 12",
    "Iowa State": "Big 12",
    "Kansas": "Big 12",
    "Kansas State": "Big 12",
    "Oklahoma": "Big 12",
    "Oklahoma State": "Big 12",
    "TCU": "Big 12",
    "Texas": "Big 12",
    "Texas Tech": "Big 12",
    "West Virginia": "Big 12",
    "Arizona": "Pac-12",
    "Arizona State": "Pac-12",
    "California": "Pac-12",
    "Colorado": "Pac-12",
    "Oregon": "Pac-12",
    "Oregon State": "Pac-12",
    "Stanford": "Pac-12",
    "UCLA": "Pac-12",
    "USC": "Pac-12",
    "Utah": "Pac-12",
    "Washington": "Pac-12",
    "Washington State": "Pac-12",
    "BYU": "Big 12",
    "Notre Dame": "Independent",
}

In [None]:
stats_dictionary = {
    "ypp": "Yards Per Play",
    "success_rate": "Success Rate",
    "explosive_rate": "Explosive Rate",
    "TFL_rate": "Tackle For Loss Rate",
    "turnover_rate": "Turnover Rate",
    "third_down_conversion_rate": "Third Down Conversion Rate",
    "fourth_down_conversion_rate": "Fourth Down Conversion Rate",
    "rushpass_split": "Rush/Pass Play Split",
    "r_ypp": "Rushing Yards Per Play",
    "r_success_rate": "Rushing Success Rate",
    "r_explosive_rate": "Rushing Explosive Rate",
    "r_successyards_per_success": "Rushing Success Yards Per Success",
    "r_explosiveyards_per_explosive": "Rushing Explosive Yards Per Explosive",
    "stuff_rate": "Stuff Rate",
    "p_ypp": "Passing Yards Per Play",
    "p_success_rate": "Passing Success Rate",
    "p_explosive_rate": "Passing Explosive Rate",
    "p_successyards_per_success": "Passing Success Yards Per Success",
    "p_explosiveyards_per_explosive": "Passing Explosive Yards Per Explosive",
    "completion_rate": "Completion Rate",
    "sack_rate": "Sack Rate",
    "havoc_rate": "Havoc Rate",
    "avg_available_yards": "Average Available Yards",
    "avg_available_yards_gained": "Average Available Yards Gained",
    "redzone_drive_rate": "Redzone Drive Rate",
    "eckel_drive_rate": "Eckel Drive Rate",
    "drive_success_rate": "Drive Success Rate",
    "scoring_drive_rate": "Scoring Drive Rate",
    "pointsscored_per_drive": "Points Scored Per Drive",
    "TD_drive_rate": "Touchdown Drive Rate",
    "FG_drive_rate": "Field Goal Drive Rate",
    "firstdown_drive_rate": "First Down Drive Rate",
    "busted_drive_rate": "Busted Drive Rate",
    "longdrive_rate": "Long Drive Rate",
    "ppa_per_drive": "Predicted Points Added Per Drive",
    "giveaway_drive_rate": "Giveaway Drive Rate",
    "o_penalties_per_drive": "Offensive Penalties Per Drive",
    "o_penaltyyards_per_drive": "Offensive Penalty Yards Per Drive",
    "d_penalties_per_drive": "Defensive Penalties Per Drive",
    "d_penaltyyards_per_drive": "Defensive Penalty Yards Per Drive",
}

# Functions

## Team Data

In [6]:
def fetch_year_data(year: int, headers: Dict[str, str]) -> pd.DataFrame:
    url = "https://api.collegefootballdata.com/teams"
    params = {"year": year}

    try:
        response = requests.get(url, params=params, headers=headers)
        if response.status_code == 200:
            data = response.json()
            df = pd.DataFrame(data)
            df["season"] = year
            return df
        else:
            print(f"Error fetching data for year {year}: HTTP {response.status_code}")
            return pd.DataFrame()
    except Exception as e:
        print(f"Exception occurred while fetching data for year {year}: {str(e)}")
        return pd.DataFrame()

In [7]:
def call_api_FBSteams(first_year: int, last_year: int, api_key: str) -> pd.DataFrame:
    start = time.time()

    headers = {"Authorization": f"Bearer {api_key}"}
    year_list = list(range(first_year, last_year + 1))

    all_data = []
    for year in year_list:
        df = fetch_year_data(year, headers)
        all_data.append(df)
        time.sleep(1)  # Add a small delay to avoid overwhelming the API

    newdataset1 = pd.concat(all_data, ignore_index=True)

    final_data = (
        newdataset1[newdataset1["classification"] == "fbs"]
        .drop(
            columns=[
                "id",  # Remove the 'id' column
                "mascot",
                "alt_name1",
                "alt_name2",
                "alt_name3",
                "alt_color",
                "twitter",
                "location",
            ]
        )
        .sort_values(by=["season", "school"])
        .reset_index(drop=True)
    )

    final_data.loc[final_data["school"] == "Hawai'i", "school"] = "Hawaii"
    final_data.loc[final_data["school"] == "San José State", "school"] = (
        "San Jose State"
    )

    end = time.time()
    total_time = end - start
    total_time_per = total_time / (last_year - first_year + 1)

    minutes, seconds = divmod(int(total_time), 60)
    minutes_per, seconds_per = divmod(int(total_time_per), 60)

    print(
        f"call_api_FBSteams took {minutes} minutes and {seconds} seconds to complete."
    )
    print(
        f"It took {minutes_per} minutes and {seconds_per} seconds to complete per season."
    )

    return final_data

## Roster Data

In [8]:
def fetch_roster_data(year: int, headers: Dict[str, str]) -> pd.DataFrame:
    url = "https://api.collegefootballdata.com/roster"
    params = {"year": year}

    try:
        response = requests.get(url, params=params, headers=headers)
        if response.status_code == 200:
            data = response.json()
            df = pd.DataFrame(data)
            df["season"] = year
            return df
        else:
            print(
                f"Error fetching roster data for year {year}: HTTP {response.status_code}"
            )
            return pd.DataFrame()
    except Exception as e:
        print(
            f"Exception occurred while fetching roster data for year {year}: {str(e)}"
        )
        return pd.DataFrame()

In [9]:
def safe_literal_eval(val):
    if isinstance(val, list):
        return val
    if pd.isna(val) or val == "":
        return []
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        if isinstance(val, str):
            if val.strip().isdigit():
                return [val.strip()]
            else:
                return [val]
        else:
            return []

In [10]:
def clean_roster_data(roster_data):
    # Select the desired columns
    columns = [
        "id",
        "first_name",
        "last_name",
        "team",
        "year",
        "position",
        "recruit_ids",
        "season",
    ]
    df = roster_data[columns].copy()

    # Combine first_name and last_name into a single name column
    df["name"] = df["first_name"] + " " + df["last_name"]
    df = df.drop(columns=["first_name", "last_name"])
    # Parse the recruit_ids string into a list
    df["recruit_ids"] = df["recruit_ids"].apply(safe_literal_eval)
    # Explode the recruit_ids column
    df = df.explode("recruit_ids")
    # Remove decimals from recruit_ids
    df["recruit_id"] = df["recruit_ids"].astype(str).str.split(".").str[0]

    df.loc[df["team"] == "Hawai'i", "team"] = "Hawaii"
    df.loc[df["team"] == "San José State", "team"] = "San Jose State"

    # Reorder the columns
    column_order = ["id", "name", "team", "year", "position", "season", "recruit_id"]
    return df[column_order]


# Example usage:
# roster_data = clean_roster_data(roster_data)

In [11]:
def call_api_rosters(first_year: int, last_year: int, api_key: str) -> pd.DataFrame:
    start = time.time()

    headers = {"Authorization": f"Bearer {api_key}"}
    year_list = list(range(first_year, last_year + 1))

    all_data = []
    for year in year_list:
        df = fetch_roster_data(year, headers)
        if not df.empty:
            all_data.append(df)
        time.sleep(1)  # Add a small delay to avoid overwhelming the API

    final_data = pd.concat(all_data, ignore_index=True)

    # Remove specified columns
    columns_to_drop = [
        "weight",
        "height",
        "home_city",
        "home_state",
        "home_country",
        "home_latitude",
        "home_longitude",
        "home_county_fips",
    ]
    final_data = final_data.drop(columns=columns_to_drop, errors="ignore")

    cleaned_roster_data = clean_roster_data(final_data)

    end = time.time()
    total_time = end - start
    total_time_per = total_time / (last_year - first_year + 1)

    minutes, seconds = divmod(int(total_time), 60)
    minutes_per, seconds_per = divmod(int(total_time_per), 60)

    print(f"call_api_rosters took {minutes} minutes and {seconds} seconds to complete.")
    print(
        f"It took {minutes_per} minutes and {seconds_per} seconds to complete per season."
    )

    return cleaned_roster_data


# Example usage:
# api_key = "your_api_key_here"
# roster_data = call_api_rosters(2014, 2023, api_key)

In [12]:
def clean_roster_data(roster_data):
    # Select the desired columns
    columns = [
        "id",
        "first_name",
        "last_name",
        "team",
        "year",
        "position",
        "recruit_ids",
        "season",
    ]
    df = roster_data[columns].copy()

    # Combine first_name and last_name into a single name column
    df["name"] = df["first_name"] + " " + df["last_name"]
    df = df.drop(columns=["first_name", "last_name"])

    # Parse the recruit_ids string into a list
    df["recruit_ids"] = df["recruit_ids"].apply(safe_literal_eval)

    # Explode the recruit_ids column
    df = df.explode("recruit_ids")

    # Remove decimals from recruit_ids
    df["recruit_id"] = df["recruit_ids"].astype(str).str.split(".").str[0]

    # Reorder the columns
    column_order = ["id", "name", "team", "year", "position", "season", "recruit_id"]
    df = df[column_order]

    return df


# Example usage:
# roster_data = clean_roster_data(roster_data)

## Coaches

In [13]:
def fetch_coaches_data(first_year: int, last_year: int, api_key: str) -> List[Dict]:
    url = "https://api.collegefootballdata.com/coaches"
    headers = {"Authorization": f"Bearer {api_key}"}
    params = {"minYear": first_year, "maxYear": last_year}

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()  # Raises an HTTPError for bad responses
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching coaches data: {e}")
        return []

In [14]:
def process_coaches_data(data: List[Dict]) -> pd.DataFrame:
    # Normalize the data and expand the 'seasons' column
    df = pd.json_normalize(
        data,
        record_path="seasons",
        meta=["first_name", "last_name", "hire_date"],
        record_prefix="season_",
    )

    # Rename columns for clarity
    df = df.rename(
        columns={
            "season_year": "season",
            "season_school": "school",
            "season_games": "games",
            "season_wins": "wins",
            "season_losses": "losses",
            "season_ties": "ties",
            "season_preseason_rank": "preseason_rank",
            "season_postseason_rank": "postseason_rank",
        }
    )

    df.loc[df["school"] == "Hawai'i", "school"] = "Hawaii"
    df.loc[df["school"] == "San José State", "school"] = "San Jose State"

    # Reorder columns
    column_order = [
        "season",
        "school",
        "first_name",
        "last_name",
        "hire_date",
        "games",
        "wins",
        "losses",
        "ties",
        "preseason_rank",
        "postseason_rank",
    ]
    df = df[column_order]

    return df

In [15]:
def call_api_coaches(first_year: int, last_year: int, api_key: str) -> pd.DataFrame:
    start = time.time()

    raw_data = fetch_coaches_data(first_year, last_year, api_key)

    if not raw_data:
        print("No data retrieved. Returning empty DataFrame.")
        return pd.DataFrame()

    coaches_df = process_coaches_data(raw_data)

    end = time.time()
    total_time = end - start
    total_time_per = total_time / (last_year - first_year + 1)

    minutes, seconds = divmod(int(total_time), 60)
    minutes_per, seconds_per = divmod(int(total_time_per), 60)

    print(f"call_api_coaches took {minutes} minutes and {seconds} seconds to complete.")
    print(
        f"It took {minutes_per} minutes and {seconds_per} seconds to complete per season."
    )
    print(f"Retrieved data for {len(coaches_df)} coach-season combinations.")

    return coaches_df


# Example usage:
# api_key = "Wydj7h+lsms1cKfUmVci9wUwPlEkmw+rPR6cixM2OCkuXmivyyLoIwWfXeKuy3PW"
# coaches_data = call_api_coaches(2014, 2023, api_key)

In [16]:
def clean_coaches_data(df, first_season_infinal):
    # Rename columns for clarity
    df = df.rename(
        columns={
            "season": "season",
            "school": "team",
            "first_name": "first_name",
            "last_name": "last_name",
            "games": "games",
            "wins": "wins",
            "losses": "losses",
            "ties": "ties",
            "preseason_rank": "preseason_rank",
            "postseason_rank": "postseason_rank",
        }
    )

    # Remove hire_date column
    df = df.drop(columns=["hire_date"])

    # Create a unique coach identifier
    df["coach_id"] = df["first_name"] + " " + df["last_name"]

    # Sort the dataframe
    df = df.sort_values(["coach_id", "season"]).reset_index(drop=True)

    # Calculate win percentage
    df["win_percentage"] = df["wins"] / (df["wins"] + df["losses"] + df["ties"])

    # Function to calculate rolling win percentage and average postseason rank
    def calculate_rolling_stats(group):
        group["win_percentage_prev3yr"] = (
            group["win_percentage"].rolling(window=3, min_periods=1).mean()
        )
        return group

    # Apply rolling calculations
    df = (
        df.groupby("coach_id")
        .apply(calculate_rolling_stats, include_groups=False)
        .reset_index()
    )

    # Remove seasons before 2014
    df = df[df["season"] >= first_season_infinal].reset_index(drop=True)

    # Reorder columns
    column_order = ["season", "team", "coach_id", "win_percentage_prev3yr"]
    df = df[column_order].sort_values(["season", "coach_id"]).reset_index(drop=True)

    return df


# Example usage:
# cleaned_coaches_data = clean_coaches_data(coaches_data, 2014)

## Games

In [17]:
def fetch_calendar(year: int, headers: Dict[str, str]) -> List[Dict]:
    url = "https://api.collegefootballdata.com/calendar"
    params = {"year": year}

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching calendar data for year {year}: {e}")
        return []

In [18]:
def fetch_games(year: int, week: int, headers: Dict[str, str]) -> List[Dict]:
    url = "https://api.collegefootballdata.com/games"
    params = {"year": year, "week": week}

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching games data for year {year}, week {week}: {e}")
        return []

In [19]:
def call_api_games(first_year: int, last_year: int, api_key: str) -> pd.DataFrame:
    start = time.time()

    headers = {"Authorization": f"Bearer {api_key}"}
    all_games = []

    for year in range(first_year, last_year + 1):
        calendar = fetch_calendar(year, headers)
        regular_season_weeks = [
            week["week"] for week in calendar if week["seasonType"] == "regular"
        ]

        for week in tqdm_(
            regular_season_weeks, desc=f"Fetching {year} games", unit="week"
        ):
            games = fetch_games(year, week, headers)
            all_games.extend(games)

    games_df = pd.DataFrame(all_games)

    # Filter for FBS games
    FBS_list = [
        "SEC",
        "American Athletic",
        "FBS Independents",
        "Big Ten",
        "Conference USA",
        "Big 12",
        "Mid-American",
        "ACC",
        "Sun Belt",
        "Pac-12",
        "Mountain West",
    ]
    FBS_games = games_df[
        (games_df["home_conference"].isin(FBS_list))
        & (games_df["away_conference"].isin(FBS_list))
        & (games_df["season_type"] == "regular")
    ].reset_index(drop=True)

    FBS_games.loc[FBS_games["home_team"] == "Hawai'i", "home_team"] = "Hawaii"
    FBS_games.loc[FBS_games["home_team"] == "San José State", "home_team"] = (
        "San Jose State"
    )
    FBS_games.loc[FBS_games["away_team"] == "Hawai'i", "away_team"] = "Hawaii"
    FBS_games.loc[FBS_games["away_team"] == "San José State", "away_team"] = (
        "San Jose State"
    )

    # Remove specified columns
    columns_to_remove = [
        "notes",
        "highlights",
        "excitement_index",
        "home_pregame_elo",
        "away_pregame_elo",
        "home_postgame_elo",
        "away_postgame_elo",
        "home_win_prob",
        "away_win_prob",
        "home_division",
        "away_division",  # Remove division columns
        "attendance",  # Remove attendance column
    ]
    columns_to_remove += [
        col
        for col in FBS_games.columns
        if col.startswith("elo_") or col.startswith("win_prob_")
    ]

    FBS_games = FBS_games.drop(columns=columns_to_remove, errors="ignore")

    end = time.time()
    total_time = end - start
    total_time_per = total_time / (last_year - first_year + 1)

    minutes, seconds = divmod(int(total_time), 60)
    minutes_per, seconds_per = divmod(int(total_time_per), 60)

    print(f"call_api_games took {minutes} minutes and {seconds} seconds to complete.")
    print(
        f"It took {minutes_per} minutes and {seconds_per} seconds to complete per season."
    )
    print(f"Retrieved {len(FBS_games)} FBS games from {first_year} to {last_year}.")

    return FBS_games


# Example usage:
# api_key = "Wydj7h+lsms1cKfUmVci9wUwPlEkmw+rPR6cixM2OCkuXmivyyLoIwWfXeKuy3PW"
# games_data = call_api_games(2014, 2023, api_key)

## Betting Lines

In [20]:
def extract_betting_data(lines: List[Dict]) -> tuple:
    for entry in lines:
        if entry["provider"] == "consensus":
            return entry.get("overUnder"), entry.get("spread")
        elif entry["provider"] == "Bovada":
            return entry.get("overUnder"), entry.get("spread")
    return None, None

In [21]:
def fetch_betting_lines(year: int, headers: Dict[str, str]) -> List[Dict]:
    url = "https://api.collegefootballdata.com/lines"
    params = {"year": year}

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching betting lines for year {year}: {e}")
        return []

In [22]:
def call_api_bettinglines(
    first_year: int, last_year: int, api_key: str
) -> pd.DataFrame:
    start = time.time()

    headers = {"Authorization": f"Bearer {api_key}"}
    FBS_list = [
        "SEC",
        "American Athletic",
        "FBS Independents",
        "Big Ten",
        "Conference USA",
        "Big 12",
        "Mid-American",
        "ACC",
        "Sun Belt",
        "Pac-12",
        "Mountain West",
    ]

    all_lines = []

    for year in tqdm_(
        range(first_year, last_year + 1), desc="Fetching betting lines", unit="year"
    ):
        lines_data = fetch_betting_lines(year, headers)

        for game in lines_data:
            if (
                game["homeConference"] in FBS_list
                and game["awayConference"] in FBS_list
            ):
                overunder, spread = extract_betting_data(game.get("lines", []))
                all_lines.append(
                    {
                        "season": game["season"],
                        "week": game["week"],
                        "game_id": game["id"],
                        "home_team": game["homeTeam"],
                        "away_team": game["awayTeam"],
                        "official_overunder": overunder,
                        "official_line": spread,
                    }
                )

    betting_lines_df = pd.DataFrame(all_lines)

    # Convert spread to float and flip sign
    betting_lines_df["official_line"] = (
        pd.to_numeric(betting_lines_df["official_line"], errors="coerce") * -1
    )

    betting_lines_df.loc[betting_lines_df["home_team"] == "Hawai'i", "home_team"] = (
        "Hawaii"
    )
    betting_lines_df.loc[
        betting_lines_df["home_team"] == "San José State", "home_team"
    ] = "San Jose State"
    betting_lines_df.loc[betting_lines_df["away_team"] == "Hawai'i", "away_team"] = (
        "Hawaii"
    )
    betting_lines_df.loc[
        betting_lines_df["away_team"] == "San José State", "away_team"
    ] = "San Jose State"

    # Sort the DataFrame
    betting_lines_df = betting_lines_df.sort_values(
        ["season", "week", "game_id"]
    ).reset_index(drop=True)

    end = time.time()
    total_time = end - start
    total_time_per = total_time / (last_year - first_year + 1)

    minutes, seconds = divmod(int(total_time), 60)
    minutes_per, seconds_per = divmod(int(total_time_per), 60)

    print(
        f"call_api_bettinglines took {minutes} minutes and {seconds} seconds to complete."
    )
    print(
        f"It took {minutes_per} minutes and {seconds_per} seconds to complete per season."
    )
    print(
        f"Retrieved betting lines for {len(betting_lines_df)} FBS games from {first_year} to {last_year}."
    )

    return betting_lines_df


# Example usage:
# api_key = "Wydj7h+lsms1cKfUmVci9wUwPlEkmw+rPR6cixM2OCkuXmivyyLoIwWfXeKuy3PW"
# betting_lines_data = call_api_bettinglines(2014, 2023, api_key)

## Recruiting

In [23]:
def fetch_recruiting_data(year: int, headers: Dict[str, str]) -> List[Dict]:
    url = "https://api.collegefootballdata.com/recruiting/players"
    params = {"year": year}

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching recruiting data for year {year}: {e}")
        return []

In [24]:
def call_api_recruiting(first_year: int, last_year: int, api_key: str) -> pd.DataFrame:
    start = time.time()

    headers = {"Authorization": f"Bearer {api_key}"}

    all_recruits = []

    for year in tqdm_(
        range(first_year, last_year + 1), desc="Fetching recruiting data", unit="year"
    ):
        recruits_data = fetch_recruiting_data(year, headers)
        all_recruits.extend(recruits_data)

    recruits_df = pd.DataFrame(all_recruits)

    # Drop specified columns and rename 'committedTo' to 'school'
    columns_to_drop = [
        "height",
        "weight",
        "city",
        "stateProvince",
        "country",
        "hometownInfo",
    ]
    final_data = recruits_df.drop(columns=columns_to_drop, errors="ignore")
    final_data = final_data.rename(columns={"committedTo": "team"})

    final_data.loc[final_data["team"] == "Hawai'i", "team"] = "Hawaii"
    final_data.loc[final_data["team"] == "San José State", "team"] = "San Jose State"

    # Sort the DataFrame
    final_data = final_data.sort_values(["year", "ranking"]).reset_index(drop=True)

    end = time.time()
    total_time = end - start
    total_time_per = total_time / (last_year - first_year + 1)

    minutes, seconds = divmod(int(total_time), 60)
    minutes_per, seconds_per = divmod(int(total_time_per), 60)

    print(
        f"call_api_recruiting took {minutes} minutes and {seconds} seconds to complete."
    )
    print(
        f"It took {minutes_per} minutes and {seconds_per} seconds to complete per season."
    )
    print(
        f"Retrieved recruiting data for {len(final_data):,} players from {first_year} to {last_year}."
    )

    return final_data


# Example usage:
# api_key = "Wydj7h+lsms1cKfUmVci9wUwPlEkmw+rPR6cixM2OCkuXmivyyLoIwWfXeKuy3PW"
# recruiting_data = call_api_recruiting(2014, 2023, api_key)

## Returning Production

In [25]:
def fetch_return_production_data(year: int, headers: Dict[str, str]) -> List[Dict]:
    url = "https://api.collegefootballdata.com/player/returning"
    params = {"year": year}

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching return production data for year {year}: {e}")
        return []

In [26]:
def call_api_returnproduction(
    first_year: int, last_year: int, api_key: str
) -> pd.DataFrame:
    start = time.time()

    headers = {"Authorization": f"Bearer {api_key}"}

    all_production = []

    for year in tqdm_(
        range(first_year, last_year + 1),
        desc="Fetching return production data",
        unit="year",
    ):
        production_data = fetch_return_production_data(year, headers)
        for team in production_data:
            team_data = {
                "season": year,
                "team": team["team"],
                "conference": team["conference"],
                "total_ppa": team["totalPPA"],
                "total_passing_ppa": team["totalPassingPPA"],
                "total_receiving_ppa": team["totalReceivingPPA"],
                "total_rushing_ppa": team["totalRushingPPA"],
                "percent_ppa": team["percentPPA"],
                "percent_passing_ppa": team["percentPassingPPA"],
                "percent_receiving_ppa": team["percentReceivingPPA"],
                "percent_rushing_ppa": team["percentRushingPPA"],
                "usage": team["usage"],
                "passing_usage": team["passingUsage"],
                "receiving_usage": team["receivingUsage"],
                "rushing_usage": team["rushingUsage"],
            }
            all_production.append(team_data)

    production_df = pd.DataFrame(all_production)

    production_df.loc[production_df["team"] == "Hawai'i", "team"] = "Hawaii"
    production_df.loc[production_df["team"] == "San José State", "team"] = (
        "San Jose State"
    )

    # Sort the DataFrame
    production_df = production_df.sort_values(
        ["season", "conference", "team"]
    ).reset_index(drop=True)

    end = time.time()
    total_time = end - start
    total_time_per = total_time / (last_year - first_year + 1)

    minutes, seconds = divmod(int(total_time), 60)
    minutes_per, seconds_per = divmod(int(total_time_per), 60)

    print(
        f"call_api_returnproduction took {minutes} minutes and {seconds} seconds to complete."
    )
    print(
        f"It took {minutes_per} minutes and {seconds_per} seconds to complete per season."
    )
    print(
        f"Retrieved return production data for {len(production_df)} team-seasons from {first_year} to {last_year}."
    )

    return production_df


# Example usage:
# api_key = "Wydj7h+lsms1cKfUmVci9wUwPlEkmw+rPR6cixM2OCkuXmivyyLoIwWfXeKuy3PW"
# return_production_data = call_api_returnproduction(2014, 2023, api_key)

## SP+ Ratings

In [27]:
def fetch_sp_ratings_data(year: int, headers: Dict[str, str]) -> List[Dict]:
    url = "https://api.collegefootballdata.com/ratings/sp"
    params = {"year": year}

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching SP+ ratings data for year {year}: {e}")
        return []

In [28]:
def call_api_spratings(first_year: int, last_year: int, api_key: str) -> pd.DataFrame:
    start = time.time()

    headers = {"Authorization": f"Bearer {api_key}"}

    all_ratings = []

    for year in tqdm_(
        range(first_year, last_year + 1), desc="Fetching SP+ ratings data", unit="year"
    ):
        ratings_data = fetch_sp_ratings_data(year, headers)
        for team in ratings_data:
            try:
                team_data = {
                    "year": year,
                    "team": team.get("team", ""),
                    "conference": team.get("conference", "Unknown"),
                    "o_rating": team["offense"].get("rating", None),
                    "d_rating": team["defense"].get("rating", None),
                    "st_rating": team.get("specialTeams", {}).get("rating", None),
                }
                all_ratings.append(team_data)
            except KeyError as e:
                print(f"Error processing team data for year {year}: {e}")
                print(f"Problematic team data: {team}")
                continue

    ratings_df = pd.DataFrame(all_ratings)

    ratings_df.loc[ratings_df["team"] == "Hawai'i", "team"] = "Hawaii"
    ratings_df.loc[ratings_df["team"] == "San José State", "team"] = "San Jose State"

    ratings_df = ratings_df.sort_values(["year", "conference", "team"]).reset_index(
        drop=True
    )

    end = time.time()
    total_time = end - start
    total_time_per = total_time / (last_year - first_year + 1)

    minutes, seconds = divmod(int(total_time), 60)
    minutes_per, seconds_per = divmod(int(total_time_per), 60)

    print(
        f"call_api_spratings took {minutes} minutes and {seconds} seconds to complete."
    )
    print(
        f"It took {minutes_per} minutes and {seconds_per} seconds to complete per season."
    )
    print(
        f"Retrieved SP+ ratings data for {len(ratings_df)} team-seasons from {first_year} to {last_year}."
    )

    return ratings_df


# Example usage:
# api_key = "Wydj7h+lsms1cKfUmVci9wUwPlEkmw+rPR6cixM2OCkuXmivyyLoIwWfXeKuy3PW"
# sp_ratings_data = call_api_spratings(2014, 2023, api_key)

## Talent

In [29]:
def fetch_talent_data(year: int, headers: Dict[str, str]) -> List[Dict]:
    url = "https://api.collegefootballdata.com/talent"
    params = {"year": year}

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching talent data for year {year}: {e}")
        return []

In [30]:
def call_api_talent(first_year: int, last_year: int, api_key: str) -> pd.DataFrame:
    start = time.time()

    headers = {"Authorization": f"Bearer {api_key}"}

    all_talent = []

    for year in tqdm_(
        range(first_year, last_year + 1), desc="Fetching talent data", unit="year"
    ):
        talent_data = fetch_talent_data(year, headers)
        for team in talent_data:
            team_data = {
                "year": year,  # Use the year from our input
                "school": team.get("school", ""),
                "talent": team.get("talent", None),
            }
            all_talent.append(team_data)

    talent_df = pd.DataFrame(all_talent)

    talent_df.loc[talent_df["school"] == "Hawai'i", "school"] = "Hawaii"
    talent_df.loc[talent_df["school"] == "San José State", "school"] = "San Jose State"

    # Only sort if we have data
    if not talent_df.empty:
        talent_df = talent_df.sort_values(["year", "school"]).reset_index(drop=True)

    end = time.time()
    total_time = end - start
    total_time_per = total_time / (last_year - first_year + 1)

    minutes, seconds = divmod(int(total_time), 60)
    minutes_per, seconds_per = divmod(int(total_time_per), 60)

    print(f"call_api_talent took {minutes} minutes and {seconds} seconds to complete.")
    print(
        f"It took {minutes_per} minutes and {seconds_per} seconds to complete per season."
    )
    print(
        f"Retrieved talent data for {len(talent_df)} team-seasons from {first_year} to {last_year}."
    )

    return talent_df


# Example usage:
# api_key = "Wydj7h+lsms1cKfUmVci9wUwPlEkmw+rPR6cixM2OCkuXmivyyLoIwWfXeKuy3PW"
# talent_data = call_api_talent(2014, 2023, api_key)

## Plays

In [31]:
def fetch_calendar(year: int, headers: Dict[str, str]) -> List[Dict]:
    url = "https://api.collegefootballdata.com/calendar"
    params = {"year": year}

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching calendar data for year {year}: {e}")
        return []

In [32]:
def fetch_plays(year: int, week: int, headers: Dict[str, str]) -> List[Dict]:
    url = "https://api.collegefootballdata.com/plays"
    params = {"year": year, "week": week}

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching plays data for year {year}, week {week}: {e}")
        return []

In [33]:
def call_api_plays(first_year: int, last_year: int, api_key: str) -> pd.DataFrame:
    start = time.time()

    headers = {"Authorization": f"Bearer {api_key}"}
    FBS_list = [
        "SEC",
        "American Athletic",
        "FBS Independents",
        "Big Ten",
        "Conference USA",
        "Big 12",
        "Mid-American",
        "ACC",
        "Sun Belt",
        "Pac-12",
        "Mountain West",
    ]

    all_plays = []

    for year in range(first_year, last_year + 1):
        calendar = fetch_calendar(year, headers)
        regular_season_weeks = [
            week["week"] for week in calendar if week["seasonType"] == "regular"
        ]

        for week in tqdm_(
            regular_season_weeks, desc=f"Fetching {year} plays", unit="week"
        ):
            plays = fetch_plays(year, week, headers)
            for play in plays:
                play["season"] = year  # Add the year to each play
                play["week"] = week  # Add the week to each play
            all_plays.extend(plays)

    plays_df = pd.DataFrame(all_plays)

    # Filter for FBS games
    cfpera_plays = plays_df[
        (plays_df["offense_conference"].isin(FBS_list))
        & (plays_df["defense_conference"].isin(FBS_list))
    ].reset_index(drop=True)

    # Rename and adjust columns
    cfpera_plays = cfpera_plays.rename(
        columns={"distance": "yards_to_first", "period": "quarter"}
    )

    # Calculate adj_yd_line
    cfpera_plays["adj_yd_line"] = cfpera_plays.apply(
        lambda row: row["yard_line"]
        if row["offense"] == row["home"]
        else 100 - row["yard_line"],
        axis=1,
    )

    # Fix turnover yards
    interception_conditions = (
        (cfpera_plays["play_type"] == "Interception")
        | (cfpera_plays["play_type"] == "Interception Return Touchdown")
        | (cfpera_plays["play_type"] == "Pass Interception Return")
    )
    cfpera_plays.loc[interception_conditions, "yards_gained"] = 0

    # Extract minutes and seconds into separate columns
    cfpera_plays["clock_minutes"] = cfpera_plays["clock"].apply(lambda x: x["minutes"])
    cfpera_plays["clock_seconds"] = cfpera_plays["clock"].apply(lambda x: x["seconds"])
    cfpera_plays = cfpera_plays.drop(columns=["clock", "wallclock"])

    cfpera_plays.loc[cfpera_plays["offense"] == "Hawai'i", "offense"] = "Hawaii"
    cfpera_plays.loc[cfpera_plays["offense"] == "San José State", "offense"] = (
        "San Jose State"
    )
    cfpera_plays.loc[cfpera_plays["defense"] == "Hawai'i", "defense"] = "Hawaii"
    cfpera_plays.loc[cfpera_plays["defense"] == "San José State", "defense"] = (
        "San Jose State"
    )

    columns = cfpera_plays.columns.tolist()
    columns.remove("season")
    columns.insert(0, "season")
    columns.remove("week")
    columns.insert(1, "week")
    # Reorder the dataframe columns
    cfpera_plays = cfpera_plays[columns]

    cfpera_plays = cfpera_plays.sort_values(
        by=["season", "game_id", "drive_number", "play_number"],
        ascending=[False, True, True, True],
    ).reset_index(drop=True)

    end = time.time()
    total_time = end - start
    total_time_per = total_time / (last_year - first_year + 1)

    minutes, seconds = divmod(int(total_time), 60)
    minutes_per, seconds_per = divmod(int(total_time_per), 60)

    print(f"call_api_plays took {minutes} minutes and {seconds} seconds to complete.")
    print(
        f"It took {minutes_per} minutes and {seconds_per} seconds to complete per season."
    )

    return cfpera_plays


# Example usage:
# plays_data = call_api_plays(2014, 2023, api_key)

## Fixing Data

In [34]:
def apply_updates_to_dataframe(df, conditions_and_updates):
    for condition, updates in conditions_and_updates:
        game_id, drive_number, *play_number = condition
        if not play_number:
            # If play_number is not provided in the condition, apply the updates to all rows matching game_id and drive_number
            condition_mask = (df["game_id"] == game_id) & (
                df["drive_number"] == drive_number
            )
        else:
            play_number = play_number[0]  # Extract the play_number from the list
            # Apply the updates only to rows matching all conditions
            condition_mask = (
                (df["game_id"] == game_id)
                & (df["drive_number"] == drive_number)
                & (df["play_number"] == play_number)
            )
        df.loc[condition_mask, list(updates.keys())] = list(updates.values())
    return df


# Example usage:
# updated_plays = apply_updates_to_dataframe(plays_data, conditions_and_updates)

In [35]:
# Define the conditions and updates
conditions_and_updates = [
    ((400937467, 1, 5), {"yards_gained": 15, "play_type": "Penalty"}),
    ((400547851, 11, 6), {"yards_gained": 15, "play_type": "Penalty"}),
    ((400547737, 9, 1), {"yards_to_first": 1}),
    (
        (400547737, 9, 2),
        {
            "yard_line": 2,
            "yards_to_goal": 98,
            "yards_to_first": 9,
            "yards_gained": -1,
            "adj_yd_line": 98,
        },
    ),
    ((400547737, 9, 3), {"yards_to_first": 10}),
    ((400547739, 11, 14), {"yards_gained": -5}),
    (
        (400547739, 11, 15),
        {"yard_line": 82, "yards_to_goal": 18, "yards_to_first": 18, "adj_yd_line": 18},
    ),
    (
        (400547739, 11, 16),
        {"yard_line": 82, "yards_to_goal": 18, "yards_to_first": 18, "adj_yd_line": 18},
    ),
    ((400869843, 27, 9), {"yards_gained": -5}),
    ((400869843, 27, 10), {"yard_line": 78, "yards_to_goal": 22}),
    ((401237102, 4), {"offense": "Texas A&M", "defense": "Florida"}),
    ((401237102, 4, 8), {"play_number": 5}),
    ((401237102, 4, 9), {"play_number": 6}),
    ((401237102, 4, 10), {"play_number": 7}),
    ((401237102, 4, 11), {"play_number": 8}),
    ((401237102, 4, 12), {"play_number": 9}),
    ((401237102, 4, 13), {"play_number": 10}),
    ((401237102, 4, 14), {"play_number": 11}),
    ((401237102, 4, 15), {"play_number": 12}),
    ((401237102, 4, 16), {"play_number": 13}),
    ((401237102, 4, 17), {"play_number": 14}),
    ((401237102, 4, 18), {"play_number": 15}),
    ((401237102, 4, 29), {"play_number": 16}),
    ((400869850, 26, 15), {"yards_gained": -5}),
    ((401013353, 23, 4), {"yards_gained": -5}),
    ((400869264, 9, 1), {"yards_gained": -5}),
    ((401114260, 11, 5), {"yards_gained": -5}),
    ((401282206, 9, 6), {"yards_gained": -15}),
    ((401405102, 14, 4), {"yards_gained": -15}),
    ((401282189, 6, 7), {"yards_gained": -15}),
    ((401309577, 22, 1), {"yards_gained": -15}),
    ((400548020, 16, 10), {"yards_gained": -5}),
    ((401403927, 9, 3), {"yards_gained": -15}),
    ((400787353, 3, 5), {"yards_gained": -5}),
    ((400869721, 19, 5), {"yards_gained": -15}),
    ((401310733, 6, 8), {"yards_gained": 0}),
    ((401287949, 4, 3), {"yards_gained": -9}),
    ((401309639, 19, 5), {"yards_gained": 0}),
    ((401282215, 26, 10), {"yards_gained": 0}),
    ((401119278, 26, 7), {"yards_gained": 0}),
    ((401121957, 28, 4), {"yards_gained": -10}),
    ((400941829, 25, 2), {"play_type": "Penalty", "yards_gained": -10}),
    ((400869041, 18, 8), {"yards_gained": -9}),
    ((400547743, 21, 3), {"yards_gained": 15}),
    ((401117876, 8, 5), {"yards_gained": 0}),
    ((401022561, 20, 11), {"yards_gained": 15}),
    (
        (401643724, 22, 15),
        {"yard_line": 25, "yards_to_goal": 25, "yards_to_first": 25, "adj_yd_line": 25},
    ),  # Added 10/14/24
]

## Explosives

In [36]:
def calculate_explosive(row):
    if row["play_type"] in [
        "Rush",
        "Rushing Touchdown",
        "Fumble Recovery (Own)",
        "Pass",
        "Pass Reception",
        "Pass Incompletion",
        "Passing Touchdown",
        "Sack",
        "Safety",
    ]:
        if (row["rush_result"] == 1) and (row["yards_gained"] >= 15):
            return 1
        if (row["pass_attempt"] == 1) and (row["yards_gained"] >= 20):
            return 1
        else:
            return 0
    elif row["play_type"] in [
        "Fumble Recovery (Opponent)",
        "Fumble Return Touchdown",
        "Pass Interception Return",
        "Interception Return Touchdown",
        "Safety",
        "Interception",
    ]:
        return 0
    else:
        return None

## Success

In [37]:
def calculate_play_success(row):
    if row["play_type"] in [
        "Rush",
        "Rushing Touchdown",
        "Fumble Recovery (Own)",
        "Pass",
        "Pass Reception",
        "Pass Incompletion",
        "Passing Touchdown",
        "Sack",
        "Safety",
    ]:
        if row["yards_to_first"] == 0:
            if row["down"] == 1 and row["yards_gained"] >= (0.5 * row["yards_to_goal"]):
                return 1
            if row["down"] == 2 and row["yards_gained"] >= (0.7 * row["yards_to_goal"]):
                return 1
            if row["down"] == 3 and row["yards_gained"] >= row["yards_to_goal"]:
                return 1
            if row["down"] == 4 and row["yards_gained"] >= row["yards_to_goal"]:
                return 1
            else:
                return 0
        elif row["yards_to_first"] > 0:
            if row["down"] == 1 and row["yards_gained"] >= (
                0.5 * row["yards_to_first"]
            ):
                return 1
            if row["down"] == 2 and row["yards_gained"] >= (
                0.7 * row["yards_to_first"]
            ):
                return 1
            if row["down"] == 3 and row["yards_gained"] >= row["yards_to_first"]:
                return 1
            if row["down"] == 4 and row["yards_gained"] >= row["yards_to_first"]:
                return 1
            if row["down"] == 1 and row["yards_gained"] >= (0.5 * row["yards_to_goal"]):
                return 1
            if row["down"] == 2 and row["yards_gained"] >= (0.7 * row["yards_to_goal"]):
                return 1
            if row["down"] == 3 and row["yards_gained"] >= row["yards_to_goal"]:
                return 1
            if row["down"] == 4 and row["yards_gained"] >= row["yards_to_goal"]:
                return 1
            else:
                return 0
        else:
            return 0
    elif row["turnover"] == 1:
        return 0
    elif row["penalty"] == 1:
        return None
    else:
        return None

In [38]:
def calculate_drive_success(row):
    """
    Calculate a binary Drive Success metric for a single drive (row).

    :param row: A pandas Series representing a single drive
    :return: 1 if the drive is considered successful, 0 otherwise
    """
    # Check for turnover first
    if row["giveaway"] == 1:
        return 0

    # Touchdown is always a success
    if row["TD"] == 1:
        return 1

    # Field goal success depends on starting position
    if row["FG"] == 1 and row["starting_position"] <= 30:
        return 1

    # Success based on field position and available yards gained
    if row["starting_position"] <= 25 and row["available_yards_pct"] >= 0.5:
        return 1

    if 25 < row["starting_position"] <= 50 and row["available_yards_pct"] >= 0.75:
        return 1

    # If none of the above conditions are met, the drive is not considered successful
    return 0


# Example usage:
# by_drive_data['drive_success'] = by_drive_data.apply(calculate_drive_success, axis=1)

## Fixing Yards Gained

In [39]:
def update_yards_gained(row):
    Fumble = ["Fumble Recovery (Opponent)"]
    Fumble_TD = ["Fumble Return Touchdown"]

    if row["play_type"] in Fumble_TD and row["yards_gained"] != 0:
        return 0
    elif row["play_type"] in Fumble:
        return 0
    else:
        return row["yards_gained"]

## Play Duration

In [40]:
def calculate_time_features(data):
    # Sort the data first
    data = data.sort_values(
        by=["season", "week", "game_id", "quarter", "drive_number", "play_number"],
        ascending=[True, True, True, True, True, True],
    ).reset_index(drop=True)

    # Validate clock_minutes and clock_seconds
    if data["clock_minutes"].max() > 15 or data["clock_seconds"].max() >= 60:
        print("Warning: Some clock values seem to be out of range.")

    # Calculate clock_minutes_in_secs
    data["clock_minutes_in_secs"] = np.where(
        data["quarter"] <= 4,
        (4 - data["quarter"]) * 15 * 60 + (data["clock_minutes"] * 60),
        0,  # For overtime periods
    )

    # Add clock seconds to get total time remaining
    data["time_remaining_after"] = data["clock_minutes_in_secs"] + data["clock_seconds"]

    # Calculate time remaining before the play
    data["time_remaining_before"] = data.groupby("game_id")[
        "time_remaining_after"
    ].shift(1)
    data.loc[
        (data["drive_number"] == 1) & (data["play_number"] == 1),
        "time_remaining_before",
    ] = 3600

    # Calculate play duration
    data["play_duration"] = data["time_remaining_before"] - data["time_remaining_after"]

    # Validate results
    if data["play_duration"].min() < 0:
        print(
            "Warning: Some play durations are negative. This might indicate an issue with the time calculation."
        )

    data.loc[data["penalty"] == 1, "play_duration"] = None
    data.loc[((data["st"] == 1) | (data["twopoint"] == 1)), "play_duration"] = None
    data.loc[data["play_duration"] <= 0, "play_duration"] = None
    data.loc[data["play_duration"] >= 60, "play_duration"] = (
        None  # The data is that bad. 10% of all the play durations turn out this way. Nothing I can do.
    )

    # data = data.drop(columns=['clock_minutes','clock_seconds','time_remaining_before','time_remaining_after'])

    return data


# Usage:
# data = calculate_time_features(data)

## All Plays to By Play

In [41]:
def allplays_to_byplay(data):
    start = time.time()

    # Constants (moved outside the function for clarity)
    FBS_list = [
        "SEC",
        "American Athletic",
        "FBS Independents",
        "Big Ten",
        "Conference USA",
        "Big 12",
        "Mid-American",
        "ACC",
        "Sun Belt",
        "Pac-12",
        "Mountain West",
    ]
    st_kickoffs = ["Kickoff", "Kickoff Return (Offense)", "Kickoff Return Touchdown"]
    st_punts = [
        "Punt",
        "Blocked Punt",
        "Punt Return Touchdown",
        "Blocked Punt Touchdown",
    ]
    st_fg = [
        "Field Goal Good",
        "Field Goal Missed",
        "Blocked Field Goal",
        "Missed Field Goal Return",
        "Missed Field Goal Return Touchdown",
        "Blocked Field Goal Touchdown",
    ]
    st_extrapoint = ["Extra Point Good", "Extra Point Missed"]
    twopoint_list = ["Two Point Pass", "Two Point Rush", "Defensive 2pt Conversion"]
    st_list = st_kickoffs + st_punts + st_fg + st_extrapoint
    endofdrive = [
        "Punt",
        "Field Goal Good",
        "Field Goal Missed",
        "Blocked Field Goal",
        "Blocked Punt",
        "Punt Return Touchdown",
        "Blocked Punt Touchdown",
        "Missed Field Goal Return",
        "Blocked Field Goal Touchdown",
        "Missed Field Goal Return Touchdown",
    ]
    turnover_list = [
        "Interception",
        "Interception Return Touchdown",
        "Pass Interception Return",
        "Fumble Recovery (Opponent)",
        "Fumble Return Touchdown",
    ]
    rushattempt_list = ["Rush", "Rushing Touchdown"]
    rushresult_list = ["Rush", "Rushing Touchdown", "Sack", "Safety"]
    dropback_list = [
        "Pass",
        "Interception",
        "Interception Return Touchdown",
        "Pass Interception Return",
        "Passing Touchdown",
        "Pass Incompletion",
        "Pass Reception",
        "Sack",
    ]
    passattempt_list = [
        "Pass",
        "Pass Completion",
        "Interception",
        "Interception Return Touchdown",
        "Pass Interception Return",
        "Passing Touchdown",
        "Pass Incompletion",
        "Pass Reception",
    ]
    completion_list = ["Pass Reception", "Pass Completion", "Pass", "Passing Touchdown"]
    playtype_delete = ["Timeout", "Uncategorized", "placeholder", "End Period"]
    havoc_list = [
        "Fumble Recovery (Opponent)",
        "Fumble Return Touchdown",
        "Fumble Recovery (Own)",
        "Pass Incompletion",
        "Interception",
        "Interception Return Touchdown",
        "Pass Interception Return",
    ]

    # Create a copy of the input data
    allFBSplays = data.copy()

    # Add new columns
    allFBSplays["relative_score"] = (
        allFBSplays["offense_score"] - allFBSplays["defense_score"]
    )
    allFBSplays["half"] = np.select(
        [allFBSplays["quarter"].isin([1, 2]), allFBSplays["quarter"].isin([3, 4])],
        [1, 2],
        default=3,
    )
    allFBSplays["penalty"] = (allFBSplays["play_type"] == "Penalty").astype(int)
    allFBSplays["defensive_penalty"] = (
        (allFBSplays["play_type"] == "Penalty") & (allFBSplays["yards_gained"] > 0)
    ).astype(int)
    allFBSplays["offensive_penalty"] = (
        (allFBSplays["play_type"] == "Penalty") & (allFBSplays["yards_gained"] < 0)
    ).astype(int)
    allFBSplays["home_away"] = np.where(
        allFBSplays["offense"] == allFBSplays["home"], "home", "away"
    )

    # Create binary columns
    binary_columns = {
        "st_kickoff": st_kickoffs,
        "st_punt": st_punts,
        "st_fg": st_fg,
        "st": st_list,
        "endofdrive": endofdrive,
        "twopoint": twopoint_list,
        "turnover": turnover_list,
        "rush_attempt": rushattempt_list,
        "rush_result": rushresult_list,
        "dropback": dropback_list,
        "pass_attempt": passattempt_list,
    }

    for col, play_types in binary_columns.items():
        allFBSplays[col] = allFBSplays["play_type"].isin(play_types).astype(int)

    allFBSplays["red_zone"] = (allFBSplays["yards_to_goal"] <= 20).astype(int)
    allFBSplays["eckel"] = (
        (allFBSplays["yards_to_goal"] < 40) & (allFBSplays["down"] == 1)
    ).astype(int)

    # Update yards gained
    allFBSplays["updated_yards_gained"] = allFBSplays.apply(update_yards_gained, axis=1)
    allFBSplays["TFL"] = (
        (allFBSplays["penalty"] == 0) & (allFBSplays["updated_yards_gained"] < 0)
    ).astype(int)
    allFBSplays["sack"] = (allFBSplays["play_type"] == "Sack").astype(int)
    allFBSplays["completion"] = (allFBSplays["play_type"].isin(completion_list)).astype(
        int
    )

    # Calculate success, explosive, opportunity
    allFBSplays["success"] = allFBSplays.apply(calculate_play_success, axis=1)
    allFBSplays["explosive"] = allFBSplays.apply(calculate_explosive, axis=1)
    allFBSplays["success_yards"] = np.where(
        allFBSplays["success"] == 1, allFBSplays["updated_yards_gained"], 0
    )
    allFBSplays["explosive_yards"] = np.where(
        allFBSplays["explosive"] == 1, allFBSplays["updated_yards_gained"], 0
    )
    allFBSplays["stuff"] = (
        (allFBSplays["rush_attempt"] == 1) & (allFBSplays["updated_yards_gained"] <= 0)
    ).astype(int)

    # Calculate havoc
    allFBSplays["havoc"] = (
        allFBSplays["play_type"].isin(havoc_list)
        | (allFBSplays["updated_yards_gained"] < 0)
    ).astype(int)

    # Create third and fourth down conversion columns
    allFBSplays["thirddown_conversion"] = np.where(
        allFBSplays["down"] == 3, (np.where(allFBSplays["success"] == 1, 1, 0)), None
    )
    allFBSplays["fourthdown_conversion"] = np.where(
        allFBSplays["down"] == 4, (np.where(allFBSplays["success"] == 1, 1, 0)), None
    )

    allFBSplays.loc[allFBSplays["penalty"] == 1, "havoc"] = 0

    # Rename updated_yards_gained to yards_gained
    allFBSplays = allFBSplays.drop(columns=["yards_gained"]).rename(
        columns={"updated_yards_gained": "yards_gained"}
    )

    # Sort values
    allFBSplays = allFBSplays.sort_values(
        by=["season", "week", "game_id", "quarter", "drive_number", "play_number"]
    )

    # Calculate time remaining
    allFBSplays = calculate_time_features(allFBSplays)

    # Efficient garbage time calculation
    def calculate_garbage_time(df):
        # Initialize garbage column with False
        df["garbage"] = False

        # No garbage time in first half
        second_half = df["quarter"] > 2

        # Third quarter garbage time
        df.loc[
            second_half & (df["quarter"] == 3) & (abs(df["relative_score"]) >= 35),
            "garbage",
        ] = True

        # Fourth quarter garbage time
        fourth_quarter = second_half & (df["quarter"] == 4)
        df.loc[fourth_quarter & (abs(df["relative_score"]) >= 27), "garbage"] = True

        return df["garbage"].astype(int)

    # Apply garbage time calculation
    allFBSplays["garbage"] = calculate_garbage_time(allFBSplays)

    # Categorize field position
    allFBSplays["field_position_bin"] = pd.cut(
        allFBSplays["yard_line"],
        bins=[0, 20, 50, 80, 100],
        labels=["Own Red Zone", "Own", "Opponent", "Red Zone"],
    )

    # Define passing down conditions
    allFBSplays["passing_down"] = np.where(
        ((allFBSplays["down"] == 2) & (allFBSplays["yards_to_first"] >= 8))
        | ((allFBSplays["down"].isin([3, 4])) & (allFBSplays["yards_to_first"] >= 5)),
        1,
        0,
    )

    # Drop unnecessary columns and filter out certain play types
    column_order = [
        "season",
        "week",
        "game_id",
        "offense",
        "defense",
        "home_away",
        "relative_score",
        "offense_score",
        "defense_score",
        "half",
        "quarter",
        "offense_timeouts",
        "defense_timeouts",
        "drive_id",
        "drive_number",
        "play_number",
        "garbage",
        "yard_line",
        "yards_to_goal",
        "adj_yd_line",
        "field_position_bin",
        "down",
        "yards_to_first",
        "passing_down",
        "red_zone",
        "eckel",
        "scoring",
        "play_type",
        "play_text",
        "penalty",
        "offensive_penalty",
        "defensive_penalty",
        "st",
        "st_kickoff",
        "st_punt",
        "st_fg",
        "endofdrive",
        "twopoint",
        "turnover",
        "rush_attempt",
        "rush_result",
        "dropback",
        "pass_attempt",
        "yards_gained",
        "ppa",
        "TFL",
        "sack",
        "completion",
        "success",
        "success_yards",
        "explosive",
        "explosive_yards",
        "stuff",
        "havoc",
        "thirddown_conversion",
        "fourthdown_conversion",
        "clock_minutes",
        "clock_minutes_in_secs",
        "clock_seconds",
        "time_remaining_after",
        "time_remaining_before",
        "play_duration",
    ]
    allFBSplays = allFBSplays[column_order].copy()
    by_play = allFBSplays[~allFBSplays["play_type"].isin(playtype_delete)].copy()
    by_play["ppa"] = pd.to_numeric(by_play["ppa"], errors="coerce")

    # Because I have to put them somewhere
    by_play.loc[
        by_play["play_type"].isin(
            ["Fumble Recovery (Own)", "Fumble Recovery (Opponent)"]
        ),
        "rush_attempt",
    ] = 1
    by_play.loc[
        by_play["play_type"].isin(
            ["Fumble Recovery (Own)", "Fumble Recovery (Opponent)"]
        ),
        "rush_result",
    ] = 1
    by_play.loc[
        by_play["play_type"].isin(["Fumble Recovery (Opponent)"]), "yards_gained"
    ] = 0

    end = time.time()
    total_time = end - start
    minutes, seconds = divmod(int(total_time), 60)
    print(
        f"allplays_to_byplay took {minutes} minutes and {seconds} seconds to complete."
    )

    return by_play


# Example usage:
# processed_plays = allplays_to_byplay(plays_data)

# By Play to By Drive

In [42]:
# Constants
END_LIST = ["End of Game", "End of Half", "End of Regulation"]
KICKOFFS_LIST = ["Kickoff", "Kickoff Return (Offense)", "Kickoff Return Touchdown"]
TWOPOINT_LIST = [
    "Two Point Pass",
    "Two Point Rush",
    "Defensive 2pt Conversion",
    "2pt Conversion",
]
EXTRAPOINT_LIST = [
    "Extra Point Good",
    "Extra Point Missed",
    "Extra Point Blocked",
    "Extra Point Blocked Missed",
    "Extra Point Missed Blocked",
]
FOURTHDOWNGIVEUP_LIST = [
    "Punt",
    "Blocked Punt",
    "Punt Return Touchdown",
    "Blocked Punt Touchdown",
    "Field Goal Good",
    "Field Goal Missed",
    "Blocked Field Goal",
    "Missed Field Goal Return",
    "Missed Field Goal Return Touchdown",
    "Blocked Field Goal Touchdown",
]
TD_AND_FG_LIST = [
    "Rushing Touchdown",
    "Passing Touchdown",
    "Field Goal Good",
    "Field Goal Missed",
    "Blocked Field Goal",
    "Blocked Field Goal Touchdown",
    "Missed Field Goal Return",
    "Missed Field Goal Return Touchdown",
]

In [44]:
def byplay_to_bydrive(data: pd.DataFrame, verbose: bool = False) -> pd.DataFrame:
    """
    Convert play-by-play data to drive-by-drive summary.

    Args:
        data (pd.DataFrame): Input play-by-play data.
        verbose (bool): If True, print execution time.

    Returns:
        pd.DataFrame: Summarized drive-by-drive data.
    """
    start = time.time()

    try:
        # Sort and preprocess data
        all_plays = preprocess_data(data)

        # Create filtered datasets
        no_kicksandtwopoints, no_4thdowngiveups, off_plays = filter_plays(all_plays)

        # Create drive-level DataFrames
        by_drive_1 = create_basic_drive_df(no_kicksandtwopoints)
        by_drive_start = create_drive_start_df(no_kicksandtwopoints)
        by_drive_end = create_drive_end_df(no_kicksandtwopoints)
        by_drive_penalties = calculate_drive_penalties(all_plays)
        by_drive_off = calculate_offensive_stats(off_plays)
        by_drive_rush = calculate_rush_stats(off_plays)
        by_drive_pass = calculate_pass_stats(off_plays)

        # Merge all drive DataFrames
        by_drive = merge_drive_dataframes(
            [
                by_drive_1,
                by_drive_start,
                by_drive_off,
                by_drive_rush,
                by_drive_pass,
                by_drive_end,
                by_drive_penalties,
            ]
        )

        # Final calculations
        by_drive = calculate_final_stats(by_drive)
        by_drive["total_plays"] = by_drive["total_plays"] - by_drive[
            "ending_play"
        ].apply(lambda x: 1 if x in END_LIST else 0)

        # Select final columns
        final_columns = [
            "season",
            "game_id",
            "offense",
            "drive_number",
            "week",
            "home_away",
            "defense",
            "half",
            "quarter",
            "garbage",
            "starting_position",
            "score_condition",
            "available_yards",
            "ending_position",
            "red_zone",
            "eckel",
            "ending_play",
            "score_result",
            "drive_success",
            "points_scored",
            "firstdown_drive",
            "busted_drive",
            "long_drive",
            "total_plays",
            "available_yards_gained",
            "available_yards_rate",
            "total_yards",
            "total_success",
            "success_yards",
            "total_explosive",
            "explosive_yards",
            "total_TFL",
            "havocs",
            "total_ppa",
            "drive_duration",
            "runpass_split",
            "available_yards_pct",
            "ypp",
            "success_rate",
            "explosive_rate",
            "rush_plays",
            "rush_yards",
            "r_successes",
            "r_explosive",
            "rush_attempts",
            "stuffs",
            "rush_ppa",
            "r_play_duration",
            "pass_plays",
            "pass_yards",
            "p_successes",
            "p_explosives",
            "sacks",
            "pass_ppa",
            "p_play_duration",
            "completions",
            "giveaway",
            "TD",
            "FG",
            "o_penalties",
            "o_penalty_yards",
            "d_penalties",
            "d_penalty_yards",
            "first_downs",
            "second_downs",
            "third_downs",
            "thirddown_conversions",
            "fourth_downs",
            "fourth_down_attempts",
            "fourthdown_conversions",
            "thirddown_conversion_rate",
            "fourthdown_conversion_rate",
            "starting_play_number",
            "ending_play_number",
            "starting_time",
            "ending_time",
            "drive_time",
            "plays_per_minute",
        ]
        by_drive = (
            by_drive[final_columns][by_drive["total_plays"] > 0]
            .copy()
            .reset_index(drop=True)
        )

        if verbose:
            end = time.time()
            print(
                f"byplay_to_bydrive took {int((end - start) // 60)} minutes and {int((end - start) % 60)} seconds to complete."
            )

        return by_drive

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return pd.DataFrame()

In [45]:
def preprocess_data(data: pd.DataFrame) -> pd.DataFrame:
    """Preprocess and sort the input data."""
    all_plays = data.sort_values(
        by=["season", "game_id", "offense", "drive_number", "play_number"]
    ).reset_index(drop=True)
    all_plays["ppa"] = pd.to_numeric(all_plays["ppa"], errors="coerce")
    return all_plays


def filter_plays(all_plays: pd.DataFrame) -> tuple:
    """Filter plays based on different criteria."""
    no_kicksandtwopoints = (
        all_plays[
            ~all_plays["play_type"].isin(
                KICKOFFS_LIST + TWOPOINT_LIST + EXTRAPOINT_LIST
            )
        ]
        .copy()
        .reset_index(drop=True)
    )
    no_4thdowngiveups = (
        no_kicksandtwopoints[
            ~no_kicksandtwopoints["play_type"].isin(FOURTHDOWNGIVEUP_LIST)
        ]
        .copy()
        .reset_index(drop=True)
    )
    off_plays = (
        no_4thdowngiveups[no_4thdowngiveups["penalty"] == 0]
        .copy()
        .reset_index(drop=True)
    )

    for df in [no_kicksandtwopoints, no_4thdowngiveups, off_plays]:
        df.sort_values(
            by=["season", "game_id", "offense", "drive_number", "play_number"],
            inplace=True,
        )
        df.reset_index(drop=True, inplace=True)

    return no_kicksandtwopoints, no_4thdowngiveups, off_plays

In [46]:
def create_basic_drive_df(no_kicksandtwopoints: pd.DataFrame) -> pd.DataFrame:
    """Create a basic drive-level DataFrame."""
    return no_kicksandtwopoints[
        ["season", "game_id", "offense", "drive_number", "week", "home_away", "defense"]
    ].drop_duplicates()

In [47]:
def create_drive_start_df(no_kicksandtwopoints: pd.DataFrame) -> pd.DataFrame:
    """Create a DataFrame with drive start information."""
    by_drive_start = (
        no_kicksandtwopoints.groupby(["season", "game_id", "offense", "drive_number"])
        .agg(
            {
                "quarter": "first",
                "half": "first",
                "yards_to_goal": "first",
                "relative_score": "first",
                "play_number": "first",
                "time_remaining_before": "first",
            }
        )
        .reset_index()
        .rename(
            columns={
                "yards_to_goal": "starting_position",
                "relative_score": "score_condition",
                "play_number": "starting_play_number",
                "time_remaining_before": "starting_time",
            }
        )
    )
    by_drive_start["available_yards"] = 100 - by_drive_start["starting_position"]
    return by_drive_start

In [48]:
def create_drive_end_df(no_kicksandtwopoints: pd.DataFrame) -> pd.DataFrame:
    """Create a DataFrame with drive end information."""
    by_drive_end = (
        no_kicksandtwopoints.groupby(["season", "game_id", "offense", "drive_number"])
        .agg(
            {
                "play_type": "last",
                "yards_to_goal": "last",
                "turnover": "last",
                "red_zone": "max",
                "eckel": "max",
                "play_number": "last",
                "time_remaining_after": "last",
            }
        )
        .reset_index()
        .rename(
            columns={
                "play_type": "ending_play",
                "yards_to_goal": "ending_position",
                "turnover": "giveaway",
                "play_number": "ending_play_number",
                "time_remaining_after": "ending_time",
            }
        )
    )

    by_drive_end["TD"] = (
        by_drive_end["ending_play"]
        .isin(["Passing Touchdown", "Rushing Touchdown"])
        .astype(int)
    )
    by_drive_end["FG"] = (by_drive_end["ending_play"] == "Field Goal Good").astype(int)
    by_drive_end["score_result"] = (
        (by_drive_end["TD"] > 0) | (by_drive_end["FG"] > 0)
    ).astype(int)
    by_drive_end.loc[by_drive_end["TD"] == 1, ["red_zone", "eckel"]] = 1

    return by_drive_end

In [53]:
def calculate_drive_penalties(all_plays: pd.DataFrame) -> pd.DataFrame:
    """Calculate penalties for each drive."""

    def calculate_penalties(df: pd.DataFrame, penalty_type: str) -> pd.DataFrame:
        penalty_agg = {"play_number": "count", "yards_gained": lambda x: abs(x.sum())}
        penalties = (
            df[df[f"{penalty_type}_penalty"] == 1]
            .groupby(["season", "game_id", "offense", "drive_number"])
            .agg(penalty_agg)
            .reset_index()
        )
        penalties.columns = [
            "season",
            "game_id",
            "offense",
            "drive_number",
            f"{penalty_type[0]}_penalties",
            f"{penalty_type[0]}_penalty_yards",
        ]
        return penalties

    by_drive_penalties = pd.merge(
        calculate_penalties(all_plays, "offensive"),
        calculate_penalties(all_plays, "defensive"),
        on=["season", "game_id", "offense", "drive_number"],
        how="outer",
    ).fillna(0)

    return by_drive_penalties

In [49]:
def calculate_offensive_stats(off_plays: pd.DataFrame) -> pd.DataFrame:
    """Calculate offensive statistics for each drive."""
    for down in range(1, 5):
        off_plays[f"{['first', 'second', 'third', 'fourth'][down - 1]}_down"] = (
            off_plays["down"] == down
        ).astype(int)
    off_plays["fourth_down_attempt"] = (
        (off_plays["down"] == 4) & (~off_plays["play_type"].isin(FOURTHDOWNGIVEUP_LIST))
    ).astype(int)

    agg_dict = {
        "play_number": "count",
        "yards_gained": "sum",
        "success": "sum",
        "success_yards": "sum",
        "explosive": "sum",
        "explosive_yards": "sum",
        "TFL": "sum",
        "havoc": "sum",
        "ppa": "sum",
        "play_duration": "sum",
        "first_down": "sum",
        "second_down": "sum",
        "third_down": "sum",
        "thirddown_conversion": "sum",
        "fourth_down": "sum",
        "fourth_down_attempt": "sum",
        "fourthdown_conversion": "sum",
    }
    by_drive_off = (
        off_plays.groupby(["season", "game_id", "offense", "drive_number"])
        .agg(agg_dict)
        .reset_index()
    )

    by_drive_off.columns = [
        "season",
        "game_id",
        "offense",
        "drive_number",
        "total_plays",
        "total_yards",
        "total_success",
        "success_yards",
        "total_explosive",
        "explosive_yards",
        "total_TFL",
        "havocs",
        "total_ppa",
        "drive_duration",
        "first_downs",
        "second_downs",
        "third_downs",
        "thirddown_conversions",
        "fourth_downs",
        "fourth_down_attempts",
        "fourthdown_conversions",
    ]

    return by_drive_off

In [50]:
def calculate_rush_stats(off_plays: pd.DataFrame) -> pd.DataFrame:
    """Calculate rushing statistics for each drive."""
    rush_agg_dict = {
        "play_number": "count",
        "yards_gained": "sum",
        "success": "sum",
        "explosive": "sum",
        "rush_attempt": "sum",
        "stuff": "sum",
        "ppa": "sum",
        "play_duration": "sum",
    }
    by_drive_rush = (
        off_plays[off_plays["rush_attempt"] == 1]
        .groupby(["season", "game_id", "offense", "drive_number"])
        .agg(rush_agg_dict)
        .reset_index()
    )

    by_drive_rush.columns = [
        "season",
        "game_id",
        "offense",
        "drive_number",
        "rush_plays",
        "rush_yards",
        "r_successes",
        "r_explosive",
        "rush_attempts",
        "stuffs",
        "rush_ppa",
        "r_play_duration",
    ]

    return by_drive_rush

In [51]:
def calculate_pass_stats(off_plays: pd.DataFrame) -> pd.DataFrame:
    """Calculate passing statistics for each drive."""
    pass_agg_dict = {
        "play_number": "count",
        "yards_gained": "sum",
        "success": "sum",
        "explosive": "sum",
        "sack": "sum",
        "ppa": "sum",
        "play_duration": "sum",
        "completion": "sum",
    }
    by_drive_pass = (
        off_plays[off_plays["dropback"] == 1]
        .groupby(["season", "game_id", "offense", "drive_number"])
        .agg(pass_agg_dict)
        .reset_index()
    )

    by_drive_pass.columns = [
        "season",
        "game_id",
        "offense",
        "drive_number",
        "pass_plays",
        "pass_yards",
        "p_successes",
        "p_explosives",
        "sacks",
        "pass_ppa",
        "p_play_duration",
        "completions",
    ]

    return by_drive_pass

In [52]:
def merge_drive_dataframes(dataframes: List[pd.DataFrame]) -> pd.DataFrame:
    """Merge all drive-level DataFrames."""
    by_drive = dataframes[0]
    for df in dataframes[1:]:
        by_drive = pd.merge(
            by_drive,
            df,
            on=["season", "game_id", "offense", "drive_number"],
            how="left",
        )

    # Filling in relevant NAs
    by_drive["o_penalties"] = by_drive["o_penalties"].fillna(0)
    by_drive["o_penalty_yards"] = by_drive["o_penalty_yards"].fillna(0)
    by_drive["d_penalties"] = by_drive["d_penalties"].fillna(0)
    by_drive["d_penalty_yards"] = by_drive["d_penalty_yards"].fillna(0)
    by_drive["rush_plays"] = by_drive["rush_plays"].fillna(0)
    by_drive["rush_yards"] = by_drive["rush_yards"].fillna(0)
    by_drive["r_successes"] = by_drive["r_successes"].fillna(0)
    by_drive["r_explosive"] = by_drive["r_explosive"].fillna(0)
    by_drive["rush_attempts"] = by_drive["rush_attempts"].fillna(0)
    by_drive["stuffs"] = by_drive["stuffs"].fillna(0)
    by_drive["pass_plays"] = by_drive["pass_plays"].fillna(0)
    by_drive["pass_yards"] = by_drive["pass_yards"].fillna(0)
    by_drive["p_successes"] = by_drive["p_successes"].fillna(0)
    by_drive["p_explosives"] = by_drive["p_explosives"].fillna(0)
    by_drive["sacks"] = by_drive["sacks"].fillna(0)
    by_drive["completions"] = by_drive["completions"].fillna(0)
    by_drive["giveaway"] = by_drive["giveaway"].fillna(0)
    by_drive["r_play_duration"] = by_drive["r_play_duration"].fillna(0)
    by_drive["p_play_duration"] = by_drive["p_play_duration"].fillna(0)

    return by_drive

In [56]:
def calculate_final_stats(by_drive: pd.DataFrame) -> pd.DataFrame:
    """Calculate final statistics for each drive."""
    # Define garbage time conditions
    conditions = [
        (by_drive["half"] == 1)
        & (by_drive["total_plays"] <= 2)
        & (by_drive["ending_play"] == "End of Half"),
        (by_drive["ending_play"] == "End of Game")
        & (by_drive["score_condition"] == 0)
        & (by_drive["total_plays"] < 4)
        & (by_drive["score_result"] == 0)
        & (by_drive["giveaway"] == 0),
        (by_drive["quarter"] == 3) & (abs(by_drive["score_condition"]) >= 32),
        (by_drive["quarter"] == 4) & (abs(by_drive["score_condition"]) >= 24),
        (by_drive["ending_play"] == "End of Game")
        & (abs(by_drive["score_condition"]) > 8)
        & (by_drive["score_result"] == 0),
    ]
    by_drive["garbage"] = np.any(conditions, axis=0).astype(int)

    # Extra Calculations
    by_drive["available_yards_gained"] = (
        by_drive["total_yards"]
        - by_drive["o_penalty_yards"]
        + by_drive["d_penalty_yards"]
    )
    by_drive["available_yards_rate"] = (
        by_drive["available_yards_gained"] / by_drive["available_yards"]
    )
    by_drive["drive_success"] = by_drive["ending_play"].isin(TD_AND_FG_LIST).astype(int)
    by_drive["points_scored"] = np.where(
        by_drive["TD"] == 1, 7, np.where(by_drive["FG"] == 1, 3, 0)
    )
    by_drive["firstdown_drive"] = (
        (by_drive["TD"] == 1) | (by_drive["first_downs"] >= 1)
    ).astype(int)
    by_drive["long_drive"] = (by_drive["total_plays"] >= 7).astype(int)
    by_drive["busted_drive"] = (by_drive["total_yards"] < 0).astype(int)

    # Avoid division by zero
    by_drive["runpass_split"] = np.where(
        by_drive["total_plays"] > 0,
        by_drive["rush_plays"] / by_drive["total_plays"],
        np.nan,
    )
    by_drive["available_yards_pct"] = np.where(
        by_drive["available_yards"] > 0,
        by_drive["available_yards_gained"] / by_drive["available_yards"],
        np.nan,
    )
    by_drive["ypp"] = np.where(
        by_drive["total_plays"] > 0,
        by_drive["total_yards"] / by_drive["total_plays"],
        np.nan,
    )
    by_drive["success_rate"] = np.where(
        by_drive["total_plays"] > 0,
        by_drive["total_success"] / by_drive["total_plays"],
        np.nan,
    )
    by_drive["explosive_rate"] = np.where(
        by_drive["total_plays"] > 0,
        by_drive["total_explosive"] / by_drive["total_plays"],
        np.nan,
    )
    by_drive["thirddown_conversion_rate"] = np.where(
        (by_drive["third_downs"].fillna(0) > 0),
        (
            by_drive["thirddown_conversions"]
            / by_drive["third_downs"].replace(0, np.nan)
        ),
        np.nan,
    )
    by_drive["fourthdown_conversion_rate"] = np.where(
        (by_drive["fourth_downs"].fillna(0) > 0),
        (
            by_drive["fourthdown_conversions"]
            / by_drive["fourth_downs"].replace(0, np.nan)
        ),
        np.nan,
    )

    by_drive["drive_time"] = by_drive["starting_time"] - by_drive["ending_time"]
    by_drive["plays_per_minute"] = np.where(
        by_drive["drive_time"] > 0,
        by_drive["total_plays"] / (by_drive["drive_time"] / 60),
        0,
    )

    # Ensure all numeric columns are float
    numeric_columns = by_drive.select_dtypes(include=[np.number]).columns
    by_drive[numeric_columns] = by_drive[numeric_columns].astype(float)

    return by_drive