# Data Collection

This notebook collects data from [Basketball Reference](https://basketball-reference.com).

For each game in the season provided, it creates a line in a CSV that outlines the team, their offensive stats, their defensive stats, their "home field advantage", and the outcome of the game.

Because there are 2 teams in a game, 2 lines will be made per game.

The data from this CSV will be used to train a machine learning model.


# Imports


In [1]:
import datetime
import os
import time
from dataclasses import dataclass

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup


# Helper Functions


In [2]:
GAME_LINKS_URL_TEMPLATE = (
    "https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html"
)


def get_game_links(year: str, month: str) -> list[str]:
    """
    Get the link for all games in a given month and year

    Args:
        year (str): The year of the season to get the games for
        month (str): The month of the season to get the games for

    Returns:
        list[str]: A list of links to the games
    """
    url = GAME_LINKS_URL_TEMPLATE.format(year=year, month=month)

    response = requests.get(url)

    soup = BeautifulSoup(response.text, "html.parser")

    # sleep to avoid rate limits in the API
    time.sleep(5)

    game_link_td_elements = soup.find_all("td", attrs={"data-stat": "box_score_text"})

    game_link_a_elements = [td.find("a") for td in game_link_td_elements]

    raw_game_links = [a["href"] for a in game_link_a_elements if a]

    game_links = [
        link.replace("/boxscores/", "").replace(".html", "") for link in raw_game_links
    ]

    return game_links

In [3]:
def get_game_date(game_link: str) -> datetime.date:
    """
    Get the date of a given game

    A game link is formatted as follows:
    202110190MIL

    The year is the first 4 characters, the month is the next 2 characters, and the day is the next 2 characters.

    Args:
        game_link (str): The link to the game

    Returns:
        datetime.date: The date of the game
    """
    year = int(game_link[:4])
    month = int(game_link[4:6])
    day = int(game_link[6:8])

    date = datetime.date(year, month, day)

    return date

In [4]:
@dataclass
class TeamsAndPoints:
    away_team: str
    away_team_points: int
    home_team: str
    home_team_points: int


def get_teams_and_points(soup: BeautifulSoup) -> TeamsAndPoints | None:
    """
    Get the away team, away team points, home team, and home team points for a given game

    Args:
        soup (BeautifulSoup): The soup of the game page

    Returns:
        TeamsAndPoints: The away team, away team points, home team, and home team points
    """
    teams_and_points_meta_tag = soup.find("meta", {"property": "og:description"})

    if not teams_and_points_meta_tag:
        return None

    # get content from the meta tag
    teams_and_points_string = teams_and_points_meta_tag.get("content")

    # get everything up to the first period
    teams_and_points_string = teams_and_points_string.split(".")[0]

    # split the string into a list of teams and points
    away_team, away_team_points, _, home_team, home_team_points = (
        teams_and_points_string.split(" ")
    )

    # the points are in the format "(X)" so we need to remove the parentheses and convert to int
    away_team_points = int(away_team_points[1:-1])
    home_team_points = int(home_team_points[1:-1])

    return TeamsAndPoints(away_team, away_team_points, home_team, home_team_points)


In [5]:
def get_game_stats_for_team(soup: BeautifulSoup, team: str) -> pd.DataFrame:
    """
    Get the statistics for a given team in a given game

    Args:
        soup (BeautifulSoup): The soup of the game page
        team (str): The team to get the statistics for

    Returns:
        pd.DataFrame: The statistics for the team
    """
    table = soup.find("table", attrs={"id": f"box-{team}-game-basic"})

    # the footer contains overall stats, instead of per-player stats
    table_footer = table.find("tfoot")

    table_footer_td_elements = table_footer.find_all("td")

    table_stats = [stat.text for stat in table_footer_td_elements]

    table_stats_index = [
        str.upper(stat["data-stat"]) for stat in table_footer_td_elements
    ]

    table = pd.DataFrame(table_stats, index=table_stats_index).T

    # removes all needed (since it is n/a for a team)
    del table["GAME_SCORE"]
    del table["PLUS_MINUS"]
    del table["MP"]
    del table["PTS"]

    return table


In [6]:
def add_opposing_team_stats(
    team_stats: pd.DataFrame, opposing_team_stats: pd.DataFrame
) -> pd.DataFrame:
    """
    Add the statistics for the opposing team to the given team's statistics

    Args:
        team_stats (pd.DataFrame): The statistics for the team
        opposing_team_stats (pd.DataFrame): The statistics for the opposing team

    Returns:
        pd.DataFrame: A dataframe that has the team's stats, but with the opposing team's stats annotated with "OPP"
    """

    opposing_team_stats.columns = [
        f"{stat}_OPP" for stat in opposing_team_stats.columns
    ]

    total_team_stats = pd.concat([team_stats, opposing_team_stats], axis=1)

    return total_team_stats


In [7]:
GAME_DATA_URL_TEMPLATE = "https://www.basketball-reference.com/boxscores/{game}.html"


def get_game_data(game: str, season: str, index: int) -> pd.DataFrame | None:
    """
    Get the data for a given game and season

    Args:
        game (str): The game to get the data for
        season (str): The season to get the data for

    Returns:
        pd.DataFrame: A dataframe that has the data for the game
    """
    url = GAME_DATA_URL_TEMPLATE.format(game=game)
    response = requests.get(url)

    soup = BeautifulSoup(response.text, "html.parser")

    # wait to avoid rate limits in the API
    time.sleep(5)

    date = get_game_date(game)

    teams_and_points = get_teams_and_points(soup)

    if not teams_and_points:
        return None

    away_team, away_team_points, home_team, home_team_points = (
        teams_and_points.away_team,
        teams_and_points.away_team_points,
        teams_and_points.home_team,
        teams_and_points.home_team_points,
    )

    score_line_stats = [
        "SEASON",
        "DATE",
        "HOME/AWAY",
        "TEAM",
        "PTS",
        "TEAM_OPP",
        "PTS_OPP",
        "WIN/LOSS",
    ]

    away_score_line = [
        season,
        date,
        "AWAY",
        away_team,
        away_team_points,
        home_team,
        home_team_points,
        "WIN" if away_team_points > home_team_points else "LOSS",
    ]

    home_score_line = [
        season,
        date,
        "HOME",
        home_team,
        home_team_points,
        away_team,
        away_team_points,
        "WIN" if away_team_points < home_team_points else "LOSS",
    ]

    away_score_line = pd.DataFrame(away_score_line, index=score_line_stats).T
    home_score_line = pd.DataFrame(home_score_line, index=score_line_stats).T

    away_team_stats = get_game_stats_for_team(soup, away_team)
    home_team_stats = get_game_stats_for_team(soup, home_team)

    combined_away_team_stats = add_opposing_team_stats(
        away_team_stats, home_team_stats.copy(True)
    )
    combined_home_team_stats = add_opposing_team_stats(
        home_team_stats, away_team_stats.copy(True)
    )

    away_team_data = pd.concat([away_score_line, combined_away_team_stats], axis=1)
    home_team_data = pd.concat([home_score_line, combined_home_team_stats], axis=1)

    game_data = pd.concat([away_team_data, home_team_data], axis=0)

    if index % 10 == 0:
        print(f"Game {game} on {date}: {home_team} vs {away_team}")

    return game_data


In [8]:
def get_game_data_for_month(season: str, month: str) -> None:
    """
    Get the data for all games in a given month and season

    Args:
        season (str): The season to get the data for
    """
    game_links = get_game_links(season, month)

    if not game_links:
        return

    monthly_data = [
        get_game_data(game, season, index) for index, game in enumerate(game_links)
    ]

    monthly_data = [data for data in monthly_data if data is not None]

    monthly_data = pd.concat(monthly_data, axis=0)

    file_name = f"{season}_{month}.csv"
    file_path = f"scraped_data/season_data/{file_name}"

    monthly_data.to_csv(file_path)


In [9]:
MONTHS = [
    "october",
    "november",
    "december",
    "january",
    "february",
    "march",
    "april",
    "may",
]


def get_games_for_season(season: str) -> None:
    """
    Get the data for all games in a given season

    Args:
        season (str): The season to get the data for
    """

    for month in MONTHS:
        get_game_data_for_month(season, month)


In [10]:
SEASONS = [
    "2019",
    "2020",
    "2021",
    "2022",
    "2023",
    "2024",
]


def get_games_for_multiple_seasons() -> None:
    """
    Get the data for all games in a given season

    Args:
        seasons (list[str]): The seasons to get the data for
    """
    for season in SEASONS:
        get_games_for_season(season)


In [11]:
def combine_csv_files(csv_file_paths: list[str], output_file_path: str) -> None:
    """
    Combine multiple CSV files into a single CSV file

    Args:
        csv_file_paths (list[str]): The paths to the CSV files to combine
        output_file_path (str): The path to the output CSV file
    """
    csv_df_list = [pd.read_csv(file, index_col=0) for file in csv_file_paths]

    csv_df = pd.concat(csv_df_list, axis=0)

    csv_df = csv_df.reset_index(drop=True)

    csv_df.to_csv(output_file_path)

    return csv_df


In [12]:
def congregate_season_data(season: str) -> None:
    """
    Congregate the data for a given season

    Args:
        season (str): The season to combine the data for
    """
    files = [f"scraped_data/season_data/{season}_{month}.csv" for month in MONTHS]

    files = [file for file in files if os.path.exists(file)]

    combine_csv_files(files, f"scraped_data/combined_season_data/{season}.csv")


def congregate_all_season_data() -> None:
    """
    Congregate the data for all seasons
    """
    for season in SEASONS:
        congregate_season_data(season)

In [13]:
def combine_all_seasons_into_one_csv() -> None:
    """
    Combine all seasons into a single CSV file
    """
    files = [f"scraped_data/combined_season_data/{season}.csv" for season in SEASONS]

    combine_csv_files(files, "scraped_data/combined_historic_data/data.csv")


# Runner Code


In [None]:
get_games_for_multiple_seasons()

congregate_all_season_data()

combine_all_seasons_into_one_csv()