# Imports

In [3]:
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from datetime import datetime
import pandas as pd
from tqdm import tqdm
import random
import time
from collections import deque
import sys
import os

# Format url (function)

In [4]:
def create_schedule_url(year: str, league: str, m_link: str = None) -> str:
    """
    Create a basketball-reference URL for the basketball schedule page.

    Parameters:
    year (str): The season year.
    league (str): The basketball league.
    m_link (str, optional): The full ending link, specific for a month.

    Returns:
    (str): A basketball-reference URL pointing to the desired basketball schedule page.
    """
    if m_link == None:
        url = f"https://www.basketball-reference.com/leagues/{league}_{year}_games.html"
    else:
        url = f"https://www.basketball-reference.com{m_link}"

    return url

# Request HTML (function)

In [5]:
_request_times = deque()

def get_request_soup(url: str) -> BeautifulSoup:
    """
    Sends a GET request to a URL and returns a BeautifulSoup object. Handles not sending too many
    requests to not get a rate limited request (429) from basketball-reference.

    Parameters:
    url (str): A URL pointing to the desired page.

    Returns:
    BeautifulSoup: Parsed HTML content of the requested page.

    Exceptions:
    Terminates the entire python script if the response status code is 429.
    Raises an HTTP error if response status code is problematic.
    Prints an error message if the request fails due to connection, timeout, or other issues.
    """
    ## !!! Bot Limit: 20 reqeusts per min !!!
    global _request_times

    # Delete timestamps older than a minute
    a_minute_ago = time.monotonic() - 60
    while _request_times and _request_times[0] < a_minute_ago:
        _request_times.popleft()

    # Check if less than 15 requests have been made in the last minute
    if len(_request_times) >= 15:
        oldest_request = _request_times[0]
        sleep_time = oldest_request - a_minute_ago
        if sleep_time > 0:
            print(f"Too many requests. Pausing for: {sleep_time:.2f}")
            time.sleep(sleep_time)

    # Request HTML
    try:
        wait_time = random.uniform(3, 5)
        time.sleep(wait_time)
        response = requests.get(url)
        _request_times.append(time.monotonic())

        # Check response
        if response.status_code == 429:
            print("Too many requests (response code 429) - You are in jail for an hour :(")
            print("Saving collected data into dataframe named df")
            sys.exit()
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")
        return soup

    except requests.exceptions.ConnectionError:
        print("Failed to connect to basketball-reference site")
    except requests.exceptions.Timeout:
        print("The request timed out")
    except requests.exceptions.RequestException as e:
        print(f"An error occured: {e}")

    return None

# Get starting dates of playoffs (function)

In [None]:
def get_starting_dates_of_playoffs() -> dict:
    """
    Returns the starting date of the playoffs for every year for each of the major american
    basketball leagues (NBA, ABA, BAA) using basketball-reference's playoff series list.
    Is also used to infer which seasons should be scraped.

    Returns:
    dict: A nested dictionary where:
        key (str): The league shortcut.
        nested key (str): The season year.
        value (datetime): The date of the beggining of the playoffs.

    Notes:
    It relies on an external helper function `get_request_soup(url)` to receive the parsed HTML content.
    """
    starting_dates = {"NBA": {}, "ABA": {}, "BAA": {}}

    playoffs_url = "https://www.basketball-reference.com/playoffs/series.html"
    soup = get_request_soup(playoffs_url)
    playoffs_series_table = soup.find("table", id="playoffs_series")
    tbody = playoffs_series_table.find("tbody")

    for trow in tbody.find_all("tr"):

        if trow.has_attr("csk"):
            continue  # Not yet finished playoff series

        if "class" in trow.attrs and any(
            class_name in ("thead", "overheader")
            for class_name in trow.get("class", [])
        ):
            continue  # Header of table

        year = trow.find("th", {"data-stat": "season"}).text.strip()
        league = trow.find("td", {"data-stat": "lg"}).text.strip()
        month_day = trow.find("td", {"data-stat": "date_range"}).text.strip()[:6]
        full_date = f"{month_day} {year}"
        s_date = datetime.strptime(full_date, "%b %d %Y")

        if year not in starting_dates[league]:
            starting_dates[league][year] = []
        starting_dates[league][year].append(s_date)

    playoffs_starting_dates = {"NBA": {}, "ABA": {}, "BAA": {}}
    for league in starting_dates:
        for year in starting_dates[league]:
            playoffs_starting_dates[league][year] = min(starting_dates[league][year])

    return playoffs_starting_dates

# Get match data for a season (multiple functions)

In [7]:
def get_months_of_games_in_season(soup: Tag) -> list:
    """
    Extracts list of endings of urls for sites for each month during which basketball
    games where played for given year and league.

    Parameters:
    soup (bs4.BeautifulSoup): A parsed BeautifulSoup object containing the HTML of the basketball schedule page.

    Returns:
    list of str: A list of ending parts of urls.
    """
    month_links = []
    filter_div = soup.find("div", class_="filter")
    
    for div in filter_div.find_all("div"):
        m_link = div.find("a")["href"]
        month_links.append(m_link)
    return month_links

In [8]:
def parse_data_point(key: str, text: str) -> datetime | int | str:
    """
    Parses match data point based on its key.

    Parameters:
    key (str): Name of the data point.
    text (str): Text content of the data point

    Returns:
    (datetime): If the key is 'date'.
    (int): If the key is 'visitor_pts', 'home_pts' or 'overtime' (representing number of overtimes played).
    (string): For all other keys.
    """
    if key == "date":
        try:
            return datetime.strptime(text, "%a, %b %d, %Y")
        except ValueError:
            print(f"Could not convert {key} with value: ({text}) to datetime.")
            return datetime(2000, 1, 1)

    elif key in ("visitor_pts", "home_pts"):
        try:
            return int(text)
        except ValueError:
            print(f"Could not convert {key} with value: {text} to integer.")
            return 0

    elif key == "overtime":
        if text == "":
            return 0
        elif text == "OT":
            return 1
        else:
            try:
                return int(text[:-2])
            except ValueError:
                print(f"Could not convert {key} with value: {text} to integer by omitting the last two characters.")
                return 0

    else:
        return text

In [9]:
def get_match_data(trow: Tag) -> dict | None:
    """
    Extracts basketball match data from a <tr> HTML element.

    Parameters:
    trow (bs4.element.Tag): A BeautifulSoup <tr> tag representing one row with data points about a single basketball match.

    Returns:
    dict | None:
        dict: A dictionary with the following keys:
            - 'date' (datetime): The date of the game.
            - 'visitor_name' (str): Name of the visiting team.
            - 'visitor_pts' (int): Points scored by the visiting team.
            - 'home_name' (str): Name of the home team.
            - 'home_pts' (int): Points scored by the home team.
            - 'overtime' (int): Number of overtime periods.
        None: If the trow is missing data or match hasn't been played yet.

    Notes:
    It relies on an external helper function `parse_data_point(key, text)` to handle value conversion.
    """
    match_data = {}
    data_fields = {
        "date": ("th", "date_game"),
        "visitor_name": ("td", "visitor_team_name"),
        "visitor_pts": ("td", "visitor_pts"),
        "home_name": ("td", "home_team_name"),
        "home_pts": ("td", "home_pts"),
        "overtime": ("td", "overtimes"),
    }

    # Scrape needed data points
    for key, (tag, data_stat) in data_fields.items():
        try:
            text = trow.find(tag, {"data-stat": data_stat}).text.strip()
        except AttributeError:
            print(f"Attribute error for {key} in this trow: \n", trow)
            return None

        data_point = parse_data_point(key, text)
        match_data[key] = data_point

        if key == "date" and data_point.date() >= datetime.now().date():
            return None  # Skips matches that haven't been played yet

    # Calculate values needed for elo
    match_data["home_win"] = (
        True if match_data["home_pts"] > match_data["visitor_pts"] else False
    )
    match_data["margin_of_victory"] = abs(
        match_data["home_pts"] - match_data["visitor_pts"]
    )

    return match_data

In [10]:
def get_match_data_for_season(year: str, league: str, playoff_starting_date: datetime) -> list:
    """
    Scrapes all relevant match data for an entire basketball season from basketball-reference.

    Parameters:
    year (str): The season year.
    playoff_starting_date (datetime): The starting date of the playoffs that year for given league.

    Returns:
    list: A list of matches (for an entire season) with game data.
    """
    all_matches = []

    # Find months when games are played
    default_url = create_schedule_url(year, league)
    soup = get_request_soup(default_url)
    month_links = get_months_of_games_in_season(soup)

    # Get game data
    for m_link in month_links:
        month_url = create_schedule_url(year, league, m_link=m_link)
        soup_month = get_request_soup(month_url)

        table = soup_month.find("table", id="schedule")
        tbody = table.find("tbody")
        for trow in tbody.find_all("tr"):
            if "thead" in trow.get("class", []):
                continue  # Header of table

            match_data = get_match_data(trow)
            if match_data is None:
                continue  # Faulty match or match that hasn't been played yet
            
            match_data["postseason"] = True if playoff_starting_date <= match_data["date"] else False
            match_data["season"] = year
            match_data["league"] = league
            all_matches.append(match_data)

    return all_matches

In [11]:
def calculate_number_of_batches(window_size: int, num_of_years_to_process: int) -> int:
    """
    Calculates the number of batches the scraping will be split into. Splitting into batches and
    exporting data by batches helps save scraping progress when network or other errors might arise.

    Parameters:
    window_size (int): Number of years each batch should contain.
    years_to_process (int): Number of years needed to process.

    Returns:
    int: The number of batches the scraping will be split into.
    """
    return num_of_years_to_process // window_size + (
        num_of_years_to_process % window_size > 0
    )

In [None]:
def process_batch(
    batch_year_league: list, batch: int, num_of_batches: int, playoffs_starting_dates: dict
) -> list:
    """
    Processing one batch of seasons (different years or different leagues). Includes progress bar.

    Parameters:
    batch_year_league (list): List of tuples containing (season year, league name).
    batch (int): Batch number.
    num_of_batches (int): Total number of batches.
    playoffs_starting_dates (dict): A nested dictionary with league names (str) as keys containing
    key, value pairs of season year (str) and starting date of playoffs (datetime).

    Returns:
    list: Match data for all matches in given season for a given league.

    Note:
    Relies on function `get_match_data_for_season` that collects all match data for a given season.
    """
    all_batch_matches = []

    for year, league in tqdm(
        batch_year_league, desc=f"Processing batch {batch+1}/{num_of_batches}"
    ):
        season_matches = get_match_data_for_season(
            year, league, playoffs_starting_dates[league][year]
        )
        all_batch_matches.extend(season_matches)

    return all_batch_matches

# Run entire code

In [None]:
# Get a dictionary of starting dates of playoffs for every year for every league
playoffs_starting_dates = get_starting_dates_of_playoffs()

# Get years to process
years_to_process = []
for league in ["NBA", "ABA", "BAA"]:
    for year in playoffs_starting_dates[league].keys():
        years_to_process.append((year, league))

# Prepare batch number and batch sizes
WINDOW_SIZE = 7
num_of_batches = calculate_number_of_batches(WINDOW_SIZE, len(years_to_process))

# Directory
directory = "Data"
directory2 = "partial_match_data"
combined_directory = os.path.join(directory, directory2)
os.makedirs(combined_directory, exist_ok=True)

# Scrape data by batches
for batch in range(0, num_of_batches):
    batch_year_league = years_to_process[batch * WINDOW_SIZE : (batch + 1) * WINDOW_SIZE]
    all_batch_matches = process_batch(batch_year_league, batch, num_of_batches, playoffs_starting_dates)
    
    # Convert to df and export
    df = pd.DataFrame(all_batch_matches)
    filepath = os.path.join(combined_directory, f"match_data_{batch}.csv")
    df.to_csv(filepath, index=False)

{'NBA': {'2025': datetime.datetime(2025, 4, 19, 0, 0), '2024': datetime.datetime(2024, 4, 20, 0, 0), '2023': datetime.datetime(2023, 4, 15, 0, 0), '2022': datetime.datetime(2022, 4, 16, 0, 0), '2021': datetime.datetime(2021, 5, 22, 0, 0), '2020': datetime.datetime(2020, 8, 17, 0, 0), '2019': datetime.datetime(2019, 4, 13, 0, 0), '2018': datetime.datetime(2018, 4, 14, 0, 0), '2017': datetime.datetime(2017, 4, 15, 0, 0), '2016': datetime.datetime(2016, 4, 16, 0, 0), '2015': datetime.datetime(2015, 4, 18, 0, 0), '2014': datetime.datetime(2014, 4, 19, 0, 0), '2013': datetime.datetime(2013, 4, 20, 0, 0), '2012': datetime.datetime(2012, 4, 28, 0, 0), '2011': datetime.datetime(2011, 4, 16, 0, 0), '2010': datetime.datetime(2010, 4, 17, 0, 0), '2009': datetime.datetime(2009, 4, 18, 0, 0), '2008': datetime.datetime(2008, 4, 19, 0, 0), '2007': datetime.datetime(2007, 4, 21, 0, 0), '2006': datetime.datetime(2006, 4, 22, 0, 0), '2005': datetime.datetime(2005, 4, 23, 0, 0), '2004': datetime.datetime

SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
