In [199]:
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from datetime import datetime
import pandas as pd
from tqdm import tqdm
import random
import time
from collections import deque
import sys

In [174]:
def create_schedule_url(year: str, month: str = "") -> str:
    """
    Create a basketball-reference URL for the NBA schedule page.

    Parameters:
    year (str): The NBA season year.
    month (str, optional): The NBA season month (defaults to an empty string for the default page).

    Returns:
    (str): A basketball-reference URL pointing to the desired NBA schedule page.
    """
    template_url = (
        "https://www.basketball-reference.com/leagues/NBA_{year}_games{month}.html"
    )
    formatted_month = f"-{month.lower()}" if month else ""
    url = template_url.format(year=year, month=formatted_month)
    return url

In [200]:
_request_times = deque()

def get_request_soup(url: str) -> BeautifulSoup:
    """
    Sends a GET request to a URL and returns a BeautifulSoup object. Handles not sending too many
    requests to not get a rate limited request (429) from basketball-reference.

    Parameters:
    url (str): A URL pointing to the desired page.

    Returns:
    BeautifulSoup: Parsed HTML content of the requested page.

    Exceptions:
    Terminates the entire python script if the response status code is 429.
    Raises an HTTP error if response status code is problematic.
    Prints an error message if the request fails due to connection, timeout, or other issues.
    """
    ## !!! Bot Limit: 20 reqeusts per min !!!
    global _request_times

    # Delete timestamps older than a minute
    a_minute_ago = time.monotonic() - 60
    while _request_times and _request_times[0] < a_minute_ago:
        _request_times.popleft()

    # Check if less than 15 requests have been made in the last minute
    if len(_request_times) >= 15:
        oldest_request = _request_times[0]
        sleep_time = oldest_request - a_minute_ago
        if sleep_time > 0:
            time.sleep(sleep_time)

    # Request HTML
    try:
        wait_time = random.uniform(0.1, 0.2)
        time.sleep(wait_time)
        response = requests.get(url)
        _request_times.append(time.monotonic())

        # Check response
        if response.status_code == 429:
            print("Rate Limited Request - You are in jail for an hour :(")
            sys.exit()
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")
        return soup

    except requests.exceptions.ConnectionError:
        print("Failed to connect to basketball-reference site")
    except requests.exceptions.Timeout:
        print("The request timed out")
    except requests.exceptions.RequestException as e:
        print(f"An error occured: {e}")

    return None

In [176]:
def get_months_of_games_in_season(soup: Tag) -> list:
    """
    Extracts list of months during which NBA games where played.

    Parameters:
    soup (bs4.BeautifulSoup): A parsed BeautifulSoup object containing the HTML of the NBA schedule page.

    Returns:
    list of str: A list of month names.
    """
    months = []
    filter_div = soup.find("div", class_="filter")
    for div in filter_div.find_all("div"):
        month = div.text.strip()
        months.append(month)
    return months

In [177]:
def parse_data_point(key: str, text: str) -> datetime | int | str:
    """
    Parses match data point based on its key.

    Parameters:
    key (str): Name of the data point.
    text (str): Text content of the data point

    Returns:
    (datetime): If the key is 'date'.
    (int): If the key is 'visitor_pts', 'home_pts' or 'overtime' (representing number of overtimes played).
    (string): For all other keys.
    """
    if key == "date":
        return datetime.strptime(text, "%a, %b %d, %Y")

    elif key in ("visitor_pts", "home_pts"):
        return int(text)

    elif key == "overtime":
        if text is None:
            return 0
        elif text == "OT":
            return 1
        else:
            try:
                return int(text[:-2])
            except ValueError:
                return 0

    else:
        return text

In [178]:
def get_match_data(trow: Tag) -> dict:
    """
    Extracts NBA match data from a <tr> HTML element.

    Parameters:
    trow (bs4.element.Tag): A BeautifulSoup <tr> tag representing one row with data points about a single NBA match.

    Returns:
    dict: A dictionary with the following keys:
        - 'date' (datetime): The date of the game.
        - 'visitor_name' (str): Name of the visiting team.
        - 'visitor_pts' (int): Points scored by the visiting team.
        - 'home_name' (str): Name of the home team.
        - 'home_pts' (int): Points scored by the home team.
        - 'overtime' (int): Number of overtime periods.

    Notes:
    It relies on an external helper function `parse_data_point(key, text)` to handle value conversion.
    """
    match_data = {}
    data_fields = {
        "date": ("th", "date_game"),
        "visitor_name": ("td", "visitor_team_name"),
        "visitor_pts": ("td", "visitor_pts"),
        "home_name": ("td", "home_team_name"),
        "home_pts": ("td", "home_pts"),
        "overtime": ("td", "overtimes"),
    }

    for key, (tag, data_stat) in data_fields.items():
        text = trow.find(tag, {"data-stat": data_stat}).text.strip()
        data_point = parse_data_point(key, text)
        match_data[key] = data_point
        
    return match_data

In [190]:
def get_starting_dates_of_playoffs(years_played_NBA: list) -> dict:
    """
    Returns the starting date of the NBA playoffs for every year using basketball-reference's playoff series list.

    Parameters:
    years_played_NBA (list): LIst of ending years (str) in which the NBA was played.

    Returns:
    dict: A dictionary where:
        key (str): The NBA season year.
        value (datetime): The date of the beggining of the playoffs.

    Notes:
    It relies on an external helper function `get_request_soup(url)` to receive the parsed HTML content.
    """
    starting_dates = {year: [] for year in years_played_NBA}

    playoffs_url = "https://www.basketball-reference.com/playoffs/series.html"
    soup = get_request_soup(playoffs_url)

    playoffs_series_table = soup.find("table", id="playoffs_series")
    print("playoffs_series" in soup.prettify())  # might say False
    print("playoffs_series" in soup.text)        # might say False
    print("playoffs_series" in str(soup))
    tbody = playoffs_series_table.find("tbody")

    for trow in tbody.find_all("tr"):

        if trow.has_attr("csk"):  # Not yet finished playoff series
            continue

        if "class" in trow.attrs and any(  # Header of table
            class_name in ("thead", "overheader") for class_name in trow.get("class")
        ):
            continue

        year = trow.find("th", {"data-stat": "season"}).text.strip()
        month_day = trow.find("td", {"data-stat": "date_range"}).text.strip()[:6]
        full_date = f"{month_day} {year}"
        s_date = datetime.strptime(full_date, "%b %d %Y")
        starting_dates[year].append(s_date)

    playoffs_starting_dates = {
        year: min(dates) for year, dates in starting_dates.items()
    }

    return playoffs_starting_dates

In [180]:
def get_match_data_for_season(year: str) -> list:
    """
    Scrapes all relevant match data for an entire NBA season from basketball-reference.

    Parameters:
    year (str): The NBA season year.

    Returns:
    list: A list of matches (for an entire season) with game data.
    """
    all_matches = []

    # Find months when games are played
    default_url = create_schedule_url(year)
    soup = get_request_soup(default_url)
    months = get_months_of_games_in_season(soup)

    # Get game data
    for month in months:
        month_url = create_schedule_url(year, month=month)
        soup_month = get_request_soup(month_url)

        table = soup_month.find("table", id="schedule")
        tbody = table.find("tbody")
        for trow in tbody.find_all("tr"):
            if "thead" in trow.get("class"):  # Header of table
                continue

            match_data = get_match_data(trow)
            match_data["season"] = year
            all_matches.append(match_data)

    return all_matches

In [None]:
years_played_NBA = [str(year) for year in range(1947, 2026)]

playoffs_starting_dates = get_starting_dates_of_playoffs(years_played_NBA)

all_seasons_matches = []

for year in tqdm(years_played_NBA[-10:]):
    print(year)
    season_matches = get_match_data_for_season(year)
    all_seasons_matches.extend(season_matches)

df = pd.DataFrame(all_seasons_matches)

# Add columns needed for calculating elo
df["postseason"] = True if playoffs_starting_dates[df["season"]] <= df["date"] else False
df["home_win"] = True if df["home_pts"] > df["visitor_pts"] else False
df["margin_of_victory"] = abs(df["home_pts"] - df["visitor_pts"])

AttributeError: 'NoneType' object has no attribute 'find'

In [197]:
get_starting_dates_of_playoffs(years_played_NBA)

429
https://www.basketball-reference.com/playoffs/series.html

<!DOCTYPE html>
<html data-version="klecko-" data-root="" lang="en" class="no-js" >
<head id="suppress_all_ads">
    <meta charset="utf-8">
    <meta http-equiv="x-ua-compatible" content="ie=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=2.0" />

    <title>Rate Limited Request (429 error) | Sports-Reference.com</title>

    <link rel="canonical" href="https://www.sports-reference.com/429.html" />

<!-- CSS start -->
 <style>body,html{color:#000;font:14px/1.25 Helvetica Neue,helvetica,arial,sans-serif;margin:0;padding:0}html:not(.backstop){scroll-behavior:smooth}html:not(.backstop) div,html:not(.backstop) span{scroll-snap-margin:2.5em 0 0 0;scroll-margin:2.5em 0 0 0}body{-webkit-text-size-adjust:none;-moz-text-size-adjust:none;-ms-text-size-adjust:none;background:#c9cbcd;position:relative;z-index:0}a img{border:0}li,ol,ul{list-style-type:none;margin:0;padding:0}table td,table 

AttributeError: 'NoneType' object has no attribute 'find'

In [None]:
df.to_csv('raw_data.csv', index = False)

In [None]:
df.head()

Unnamed: 0,date,visitor_name,visitor_pts,home_name,home_pts,overtime,home_win,margin_of_victory,year,postseason
0,2023-10-24,Los Angeles Lakers,107,Denver Nuggets,119,0,True,12,2024,False
1,2023-10-24,Phoenix Suns,108,Golden State Warriors,104,0,False,4,2024,False
2,2023-10-25,Houston Rockets,86,Orlando Magic,116,0,True,30,2024,False
3,2023-10-25,Boston Celtics,108,New York Knicks,104,0,False,4,2024,False
4,2023-10-25,Washington Wizards,120,Indiana Pacers,143,0,True,23,2024,False
