In [29]:
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from datetime import datetime
import pandas as pd
from tqdm import tqdm

In [30]:
template_url = 'https://www.basketball-reference.com/leagues/NBA_{year}_games{month}.html'

In [None]:
def get_request_soup(template_url: str, year: str, month: str = '') -> BeautifulSoup:
    """
    Sends a GET request to a formatted basketball-reference URL and returns a BeautifulSoup object.

    Parameters:
    template_url (str): A URL template with placeholders for 'year' and 'month'.
    year (str): The NBA season year.
    month (str, optional): The NBA season month (defaults to an empty string for the default page).

    Returns:
    BeautifulSoup: Parsed HTML content of the requested page.

    Exceptions:
    Prints an error message if the request fails due to connection, timeout, or other issues.
    """

    try:
        formatted_month = f'-{month.lower()}' if month else ''
        default_page_url = template_url.format(year = year, month = formatted_month)
        response = requests.get(default_page_url)
        soup = BeautifulSoup(response.text, 'html.parser')

    except requests.exceptions.ConnectionError:
        print('Failed to connect to basketball-reference site')
    except requests.exceptions.Timeout:
        print('The request timed out')
    except requests.exceptions.RequestException as e:
        print(f'An error occured: {e}')

    return soup

In [None]:
def find_months_of_games(soup: Tag) -> list:
    """
    Extracts list of months during which NBA games where played.

    Parameters:
    soup (bs4.BeautifulSoup): A parsed BeautifulSoup object containing the HTML of the NBA schedule page.

    Returns:
    list of str: A list of month names.
    """

    months = []
    filter_div = soup.find('div', class_='filter')
    for div in filter_div.find_all('div'):
        month = div.text.strip()
        months.append(month)
    return months

['October', 'November', 'December', 'January', 'February', 'March', 'April', 'May', 'June']


In [None]:
def parse_data_point(key: str, text: str) -> datetime | int | str:
    """ 
    Parses match data point based on its key.

    Parameters:
    key (str): Name of the data point.
    text (str): Text content of the data point

    Returns:
    (datetime): If the key is 'date'.
    (int): If the key is 'visitor_pts', 'home_pts' or 'overtime' (representing number of overtimes played).
    (string): For all other keys.
    """

    if key == 'date':
        return datetime.strptime(text, '%a, %b %d, %Y')
    
    elif key in ('visitor_pts', 'home_pts'):
        return int(text)
    
    elif key == 'overtime':
        if text is None:
            return 0
        elif text == 'OT':
            return 1
        else:
            try:
                return int(text[:-2])
            except ValueError:
                return 0
            
    else:
        return text

In [None]:
def get_match_data(trow: Tag) -> dict:
    """
    Extracts NBA match data from a <tr> HTML element.

    Parameters:
    trow (bs4.element.Tag): A BeautifulSoup <tr> tag representing one row with data points about a single NBA match.

    Returns:
    dict: A dictionary with the following keys:
        - 'date' (datetime): The date of the game.
        - 'visitor_name' (str): Name of the visiting team.
        - 'visitor_pts' (int): Points scored by the visiting team.
        - 'home_name' (str): Name of the home team.
        - 'home_pts' (int): Points scored by the home team.
        - 'overtime' (int): Number of overtime periods.

    Notes:
    It relies on an external helper function `parse_data_point(key, text)` to handle value conversion.
    """

    match_data = {}
    data_fields = {
        'date':        ('th',  'date_game'),
        'visitor_name':('td',  'visitor_team_name'),
        'visitor_pts': ('td',  'visitor_pts'),
        'home_name':   ('td',  'home_team_name'),
        'home_pts':    ('td',  'home_pts'),
        'overtime':    ('td', 'overtimes')
    }

    for key, (tag, data_stat) in data_fields.items():
        text = trow.find(tag, {'data-stat': data_stat}).text.strip()
        data_point = parse_data_point(key, text)
        match_data[key] = data_point

    return match_data

In [None]:
all_matches = []
soup = get_request_soup(template_url, '2024')
months = find_months_of_games(soup)

for month in tqdm(months, desc='Scraping months'):
    soup_month = get_request_soup(template_url, '2024', month = month)

    table = soup_month.find('table', id='schedule')
    tbody = table.find('tbody')
    for trow in tbody.find_all('tr'):
        match_data = get_match_data(trow)
        all_matches.append(match_data)
        
df = pd.DataFrame(all_matches)

Scraping match data: 100%|██████████| 54/54 [00:00<00:00, 4157.96it/s]
Scraping match data: 100%|██████████| 219/219 [00:00<00:00, 3423.11it/s]
Scraping match data: 100%|██████████| 208/208 [00:00<00:00, 3495.53it/s]
Scraping match data: 100%|██████████| 231/231 [00:00<00:00, 2997.68it/s]
Scraping match data: 100%|██████████| 174/174 [00:00<00:00, 2972.98it/s]
Scraping match data: 100%|██████████| 230/230 [00:00<00:00, 3405.28it/s]
Scraping match data: 100%|██████████| 157/157 [00:00<00:00, 3365.66it/s]
Scraping match data: 100%|██████████| 41/41 [00:00<00:00, 3715.94it/s]
Scraping match data: 100%|██████████| 5/5 [00:00<00:00, 4996.79it/s]


In [39]:
df.to_csv('raw_data.csv', index = False)

In [40]:
df.head()

Unnamed: 0,date,visitor_name,visitor_pts,home_name,home_pts,overtime
0,2023-10-24,Los Angeles Lakers,107,Denver Nuggets,119,0
1,2023-10-24,Phoenix Suns,108,Golden State Warriors,104,0
2,2023-10-25,Houston Rockets,86,Orlando Magic,116,0
3,2023-10-25,Boston Celtics,108,New York Knicks,104,0
4,2023-10-25,Washington Wizards,120,Indiana Pacers,143,0
