In [3]:
import os
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
import time


In [4]:
SEASONS = list(range(2018,2024))


In [5]:
DATA_DIR = 'data'
MONTHS_DIR = os.path.join(DATA_DIR, 'months')
SCORES_DIR = os.path.join(DATA_DIR, 'scores')

In [6]:
async def get_html(url, selector, sleep=3, retries = 3):
    """
    Fetches the HTML content of a specified element from a webpage using Playwright.
    Args:
        url (str): The URL of the webpage to fetch the HTML from.
        selector (str): The CSS selector of the element to fetch the HTML content from.
        sleep (int, optional): The base number of seconds to wait between retries. Defaults to 5.
        retries (int, optional): The number of times to retry fetching the HTML in case of failure. Defaults to 3.
    Returns:
        str: The HTML content of the specified element, or None if the fetch fails after the specified retries.
    Raises:
        PlaywrightTimeout: If the request times out.
    """

    html = None
    for i in range(1,retries+1):
        time.sleep(sleep * i) #makes sure we dont scrape too fast and we dont get banned.

        try:
            async with async_playwright() as p:
                browser = await p.chromium.launch()
                page = await browser.new_page()
                await page.goto(url)
                print(await page.title())
                html = await page.inner_html(selector)
        except PlaywrightTimeout:
            print(f"Timeout on {url}")
            continue
        else: 
            break
    return html


In [7]:
async def scrape_season(season):
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games.html"
    html = await get_html(url, '#content .filter')
    soup = BeautifulSoup(html)
    links = soup.find_all('a')
    href = [l['href'] for l in links]
    months_pages = [f'https://www.basketball-reference.com{l}' for l in href]

    for url in months_pages:
        save_path = os.path.join(MONTHS_DIR, url.split('/')[-1]) #naming by month
        if os.path.exists(save_path):
            print(f"Skipping {url}")
            continue

        html = await get_html(url, '#all_schedule')
        with open(save_path, 'w+') as f:
            f.write(html)

In [8]:
for season in SEASONS:
    await scrape_season(season)

Timeout on https://www.basketball-reference.com/leagues/NBA_2018_games.html
2017-18 NBA Schedule | Basketball-Reference.com
Skipping https://www.basketball-reference.com/leagues/NBA_2018_games-october.html
Skipping https://www.basketball-reference.com/leagues/NBA_2018_games-november.html
Skipping https://www.basketball-reference.com/leagues/NBA_2018_games-december.html
Skipping https://www.basketball-reference.com/leagues/NBA_2018_games-january.html
Skipping https://www.basketball-reference.com/leagues/NBA_2018_games-february.html
Skipping https://www.basketball-reference.com/leagues/NBA_2018_games-march.html
Skipping https://www.basketball-reference.com/leagues/NBA_2018_games-april.html
Skipping https://www.basketball-reference.com/leagues/NBA_2018_games-may.html
Skipping https://www.basketball-reference.com/leagues/NBA_2018_games-june.html
2018-19 NBA Schedule | Basketball-Reference.com
Skipping https://www.basketball-reference.com/leagues/NBA_2019_games-october.html
Skipping https:/

In [9]:
months_files = os.listdir(MONTHS_DIR)

### We are now looking to get the box scores

In [10]:
#Sample to see first file
async def scrape_game(months_file):
    """
    Scrapes game data from a given HTML file containing links to box scores.

    Args:
        months_file (str): The path to the HTML file containing links to box scores.

    Returns:
        None

    This function reads the HTML content from the specified file, extracts links to box scores,
    and then scrapes the box score data from each link. The scraped data is saved to a specified
    directory. If the data for a particular box score already exists, it skips that box score.
    """
    with open(months_file, 'r') as f:
        html = f.read()

    soup = BeautifulSoup(html)
    links = soup.find_all('a')
    hrefs = [l.get('href') for l in links]
    box_scores = [l for l in hrefs if l and 'boxscore' in l and '.html' in l]
    #box_scores is jut the ending hmtl path, we need to add the base
    box_scores = [f'https://www.basketball-reference.com{l}' for l in box_scores]

    #Now we have the urls, we can scrape the box scores.
    for url in box_scores:
        save_path = os.path.join(SCORES_DIR, url.split('/')[-1])
        if os.path.exists(save_path):
            print(f"Skipping {url}")
            continue
        html = await get_html(url, '#content')
        if not html:
            continue
        with open(save_path, 'w+') as f:
            f.write(html)
        print(f"Saved {url}")

In [11]:
months_files = [m for m in months_files if '.html' in m]

In [17]:
#loop to scrape games
for f in months_files:
    filepath = os.path.join(MONTHS_DIR, f)

    await scrape_game(filepath)
    

Skipping https://www.basketball-reference.com/boxscores/202110190MIL.html
Skipping https://www.basketball-reference.com/boxscores/202110190LAL.html
Skipping https://www.basketball-reference.com/boxscores/202110200CHO.html
Skipping https://www.basketball-reference.com/boxscores/202110200DET.html
Skipping https://www.basketball-reference.com/boxscores/202110200NYK.html
Skipping https://www.basketball-reference.com/boxscores/202110200TOR.html
Skipping https://www.basketball-reference.com/boxscores/202110200MEM.html
Skipping https://www.basketball-reference.com/boxscores/202110200MIN.html
Skipping https://www.basketball-reference.com/boxscores/202110200NOP.html
Skipping https://www.basketball-reference.com/boxscores/202110200SAS.html
Skipping https://www.basketball-reference.com/boxscores/202110200UTA.html
Skipping https://www.basketball-reference.com/boxscores/202110200POR.html
Skipping https://www.basketball-reference.com/boxscores/202110200PHO.html
Skipping https://www.basketball-refere