In [1]:
import os
from bs4 import BeautifulSoup                                                             # parse html
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout    # enables us to open web browser and grab html
import time 

In [2]:
SEASONS = list(range(2016, 2023))

In [3]:
DATADIR = 'data'
STANDINGS_DIR = os.path.join(DATADIR, 'standings')
SCORES_DIR = os.path.join(DATADIR, 'scores')

In [4]:
async def get_html(url, selector, sleep = 5, retries = 3):
    html = None
    for i in range(1, retries + 1):                         
        time.sleep(sleep * i)                                     # Avoid website server banning us

        try:
            async with async_playwright() as p:                   # Initialize playwright instance
                browser = await p.chromium.launch()               # Launch browser. Await turns async method into a synchronous method. Waits for browser to launch before continuing
                page = await browser.new_page()                    # Create new tab in browser
                await page.goto(url)                              # Send browser in tab to given url
                print(await page.title())                         # Check scraping progress
                html = await page.inner_html(selector)            # Assign html to selected html
        except PlaywrightTimeout:
            print(f'Timeout error on {url}')                      # Try again with "continue" if error
            continue
        else:
            break                                                 # If web scraping successful, exit loop
    return html 
                

In [5]:
async def scrape_season(season):
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games.html"
    html = await get_html(url, "#content .filter")                           # Get html before continuing. Content filter is CSS selector to target element ids, starting with id="content" and class = "filter"
    
    # Parse NBA standings with BeautifulSoup
    soup = BeautifulSoup(html)                                               # Initialize BeautifulSoup Class
    links = soup.find_all("a")                                               # Find_all() "a" tags which are the links
    href = [l["href"] for l in links]                                        # Get hrefs (href="/leagues/NBA 2024 games-november.html")
    standings_pages = [f'https://basketball-reference.com{l}' for l in href] # Add header to links

    for url in standings_pages:                                              # Loop through standings_pages
        save_path = os.path.join(STANDINGS_DIR, url.split("/")[-1])          # Save to disc. We want to save all data first, then process later.
        if os.path.exists(save_path):                                        # Check if already scraped, does not scrape again
            continue

        html = await get_html(url, "#all_schedule")                         # Grab "all_schedule" table which contains box scores
        with open(save_path, "w+") as f:                                    # Save html before we do any processing
            f.write(html)                                                   # Write html to file

In [6]:
for season in SEASONS:
    await scrape_season(season)       # Ensure we finish scraping each season before continuing

Timeout error on https://www.basketball-reference.com/leagues/NBA_2016_games.html
Timeout error on https://www.basketball-reference.com/leagues/NBA_2016_games.html
Timeout error on https://www.basketball-reference.com/leagues/NBA_2016_games.html


TypeError: object of type 'NoneType' has no len()

In [None]:
# Parse box score links with BeautifulSoup
standings_files = os.listdir(STANDINGS_DIR)    # Verify scraping completion

In [None]:
standings_files

In [None]:
async def scrape_game(standings_file):
    with open(standings_file, 'r') as f:                                             # Open standings file to scrape box score data
        html = f.read()                                                              # Read file
     
    soup = BeautifulSoup(standings_file)                                             # Process file using BeautifulSoup
    links = soup.find_all("a")                                                       # Clean data starting with finding "a" tags
    hrefs = [l.get("href") for l in links]                                           # Then get elements with "href" tags
    box_scores = [l for l in hrefs if l and "boxscore" in l and ".html" in l]        # List comprehension to filter null, contains "boxscore", and ".html"
    box_scores = [f"https://www.basketball-reference.com{l}" for l in box_scores]    # Add header to complete box score links
     
    # Download box scores with Playwright
    for url in box_scores:                                                           # Loop through box score pages
        save_path = os.path.join(SCORES_DIR, url.split("/")[-1])                     # Save box scores in scores directory
        if os.path.exists(save_path):                                                # Ignore duplicate files
            continue
        
        html = await get_html(url, "#content")                                       # Get html with selector "content"
        if not html:                                                                 # Check if html is null
            continue
        with open(save_path, "w+") as f:                                             # Save html before we do any processing
            f.write(html)                                                            # Write html to file

In [None]:
standings_file = [s for s in stands_file if.".html" in s]

In [None]:
for f in standings_files:                           # Scrape each game
    filepath = os.path.join(STANDINGS_DIR, f)

    await scrape_game(filepath)

In [None]:
standings_files