## Web Scraping NBA Games With Python
### Downloading NBA Games Data (Scores, Standings)

In [11]:
# !pip install playwright
# !playwright install

Downloading Chromium 110.0.5481.38 (playwright build v1045)[2m from https://playwright.azureedge.net/builds/chromium/1045/chromium-mac.zip[22m
Chromium 110.0.5481.38 (playwright build v1045) downloaded to /Users/cristianmurillo/Library/Caches/ms-playwright/chromium-1045
Downloading FFMPEG playwright build v1008[2m from https://playwright.azureedge.net/builds/ffmpeg/1008/ffmpeg-mac.zip[22m
FFMPEG playwright build v1008 downloaded to /Users/cristianmurillo/Library/Caches/ms-playwright/ffmpeg-1008
Downloading Firefox 108.0.2 (playwright build v1372)[2m from https://playwright.azureedge.net/builds/firefox/1372/firefox-mac-11.zip[22m
Firefox 108.0.2 (playwright build v1372) downloaded to /Users/cristianmurillo/Library/Caches/ms-playwright/firefox-1372
Downloading Webkit 16.4 (playwright build v1767)[2m from https://playwright.azureedge.net/builds/webkit/1767/webkit-mac-12.zip[22m
Webkit 16.4 (playwright build v1767) downloaded to /Users/cristianmurillo/Library/Caches/ms-playwright/w

In [1]:
import os
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
import time

In [2]:
seasons = list(range(2016, 2023))
seasons

[2016, 2017, 2018, 2019, 2020, 2021, 2022]

In [3]:
DATA_DIR = 'data'
STANDINGS_DIR = os.path.join(DATA_DIR, 'standings')
SCORES_DIR = os.path.join(DATA_DIR, 'scores')

In [37]:
async def get_html(url: str, selector: str, sleep:int=5, retries:int=5) -> str:
  html = None
  for i in range(1, retries + 1):
    try:
      async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        await page.goto(url)
        # increase sleep time for each retry attempt
        # to avoid being blocked by the server
        page.wait_for_timeout(sleep * i * 1000)
        print(await page.title())
        html = await page.inner_html(selector)
    except PlaywrightTimeout:
      print(f'Timeout error on {url}')
      continue
    else:
      # no retry if html is successfilly retrieved
      break
  return html

In [51]:
async def scrape_season(season: int) -> None:
  url = f'https://www.basketball-reference.com/leagues/NBA_{season}_games.html'
  html = await get_html(url, '#content .filter')
  soup = BeautifulSoup(html)
  links = soup.find_all('a')
  href = [link['href'] for link in links]
  standings_pages = [f'https://www.basketball-reference.com{link}' for link in href]
  
  for link in standings_pages:
    save_path = os.path.join(STANDINGS_DIR, link.split('/')[-1])
    if os.path.exists(save_path):
      continue

    html = await get_html(link, '#all_schedule')
    with open(save_path, 'w+') as f:
      f.write(html)

In [53]:
for season in seasons:
  await scrape_season(season)

In [61]:
async def scrape_game(standings_file):
  with open(standings_file, 'r') as f:
    html = f.read()

    soup = BeautifulSoup(html)
    links = soup.find_all('a')
    hrefs = [l.get('href') for l in links]
    box_scores = [l for l in hrefs if l and 'boxscore' in l and '.html' in l]
    box_scores = [f'https://www.basketball-reference.com{l}' for l in box_scores]

    for url in box_scores:
      save_path = os.path.join(SCORES_DIR, url.split('/')[-1])
      if os.path.exists(save_path):
        continue

      html = await get_html(url, '#content')
      if not html:
        continue
      with open(save_path, 'w+') as f:
        f.write(html)

In [63]:
standing_files = [os.path.join(STANDINGS_DIR, f) for f in os.listdir(STANDINGS_DIR) if f.endswith('.html')]
for file in standing_files:
  await scrape_game(file)