Initial scrape and testing. Set FULL_SCRAPE = True to re-scrape all the data from the START_SEASON (2006). If set to False, then it will try to determine the last date scraped and only scrape the new data.

In [1]:
import os

import pandas as pd


from datetime import datetime, timedelta
from pytz import timezone

# change working directory to project root when running from notebooks folder to make it easier to import modules
# and to access sibling folders
os.chdir('..') 


from src.webscraping import (
    determine_scrape_start,
    activate_web_driver,
    load_scraped_data,
    scrape_to_dataframe,
    scrape_sub_seasons,
    validate_scraped_dataframes,
)

from pathlib import Path  #for Windows/Linux compatibility
DATAPATH = Path(r'data')
NEWLY_SCRAPED_PATH = DATAPATH / 'newly_scraped'

In [2]:
FULL_SCRAPE = False # set to True to scrape all seasons, False to scrape from where the last scrape left off
START_SEASON = 2006  # start season for scraping if FULL_SCRAPE is TRUE (some seasons prior to 2006 may be missing some of the additional stats)

#nba.com serves 5 different boxscore screens - 'traditional', 'advanced', 'four-factors', 'misc', 'scoring'
STAT_TYPES = ['traditional', 'advanced', 'four-factors', 'misc', 'scoring']  



**Determine How current is the existing data**

In [3]:

if FULL_SCRAPE:
    # scrape all seasons from START_SEASON to current season

    seasons = list(range(START_SEASON, datetime.now().year))
    seasons = [str(season) + "-" + (str(season + 1))[-2:] for season in seasons]  # format season as '2006-07' which is required for nba.com advanced boxscores

    first_start_date = "10/1/" + str(START_SEASON)  #usually starts in October
     
else:
    # determine where to start scraping
    
    scraped_data = [] #list of dataframes
    scraped_data = load_scraped_data(new=False)

    # check the latest game in the dataset to see what needs to be scraped
    first_start_date, seasons = determine_scrape_start(scraped_data)

    if first_start_date is None:
        print("Error - previous scraped data has inconsistent dates")
        exit()





  df = pd.read_csv(scraped_path / file)


Last date in dataset:  2024-05-10 00:00:00
Last season in dataset:  2023
Current season:  2023
Seasons to scrape:  ['2023-24']
Start date:  05/11/2024


**Activate Webdriver**

In [4]:
driver = activate_web_driver('chromium')
    

**Scrape and Save New Completed Games**

In [5]:

for stat_type in STAT_TYPES:
    
    new_games = pd.DataFrame()
    df_season = pd.DataFrame()
    
    # if there are multiple seasons, start date will be reset to the beginning of the next season,
    # so we need to keep track of the original start date because we are scraping multiple stats categories, each with their own seasons
    start_date = first_start_date 

    for season in seasons:
        season_year = int(season[:4])    
        end_date = "08/01/" + str(season_year+1)  # use August 1st to get all games from the current season
        df_season = scrape_sub_seasons(driver, str(season), str(start_date), str(end_date), stat_type)
        new_games = pd.concat([new_games, df_season], axis=0)
        start_date = "08/01/" + str(season_year+1)  #if more than 1 season, reset start date to beginning of next season

    file_name = "games_" + stat_type + ".csv"
    print(f"Saving {file_name}")
    print()
    new_games.to_csv(NEWLY_SCRAPED_PATH / file_name, index=False)


Scraping 2023-24 from 05/11/2024 to 08/01/2024 for traditional stats
Scraping https://www.nba.com/stats/teams/boxscores?SeasonType=Regular+Season&Season=2023-24&DateFrom=05/11/2024&DateTo=08/01/2024
No data found
Scraping https://www.nba.com/stats/teams/boxscores?SeasonType=PlayIn&Season=2023-24&DateFrom=05/11/2024&DateTo=08/01/2024
No data found
Scraping https://www.nba.com/stats/teams/boxscores?SeasonType=Playoffs&Season=2023-24&DateFrom=05/11/2024&DateTo=08/01/2024


  dfs = pd.read_html(str(data_table), header=0)


Saving games_traditional.csv

Scraping 2023-24 from 05/11/2024 to 08/01/2024 for advanced stats
Scraping https://www.nba.com/stats/teams/boxscores-advanced?SeasonType=Regular+Season&Season=2023-24&DateFrom=05/11/2024&DateTo=08/01/2024
No data found
Scraping https://www.nba.com/stats/teams/boxscores-advanced?SeasonType=PlayIn&Season=2023-24&DateFrom=05/11/2024&DateTo=08/01/2024
No data found
Scraping https://www.nba.com/stats/teams/boxscores-advanced?SeasonType=Playoffs&Season=2023-24&DateFrom=05/11/2024&DateTo=08/01/2024


  dfs = pd.read_html(str(data_table), header=0)


Saving games_advanced.csv

Scraping 2023-24 from 05/11/2024 to 08/01/2024 for four-factors stats
Scraping https://www.nba.com/stats/teams/boxscores-four-factors?SeasonType=Regular+Season&Season=2023-24&DateFrom=05/11/2024&DateTo=08/01/2024
No data found
Scraping https://www.nba.com/stats/teams/boxscores-four-factors?SeasonType=PlayIn&Season=2023-24&DateFrom=05/11/2024&DateTo=08/01/2024
No data found
Scraping https://www.nba.com/stats/teams/boxscores-four-factors?SeasonType=Playoffs&Season=2023-24&DateFrom=05/11/2024&DateTo=08/01/2024


  dfs = pd.read_html(str(data_table), header=0)


Saving games_four-factors.csv

Scraping 2023-24 from 05/11/2024 to 08/01/2024 for misc stats
Scraping https://www.nba.com/stats/teams/boxscores-misc?SeasonType=Regular+Season&Season=2023-24&DateFrom=05/11/2024&DateTo=08/01/2024
No data found
Scraping https://www.nba.com/stats/teams/boxscores-misc?SeasonType=PlayIn&Season=2023-24&DateFrom=05/11/2024&DateTo=08/01/2024
No data found
Scraping https://www.nba.com/stats/teams/boxscores-misc?SeasonType=Playoffs&Season=2023-24&DateFrom=05/11/2024&DateTo=08/01/2024


  dfs = pd.read_html(str(data_table), header=0)


Saving games_misc.csv

Scraping 2023-24 from 05/11/2024 to 08/01/2024 for scoring stats
Scraping https://www.nba.com/stats/teams/boxscores-scoring?SeasonType=Regular+Season&Season=2023-24&DateFrom=05/11/2024&DateTo=08/01/2024
No data found
Scraping https://www.nba.com/stats/teams/boxscores-scoring?SeasonType=PlayIn&Season=2023-24&DateFrom=05/11/2024&DateTo=08/01/2024
No data found
Scraping https://www.nba.com/stats/teams/boxscores-scoring?SeasonType=Playoffs&Season=2023-24&DateFrom=05/11/2024&DateTo=08/01/2024
Saving games_scoring.csv



  dfs = pd.read_html(str(data_table), header=0)


**Close Webdriver**

In [6]:
driver.close() 

**Reload and Validate the Scraped Data**


In [7]:
scraped_data = load_scraped_data(new=True)

response = validate_scraped_dataframes(scraped_data)

if response == "Pass":
    print("All scraped dataframes are consistent")
else:
    print("Error - scraped dataframes are inconsistent")
    print(response)

All scraped dataframes are consistent
