This is a simple utility notebook to update the games.csv file with the latest data from NBA.com. You want to run this notebook before backfilling the data to Hopsworks.ai for the first time.

In [1]:
import os

import pandas as pd

from datetime import datetime, timedelta
from pytz import timezone

# change working directory to project root when running from notebooks folder to make it easier to import modules
# and to access sibling folders
os.chdir('..') 

from src.webscraping import (
    activate_web_driver,
    scrape_to_dataframe,
    convert_columns,
    combine_home_visitor,  
)

from pathlib import Path  #for Windows/Linux compatibility
DATAPATH = Path(r'data')

In [2]:
WEBSCRAPER = 'SELENIUM'  # choose between 'SCRAPINGANT' and 'SELENIUM'



**Determine How current is the existing data**

In [3]:
games_old = pd.read_csv(DATAPATH / "games.csv")

# Find the last date and season in the current dataset

games_old["GAME_DATE_EST"] = pd.to_datetime(games_old["GAME_DATE_EST"])
last_date = games_old["GAME_DATE_EST"].max()
last_season = int(games_old["SEASON"].max())

# Determine the date of the next day to begin scraping from
start_date = last_date + timedelta(days=1)

# determine what season we are in currently
today = datetime.now(timezone('EST')) #nba.com uses US Eastern Standard Time
if today.month >= 10:
    current_season = today.year
else:
    current_season = today.year - 1

# determine which seasons we need to scrape to catch up the data
seasons = list(range(last_season, current_season+1))


print("Last date in dataset: ", last_date)
print("Last season in dataset: ", last_season)
print("Current season: ", current_season)
print("Seasons to scrape: ", seasons)
print("Start date: ", start_date)

# if the last date in the dataset is today, then we don't need to scrape any new data
if start_date > datetime.now():
    print("No new data to scrape")
    exit()

Last date in dataset:  2022-03-12 00:00:00
Last season in dataset:  2021
Current season:  2023
Seasons to scrape:  [2021, 2022, 2023]
Start date:  2022-03-13 00:00:00


**Activate Webdriver**

In [4]:

# if scrapingant is chosen then set the api key, otherwise load the selenium webdriver
if WEBSCRAPER == 'SCRAPINGANT':
    try:
        SCRAPINGANT_API_KEY = os.environ['SCRAPINGANT_API_KEY']
    except:
        raise Exception('Set environment variable SCRAPINGANT_API_KEY')
    driver = None
    
elif WEBSCRAPER == 'SELENIUM':
    driver = activate_web_driver('chromium')
    SCRAPINGANT_API_KEY = ""

**Scrape New Completed Games and Format Them**

In [5]:
def update_games(driver, season, start_date, end_date)-> pd.DataFrame:

    season_types = ["Regular+Season", "PlayIn", "Playoffs"]
      
    all_season_types = pd.DataFrame()

    for season_type in season_types:
        
        df = scrape_to_dataframe(api_key=SCRAPINGANT_API_KEY, driver=driver, Season=season, DateFrom=start_date, DateTo=end_date, season_type=season_type)

        if not(df.empty):
            df = convert_columns(df)
            df = combine_home_visitor(df)
            all_season_types = pd.concat([all_season_types, df], axis=0)


    return all_season_types
    

In [6]:
new_games = pd.DataFrame()
df_season = pd.DataFrame()

for season in seasons:
    end_date = datetime.strptime(f"{season+1}-08-01", "%Y-%m-%d") # use August 1st to get all games from the current season
    print(f"Scraping season {season} from {start_date} to {end_date}")
    df_season = update_games(driver, str(season), str(start_date), str(end_date))
    new_games = pd.concat([new_games, df_season], axis=0)
    start_date = datetime.strptime(f"{season+1}-10-01", "%Y-%m-%d") # if more than 1 season, reset start date to beginning of next season


new_games

Scraping season 2021 from 2022-03-13 00:00:00 to 2022-08-01 00:00:00
Scraping https://www.nba.com/stats/teams/boxscores?SeasonType=Regular+Season&Season=2021&DateFrom=2022-03-13 00:00:00&DateTo=2022-08-01 00:00:00
0      1610612754
1      1610612748
2      1610612761
3      1610612752
4      1610612743
          ...    
435    1610612755
436    1610612740
437    1610612745
438    1610612737
439    1610612754
Length: 440, dtype: object
HOME_TEAM_ID 0      1610612752
1      1610612743
2      1610612742
3      1610612750
4      1610612739
          ...    
215    1610612738
216    1610612756
217    1610612753
218    1610612740
219    1610612737
Name: HOME_TEAM_ID, Length: 220, dtype: object
GAME_ID 0      0022101226
1      0022101220
2      0022101219
3      0022101224
4      0022101218
          ...    
215    0022101014
216    0022101019
217    0022101015
218    0022101017
219    0022101016
Name: GAME_ID, Length: 220, dtype: object
VISITOR_TEAM_ID 0      1610612761
1      1610612747
2  

Unnamed: 0,GAME_DATE_EST,HOME_TEAM_WINS,PTS_home,FG_PCT_home,FG3_PCT_home,FT_PCT_home,REB_home,AST_home,HOME_TEAM_ID,GAME_ID,PTS_away,FG_PCT_away,FG3_PCT_away,FT_PCT_away,REB_away,AST_away,VISITOR_TEAM_ID,SEASON
0,2022-04-10,1,105,46.0,34.9,83.3,48,32,1.610613e+09,22101226.0,94,39.1,28.6,90.9,43,27,1.610613e+09,2021.0
1,2022-04-10,0,141,49.0,31.9,77.8,45,33,1.610613e+09,22101220.0,146,46.8,37.2,89.4,50,26,1.610613e+09,2021.0
2,2022-04-10,1,130,54.1,55.6,78.3,43,34,1.610613e+09,22101219.0,120,48.3,35.5,100.0,35,26,1.610613e+09,2021.0
3,2022-04-10,0,120,50.5,35.5,77.3,32,30,1.610613e+09,22101224.0,124,53.0,47.6,78.8,48,22,1.610613e+09,2021.0
4,2022-04-10,1,133,54.3,50.0,70.6,48,39,1.610613e+09,22101218.0,115,44.3,40.0,78.1,41,27,1.610613e+09,2021.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21,2024-04-21,1,114,47.6,44.9,87.5,44,27,1.610613e+09,42300101.0,94,46.9,32.4,100.0,34,23,1.610613e+09,2023.0
22,2024-04-20,1,114,46.1,35.7,83.3,49,27,1.610613e+09,42300151.0,103,49.4,27.6,89.5,40,22,1.610613e+09,2023.0
23,2024-04-20,1,111,39.6,45.7,82.1,55,20,1.610613e+09,42300111.0,104,44.4,34.3,90.9,33,17,1.610613e+09,2023.0
24,2024-04-20,1,120,50.0,37.5,90.9,52,26,1.610613e+09,42300161.0,95,44.0,32.1,80.0,28,16,1.610613e+09,2023.0


**Close Webdriver**

In [7]:
if WEBSCRAPER == 'SELENIUM':
    driver.close() 

**Append to Games.csv**

In [8]:
games = pd.concat([games_old, new_games], axis=0)

games.to_csv(DATAPATH / "games.csv", index=False)

games


Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,...,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,2022-03-12,22101005.0,Final,1.610613e+09,1.610613e+09,2021.0,1.610613e+09,104.0,0.398,0.760,...,23.0,53.0,1.610613e+09,113.0,0.422,0.875,0.357,21.0,46.0,0
1,2022-03-12,22101006.0,Final,1.610613e+09,1.610613e+09,2021.0,1.610613e+09,101.0,0.443,0.933,...,20.0,46.0,1.610613e+09,91.0,0.419,0.824,0.208,19.0,40.0,1
2,2022-03-12,22101007.0,Final,1.610613e+09,1.610613e+09,2021.0,1.610613e+09,108.0,0.412,0.813,...,28.0,52.0,1.610613e+09,119.0,0.489,1.000,0.389,23.0,47.0,0
3,2022-03-12,22101008.0,Final,1.610613e+09,1.610613e+09,2021.0,1.610613e+09,122.0,0.484,0.933,...,33.0,55.0,1.610613e+09,109.0,0.413,0.696,0.386,27.0,39.0,1
4,2022-03-12,22101009.0,Final,1.610613e+09,1.610613e+09,2021.0,1.610613e+09,115.0,0.551,0.750,...,32.0,39.0,1.610613e+09,127.0,0.471,0.760,0.387,28.0,50.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21,2024-04-21,42300101.0,,1.610613e+09,1.610613e+09,2023.0,,114.0,47.600,87.500,...,27.0,44.0,,94.0,46.900,100.000,32.400,23.0,34.0,1
22,2024-04-20,42300151.0,,1.610613e+09,1.610613e+09,2023.0,,114.0,46.100,83.300,...,27.0,49.0,,103.0,49.400,89.500,27.600,22.0,40.0,1
23,2024-04-20,42300111.0,,1.610613e+09,1.610613e+09,2023.0,,111.0,39.600,82.100,...,20.0,55.0,,104.0,44.400,90.900,34.300,17.0,33.0,1
24,2024-04-20,42300161.0,,1.610613e+09,1.610613e+09,2023.0,,120.0,50.000,90.900,...,26.0,52.0,,95.0,44.000,80.000,32.100,16.0,28.0,1
