In [1]:
import os

import pandas as pd
import numpy as np

import hopsworks

from datetime import datetime, timedelta
from pytz import timezone

from src.webscraping import (
    activate_web_driver,
    scrape_to_dataframe,
    convert_columns,
    combine_home_visitor,  
    get_todays_matchups,
)

from src.data_processing import (
    process_games,
    add_TARGET,
)

from src.feature_engineering import (
    process_features,
)

from src.hopsworks_utils import (
    save_feature_names,
    convert_feature_names,
)

import json

from pathlib import Path  #for Windows/Linux compatibility
DATAPATH = Path(r'data')

**Load API keys**

In [2]:
from dotenv import load_dotenv

load_dotenv()

try:
    HOPSWORKS_API_KEY = os.environ['HOPSWORKS_API_KEY']
except:
    raise Exception('Set environment variable HOPSWORKS_API_KEY')

SCRAPINGANT_API_KEY = ""

**Activate Webdriver**

In [3]:
# initiate a webdriver in selenium 
# since website data is dynamically generated

driver = activate_web_driver('firefox')

2023-04-03 07:55:33,539 INFO: Get LATEST geckodriver version for 111.0 firefox


[WDM] - Downloading: 19.1kB [00:00, 19.6MB/s]                   


2023-04-03 07:55:34,178 INFO: There is no [win64] geckodriver for browser 111.0 in cache
2023-04-03 07:55:34,179 INFO: Getting latest mozilla release info for v0.33.0


[WDM] - Downloading: 19.1kB [00:00, 9.80MB/s]                   

2023-04-03 07:55:34,418 INFO: About to download new driver from https://github.com/mozilla/geckodriver/releases/download/v0.33.0/geckodriver-v0.33.0-win64.zip



[WDM] - Downloading: 100%|██████████| 1.59M/1.59M [00:00<00:00, 8.49MB/s]


2023-04-03 07:55:35,448 INFO: Driver has been saved in cache [C:\Users\Chris\.wdm\drivers\geckodriver\win64\0.33]




**Scrape New Completed Games and Format Them**

In [4]:
def get_2021_leftovers(driver)-> pd.DataFrame:


    SEASON = "2021-22"
    DATETO = "07/01/2022"
    DATEFROM = "10/01/2021"


    #df = scrape_to_dataframe(driver, Season=SEASON, DateFrom=DATEFROM, DateTo=DATETO)
    df = scrape_to_dataframe(api_key=SCRAPINGANT_API_KEY, driver=driver, Season=SEASON, DateFrom=DATEFROM, DateTo=DATETO, )

    df = convert_columns(df)

    print(df.info())
    df = combine_home_visitor(df)

    return df

def get_2022_to_today(driver)-> pd.DataFrame:


    TODAY = datetime.now(timezone('EST')) #nba.com uses US Eastern Standard Time

    SEASON = "2022-23"
    DATETO = TODAY.strftime("%m/%d/%y")
    DATEFROM = "10/18/2022"


    #df = scrape_to_dataframe(driver, Season=SEASON, DateFrom=DATEFROM, DateTo=DATETO)
    df = scrape_to_dataframe(api_key=SCRAPINGANT_API_KEY, driver=driver, Season=SEASON, DateFrom=DATEFROM, DateTo=DATETO, )

    df = convert_columns(df)

    print(df.info())
    df = combine_home_visitor(df)

    return df

df_1 = get_2021_leftovers(driver)

df_2 = get_2022_to_today(driver)

df_combined = pd.concat([df_1, df_2], axis=0)

df_combined 




<class 'pandas.core.frame.DataFrame'>
Int64Index: 2460 entries, 0 to 2459
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   HOME            2460 non-null   int64         
 1   GAME_DATE_EST   2460 non-null   datetime64[ns]
 2   HOME_TEAM_WINS  2460 non-null   int64         
 3   PTS             2460 non-null   int64         
 4   FG_PCT          2460 non-null   float64       
 5   FG3_PCT         2460 non-null   float64       
 6   FT_PCT          2460 non-null   float64       
 7   REB             2460 non-null   int64         
 8   AST             2460 non-null   int64         
 9   TEAM_ID         2460 non-null   object        
 10  GAME_ID         2460 non-null   object        
dtypes: datetime64[ns](1), float64(3), int64(5), object(2)
memory usage: 230.6+ KB
None
     GAME_DATE_EST  HOME_TEAM_WINS  PTS_home  FG_PCT_home  FG3_PCT_home  \
0       2022-04-10               0       120         4



<class 'pandas.core.frame.DataFrame'>
Int64Index: 2350 entries, 0 to 2349
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   HOME            2350 non-null   int64         
 1   GAME_DATE_EST   2350 non-null   datetime64[ns]
 2   HOME_TEAM_WINS  2350 non-null   int64         
 3   PTS             2350 non-null   int64         
 4   FG_PCT          2350 non-null   float64       
 5   FG3_PCT         2350 non-null   float64       
 6   FT_PCT          2350 non-null   float64       
 7   REB             2350 non-null   int64         
 8   AST             2350 non-null   int64         
 9   TEAM_ID         2350 non-null   object        
 10  GAME_ID         2350 non-null   object        
dtypes: datetime64[ns](1), float64(3), int64(5), object(2)
memory usage: 220.3+ KB
None
     GAME_DATE_EST  HOME_TEAM_WINS  PTS_home  FG_PCT_home  FG3_PCT_home  \
0       2023-04-02               0       110         4

Unnamed: 0,GAME_DATE_EST,HOME_TEAM_WINS,PTS_home,FG_PCT_home,FG3_PCT_home,FT_PCT_home,REB_home,AST_home,HOME_TEAM_ID,GAME_ID,PTS_away,FG_PCT_away,FG3_PCT_away,FT_PCT_away,REB_away,AST_away,VISITOR_TEAM_ID,SEASON
0,2022-04-10,0,120,48.3,35.5,100.0,35,26,1610612759,22101219,130,54.1,55.6,78.3,43,34,1610612742,2021
1,2022-04-10,1,139,54.5,37.5,100.0,56,34,1610612738,22101223,110,38.2,31.9,63.0,45,27,1610612763,2021
2,2022-04-10,0,126,45.2,41.3,68.4,30,32,1610612754,22101216,134,64.2,36.4,88.0,50,30,1610612751,2021
3,2022-04-10,1,146,46.8,37.2,89.4,50,26,1610612747,22101220,141,49.0,31.9,77.8,45,33,1610612743,2021
4,2022-04-10,1,116,52.6,53.8,73.3,40,26,1610612758,22101229,109,40.8,29.8,73.3,50,27,1610612756,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1170,2022-10-19,1,114,45.7,35.5,79.2,53,21,1610612764,22200004,107,40.2,35.7,66.7,42,21,1610612754,2022
1171,2022-10-19,1,115,44.3,39.3,78.8,44,20,1610612757,22200014,108,45.9,38.6,68.4,41,27,1610612758,2022
1172,2022-10-19,0,109,48.8,36.7,73.7,48,21,1610612753,22200003,113,42.6,36.8,79.2,41,31,1610612765,2022
1173,2022-10-18,0,109,42.6,25.0,76.0,48,23,1610612747,22200002,123,45.5,35.6,73.9,48,31,1610612744,2022


**Close Webdriver**

In [5]:
driver.close() 



**Append to Games.csv**

In [6]:
games_old = pd.read_csv(DATAPATH / "games_old.csv")

#drop 2021-22 season
games_old = games_old[games_old['SEASON'] != 2021]

games = pd.concat([games_old, df_combined], axis=0)

games.to_csv(DATAPATH / "games.csv", index=False)

games


Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,...,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
1076,2021-07-20,42000406,Final,1610612749,1610612756,2020,1.610613e+09,105.0,0.451,0.862,...,20.0,53.0,1.610613e+09,98.0,0.442,0.842,0.240,14.0,37.0,1
1077,2021-07-17,42000405,Final,1610612756,1610612749,2020,1.610613e+09,119.0,0.552,0.909,...,23.0,35.0,1.610613e+09,123.0,0.575,0.529,0.500,26.0,37.0,0
1078,2021-07-14,42000404,Final,1610612749,1610612756,2020,1.610613e+09,109.0,0.402,0.828,...,22.0,48.0,1.610613e+09,103.0,0.513,0.842,0.304,18.0,40.0,1
1079,2021-07-11,42000403,Final,1610612749,1610612756,2020,1.610613e+09,120.0,0.478,0.769,...,28.0,47.0,1.610613e+09,100.0,0.482,0.688,0.290,21.0,36.0,1
1080,2021-07-08,42000402,Final,1610612756,1610612749,2020,1.610613e+09,118.0,0.489,0.857,...,26.0,43.0,1.610613e+09,108.0,0.452,0.652,0.290,21.0,46.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1170,2022-10-19 00:00:00,22200004,,1610612764,1610612754,2022,,114.0,45.700,79.200,...,21.0,53.0,,107.0,40.200,66.700,35.700,21.0,42.0,1
1171,2022-10-19 00:00:00,22200014,,1610612757,1610612758,2022,,115.0,44.300,78.800,...,20.0,44.0,,108.0,45.900,68.400,38.600,27.0,41.0,1
1172,2022-10-19 00:00:00,22200003,,1610612753,1610612765,2022,,109.0,48.800,73.700,...,21.0,48.0,,113.0,42.600,79.200,36.800,31.0,41.0,0
1173,2022-10-18 00:00:00,22200002,,1610612747,1610612744,2022,,109.0,42.600,76.000,...,23.0,48.0,,123.0,45.500,73.900,35.600,31.0,48.0,0
