In [1]:
import os

import pandas as pd
import numpy as np

import hopsworks

from datetime import datetime, timedelta
from pytz import timezone

from src.webscraping import (
    activate_web_driver,
    scrape_to_dataframe,
    convert_columns,
    combine_home_visitor,  
    get_todays_matchups,
)

from src.data_processing import (
    process_games,
    add_TARGET,
)

from src.feature_engineering import (
    process_features,
)

from src.hopsworks_utils import (
    save_feature_names,
    convert_feature_names,
)

import json

from pathlib import Path  #for Windows/Linux compatibility
DATAPATH = Path(r'data')

**Load API keys**

In [2]:
from dotenv import load_dotenv

load_dotenv()

try:
    HOPSWORKS_API_KEY = os.environ['HOPSWORKS_API_KEY']
except:
    raise Exception('Set environment variable HOPSWORKS_API_KEY')

**Activate Webdriver**

In [3]:
# initiate a webdriver in selenium 
# since website data is dynamically generated

driver = activate_web_driver('firefox')

2023-01-02 12:08:15,493 INFO: Get LATEST geckodriver version for 108.0 firefox


[WDM] - Downloading: 19.0kB [00:00, 19.0MB/s]                   


2023-01-02 12:08:16,098 INFO: Driver [C:\Users\Chris\.wdm\drivers\geckodriver\win64\0.32\geckodriver.exe] found in cache




**Scrape New Completed Games and Format Them**

In [4]:
def get_2021_leftovers(driver)-> pd.DataFrame:


    SEASON = "2021-22"
    DATETO = "07/01/2022"
    DATEFROM = "03/13/2022"


    df = scrape_to_dataframe(driver, Season=SEASON, DateFrom=DATEFROM, DateTo=DATETO)

    df = convert_columns(df)

    print(df.info())
    df = combine_home_visitor(df)

    return df

def get_2022_to_today(driver)-> pd.DataFrame:


    TODAY = datetime.now(timezone('EST')) #nba.com uses US Eastern Standard Time

    SEASON = "2022-23"
    DATETO = TODAY.strftime("%m/%d/%y")
    DATEFROM = "10/18/2022"


    df = scrape_to_dataframe(driver, Season=SEASON, DateFrom=DATEFROM, DateTo=DATETO)

    df = convert_columns(df)

    print(df.info())
    df = combine_home_visitor(df)

    return df

df_1 = get_2021_leftovers(driver)

df_2 = get_2022_to_today(driver)

df_combined = pd.concat([df_1, df_2], axis=0)

df_combined




<class 'pandas.core.frame.DataFrame'>
Int64Index: 440 entries, 0 to 439
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   HOME            440 non-null    int64         
 1   GAME_DATE_EST   440 non-null    datetime64[ns]
 2   HOME_TEAM_WINS  440 non-null    int64         
 3   PTS             440 non-null    int64         
 4   FG_PCT          440 non-null    float64       
 5   FG3_PCT         440 non-null    float64       
 6   FT_PCT          440 non-null    float64       
 7   REB             440 non-null    int64         
 8   AST             440 non-null    int64         
 9   TEAM_ID         440 non-null    object        
 10  GAME_ID         440 non-null    object        
dtypes: datetime64[ns](1), float64(3), int64(5), object(2)
memory usage: 41.2+ KB
None




<class 'pandas.core.frame.DataFrame'>
Int64Index: 1098 entries, 0 to 1097
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   HOME            1098 non-null   int64         
 1   GAME_DATE_EST   1098 non-null   datetime64[ns]
 2   HOME_TEAM_WINS  1098 non-null   int64         
 3   PTS             1098 non-null   int64         
 4   FG_PCT          1098 non-null   float64       
 5   FG3_PCT         1098 non-null   float64       
 6   FT_PCT          1098 non-null   float64       
 7   REB             1098 non-null   int64         
 8   AST             1098 non-null   int64         
 9   TEAM_ID         1098 non-null   object        
 10  GAME_ID         1098 non-null   object        
dtypes: datetime64[ns](1), float64(3), int64(5), object(2)
memory usage: 102.9+ KB
None


Unnamed: 0,GAME_DATE_EST,HOME_TEAM_WINS,PTS_home,FG_PCT_home,FG3_PCT_home,FT_PCT_home,REB_home,AST_home,HOME_TEAM_ID,GAME_ID,PTS_away,FG_PCT_away,FG3_PCT_away,FT_PCT_away,REB_away,AST_away,VISITOR_TEAM_ID,SEASON
0,2022-04-10,0,108,44.7,34.8,61.5,43,25,1610612764,22101217,124,53.9,54.5,71.4,48,34,1610612766,2021
1,2022-04-10,0,94,39.1,28.6,90.9,43,27,1610612761,22101226,105,46.0,34.9,83.3,48,32,1610612752,2021
2,2022-04-10,0,111,42.6,38.0,75.0,47,20,1610612748,22101227,125,49.5,39.7,85.7,48,33,1610612753,2021
3,2022-04-10,0,115,44.3,40.0,78.1,41,27,1610612749,22101218,133,54.3,50.0,70.6,48,39,1610612739,2021
4,2022-04-10,0,106,45.8,32.4,65.5,42,26,1610612765,22101228,118,52.3,20.0,91.3,42,25,1610612755,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
544,2022-10-19,1,115,44.3,39.3,78.8,44,20,1610612757,22200014,108,45.9,38.6,68.4,41,27,1610612758,2022
545,2022-10-19,1,114,45.7,35.5,79.2,53,21,1610612764,22200004,107,40.2,35.7,66.7,42,21,1610612754,2022
546,2022-10-19,0,105,46.7,40.0,61.8,40,17,1610612742,22200013,107,47.1,36.4,86.4,40,25,1610612756,2022
547,2022-10-18,0,109,42.6,25.0,76.0,48,23,1610612747,22200002,123,45.5,35.6,73.9,48,31,1610612744,2022


**Close Webdriver**

In [5]:
driver.close() 



**Append to Games.csv**

In [6]:
games_old = pd.read_csv(DATAPATH / "games_old.csv")

games = pd.concat([games_old, df_combined], axis=0)

games.to_csv(DATAPATH / "games.csv", index=False)

games


Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,...,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,2022-03-12,22101005,Final,1610612748,1610612750,2021,1.610613e+09,104.0,0.398,0.760,...,23.0,53.0,1.610613e+09,113.0,0.422,0.875,0.357,21.0,46.0,0
1,2022-03-12,22101006,Final,1610612741,1610612739,2021,1.610613e+09,101.0,0.443,0.933,...,20.0,46.0,1.610613e+09,91.0,0.419,0.824,0.208,19.0,40.0,1
2,2022-03-12,22101007,Final,1610612759,1610612754,2021,1.610613e+09,108.0,0.412,0.813,...,28.0,52.0,1.610613e+09,119.0,0.489,1.000,0.389,23.0,47.0,0
3,2022-03-12,22101008,Final,1610612744,1610612749,2021,1.610613e+09,122.0,0.484,0.933,...,33.0,55.0,1.610613e+09,109.0,0.413,0.696,0.386,27.0,39.0,1
4,2022-03-12,22101009,Final,1610612743,1610612761,2021,1.610613e+09,115.0,0.551,0.750,...,32.0,39.0,1.610613e+09,127.0,0.471,0.760,0.387,28.0,50.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
544,2022-10-19 00:00:00,22200014,,1610612757,1610612758,2022,,115.0,44.300,78.800,...,20.0,44.0,,108.0,45.900,68.400,38.600,27.0,41.0,1
545,2022-10-19 00:00:00,22200004,,1610612764,1610612754,2022,,114.0,45.700,79.200,...,21.0,53.0,,107.0,40.200,66.700,35.700,21.0,42.0,1
546,2022-10-19 00:00:00,22200013,,1610612742,1610612756,2022,,105.0,46.700,61.800,...,17.0,40.0,,107.0,47.100,86.400,36.400,25.0,40.0,0
547,2022-10-18 00:00:00,22200002,,1610612747,1610612744,2022,,109.0,42.600,76.000,...,23.0,48.0,,123.0,45.500,73.900,35.600,31.0,48.0,0
