In [2]:
import os

import pandas as pd
import numpy as np

import hopsworks

from datetime import datetime, timedelta
from pytz import timezone

from src.webscraping import (
    activate_web_driver,
    scrape_to_dataframe,
    convert_columns,
    combine_home_visitor,  
)

from src.data_processing import (
    process_games,
    add_TARGET,
)

from src.feature_engineering import (
    process_features,
)

import json

from pathlib import Path  #for Windows/Linux compatibility
DATAPATH = Path(r'data')

**Load API keys**

In [3]:
from dotenv import load_dotenv

load_dotenv()

try:
    HOPSWORKS_API_KEY = os.environ['HOPSWORKS_API_KEY']
except:
    raise Exception('Set environment variable HOPSWORKS_API_KEY')

**Scrape Data and Format**

In [4]:

# set search strings for the last seven days 
DAYS = 7
SEASON = "" #no season will cause website to default to current season, format is "2022-23"
TODAY = datetime.now(timezone('EST')) #nba.com uses US Eastern Standard Time
LASTWEEK = (TODAY - timedelta(days=DAYS))
DATETO = TODAY.strftime("%m/%d/%y")
DATEFROM = LASTWEEK.strftime("%m/%d/%y")

# initate a webdriver in selenium 
# since website data is dynamically generated
driver = activate_web_driver('firefox')

df = scrape_to_dataframe(driver, Season=SEASON, DateFrom=DATEFROM, DateTo=DATETO)

driver.close() 

df = convert_columns(df)
df = combine_home_visitor(df)

df

2022-12-08 07:41:51,698 INFO: Get LATEST geckodriver version for 107.0 firefox


[WDM] - Downloading: 19.0kB [00:00, 6.49MB/s]                   


2022-12-08 07:41:52,482 INFO: Getting latest mozilla release info for v0.32.0


[WDM] - Downloading: 19.0kB [00:00, 4.88MB/s]                   

2022-12-08 07:41:53,315 INFO: About to download new driver from https://github.com/mozilla/geckodriver/releases/download/v0.32.0/geckodriver-v0.32.0-win64.zip



[WDM] - Downloading: 100%|██████████| 1.58M/1.58M [00:00<00:00, 2.15MB/s]


2022-12-08 07:41:55,886 INFO: Driver has been saved in cache [C:\Users\Chris\.wdm\drivers\geckodriver\win64\0.32]




Unnamed: 0,GAME_DATE_EST,HOME_TEAM_WINS,PTS_home,FG_PCT_home,FG3_PCT_home,FT_PCT_home,REB_home,AST_home,HOME_TEAM_ID,GAME_ID,PTS_away,FG_PCT_away,FG3_PCT_away,FT_PCT_away,REB_away,AST_away,VISITOR_TEAM_ID,SEASON
0,2022-12-07,1,125,48.5,35.6,84.6,53,29,1610612738,22200373,98,39.8,25.0,84.2,49,19,1610612756,2022
1,2022-12-07,0,123,45.9,33.3,76.0,42,26,1610612744,22200372,124,53.5,42.9,81.0,40,26,1610612762,2022
2,2022-12-07,0,111,48.8,29.0,69.0,38,22,1610612764,22200367,115,50.0,43.5,88.2,41,29,1610612741,2022
3,2022-12-07,0,115,48.1,45.2,75.0,31,22,1610612754,22200370,121,47.3,34.8,92.6,51,25,1610612750,2022
4,2022-12-07,0,98,38.9,35.7,81.3,42,17,1610612765,22200371,104,45.9,29.6,78.3,50,21,1610612740,2022
5,2022-12-07,0,102,37.0,36.8,87.0,48,25,1610612760,22200368,123,51.1,36.8,81.3,50,32,1610612763,2022
6,2022-12-07,0,111,41.1,36.6,69.2,53,26,1610612746,22200363,116,42.1,28.6,90.3,53,20,1610612753,2022
7,2022-12-07,0,116,53.4,41.7,52.2,40,25,1610612766,22200364,122,57.0,38.7,75.0,39,33,1610612751,2022
8,2022-12-07,0,113,47.3,31.6,93.8,40,19,1610612758,22200369,126,47.9,42.4,83.3,50,34,1610612749,2022
9,2022-12-07,0,89,37.6,16.7,79.2,39,16,1610612737,22200365,113,44.6,47.2,91.7,55,26,1610612752,2022


**Data Processing**

In [5]:
df = process_games(df) 
df = add_TARGET(df)
df

Unnamed: 0,GAME_DATE_EST,HOME_TEAM_WINS,PTS_home,FG_PCT_home,FG3_PCT_home,FT_PCT_home,REB_home,AST_home,HOME_TEAM_ID,GAME_ID,PTS_away,FG_PCT_away,FG3_PCT_away,FT_PCT_away,REB_away,AST_away,VISITOR_TEAM_ID,SEASON,PLAYOFF,TARGET
0,2022-12-07,1,125,48.5,35.6,84.6,53,29,1610612738,22200373,98,39.8,25.0,84.2,49,19,1610612756,2022,0,1
1,2022-12-07,0,123,45.9,33.3,76.0,42,26,1610612744,22200372,124,53.5,42.9,81.0,40,26,1610612762,2022,0,0
2,2022-12-07,0,111,48.8,29.0,69.0,38,22,1610612764,22200367,115,50.0,43.5,88.2,41,29,1610612741,2022,0,0
3,2022-12-07,0,115,48.1,45.2,75.0,31,22,1610612754,22200370,121,47.3,34.8,92.6,51,25,1610612750,2022,0,0
4,2022-12-07,0,98,38.9,35.7,81.3,42,17,1610612765,22200371,104,45.9,29.6,78.3,50,21,1610612740,2022,0,0
5,2022-12-07,0,102,37.0,36.8,87.0,48,25,1610612760,22200368,123,51.1,36.8,81.3,50,32,1610612763,2022,0,0
6,2022-12-07,0,111,41.1,36.6,69.2,53,26,1610612746,22200363,116,42.1,28.6,90.3,53,20,1610612753,2022,0,0
7,2022-12-07,0,116,53.4,41.7,52.2,40,25,1610612766,22200364,122,57.0,38.7,75.0,39,33,1610612751,2022,0,0
8,2022-12-07,0,113,47.3,31.6,93.8,40,19,1610612758,22200369,126,47.9,42.4,83.3,50,34,1610612749,2022,0,0
9,2022-12-07,0,89,37.6,16.7,79.2,39,16,1610612737,22200365,113,44.6,47.2,91.7,55,26,1610612752,2022,0,0


**Access Feature Store**

In [6]:
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/3350




Connected. Call `.close()` to terminate connection gracefully.


**Access Feature Group**

In [7]:
rolling_stats_fg = fs.get_feature_group(
    name="rolling_stats",
    version=1,
)

In [10]:

ds_query = rolling_stats_fg.select_all()
df_query = ds_query.read()
df_query


2022-12-08 08:26:54,516 INFO: USE `nba_predictor_featurestore`
2022-12-08 08:26:54,941 INFO: SELECT `fg0`.`game_date_est` `game_date_est`, `fg0`.`game_id` `game_id`, `fg0`.`home_team_id` `home_team_id`, `fg0`.`visitor_team_id` `visitor_team_id`, `fg0`.`season` `season`, `fg0`.`pts_home` `pts_home`, `fg0`.`fg_pct_home` `fg_pct_home`, `fg0`.`ft_pct_home` `ft_pct_home`, `fg0`.`fg3_pct_home` `fg3_pct_home`, `fg0`.`ast_home` `ast_home`, `fg0`.`reb_home` `reb_home`, `fg0`.`pts_away` `pts_away`, `fg0`.`fg_pct_away` `fg_pct_away`, `fg0`.`ft_pct_away` `ft_pct_away`, `fg0`.`fg3_pct_away` `fg3_pct_away`, `fg0`.`ast_away` `ast_away`, `fg0`.`reb_away` `reb_away`, `fg0`.`home_team_wins` `home_team_wins`, `fg0`.`target` `target`, `fg0`.`month` `month`, `fg0`.`home_team_win_streak` `home_team_win_streak`, `fg0`.`home_team_wins_avg_last_3_home` `home_team_wins_avg_last_3_home`, `fg0`.`home_team_wins_avg_last_7_home` `home_team_wins_avg_last_7_home`, `fg0`.`home_team_wins_avg_last_10_home` `home_team_wi



Unnamed: 0,game_date_est,game_id,home_team_id,visitor_team_id,season,pts_home,fg_pct_home,ft_pct_home,fg3_pct_home,ast_home,...,fg3_pct_avg_last_10_all_x_minus_y,fg3_pct_avg_last_15_all_x_minus_y,ast_avg_last_3_all_x_minus_y,ast_avg_last_7_all_x_minus_y,ast_avg_last_10_all_x_minus_y,ast_avg_last_15_all_x_minus_y,reb_avg_last_3_all_x_minus_y,reb_avg_last_7_all_x_minus_y,reb_avg_last_10_all_x_minus_y,reb_avg_last_15_all_x_minus_y
0,2017-12-08,21700374,1610612759,1610612738,2017,105,0.468994,0.875000,0.295898,16,...,-0.044751,0.008040,-2.666667,-2.571429,-0.8,2.000000,-1.666667,0.857143,1.0,-1.066667
1,2013-03-01,21200874,1610612756,1610612737,2012,92,0.444092,0.833008,0.455078,16,...,-0.099951,-0.097567,-2.666667,-2.285714,-4.5,-4.466667,12.000000,3.857143,4.1,2.933333
2,2005-11-30,20500210,1610612738,1610612755,2005,110,0.447998,0.784180,0.250000,24,...,0.033496,0.041073,3.333333,2.000000,2.1,2.000000,-11.333333,-3.571429,-4.1,-0.533333
3,2018-12-10,21800395,1610612749,1610612739,2018,108,0.437988,0.817871,0.416992,22,...,-0.029150,-0.025781,0.666667,6.714286,7.7,8.266667,6.666667,4.285714,5.2,3.533333
4,2007-03-12,20600946,1610612756,1610612745,2006,103,0.500000,0.727051,0.600098,18,...,0.070865,0.032823,-4.666667,-0.285714,4.1,3.466667,-3.000000,-4.000000,-1.7,-1.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22542,2017-12-10,21700387,1610612754,1610612743,2017,126,0.526855,0.817871,0.399902,24,...,0.031763,0.041113,0.666667,0.142857,2.3,0.600000,-0.333333,-2.142857,-2.3,-2.066667
22543,2015-04-10,21401175,1610612753,1610612761,2014,99,0.460938,1.000000,0.350098,21,...,-0.075330,-0.050407,2.666667,3.142857,3.0,1.400000,3.000000,3.428571,3.2,2.600000
22544,2005-01-15,20400537,1610612745,1610612759,2004,73,0.333008,0.730957,0.399902,15,...,0.008850,-0.006299,8.000000,3.714286,0.8,-0.600000,1.000000,1.857143,-1.4,-0.333333
22545,2012-03-07,21100575,1610612749,1610612741,2011,104,0.477051,0.789062,0.333008,29,...,-0.114600,-0.073340,-2.666667,-1.000000,-3.2,-3.000000,-7.666667,-7.857143,-6.2,-5.133333


**Feature Engineering**

In [None]:
# Feature engineering to add: 
    # rolling averages of key stats, 
    # win/lose streaks, 
    # home/away streaks, 
    # specific matchup (team X vs team Y) rolling averages and streaks

df = process_features(df)
df

**Insert Data back to Feature Group**

In [None]:
rolling_stats_fg.insert(df, write_options={"wait_for_job" : False})

**Save original feature names to JSON**

In [None]:
def save_feature_names(df):
    # hopsworks "sanitizes" feature names by converting to all lowercase
    # this function saves the original so that they can be re-mapped later
    # for code re-usuability
    
    feature_names = df.columns.tolist()
    with open("feature_names.json", "w") as fp:
        json.dump(feature_names, fp)
        
    return "File Saved."

save_feature_names(df)
