In [1]:
import os

import pandas as pd
import numpy as np

import hopsworks

from datetime import datetime, timedelta
from pytz import timezone

from src.webscraping import (
    activate_web_driver,
    scrape_to_dataframe,
    convert_columns,
    combine_home_visitor,  
)

from src.data_processing import (
    process_games,
    add_TARGET,
)

from src.feature_engineering import (
    fix_datatypes,
    add_date_features,
    remove_playoff_games,
    add_rolling_home_visitor,
    process_games_consecutively,
    add_matchups,
    add_past_performance_all,
    combine_new_features,
    process_x_minus_y,  
    remove_non_rolling,
    process_features,
)

import json



from pathlib import Path  #for Windows/Linux compatibility
DATAPATH = Path(r'data')

**Load API keys**

In [2]:
from dotenv import load_dotenv

load_dotenv()

try:
    HOPSWORKS_API_KEY = os.environ['HOPSWORKS_API_KEY']
except:
    raise Exception('Set environment variable HOPSWORKS_API_KEY')

**Scrape Data and Format**

In [3]:

# set search strings for the last seven days 
DAYS = 7
SEASON = "" #no season will cause website to default to current season, format is "2022-23"
TODAY = datetime.now(timezone('EST')) #nba.com uses US Eastern Standard Time
LASTWEEK = (TODAY - timedelta(days=DAYS))
DATETO = TODAY.strftime("%m/%d/%y")
DATEFROM = LASTWEEK.strftime("%m/%d/%y")

# initate a webdriver in selenium 
# since website data is dynamically generated
driver = activate_web_driver('firefox')

df = scrape_to_dataframe(driver, Season=SEASON, DateFrom=DATEFROM, DateTo=DATETO)

driver.close() 

df = convert_columns(df)
df = combine_home_visitor(df)

df

2022-12-04 11:29:28,463 INFO: Get LATEST geckodriver version for 107.0 firefox


[WDM] - Downloading: 19.0kB [00:00, 9.77MB/s]                   


2022-12-04 11:29:29,131 INFO: Getting latest mozilla release info for v0.32.0
2022-12-04 11:29:29,395 INFO: About to download new driver from https://github.com/mozilla/geckodriver/releases/download/v0.32.0/geckodriver-v0.32.0-win64.zip


[WDM] - Downloading: 100%|██████████| 1.58M/1.58M [00:00<00:00, 4.24MB/s]


2022-12-04 11:29:30,684 INFO: Driver has been saved in cache [C:\Users\Chris\.wdm\drivers\geckodriver\win64\0.32]




Unnamed: 0,GAME_DATE_EST,HOME_TEAM_WINS,PTS_home,FG_PCT_home,FG3_PCT_home,FT_PCT_home,REB_home,AST_home,HOME_TEAM_ID,GAME_ID,PTS_away,FG_PCT_away,FG3_PCT_away,FT_PCT_away,REB_away,AST_away,VISITOR_TEAM_ID,SEASON
0,2022-12-03,1,116,48.2,40.0,77.4,49,19,1610612757,22200343,111,45.2,36.1,78.6,44,20,1610612762,2022
1,2022-12-03,1,121,43.0,39.3,78.6,50,27,1610612742,22200337,100,45.5,32.3,71.4,45,18,1610612752,2022
2,2022-12-03,0,108,48.7,39.3,80.8,29,23,1610612753,22200340,121,56.0,31.0,72.0,43,31,1610612761,2022
3,2022-12-03,1,123,51.7,34.4,66.7,46,31,1610612758,22200338,96,35.5,27.3,80.8,50,15,1610612746,2022
4,2022-12-03,1,135,51.6,42.3,80.0,42,26,1610612760,22200341,128,52.8,33.3,80.0,44,25,1610612750,2022
5,2022-12-03,0,101,44.0,10.7,90.0,45,17,1610612745,22200342,120,52.9,48.1,50.0,35,34,1610612744,2022
6,2022-12-03,1,105,47.0,37.5,78.9,47,24,1610612749,22200339,96,39.1,20.8,88.5,44,15,1610612766,2022
7,2022-12-02,0,109,44.0,42.4,88.2,42,23,1610612755,22200332,117,43.3,36.7,78.6,57,22,1610612763,2022
8,2022-12-02,0,96,49.4,34.8,66.7,40,19,1610612753,22200330,107,48.8,38.1,100.0,33,22,1610612739,2022
9,2022-12-02,1,120,47.5,31.4,72.2,50,19,1610612748,22200328,116,46.0,33.3,75.0,48,26,1610612738,2022


### Data Processing

In [4]:
df = process_games(df) 
df = add_TARGET(df)
df

Unnamed: 0,GAME_DATE_EST,HOME_TEAM_WINS,PTS_home,FG_PCT_home,FG3_PCT_home,FT_PCT_home,REB_home,AST_home,HOME_TEAM_ID,GAME_ID,PTS_away,FG_PCT_away,FG3_PCT_away,FT_PCT_away,REB_away,AST_away,VISITOR_TEAM_ID,SEASON,PLAYOFF,TARGET
0,2022-12-03,1,116,48.2,40.0,77.4,49,19,1610612757,22200343,111,45.2,36.1,78.6,44,20,1610612762,2022,0,1
1,2022-12-03,1,121,43.0,39.3,78.6,50,27,1610612742,22200337,100,45.5,32.3,71.4,45,18,1610612752,2022,0,1
2,2022-12-03,0,108,48.7,39.3,80.8,29,23,1610612753,22200340,121,56.0,31.0,72.0,43,31,1610612761,2022,0,0
3,2022-12-03,1,123,51.7,34.4,66.7,46,31,1610612758,22200338,96,35.5,27.3,80.8,50,15,1610612746,2022,0,1
4,2022-12-03,1,135,51.6,42.3,80.0,42,26,1610612760,22200341,128,52.8,33.3,80.0,44,25,1610612750,2022,0,1
5,2022-12-03,0,101,44.0,10.7,90.0,45,17,1610612745,22200342,120,52.9,48.1,50.0,35,34,1610612744,2022,0,0
6,2022-12-03,1,105,47.0,37.5,78.9,47,24,1610612749,22200339,96,39.1,20.8,88.5,44,15,1610612766,2022,0,1
7,2022-12-02,0,109,44.0,42.4,88.2,42,23,1610612755,22200332,117,43.3,36.7,78.6,57,22,1610612763,2022,0,0
8,2022-12-02,0,96,49.4,34.8,66.7,40,19,1610612753,22200330,107,48.8,38.1,100.0,33,22,1610612739,2022,0,0
9,2022-12-02,1,120,47.5,31.4,72.2,50,19,1610612748,22200328,116,46.0,33.3,75.0,48,26,1610612738,2022,0,1


### Feature Engineering

In [5]:
# Feature engineering to add: 
    # rolling averages of key stats, 
    # win/lose streaks, 
    # home/away streaks, 
    # specific matchup (team X vs team Y) rolling averages and streaks

df = process_features(df)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


HOME_TEAM_ID
['GAME_DATE_EST', 'HOME_TEAM_WINS', 'PTS_home', 'FG_PCT_home', 'FG3_PCT_home', 'FT_PCT_home', 'REB_home', 'AST_home', 'HOME_TEAM_ID', 'GAME_ID', 'PTS_away', 'FG_PCT_away', 'FG3_PCT_away', 'FT_PCT_away', 'REB_away', 'AST_away', 'VISITOR_TEAM_ID', 'SEASON', 'TARGET', 'MONTH', 'HOME_TEAM_WIN_STREAK', 'HOME_TEAM_WINS_AVG_LAST_3_HOME', 'HOME_TEAM_WINS_AVG_LAST_7_HOME', 'HOME_TEAM_WINS_AVG_LAST_10_HOME', 'HOME_PTS_home_AVG_LAST_3_HOME', 'HOME_PTS_home_AVG_LAST_7_HOME', 'HOME_PTS_home_AVG_LAST_10_HOME', 'HOME_FG_PCT_home_AVG_LAST_3_HOME', 'HOME_FG_PCT_home_AVG_LAST_7_HOME', 'HOME_FG_PCT_home_AVG_LAST_10_HOME', 'HOME_FT_PCT_home_AVG_LAST_3_HOME', 'HOME_FT_PCT_home_AVG_LAST_7_HOME', 'HOME_FT_PCT_home_AVG_LAST_10_HOME', 'HOME_FG3_PCT_home_AVG_LAST_3_HOME', 'HOME_FG3_PCT_home_AVG_LAST_7_HOME', 'HOME_FG3_PCT_home_AVG_LAST_10_HOME', 'HOME_AST_home_AVG_LAST_3_HOME', 'HOME_AST_home_AVG_LAST_7_HOME', 'HOME_AST_home_AVG_LAST_10_HOME', 'HOME_REB_home_AVG_LAST_3_HOME', 'HOME_REB_home_AVG_LAS

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,GAME_DATE_EST,HOME_TEAM_ID,GAME_ID,VISITOR_TEAM_ID,SEASON,TARGET,MONTH,HOME_TEAM_WIN_STREAK,HOME_TEAM_WINS_AVG_LAST_3_HOME,HOME_TEAM_WINS_AVG_LAST_7_HOME,...,FG3_PCT_AVG_LAST_10_ALL_x_minus_y,FG3_PCT_AVG_LAST_15_ALL_x_minus_y,AST_AVG_LAST_3_ALL_x_minus_y,AST_AVG_LAST_7_ALL_x_minus_y,AST_AVG_LAST_10_ALL_x_minus_y,AST_AVG_LAST_15_ALL_x_minus_y,REB_AVG_LAST_3_ALL_x_minus_y,REB_AVG_LAST_7_ALL_x_minus_y,REB_AVG_LAST_10_ALL_x_minus_y,REB_AVG_LAST_15_ALL_x_minus_y
0,2022-11-27,1610612748,22200293,1610612737,2022,1,11,1,,,...,,,,,,,,,,
1,2022-11-27,1610612757,22200290,1610612751,2022,0,11,1,,,...,,,,,,,,,,
2,2022-11-27,1610612755,22200297,1610612753,2022,1,11,-1,,,...,,,,,,,,,,
3,2022-11-27,1610612744,22200291,1610612750,2022,1,11,-1,,,...,,,,,,,,,,
4,2022-11-27,1610612742,22200298,1610612749,2022,0,11,-1,,,...,,,,,,,,,,
5,2022-11-27,1610612754,22200292,1610612746,2022,0,11,-1,,,...,,,,,,,,,,
6,2022-11-27,1610612763,22200296,1610612752,2022,1,11,-1,,,...,,,,,,,,,,
7,2022-11-27,1610612764,22200294,1610612738,2022,0,11,-1,,,...,,,,,,,,,,
8,2022-11-27,1610612739,22200295,1610612765,2022,1,11,1,,,...,,,,,,,,,,
9,2022-11-28,1610612741,22200306,1610612762,2022,1,11,1,,,...,,,,,,,,,,


### Add Data to Feature Store

In [6]:
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/3350




Connected. Call `.close()` to terminate connection gracefully.


In [7]:
rolling_stats_fg = fs.get_feature_group(
    name="rolling_stats",
    version=1,
)
rolling_stats_fg.insert(df, write_options={"wait_for_job" : False})



Uploading Dataframe: 0.00% |          | Rows 0/54 | Elapsed Time: 00:00 | Remaining Time: ?

Launching offline feature group backfill job...
Backfill Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/3350/jobs/named/rolling_stats_1_offline_fg_backfill/executions


(<hsfs.core.job.Job at 0x20f1a1f1e20>, None)

In [8]:
def save_feature_names(df):
    # hopsworks "sanitizes" feature names by converting to all lowercase
    # this function saves the original so that they can be re-mapped later
    # for code re-usuability
    
    feature_names = df.columns.tolist()
    with open("feature_names.json", "w") as fp:
        json.dump(feature_names, fp)
        
    return "File Saved."

save_feature_names(df)


'File Saved.'