In [10]:
import os

import pandas as pd
import numpy as np

import hopsworks

from datetime import datetime, timedelta
from pytz import timezone

from src.webscraping import (
    activate_web_driver,
    scrape_to_dataframe,
    convert_columns,
    combine_home_visitor,  
)

from src.data_processing import (
    process_games,
    add_TARGET,
)

from src.feature_engineering import (
    fix_datatypes,
    add_date_features,
    remove_playoff_games,
    add_rolling_home_visitor,
    process_games_consecutively,
    add_matchups,
    add_past_performance_all,
    combine_new_features,
    process_x_minus_y,  
    remove_non_rolling,
    process_features,
)

import json



from pathlib import Path  #for Windows/Linux compatibility
DATAPATH = Path(r'data')

**Load API keys**

In [11]:
from dotenv import load_dotenv

load_dotenv()

try:
    HOPSWORKS_API_KEY = os.environ['HOPSWORKS_API_KEY']
except:
    raise Exception('Set environment variable HOPSWORKS_API_KEY')

**Scrape Data and Format**

In [12]:

# set search strings for the last seven days 
DAYS = 7
SEASON = "" #no season will cause website to default to current season, format is "2022-23"
TODAY = datetime.now(timezone('EST')) #nba.com uses US Eastern Standard Time
LASTWEEK = (TODAY - timedelta(days=DAYS))
DATETO = TODAY.strftime("%m/%d/%y")
DATEFROM = LASTWEEK.strftime("%m/%d/%y")

# initate a webdriver in selenium 
# since website data is dynamically generated
driver = activate_web_driver('chromium')

df = scrape_to_dataframe(driver, Season=SEASON, DateFrom=DATEFROM, DateTo=DATETO)

driver.close() 

df = convert_columns(df)
df = combine_home_visitor(df)

df

2022-12-04 11:00:55,197 INFO: Get LATEST chromedriver version for chromium 108.0.5359
2022-12-04 11:00:55,599 INFO: Driver [/home/cmunch1/.wdm/drivers/chromedriver/linux64/108.0.5359/chromedriver] found in cache


WebDriverException: Message: unknown error: DevToolsActivePort file doesn't exist
Stacktrace:
#0 0x558b4f2412a3 <unknown>
#1 0x558b4effff77 <unknown>
#2 0x558b4f02bfc4 <unknown>
#3 0x558b4f027b0c <unknown>
#4 0x558b4f0247d0 <unknown>
#5 0x558b4f0650b7 <unknown>
#6 0x558b4f064a5f <unknown>
#7 0x558b4f05c903 <unknown>
#8 0x558b4f02fece <unknown>
#9 0x558b4f030fde <unknown>
#10 0x558b4f29163e <unknown>
#11 0x558b4f294b79 <unknown>
#12 0x558b4f27789e <unknown>
#13 0x558b4f295a83 <unknown>
#14 0x558b4f26a505 <unknown>
#15 0x558b4f2b6ca8 <unknown>
#16 0x558b4f2b6e36 <unknown>
#17 0x558b4f2d2333 <unknown>
#18 0x7ff92707bb43 <unknown>


### Data Processing

In [None]:
df = process_games(df) 
df = add_TARGET(df)
df

### Feature Engineering

In [None]:
# Feature engineering to add: 
    # rolling averages of key stats, 
    # win/lose streaks, 
    # home/away streaks, 
    # specific matchup (team X vs team Y) rolling averages and streaks

df = process_features(df)
df

### Add Data to Feature Store

In [None]:
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()

In [None]:
rolling_stats_fg = fs.get_feature_group(
    name="rolling_stats",
    version=1,
)
rolling_stats_fg.insert(df, write_options={"wait_for_job" : False})

In [None]:
def save_feature_names(df):
    # hopsworks "sanitizes" feature names by converting to all lowercase
    # this function saves the original so that they can be re-mapped later
    # for code re-usuability
    
    feature_names = df.columns.tolist()
    with open("feature_names.json", "w") as fp:
        json.dump(feature_names, fp)
        
    return "File Saved."

save_feature_names(df)
