In [13]:
!pip install selenium webdriver-manager lxml


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
import os

import pandas as pd
import numpy as np

import hopsworks

from datetime import datetime, timedelta
from pytz import timezone

from src.webscraping import (
    activate_web_driver,
    scrape_to_dataframe,
    convert_columns,
    combine_home_visitor,  
)

from src.data_processing import (
    process_games,
    add_TARGET,
)

from src.feature_engineering import (
    fix_datatypes,
    add_date_features,
    remove_playoff_games,
    add_rolling_home_visitor,
    process_games_consecutively,
    add_matchups,
    add_past_performance_all,
    combine_new_features,
    process_x_minus_y,  
    remove_non_rolling,
)
# from src.feature_engineering import process_features

import json

from pathlib import Path  #for Windows/Linux compatibility
DATAPATH = Path(r'data')

In [2]:
try:
    HOPSWORKS_API_KEY = os.environ['HOPSWORKS_API_KEY']
except:
    raise Exception('Set environment variable HOPSWORKS_API_KEY')

**Scrape Data and Format**

In [3]:
# set search strings for the last seven days 
DAYS = 3
SEASON = "" #no season will cause website to default to current season, format is "2022-23"
TODAY = datetime.now(timezone('EST')) #nba.com uses US Eastern Standard Time
LASTWEEK = (TODAY - timedelta(days=DAYS))
DATETO = TODAY.strftime("%m/%d/%y")
DATEFROM = LASTWEEK.strftime("%m/%d/%y")

# initate a webdriver in selenium 
# since website data is dynamically generated
driver = activate_web_driver()

df = scrape_to_dataframe(driver, Season=SEASON, DateFrom=DATEFROM, DateTo=DATETO)

driver.close()

df = convert_columns(df)
df = combine_home_visitor(df)


2022-11-21 13:19:05,548 INFO: Get LATEST chromedriver version for google-chrome 107.0.5304
2022-11-21 13:19:06,751 INFO: Driver [/Users/paulabartabajo/.wdm/drivers/chromedriver/mac64/107.0.5304/chromedriver] found in cache


In [4]:
df.head(10)

Unnamed: 0,GAME_DATE_EST,HOME_TEAM_WINS,PTS_home,FG_PCT_home,FG3_PCT_home,FT_PCT_home,REB_home,AST_home,HOME_TEAM_ID,GAME_ID,PTS_away,FG_PCT_away,FG3_PCT_away,FT_PCT_away,REB_away,AST_away,VISITOR_TEAM_ID,SEASON
0,2022-11-20,0,92,39.0,20.6,30.0,44,26,1610612759,22200247,123,52.9,35.3,87.5,54,29,1610612747,2022
1,2022-11-20,1,98,47.4,33.3,78.9,44,22,1610612743,22200246,97,44.9,35.7,57.1,36,24,1610612742,2022
2,2022-11-20,1,127,53.5,51.1,73.3,40,38,1610612744,22200245,120,45.6,33.3,85.2,45,24,1610612745,2022
3,2022-11-20,0,115,43.4,32.4,54.8,44,28,1610612763,22200243,127,60.2,47.1,73.3,45,33,1610612751,2022
4,2022-11-20,0,87,38.8,19.4,86.4,30,17,1610612748,22200244,113,53.0,35.1,75.0,49,31,1610612739,2022
5,2022-11-20,0,129,52.8,51.7,87.0,35,20,1610612765,22200242,137,55.3,34.3,88.6,41,29,1610612758,2022
6,2022-11-20,0,95,40.0,28.0,80.0,39,20,1610612752,22200240,116,42.6,43.6,72.2,60,35,1610612756,2022
7,2022-11-20,0,102,39.0,25.0,69.6,51,19,1610612766,22200241,106,43.0,38.2,72.2,49,19,1610612764,2022
8,2022-11-19,0,97,41.4,26.9,85.7,43,22,1610612759,22200239,119,54.3,52.5,52.6,35,32,1610612746,2022
9,2022-11-19,1,112,47.1,33.3,83.3,43,28,1610612750,22200237,109,47.9,43.8,84.4,39,21,1610612755,2022


### Data Processing

In [5]:
df = process_games(df) 
df = add_TARGET(df)

In [6]:
df.columns

Index(['GAME_DATE_EST', 'HOME_TEAM_WINS', 'PTS_home', 'FG_PCT_home',
       'FG3_PCT_home', 'FT_PCT_home', 'REB_home', 'AST_home', 'HOME_TEAM_ID',
       'GAME_ID', 'PTS_away', 'FG_PCT_away', 'FG3_PCT_away', 'FT_PCT_away',
       'REB_away', 'AST_away', 'VISITOR_TEAM_ID', 'SEASON', 'PLAYOFF',
       'TARGET'],
      dtype='object')

### Feature Engineering

In [7]:
# def process_features(df):
    
#     home_visitor_roll_list = [3, 7, 10]
#     all_roll_list = [3, 7, 10, 15]
    
#     df = fix_datatypes(df)
#     df = add_date_features(df)
#     df = remove_playoff_games(df)
#     df = add_rolling_home_visitor(df, "HOME", home_visitor_roll_list)
#     df = add_rolling_home_visitor(df, "VISITOR", home_visitor_roll_list)
    
#     df_consecutive = process_games_consecutively(df)
#     df_consecutive = add_matchups(df_consecutive, home_visitor_roll_list)
#     df_consecutive = add_past_performance_all(df_consecutive, all_roll_list)

#     #add these features back to main dataframe
#     df = combine_new_features(df,df_consecutive) 
    
#     df = remove_non_rolling(df)
    
#     df = process_x_minus_y(df)
    
#     return df
from src.feature_engineering import process_features
df = process_features(df)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

### Add Data to Feature Store

In [8]:
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Multiple projects found. 

	 (1) paulescu
	 (2) NBA_predictor

Enter project to access: 2

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/3350




Connected. Call `.close()` to terminate connection gracefully.


In [9]:
rolling_stats_fg = fs.get_feature_group(
    name="rolling_stats_pau",
    version=1,
)
rolling_stats_fg.insert(df, write_options={"wait_for_job" : False})



Uploading Dataframe: 0.00% |          | Rows 0/24 | Elapsed Time: 00:00 | Remaining Time: ?

Launching offline feature group backfill job...
Backfill Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/3350/jobs/named/rolling_stats_pau_1_offline_fg_backfill/executions


(<hsfs.core.job.Job at 0x129a06af0>, None)

In [10]:
def save_feature_names(df):
    # hopsworks "sanitizes" feature names by converting to all lowercase
    # this function saves the original so that they can be re-mapped later
    # for code re-usuability
    
    feature_names = df.columns.tolist()
    with open("feature_names.json", "w") as fp:
        json.dump(feature_names, fp)
        
    return "File Saved."

save_feature_names(df)


'File Saved.'