In [2]:
import polars as pl
import pandas as pd
import nba_api.stats.endpoints as nba_stats
import nba_api.live.nba.endpoints as nba_live
from nba_api.stats.static import players, teams
import time
from pathlib import Path
from tqdm import tqdm
import statistics


In [3]:
# relevant pbp2 columns
pbp2_cols = [
            "EVENTNUM",
            "PLAYER1_ID",
            "PLAYER1_NAME",
            "PLAYER1_TEAM_ID",
            "PLAYER2_ID",
            "PLAYER2_NAME",
            "PLAYER2_TEAM_ID",
            "PLAYER3_ID",
            "PLAYER3_NAME",
            "PLAYER3_TEAM_ID",
            "PERIOD",
            "PCTIMESTRING",
            # "HOMEDESCRIPTION",
            # "NEUTRALDESCRIPTION",
            # "VISITORDESCRIPTION",
            ]

# relevant pbp3 columns
pbp3_cols = [
            "actionNumber",
            "description",
            "actionType",
            "subType",
            "location",
            "scoreHome",
            "scoreAway",
            "shotDistance"
            ]

def createPBPData(g_id : str) -> pl.DataFrame:
    
    try:        
        # pull initial data for pbp tables, need V3 for action description and V2 for player identification
        pbp_3_raw = nba_stats.PlayByPlayV3(game_id=g_id)
        pbp_2_raw = nba_stats.PlayByPlayV2(game_id=g_id)

        # creating filtered pbp2 df
        pbp2_df = pbp_2_raw.get_data_frames()[0]
        pbp2_df_filtered = pl.from_pandas(pbp2_df).select(pbp2_cols)

        # creating filtered pbp3 df
        pbp3_df = pbp_3_raw.get_data_frames()[0]
        pbp3_df_filtered = pl.from_pandas(pbp3_df).select(pbp3_cols).rename({"actionNumber" : "EVENTNUM"})

        # merging two dfs for player and action details
        play_by_play_df = pbp3_df_filtered.join(pbp2_df_filtered, on="EVENTNUM", how="inner").lazy()
        
        df_clean = play_by_play_df.group_by("EVENTNUM").agg([
            pl.col("PERIOD").first(),
            pl.col("PCTIMESTRING").first(),
            pl.col("description").str.to_uppercase().str.concat(";"), # Concatenate all,
            pl.col("actionType").str.to_uppercase().str.concat(";"), # Concatenate all,
            pl.col("subType").str.to_uppercase().str.concat(";"), # Concatenate all,
            
            pl.col("PLAYER1_ID").first(),  # first P1 ID
            pl.col("PLAYER1_TEAM_ID").first(),  # first P1 team ID
            
            pl.col("PLAYER2_ID").last(), # last P2 ID
            pl.col("PLAYER2_TEAM_ID").first(),  # first P2 team ID
            
            pl.col("PLAYER3_ID").last(), # last P3 ID
            pl.col("PLAYER3_TEAM_ID").first(),  # first P3 team ID

            pl.col("scoreHome").max(), # post-action,
            pl.col("scoreAway").max(), # post-action,
            pl.col("shotDistance").max() # post-action
        ]).sort("EVENTNUM")
        
        pbp_downcast = df_clean.with_columns(
                                                    pl.col("PLAYER1_TEAM_ID").cast(pl.Int32),
                                                    pl.col("PLAYER2_TEAM_ID").cast(pl.Int32),
                                                    pl.col("PLAYER3_TEAM_ID").cast(pl.Int32)
                                                    )

        
        pbp_df_final = pbp_downcast.collect()
        return pbp_df_final
    except Exception as e:
        # print(f"ERROR PROCESSING GAME - {g_id}: {e} ; PROCEEDING TO NEXT")
        return -1

In [8]:
def process_group(group):
    home_id = None
    result = None
    
    at_sym_check = False
    for row in group.itertuples():
        matchup_str = row.MATCHUP
        if "@" in matchup_str:
            if at_sym_check:
                home_id = None
            else:
                at_sym_check = True
        else:
            home_id = row.TEAM_ID
            result = 0 if row.WL == "L" else 1                
    
    # Return a Series or DataFrame with your new columns
    return pd.Series({
        'HOME_ID': home_id,
        'RESULT': result
    })

game_details = nba_stats.leaguegamefinder.LeagueGameFinder(season_nullable="2022-23", season_type_nullable="Regular Season", league_id_nullable="00").get_data_frames()[0]
g_dict = game_details.groupby("GAME_ID").apply(process_group).to_dict(orient="index")


  g_dict = game_details.groupby("GAME_ID").apply(process_group).to_dict(orient="index")


In [9]:
#  g_dict = {
#         '0022400001': {'HOME_ID': 1610612738.0, 'RESULT': 0.0}, 
#         # '0022400002': {'HOME_ID': 1610612765.0, 'RESULT': 1.0}, 
#         # '0022400003': {'HOME_ID': 1610612753.0, 'RESULT': 1.0}, 
#         # '0022400004': {'HOME_ID': 1610612755.0, 'RESULT': 0.0},
            # '0022400594' : {'HOME_ID' : 1610612749.0, 'RESULT' : 1.0},
    #  }


cols_to_train = ["PERIOD", "PCTIMESTRING", "scoreHome", "scoreAway", "PLAY_ACTION", "HOME_AWAY_BOOL", "RESULT"]
processed_games_path = "processed_games_22.txt"
training_output_path = "training_data_22.csv"

'''

STL = 1
BLK = 2
MAKE = 3
MISS = 4
TURNOVER = 5
FOUL = 6
FT_MAKE = 7
FT_MISS = 8
REB = 9
TECH_FOUL = 10

'''


def createTrainingData(games_dict):
    
    err_count = 0 # need to sleep after every 10   
    processed_games = set()
    if Path(processed_games_path).exists():
        with open(processed_games_path, 'r') as f:
            processed_games = set(f.read().splitlines())
    
    
    pbar = tqdm(games_dict.items(), desc="CREATING TRAINING DATA CSV", total=len(games_dict), position=0, leave=True)
    for game_id, values in pbar:
        if game_id in processed_games:
            continue
        home_team = values.get("HOME_ID")
        result = values.get("RESULT")
        pbp_df = createPBPData(game_id)
        if not isinstance(pbp_df, pl.DataFrame):
            pbar.write(f"ERROR GETTING PBP DATA FOR GAME: {game_id}")
            err_count += 1
            if err_count > 2:
                print("RELOADING DATA_CREATION")
                return -1
            continue
        try:
            pbp_df = pbp_df.with_columns(
                
                pl.when(pl.col("PLAYER1_TEAM_ID") == home_team)
                .then(1)
                .otherwise(0)
                .cast(pl.Int8)
                .alias("HOME_AWAY_BOOL")
                ,
                
                pl.when(pl.col("actionType").str.contains("MISS"))
                .then(
                    pl.when(pl.col("actionType").str.contains(";"))
                    .then(2)
                    .otherwise(4)
                )
                .when(pl.col("actionType").str.contains("FREE"))
                .then(
                    pl.when(pl.col("description").str.contains("MISS"))
                    .then(7)
                    .otherwise(8)
                )
                .when(pl.col("actionType").str.contains("TURNOVER"))
                .then(
                    pl.when(pl.col("actionType").str.contains(";"))
                    .then(1)
                    .otherwise(5)
                )
                
                .when(pl.col("actionType").str.contains("FOUL"))
                .then(
                    pl.when(pl.col("subType").str.contains("TECH"))
                    .then(10)
                    .when(pl.col("subType").str.contains("OFFENSIVE"))
                    .then(5)
                    .otherwise(6)
                )
                
                .when(pl.col("actionType").str.contains("REBOUND"))
                .then(9)
                
                .when(pl.col("actionType").str.contains("MADE"))
                .then(3)
                
                .alias("PLAY_ACTION").cast(pl.Int8),



                pl.col("scoreHome").replace("", None).cast(pl.Int16).forward_fill(),
                pl.col("scoreAway").replace("", None).cast(pl.Int16).forward_fill(),
                
                pl.lit(result).alias("RESULT")
                
            ).select(cols_to_train).drop_nulls()
            
            if Path(training_output_path).exists():
                with open(training_output_path, 'a+', encoding='utf-8') as f:
                    pbp_df.write_csv(f, include_header=False)
            else:
                pbp_df.write_csv(training_output_path)
            
            with open(processed_games_path, 'a') as f:
                f.write(f"{game_id}\n")
                
            processed_games.add(game_id)
            
        except Exception as e:
            pbar.postfix(f"ERROR CREATING TRAINING DF FOR GAME: {game_id} ;  ERROR MESSAGE: {e}")
            
        # count += 1
        # if (count % 20) == 0:
        #     pbar.write(f"RAN ON 20 GAMES ; NOW HAS DATA FOR {len(processed_games)} GAMES")
        
    return 0

In [1]:
# x = createTrainingData(g_dict)
# absolute gross way to avoid the retry timeout error (443 I think)
# brute force baby
#
games_completed = False
while not games_completed:
    x = createTrainingData(g_dict)
    if not x:
        games_completed = True
    elif x == -1:
        continue

CREATING TRAINING DATA CSV:   1%|          | 13/1230 [00:03<04:35,  4.42it/s]