In [2]:
from nba_api.stats.endpoints import commonplayerinfo
from nba_api.stats.endpoints import leaguegamelog
from nba_api.stats.endpoints import boxscoreplayertrackv2
from nba_api.stats.endpoints import teamgamelog
from nba_api.stats.endpoints import playergamelogs
from nba_api.stats.endpoints import leaguedashplayerstats
from nba_api.stats.static import teams
from json import JSONDecodeError
import pandas as pd
import numpy as np
import random
import requests
import math
import time

In [3]:
def str_to_mins(inp):
    lst = inp.split(":")
    sec = int(lst[0])*60 + int(lst[1])
    return sec

In [26]:
games = pd.read_csv("all_games_since_1997.csv")
boxscores = pd.read_csv("all_boxscores_concatenated.csv")
season_stats_per_poss = pd.read_csv("player-seasons_per_possession.csv")
season_stats_per_poss = pd.read_csv("player-seasons_per_possession.csv")
playerinfo = pd.read_csv("commonplayerinfo.csv")
playernames = list(pd.read_csv("relevant_players.csv")["playernames"])

In [27]:
playernames_set = set(playernames)
list_all_game_ids = games[games['SEASON_ID'] > 21997]["GAME_ID"].unique()

In [28]:
games.columns

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M',
       'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS', 'VIDEO_AVAILABLE'],
      dtype='object')

In [29]:
boxscores.columns

Index(['GAME_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_CITY', 'PLAYER_ID',
       'PLAYER_NAME', 'START_POSITION', 'COMMENT', 'MIN', 'SPD', 'DIST',
       'ORBC', 'DRBC', 'RBC', 'TCHS', 'SAST', 'FTAST', 'PASS', 'AST', 'CFGM',
       'CFGA', 'CFG_PCT', 'UFGM', 'UFGA', 'UFG_PCT', 'FG_PCT', 'DFGM', 'DFGA',
       'DFG_PCT'],
      dtype='object')

In [30]:
season_stats.columns

Index(['PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_ABBREVIATION', 'AGE', 'GP',
       'W', 'L', 'W_PCT', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A',
       'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'TOV',
       'STL', 'BLK', 'BLKA', 'PF', 'PFD', 'PTS', 'PLUS_MINUS',
       'NBA_FANTASY_PTS', 'DD2', 'TD3', 'GP_RANK', 'W_RANK', 'L_RANK',
       'W_PCT_RANK', 'MIN_RANK', 'FGM_RANK', 'FGA_RANK', 'FG_PCT_RANK',
       'FG3M_RANK', 'FG3A_RANK', 'FG3_PCT_RANK', 'FTM_RANK', 'FTA_RANK',
       'FT_PCT_RANK', 'OREB_RANK', 'DREB_RANK', 'REB_RANK', 'AST_RANK',
       'TOV_RANK', 'STL_RANK', 'BLK_RANK', 'BLKA_RANK', 'PF_RANK', 'PFD_RANK',
       'PTS_RANK', 'PLUS_MINUS_RANK', 'NBA_FANTASY_PTS_RANK', 'DD2_RANK',
       'TD3_RANK', 'CFID', 'CFPARAMS', 'SEASON'],
      dtype='object')

In [31]:
playerinfo.columns

Index(['PERSON_ID', 'FIRST_NAME', 'LAST_NAME', 'DISPLAY_FIRST_LAST',
       'DISPLAY_LAST_COMMA_FIRST', 'DISPLAY_FI_LAST', 'PLAYER_SLUG',
       'BIRTHDATE', 'SCHOOL', 'COUNTRY', 'LAST_AFFILIATION', 'HEIGHT',
       'WEIGHT', 'SEASON_EXP', 'JERSEY', 'POSITION', 'ROSTERSTATUS', 'TEAM_ID',
       'TEAM_NAME', 'TEAM_ABBREVIATION', 'TEAM_CODE', 'TEAM_CITY',
       'PLAYERCODE', 'FROM_YEAR', 'TO_YEAR', 'DLEAGUE_FLAG', 'NBA_FLAG',
       'GAMES_PLAYED_FLAG', 'DRAFT_YEAR', 'DRAFT_ROUND', 'DRAFT_NUMBER'],
      dtype='object')

In [32]:
player_info_dict = {k:dict() for k in playernames_set}
for playername in player_info_dict.keys():
    season_infos = season_stats[season_stats["PLAYER_NAME"] == playername]
    player_info_dict[playername] = dict()
    #first, store the ones that don't change from season to season
    common_info =  playerinfo[playerinfo["DISPLAY_FIRST_LAST"] == playername].reset_index()
    
    try:
        ht_lst = common_info.at[0,"HEIGHT"].split("-")
    except AttributeError:
        ht_lst = [6, 6]
        
    try:
        pick = int(common_info.at[0,"DRAFT_NUMBER"])        
    except:
        pick = 61
        
    first_year = int(common_info.at[0,"FROM_YEAR"])     
    player_info_dict[playername]["seasons_available"] = set()
    for row in season_infos.itertuples():
        player_info_dict[playername]["seasons_available"].add(row.SEASON)
        player_info_dict[playername][row.SEASON] = dict()
        player_season = player_info_dict[playername][row.SEASON]
        
        player_season["FG3M"] = row.FG3M
        player_season["FG3A"] = row.FG3A
        player_season["FGM"] = row.FGM
        player_season["FGA"] = row.FGA
        player_season["FTM"] = row.FTM
        player_season["FTA"] = row.FTA
        player_season["OREB"] = row.OREB
        player_season["DREB"] = row.DREB
        player_season["AST"] = row.AST
        player_season["TOV"] = row.TOV
        player_season["BLK"] = row.BLK
        player_season["STL"] = row.STL
        player_season["BLKA"] = row.BLKA
        player_season["PF"] = row.PF
        player_season["PFD"] = row.PFD
        player_season["AGE"] = row.AGE
        player_season["HEIGHT"] = int(ht_lst[0])*12 + int(ht_lst[1]) 
        
        #now, we put in the pick # and the rookie tag + undrafted tag
        player_season["PICK"] = pick
        player_season["UNDRAFTED"] = 0 if pick != 61 else 1
        player_season["ROOKIE"] = 1 if first_year == row.SEASON else 0

In [33]:
"""this will generate a dictionary that maps game ids to a lot of information about the game. From game id, it maps to
home and away, which maps to all the different (relevant (relevant means played 3 minutes in a game at some point in their
career)) players, which then maps to the second they played in that game + their stats from the previous season
"""
game_info_dict = {k:{"home":dict(), "away":dict()} for k in list_all_game_ids}
for game_id in list_all_game_ids:
    relevant_games = games[(games['GAME_ID'] == game_id)]
    relevant_boxscores = boxscores[(boxscores['GAME_ID'] == game_id)]
    relevant_games.reset_index(drop=True, inplace=True)
    current_season = int(str(relevant_games.loc[0, "SEASON_ID"])[1:])
    
    # second step.... add the appropriate team id to home vs away
    if "@" in relevant_games.loc[0, "MATCHUP"]:
        away_team_row = 0
        home_team_row = 1      
    else:
        away_team_row = 1
        home_team_row = 0
        
    game_info_dict[game_id]["winner"] = relevant_games.loc[0, "TEAM_ID"] if relevant_games.loc[0, "WL"] == "W" else relevant_games.loc[1, "TEAM_ID"]
    game_info_dict[game_id]["away"]["team_id"] = relevant_games.loc[away_team_row, "TEAM_ID"]
    away_team_id = relevant_games.loc[away_team_row, "TEAM_ID"]
    game_info_dict[game_id]["home"]["team_id"] = relevant_games.loc[home_team_row, "TEAM_ID"]
    home_team_id = relevant_games.loc[home_team_row, "TEAM_ID"]
    
    #start logging the players
    game_info_dict[game_id]["home"]["players"] = dict()
    game_info_dict[game_id]["away"]["players"] = dict()
    for row in relevant_boxscores.itertuples(): #we are essentially iterating through the players here 
        sec_played = str_to_mins(row.MIN) #seconds played that match
        start_pos = row.START_POSITION #psoition they started in (nan if not a starter)
        playername = row.PLAYER_NAME #name of the player
        
        if sec_played > 0 and playername in playernames_set: #played in the games, is a relevant player
            loc_tag = "home" if row.TEAM_ID == home_team_id else "away" #quick way t osee if we are home or away
            game_info_dict[game_id][loc_tag]["players"][playername] = dict() #ety dictionary to store stats
            game_info_dict[game_id][loc_tag]["players"][playername] = {"sec_played":sec_played, "start_pos":start_pos}
            try:
                game_info_dict[game_id][loc_tag]["players"][playername]["stats"] = player_info_dict[playername][current_season - 1]
            except KeyError:
                lst_prev_seasons = [i for i in player_info_dict[playername]["seasons_available"] if i < current_season]
                if lst_prev_seasons:
                    closest_season = max(lst_prev_seasons)
                    game_info_dict[game_id][loc_tag]["players"][playername]["stats"] = player_info_dict[playername][closest_season]
                else:
                    try:
                        game_info_dict[game_id][loc_tag]["players"][playername]["stats"] = dict()
                        game_info_dict[game_id][loc_tag]["players"][playername]["stats"]["AGE"] = player_info_dict[playername][current_season]["AGE"]
                        game_info_dict[game_id][loc_tag]["players"][playername]["stats"]["PICK"] = player_info_dict[playername][current_season]["PICK"]
                        game_info_dict[game_id][loc_tag]["players"][playername]["stats"]["UNDRAFTED"] = player_info_dict[playername][current_season]["UNDRAFTED"]
                        game_info_dict[game_id][loc_tag]["players"][playername]["stats"]["ROOKIE"] = player_info_dict[playername][current_season]["ROOKIE"]
                    except:
                        print(playername, current_season)

Jamel Thomas 1999
Jamel Thomas 1999
Jamel Thomas 1999
Jamel Thomas 1999
Jamel Thomas 1999
Jamel Thomas 1999
Jamel Thomas 1999


In [34]:
base_columns = ["shooting", "turnovers", "OREB%", "DREB%", "free_throws"]
real_columns = []
for colname in base_columns:
    real_columns.append("hm_" + colname)
    real_columns.append("aw_" + colname)
real_columns = sorted(real_columns)
real_columns.append("result")
unencoded_df = pd.DataFrame(columns = real_columns, index = range(len(game_info_dict.keys())))

In [35]:
unencoded_df

Unnamed: 0,aw_DREB%,aw_OREB%,aw_free_throws,aw_shooting,aw_turnovers,hm_DREB%,hm_OREB%,hm_free_throws,hm_shooting,hm_turnovers,result
0,,,,,,,,,,,
1,,,,,,,,,,,
2,,,,,,,,,,,
3,,,,,,,,,,,
4,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
24875,,,,,,,,,,,
24876,,,,,,,,,,,
24877,,,,,,,,,,,
24878,,,,,,,,,,,


In [36]:
bad_data = []
for index, (game_id, obj) in enumerate(game_info_dict.items()):
    winner_id = obj["winner"]
    home_dict = obj["home"]
    away_dict = obj["away"]
    unencoded_df.at[index, "result"] = 1 if winner_id == home_dict["team_id"] else 0
    reb_dict = {"hm_":{"OREB":None, "DREB":None}, "aw_":{"OREB":None, "DREB":None}}
    for dic in [home_dict, away_dict]:
        loc_prefix = "hm_" if dic == home_dict else "aw_"
        fg = 0
        threep = 0
        fga = 0
        tov = 0
        oreb = 0
        dreb = 0
        fta = 0   
        for player_name, player_obj in dic["players"].items():
            if player_obj["start_pos"] in {"F", "C", "G"}:
                try: #this is an individual player loop.... if a single player can't be accessed, its ok to skip him
                    fg += player_obj["stats"]["FGM"]
                    threep += player_obj["stats"]["FG3M"]
                    fga += player_obj["stats"]["FGA"]
                    tov += player_obj["stats"]["TOV"]
                    oreb += player_obj["stats"]["OREB"]
                    dreb += player_obj["stats"]["DREB"]
                    fta += player_obj["stats"]["FG3M"]
                except:
                    continue
        try: #this is at a team level... if we fail here, we should not have this trianing example
            unencoded_df.at[index, loc_prefix + "shooting"] = (fg+0.5*threep)/fga
            unencoded_df.at[index, loc_prefix + "turnovers"] = tov
            unencoded_df.at[index, loc_prefix + "free_throws"] = fta/fga
            reb_dict[loc_prefix]["OREB"] = oreb
            reb_dict[loc_prefix]["DREB"] = dreb
        except:
            bad_data.append(index)
            continue #skip this training example if tehres too many
    try: #this is at a team level... if we fail here, we should not have this trianing example
        unencoded_df.at[index, "hm_OREB%"] = reb_dict["hm_"]["OREB"] / (reb_dict["hm_"]["OREB"] + reb_dict["aw_"]["OREB"])
        unencoded_df.at[index, "hm_DREB%"] = reb_dict["aw_"]["OREB"] / (reb_dict["hm_"]["OREB"] + reb_dict["aw_"]["OREB"])
        unencoded_df.at[index, "aw_OREB%"] = reb_dict["hm_"]["DREB"] / (reb_dict["hm_"]["DREB"] + reb_dict["aw_"]["DREB"])
        unencoded_df.at[index, "aw_DREB%"] = reb_dict["aw_"]["DREB"] / (reb_dict["hm_"]["DREB"] + reb_dict["aw_"]["DREB"])
    except:
        bad_data.append(index)
        continue #skip this training example if tehres too many
unencoded_df = unencoded_df.drop(unencoded_df.index[bad_data])

In [37]:
unencoded_df

Unnamed: 0,aw_DREB%,aw_OREB%,aw_free_throws,aw_shooting,aw_turnovers,hm_DREB%,hm_OREB%,hm_free_throws,hm_shooting,hm_turnovers,result
0,0.445561,0.554439,0.0614849,0.471578,0.139,0.513944,0.486056,0.0742407,0.519685,0.142,1
1,0.441971,0.558029,0.0188679,0.481132,0.137,0.524715,0.475285,0.0503919,0.5,0.178,1
2,0.596386,0.403614,0.0417559,0.465203,0.145,0.605691,0.394309,0.0491368,0.449535,0.113,0
3,0.444089,0.555911,0.0650069,0.469571,0.141,0.530686,0.469314,0.0503145,0.52327,0.166,1
4,0.50078,0.49922,0.0711354,0.493844,0.133,0.43295,0.56705,0.0606061,0.5338,0.161,1
...,...,...,...,...,...,...,...,...,...,...,...
24875,0.538206,0.461794,0.125763,0.552503,0.117,0.479638,0.520362,0.0820734,0.550756,0.091,0
24876,0.53578,0.46422,0.075188,0.518797,0.154,0.701422,0.298578,0.123288,0.530822,0.11,0
24877,0.43299,0.56701,0.112654,0.510031,0.094,0.46114,0.53886,0.136082,0.537113,0.158,1
24878,0.52349,0.47651,0.123724,0.51977,0.117,0.617486,0.382514,0.109375,0.4875,0.107,0


In [39]:
unencoded_df.to_csv("raw_training_dataset_four_factors_per_possession.csv",index=False)

In [None]:
d = pd.DataFrame(index=np.arange(30))
i = 0
for season in games["SEASON_ID"].unique():
    season_str = str(season)[1:]
    relevant_games = games[games['SEASON_ID'] == season]
    tup_teams = set()
    for row in relevant_games.itertuples():
        tup = (row.TEAM_NAME, row.TEAM_ID, row.TEAM_ABBREVIATION)
        tup_teams.add(tup)
    d.insert(i, season_str, pd.Series(list(tup_teams)), True)
    i += 1

In [1]:
def 
(season_start=1997, season_end = 2018, per_mode = "PerPossession", verbose = False, saveas = None):
    lst = []
    time_taken = [10]
    start = timeit.default_timer()
    for season in range(season_start, season_end + 1):
        time.sleep(0.36)

        successful = False
        while not successful:
            try:
                get_start = timeit.default_timer()
                games = leaguedashplayerstats.LeagueDashPlayerStats(per_mode_detailed = per_mode, season = f"{season}-{str(season+1)[-2:]}", timeout=round(2*sum(time_taken)/len(time_taken), 0))
                get_end = timeit.default_timer()
                time_taken.append(get_end-get_start)
                successful = True
            except Exception as e:
                if verbose:
                    print(f"Timeout error on season {season}-{str(season+1)[-2:]}, taking a 3 second break before resuming")
                time.sleep(3)

        gamelog = games.get_data_frames()[0]
        gamelog["SEASON"] = season
        lst.append(gamelog)
    season_infos = pd.concat(lst, axis = 0, ignore_index = True)
    if saveas:
        season_infos.to_csv(f"{saveas}.csv",index=False)
    return season_infos