In [2]:
from nba_api.stats.endpoints import commonplayerinfo
from nba_api.stats.endpoints import leaguegamelog
from nba_api.stats.endpoints import boxscoreplayertrackv2
from nba_api.stats.endpoints import teamgamelog
from nba_api.stats.endpoints import playergamelogs
from nba_api.stats.endpoints import leaguedashplayerstats
from nba_api.stats.static import teams
from json import JSONDecodeError
import pandas as pd
import numpy as np
import random
import requests
import math
import time

In [3]:
def str_to_mins(inp):
    lst = inp.split(":")
    sec = int(lst[0])*60 + int(lst[1])
    return sec

In [10]:
games = pd.read_csv("../common-datasets/all_gamelogs.csv")
boxscores = pd.read_csv("../common-datasets/all_boxscores_concatenated.csv")
season_stats = pd.read_csv("../common-datasets/player-seasons_per_possession.csv")
playerinfo = pd.read_csv("../common-datasets/commonplayerinfo.csv")
starters_boxscores = boxscores.dropna(subset = ["START_POSITION"])
playernames = list(starters_boxscores["PLAYER_NAME"].unique())

In [5]:
playernames_set = set(playernames)
list_all_game_ids = games[games['SEASON'] > 2002]["GAME_ID"].unique()

In [6]:
games.columns

Index(['Team_ID', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'W', 'L', 'W_PCT',
       'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA',
       'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PTS', 'SEASON', 'TEAM_ID'],
      dtype='object')

In [7]:
boxscores.columns

Index(['GAME_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_CITY', 'PLAYER_ID',
       'PLAYER_NAME', 'START_POSITION', 'COMMENT', 'MIN', 'SPD', 'DIST',
       'ORBC', 'DRBC', 'RBC', 'TCHS', 'SAST', 'FTAST', 'PASS', 'AST', 'CFGM',
       'CFGA', 'CFG_PCT', 'UFGM', 'UFGA', 'UFG_PCT', 'FG_PCT', 'DFGM', 'DFGA',
       'DFG_PCT'],
      dtype='object')

In [8]:
season_stats.columns

Index(['PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_ABBREVIATION', 'AGE', 'GP',
       'W', 'L', 'W_PCT', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A',
       'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'TOV',
       'STL', 'BLK', 'BLKA', 'PF', 'PFD', 'PTS', 'PLUS_MINUS',
       'NBA_FANTASY_PTS', 'DD2', 'TD3', 'GP_RANK', 'W_RANK', 'L_RANK',
       'W_PCT_RANK', 'MIN_RANK', 'FGM_RANK', 'FGA_RANK', 'FG_PCT_RANK',
       'FG3M_RANK', 'FG3A_RANK', 'FG3_PCT_RANK', 'FTM_RANK', 'FTA_RANK',
       'FT_PCT_RANK', 'OREB_RANK', 'DREB_RANK', 'REB_RANK', 'AST_RANK',
       'TOV_RANK', 'STL_RANK', 'BLK_RANK', 'BLKA_RANK', 'PF_RANK', 'PFD_RANK',
       'PTS_RANK', 'PLUS_MINUS_RANK', 'NBA_FANTASY_PTS_RANK', 'DD2_RANK',
       'TD3_RANK', 'CFID', 'CFPARAMS', 'SEASON'],
      dtype='object')

In [11]:
playerinfo.columns

Index(['PERSON_ID', 'FIRST_NAME', 'LAST_NAME', 'DISPLAY_FIRST_LAST',
       'DISPLAY_LAST_COMMA_FIRST', 'DISPLAY_FI_LAST', 'PLAYER_SLUG',
       'BIRTHDATE', 'SCHOOL', 'COUNTRY', 'LAST_AFFILIATION', 'HEIGHT',
       'WEIGHT', 'SEASON_EXP', 'JERSEY', 'POSITION', 'ROSTERSTATUS', 'TEAM_ID',
       'TEAM_NAME', 'TEAM_ABBREVIATION', 'TEAM_CODE', 'TEAM_CITY',
       'PLAYERCODE', 'FROM_YEAR', 'TO_YEAR', 'DLEAGUE_FLAG', 'NBA_FLAG',
       'GAMES_PLAYED_FLAG', 'DRAFT_YEAR', 'DRAFT_ROUND', 'DRAFT_NUMBER'],
      dtype='object')

In [14]:
player_info_dict = {k:dict() for k in playernames_set}
for playername in player_info_dict.keys():
    season_infos = season_stats[season_stats["PLAYER_NAME"] == playername]
    player_info_dict[playername] = dict()
    #first, store the ones that don't change from season to season
    common_info =  playerinfo[playerinfo["DISPLAY_FIRST_LAST"] == playername].reset_index()
    
    try:
        ht_lst = common_info.at[0,"HEIGHT"].split("-")
    except AttributeError:
        ht_lst = [6, 6]
        
    try:
        pick = int(common_info.at[0,"DRAFT_NUMBER"])
        if pick > 60:
            pick = "Undrafted"
    except:
        pick = "Undrafted"    
    player_info_dict[playername]["seasons_available"] = set()
    player_info_dict[playername]["first_season"] = int(common_info.at[0,"FROM_YEAR"])   
    player_info_dict[playername]["pick"] = pick
    
    for row in season_infos.itertuples():
        player_info_dict[playername]["seasons_available"].add(row.SEASON)
        player_info_dict[playername][row.SEASON] = dict()
        player_season = player_info_dict[playername][row.SEASON]
        
        player_season["FG3M"] = row.FG3M
        player_season["FG3A"] = row.FG3A
        player_season["FG2M"] = row.FGM - row.FG3M
        player_season["FG2A"] = row.FGA - row.FG3A
        player_season["OREB"] = row.OREB
        player_season["DREB"] = row.DREB
        player_season["AST"] = row.AST
        player_season["TOV"] = row.TOV
        player_season["BLK"] = row.BLK
        player_season["STL"] = row.STL
        player_season["BLKA"] = row.BLKA
        player_season["PF"] = row.PF
        player_season["PFD"] = row.PFD
        player_season["AGE"] = row.AGE
        player_season["HEIGHT"] = int(ht_lst[0])*12 + int(ht_lst[1]) 
        

In [15]:
draft_pick_dict = {k:dict() for k in range(1,61)}
draft_pick_dict["Undrafted"] = dict()
for draft_pick in draft_pick_dict.keys():
    draft_pick_dict[draft_pick]["num_pl"] = 0
    draft_pick_dict[draft_pick]["stats"] = dict()
    for pid, d in player_info_dict.items():
        if d["pick"] == draft_pick and d["first_season"] in d["seasons_available"] :
            draft_pick_dict[draft_pick]["num_pl"] += 1
            for k, v in d[d["first_season"]].items():
                try:
                    draft_pick_dict[draft_pick]["stats"][k] += v
                except:
                    draft_pick_dict[draft_pick]["stats"][k] = v
    for k, v in draft_pick_dict[draft_pick]["stats"].items():
        draft_pick_dict[draft_pick]["stats"][k] /= draft_pick_dict[draft_pick]["num_pl"]

In [19]:
"""this will generate a dictionary that maps game ids to a lot of information about the game. From game id, it maps to
home and away, which maps to all the different (relevant (relevant means played 3 minutes in a game at some point in their
career)) players, which then maps to the second they played in that game + their stats from the previous season
"""
game_info_dict = {k:{"home":dict(), "away":dict()} for k in list_all_game_ids}
for game_id in list_all_game_ids:
    relevant_games = games[(games['GAME_ID'] == game_id)]
    relevant_boxscores = boxscores[(boxscores['GAME_ID'] == game_id)]
    relevant_games.reset_index(drop=True, inplace=True)
    current_season = relevant_games.loc[0, "SEASON"]
    
    # second step.... add the appropriate team id to home vs away
    if "@" in relevant_games.loc[0, "MATCHUP"]:
        away_team_row = 0
        home_team_row = 1      
    else:
        away_team_row = 1
        home_team_row = 0
        
    game_info_dict[game_id]["winner"] = relevant_games.loc[0, "TEAM_ID"] if relevant_games.loc[0, "WL"] == "W" else relevant_games.loc[1, "TEAM_ID"]
    game_info_dict[game_id]["away"]["team_id"] = relevant_games.loc[away_team_row, "TEAM_ID"]
    away_team_id = relevant_games.loc[away_team_row, "TEAM_ID"]
    game_info_dict[game_id]["home"]["team_id"] = relevant_games.loc[home_team_row, "TEAM_ID"]
    home_team_id = relevant_games.loc[home_team_row, "TEAM_ID"]
    
    #log the win percentages
    game_info_dict[game_id]["away"]["W_PCT"] = relevant_games.loc[away_team_row, "W_PCT"]
    game_info_dict[game_id]["home"]["W_PCT"] = relevant_games.loc[home_team_row, "W_PCT"]
    
    #start logging the players
    game_info_dict[game_id]["home"]["players"] = dict()
    game_info_dict[game_id]["away"]["players"] = dict()
    for row in relevant_boxscores.itertuples(): #we are essentially iterating through the players here 
        sec_played = str_to_mins(row.MIN) #seconds played that match
        start_pos = row.START_POSITION #psoition they started in (nan if not a starter)
        playername = row.PLAYER_NAME #name of the player
        
        if sec_played > 0 and playername in playernames_set: #played in the games, is a relevant player
            loc_tag = "home" if row.TEAM_ID == home_team_id else "away" #quick way t osee if we are home or away
            game_info_dict[game_id][loc_tag]["players"][playername] = dict() #ety dictionary to store stats
            game_info_dict[game_id][loc_tag]["players"][playername] = {"sec_played":sec_played, "start_pos":start_pos}
            try:
                game_info_dict[game_id][loc_tag]["players"][playername]["stats"] = player_info_dict[playername][current_season - 1]
            except KeyError:
                lst_prev_seasons = [i for i in player_info_dict[playername]["seasons_available"] if i < current_season]
                if lst_prev_seasons:
                    closest_season = max(lst_prev_seasons)
                    game_info_dict[game_id][loc_tag]["players"][playername]["stats"] = player_info_dict[playername][closest_season]
                else:
                    try:
                        game_info_dict[game_id][loc_tag]["players"][playername]["stats"] = draft_pick_dict[player_info_dict[playername]["pick"]]["stats"]
                    except Exception as e:
                        print(playername, current_season, e)

In [23]:
base_columns = [x for x in game_info_dict[21801228]["home"]["players"]['Nikola Jokic']['stats'].keys()]
real_columns = []
for colname in base_columns:
    real_columns.append("hm_g_1_" + colname)
    real_columns.append("hm_g_2_" + colname)
    real_columns.append("hm_f_1_" + colname)
    real_columns.append("hm_f_2_" + colname)
    real_columns.append("hm_c_" + colname)
    real_columns.append("aw_g_1_" + colname)
    real_columns.append("aw_g_2_" + colname)
    real_columns.append("aw_f_1_" + colname)
    real_columns.append("aw_f_2_" + colname)
    real_columns.append("aw_c_" + colname)
real_columns = sorted(real_columns)
real_columns.append("aw_W_PCT")
real_columns.append("hm_W_PCT")
real_columns.append("result")
unencoded_df = pd.DataFrame(columns = real_columns, index = range(len(game_info_dict.keys())))

In [24]:
unencoded_df

Unnamed: 0,aw_c_AGE,aw_c_AST,aw_c_BLK,aw_c_BLKA,aw_c_DREB,aw_c_FG2A,aw_c_FG2M,aw_c_FG3A,aw_c_FG3M,aw_c_HEIGHT,...,hm_g_2_FG3M,hm_g_2_HEIGHT,hm_g_2_OREB,hm_g_2_PF,hm_g_2_PFD,hm_g_2_STL,hm_g_2_TOV,aw_W_PCT,hm_W_PCT,result
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19394,,,,,,,,,,,...,,,,,,,,,,
19395,,,,,,,,,,,...,,,,,,,,,,
19396,,,,,,,,,,,...,,,,,,,,,,
19397,,,,,,,,,,,...,,,,,,,,,,


In [25]:
for index, (game_id, obj) in enumerate(game_info_dict.items()):
    winner_id = obj["winner"]
    home_dict = obj["home"]
    away_dict = obj["away"]
    unencoded_df.at[index, "result"] = 1 if winner_id == home_dict["team_id"] else 0
    for dic in [home_dict, away_dict]:
        loc_prefix = "hm_" if dic == home_dict else "aw_"
        unencoded_df.at[index, loc_prefix + "W_PCT"] = dic["W_PCT"]
        first_or_second_g = {1,2}
        first_or_second_f = {1,2}
        
        for player_name, player_obj in dic["players"].items():
            if player_obj["start_pos"] in {"F", "C", "G"}:
                pos_modifier = player_obj["start_pos"].lower() + "_"
                if pos_modifier == 'g_':
                    nm = random.sample(first_or_second_g, 1)[0]
                    first_or_second_g = first_or_second_g - {nm}
                    pos_modifier += str(nm) + "_"
                elif pos_modifier == 'f_':
                    nm = random.sample(first_or_second_f, 1)[0]
                    first_or_second_f = first_or_second_f - {nm}
                    pos_modifier += str(nm) + "_"
                prefix = loc_prefix + pos_modifier
                for stat in base_columns:
                    col_to_be_changed = prefix + stat
                    if stat in player_obj["stats"].keys():
                        unencoded_df.at[index, col_to_be_changed] = player_obj["stats"][stat]
                    else:
                        unencoded_df.at[index, col_to_be_changed] = 0 

In [26]:
unencoded_df

Unnamed: 0,aw_c_AGE,aw_c_AST,aw_c_BLK,aw_c_BLKA,aw_c_DREB,aw_c_FG2A,aw_c_FG2M,aw_c_FG3A,aw_c_FG3M,aw_c_HEIGHT,...,hm_g_2_FG3M,hm_g_2_HEIGHT,hm_g_2_OREB,hm_g_2_PF,hm_g_2_PFD,hm_g_2_STL,hm_g_2_TOV,aw_W_PCT,hm_W_PCT,result
0,31,0.041,0.032,0.01,0.097,0.243,0.139,0,0,85,...,0.018,70,0.016,0.027,0,0.016,0.033,0.683,0.5,0
1,28,0.01,0.058,0.007,0.089,0.097,0.052,0,0,82,...,0.009,76,0.013,0.03,0,0.022,0.031,0.444,0.679,1
2,31,0.041,0.032,0.01,0.097,0.243,0.139,0,0,85,...,0.015,74,0.009,0.025,0,0.019,0.034,0.675,0.688,1
3,24,0.012,0.016,0.024,0.072,0.127,0.057,0,0,86,...,0.019,78,0.016,0.033,0,0.027,0.043,0.633,0.684,1
4,30,0.015,0.054,0.008,0.093,0.122,0.056,0,0,82,...,0.009,76,0.013,0.03,0,0.022,0.031,0.519,0.679,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19394,20.9545,0.0380909,0.0109545,0.0121364,0.059,0.128591,0.0583636,0.0427727,0.0148182,79.3182,...,0.035,78,0.008,0.02,0.015,0.013,0.013,0.217,0.632,1
19395,32,0.031,0.018,0.009,0.079,0.253,0.134,0.019,0.005,83,...,0.031,78,0.007,0.04,0.048,0.018,0.031,0.5,0.238,0
19396,24,0.044,0.024,0.011,0.159,0.163,0.087,0.002,0,82,...,0.029,73,0.015,0.036,0.021,0.021,0.03,1,0,0
19397,24,0.044,0.024,0.011,0.159,0.163,0.087,0.002,0,82,...,0.029,74,0.004,0.035,0.021,0.009,0.012,0.483,0.54,1


In [27]:
null_columns = unencoded_df.columns[unencoded_df.isnull().any()]

In [None]:
bx = boxscoreplayertrackv2.BoxScorePlayerTrackV2(29800231, timeout=15)
df = bx.get_data_frames()[0]
df

In [113]:
print(unencoded_df.isnull().sum().sum())
wack = unencoded_df[unencoded_df.isnull().any(axis=1)][null_columns]
wack.head()

1170


Unnamed: 0,aw_c_AGE,aw_c_AST,aw_c_BLK,aw_c_BLKA,aw_c_DREB,aw_c_FG2A,aw_c_FG2M,aw_c_FG3A,aw_c_FG3M,aw_c_HEIGHT,...,hm_g_2_FG3M,hm_g_2_HEIGHT,hm_g_2_OREB,hm_g_2_PF,hm_g_2_PFD,hm_g_2_PICK,hm_g_2_ROOKIE,hm_g_2_STL,hm_g_2_TOV,hm_g_2_UNDRAFTED
215,32,2.0,0.7,0.4,4.0,10.7,5.6,0.1,0,78,...,1.6,76,0.9,2.4,0,2,0,2.3,2.8,0
228,22,0.8,1.3,0.8,6.2,7.9,3.5,0.0,0,83,...,1.4,78,0.7,2.7,0,61,0,0.7,1.3,1
240,31,1.1,1.6,0.6,5.1,6.9,3.0,0.0,0,83,...,1.6,76,0.9,2.4,0,2,0,2.3,2.8,0
503,35,1.0,0.5,0.7,5.5,12.8,6.6,0.1,0,84,...,1.4,78,1.7,2.0,0,8,0,1.7,1.4,0
543,31,1.1,1.6,0.6,5.1,6.9,3.0,0.0,0,83,...,0.0,71,0.4,1.9,0,61,0,1.1,2.2,1


In [114]:
x = list(wack[1:2].values)[0]
unencoded_df.columns[np.argwhere(pd.isnull(x))[0][0]]

'aw_g_2_AGE'

In [115]:
list_all_game_ids[228]

29800231

In [28]:
unencoded_df = unencoded_df.dropna(axis=0)
unencoded_df.isnull().sum().sum()

0

In [29]:
unencoded_df.to_csv("training_dataset_f_plus_naive.csv",index=False)

In [None]:
d = pd.DataFrame(index=np.arange(30))
i = 0
for season in games["SEASON_ID"].unique():
    season_str = str(season)[1:]
    relevant_games = games[games['SEASON_ID'] == season]
    tup_teams = set()
    for row in relevant_games.itertuples():
        tup = (row.TEAM_NAME, row.TEAM_ID, row.TEAM_ABBREVIATION)
        tup_teams.add(tup)
    d.insert(i, season_str, pd.Series(list(tup_teams)), True)
    i += 1