In [None]:
1+1

In [None]:
from nba_api.stats.endpoints import commonplayerinfo
from nba_api.stats.endpoints import leaguegamelog
from nba_api.stats.endpoints import boxscoreplayertrackv2
from nba_api.stats.endpoints import teamgamelog
from nba_api.stats.endpoints import playergamelogs
from nba_api.stats.endpoints import leaguedashplayerstats
from nba_api.stats.static import teams
from json import JSONDecodeError
import pandas as pd
import numpy as np
import random
import requests
import math
import time

In [None]:
def str_to_mins(inp):
    lst = inp.split(":")
    sec = int(lst[0])*60 + int(lst[1])
    return sec

In [None]:
games = pd.read_csv("../common-datasets/all_gamelogs.csv")
boxscores = pd.read_csv("../common-datasets/all_boxscores_concatenated.csv")
season_stats_per_poss = pd.read_csv("../common-datasets/player-seasons_per_possession.csv")
season_stats_totals = pd.read_csv("../common-datasets/player-seasons_totals.csv")
season_stats_advanced_totals = pd.read_csv("../common-datasets/player-seasons-advanced_totals.csv")
playerinfo = pd.read_csv("../common-datasets/commonplayerinfo.csv")

In [None]:
starters_boxscores = boxscores.dropna(subset = ["START_POSITION"])
starters_ids = list(starters_boxscores["PLAYER_ID"].unique())
list_all_game_ids = games[games['SEASON'] > 2002]["GAME_ID"].unique()

In [None]:
starters_dict = {x[0]:x[1] for x in list(zip(starters_boxscores["PLAYER_ID"], starters_boxscores["PLAYER_NAME"]))}

In [None]:
"games", games.columns, "boxscores", boxscores.columns, "season_stats_per_poss", season_stats_per_poss.columns, "season_stats_totals", season_stats_totals.columns, "season_stats_advanced_totals", season_stats_advanced_totals.columns, "playerinfo", playerinfo.columns

In [None]:
player_info_dict = {k:dict() for k in starters_dict.keys()} #starters_dict is ID -> NAME
for player_id in player_info_dict.keys():
    #trim the above datasets to only include the ones we want
    per_poss = season_stats_per_poss[season_stats_per_poss["PLAYER_ID"] == player_id].reset_index(drop=True)
    per_poss = per_poss.rename(index = {i:yr for i, yr in enumerate(list(per_poss["SEASON"]))})
    totals = season_stats_totals[season_stats_totals["PLAYER_ID"] == player_id].reset_index(drop=True)
    totals = totals.rename(index = {i:yr for i, yr in enumerate(list(totals["SEASON"]))})
    advanced = season_stats_advanced_totals[season_stats_advanced_totals["PLAYER_ID"] == player_id].reset_index(drop=True)
    advanced = advanced.rename(index = {i:yr for i, yr in enumerate(list(advanced["SEASON"]))})
    pinfo = playerinfo[playerinfo["PERSON_ID"] == player_id].reset_index(drop=True)
    player_info_dict[player_id] = dict()
    
    
    try:
        ht = int(list(advanced["PLAYER_HEIGHT_INCHES"])[0])
    except Exception as e:
        print(player_id, starters_dict[player_id], e)
        ht = 6*12 + 6
    try:
        pick = int(list(advanced["DRAFT_NUMBER"])[0])
        if pick > 60:
            pick = "Undrafted"
    except:
        pick = "Undrafted"
    

    player_info_dict[player_id]["seasons_avail"] = sorted(list(totals["SEASON"]))
    player_info_dict[player_id]["FIRST_SEASON"] = int(pinfo.at[0,"FROM_YEAR"])   
    player_info_dict[player_id]["PICK"] = pick
    player_info_dict[player_id]["HEIGHT"] = ht
    
    for season in player_info_dict[player_id]["seasons_avail"]:
        player_info_dict[player_id][season] = dict()
        player_season = player_info_dict[player_id][season]
        
        #stats from per_poss
        player_season["FG3M_PP"] = per_poss.at[season, "FG3M"]
        player_season["FG3A_PP"] = per_poss.at[season, "FG3A"]
        
        player_season["FTM_PP"] = per_poss.at[season, "FTM"]
        player_season["FTA_PP"] = per_poss.at[season, "FTA"]
        
        player_season["FG2M_PP"] = per_poss.at[season, "FGM"] - per_poss.at[season, "FG3M"]
        player_season["FG2A_PP"] = per_poss.at[season, "FGA"] - per_poss.at[season, "FG3A"]
        
        player_season["OREB_PP"] = per_poss.at[season, "OREB"]
        player_season["DREB_PP"] = per_poss.at[season, "DREB"]
        
        player_season["AST_PP"] = per_poss.at[season, "AST"]
        player_season["TOV_PP"] = per_poss.at[season, "TOV"]
        player_season["STL_PP"] = per_poss.at[season, "STL"]
        player_season["BLK_PP"] = per_poss.at[season, "BLK"]
       
        player_season["PF_PP"] = per_poss.at[season, "PF"]
        player_season["PFD_PP"] = per_poss.at[season, "PFD"]
        
        player_season["AGE"] = per_poss.at[season, "AGE"]
        
        
        #stats from totals
        player_season["MIN_TOT"] = totals.at[season, "MIN"]
        
        player_season["FG3M_TOT"] = totals.at[season, "FG3M"]
        player_season["FG3A_TOT"] = totals.at[season, "FG3A"]
        player_season["FG3_PCT"] = totals.at[season, "FG3_PCT"]
        player_season["FG3_FREQ"] = totals.at[season, "FG3A"] / max(totals.at[season, "FGA"], 1)
        
        player_season["FTM_TOT"] = totals.at[season, "FTM"]
        player_season["FTA_TOT"] = totals.at[season, "FTA"]
        player_season["FT_PCT"] = totals.at[season, "FT_PCT"]
        
        player_season["FG2M_TOT"] = totals.at[season, "FGM"] - totals.at[season, "FG3M"]
        player_season["FG2A_TOT"] = totals.at[season, "FGA"] - totals.at[season, "FG3A"]
        player_season["FG2_PCT"] = player_season["FG2M_TOT"] / max(player_season["FG2A_TOT"],1)
        
        player_season["OREB_TOT"] = totals.at[season, "OREB"]
        player_season["DREB_TOT"] = totals.at[season, "DREB"]
        
        player_season["AST_TOT"] = totals.at[season, "AST"]
        player_season["TOV_TOT"] = totals.at[season, "TOV"]
        player_season["STL_TOT"] = totals.at[season, "STL"]
        player_season["BLK_TOT"] = totals.at[season, "BLK"]
       
        player_season["PF_TOT"] = totals.at[season, "PF"]
        player_season["PFD_TOT"] = totals.at[season, "PFD"]
        
        player_season["W"] = totals.at[season, "W"]
        player_season["L"] = totals.at[season, "L"]
        player_season["W_PCT"] = totals.at[season, "W_PCT"]
        player_season["PLUS_MINUS"] = totals.at[season, "PLUS_MINUS"]
        player_season["AST/TOV"] = totals.at[season, "AST"] / max(totals.at[season, "TOV"],1) ##causes lots of NaN
        
        #stats from advanced
        player_season["NET_RATING"] = advanced.at[season, "NET_RATING"]
        player_season["OREB_PCT"] = advanced.at[season, "OREB_PCT"]
        player_season["DREB_PCT"] = advanced.at[season, "DREB_PCT"]
        player_season["USG_PCT"] = advanced.at[season, "USG_PCT"]
        player_season["TS_PCT"] = advanced.at[season, "TS_PCT"]
        player_season["AST_PCT"] = advanced.at[season, "AST_PCT"]
        
        #static stats
        player_season["HEIGHT"] = ht

In [194]:
draft_pick_dict = {k:dict() for k in range(1,61)}
draft_pick_dict["Undrafted"] = dict()
for draft_pick in draft_pick_dict.keys():
    draft_pick_dict[draft_pick]["num_pl"] = 0
    draft_pick_dict[draft_pick]["stats"] = dict()
    for pid, d in player_info_dict.items():
        if d["PICK"] == draft_pick and d["FIRST_SEASON"] in d["seasons_avail"] :
            draft_pick_dict[draft_pick]["num_pl"] += 1
            for k, v in d[d["FIRST_SEASON"]].items():
                try:
                    draft_pick_dict[draft_pick]["stats"][k] += v
                except:
                    draft_pick_dict[draft_pick]["stats"][k] = v
    for k, v in draft_pick_dict[draft_pick]["stats"].items():
        draft_pick_dict[draft_pick]["stats"][k] /= draft_pick_dict[draft_pick]["num_pl"]

In [195]:
"""this will generate a dictionary that maps game ids to a lot of information about the game. From game id, it maps to
home and away, which maps to all the different (relevant) players, which then maps to the seconds they played in that game + 
their stats from the previous season
"""
game_info_dict = {k:{"home":dict(), "away":dict()} for k in list_all_game_ids}
for game_id in list_all_game_ids:
    relevant_games = games[(games['GAME_ID'] == game_id)]
    relevant_boxscores = starters_boxscores[(starters_boxscores['GAME_ID'] == game_id)]
    relevant_games.reset_index(drop=True, inplace=True)
    current_season = relevant_games.loc[0, "SEASON"]
    
    # second step.... add the appropriate team id to home vs away
    if "@" in relevant_games.loc[0, "MATCHUP"]:
        away_team_row = 0
        home_team_row = 1      
    else:
        away_team_row = 1
        home_team_row = 0
        
    game_info_dict[game_id]["winner"] = relevant_games.loc[0, "TEAM_ID"] if relevant_games.loc[0, "WL"] == "W" else relevant_games.loc[1, "TEAM_ID"]
    game_info_dict[game_id]["away"]["team_id"] = relevant_games.loc[away_team_row, "TEAM_ID"]
    away_team_id = relevant_games.loc[away_team_row, "TEAM_ID"]
    game_info_dict[game_id]["home"]["team_id"] = relevant_games.loc[home_team_row, "TEAM_ID"]
    home_team_id = relevant_games.loc[home_team_row, "TEAM_ID"]
    
    #log the win percentages
    game_info_dict[game_id]["away"]["W_PCT"] = relevant_games.loc[away_team_row, "W_PCT"]
    game_info_dict[game_id]["away"]["GP"] = relevant_games.loc[away_team_row, "W"] + relevant_games.loc[away_team_row, "L"] 
    game_info_dict[game_id]["home"]["W_PCT"] = relevant_games.loc[home_team_row, "W_PCT"]
    game_info_dict[game_id]["home"]["GP"] = relevant_games.loc[home_team_row, "W"] + relevant_games.loc[home_team_row, "L"] 
    #start logging the players
    game_info_dict[game_id]["home"]["players"] = dict()
    game_info_dict[game_id]["away"]["players"] = dict()
    for row in relevant_boxscores.itertuples(): #we are essentially iterating through the players here 
        pid = row.PLAYER_ID #name of the player
        
        loc_tag = "home" if row.TEAM_ID == home_team_id else "away" #quick way t osee if we are home or away
        game_info_dict[game_id][loc_tag]["players"][pid] = dict() #ety dictionary to store stats
        game_info_dict[game_id][loc_tag]["players"][pid] = {"sec_played":str_to_mins(row.MIN), "start_pos":row.START_POSITION}
        try:
            game_info_dict[game_id][loc_tag]["players"][pid]["stats"] = player_info_dict[pid][current_season] ##HOUIHP:OUIHPUIHPIOU
        except KeyError:
            lst_prev_seasons = [i for i in player_info_dict[pid]["seasons_avail"] if i < current_season]
            if lst_prev_seasons:
                closest_season = max(lst_prev_seasons)
                game_info_dict[game_id][loc_tag]["players"][pid]["stats"] = player_info_dict[pid][closest_season]
            else:
                try:
                    game_info_dict[game_id][loc_tag]["players"][pid]["stats"] = draft_pick_dict[player_info_dict[pid]["PICK"]]["stats"]
                except Exception as e:
                    print(playername, current_season, e)

In [196]:
base_columns = [x for x in game_info_dict[21200657]["home"]["players"][1890]['stats'].keys()]
real_columns = []
for colname in base_columns:
    real_columns.append("hm_g_1_" + colname)
    real_columns.append("hm_g_2_" + colname)
    real_columns.append("hm_f_1_" + colname)
    real_columns.append("hm_f_2_" + colname)
    real_columns.append("hm_c_" + colname)
    real_columns.append("aw_g_1_" + colname)
    real_columns.append("aw_g_2_" + colname)
    real_columns.append("aw_f_1_" + colname)
    real_columns.append("aw_f_2_" + colname)
    real_columns.append("aw_c_" + colname)
real_columns = sorted(real_columns)
real_columns.append("W_PCT_DIFF")
real_columns.append("GP")
real_columns.append("result")
unencoded_df = pd.DataFrame(columns = real_columns, index = range(len(game_info_dict.keys())))

In [197]:
for index, (game_id, obj) in enumerate(game_info_dict.items()):
    winner_id = obj["winner"]
    home_dict = obj["home"]
    away_dict = obj["away"]
    unencoded_df.at[index, "result"] = 1 if winner_id == home_dict["team_id"] else 0
    unencoded_df.at[index, "W_PCT_DIFF"] = home_dict["W_PCT"] - away_dict["W_PCT"]
    unencoded_df.at[index, "GP"] = home_dict["GP"] + away_dict["GP"]
    for dic in [home_dict, away_dict]:
        loc_prefix = "hm_" if dic == home_dict else "aw_"
        
        first_or_second_g = {1,2}
        first_or_second_f = {1,2}
        
        for player_name, player_obj in dic["players"].items():
            pos_modifier = player_obj["start_pos"].lower() + "_"
            if pos_modifier == 'g_':
                nm = random.sample(first_or_second_g, 1)[0]
                first_or_second_g = first_or_second_g - {nm}
                pos_modifier += str(nm) + "_"
            elif pos_modifier == 'f_':
                nm = random.sample(first_or_second_f, 1)[0]
                first_or_second_f = first_or_second_f - {nm}
                pos_modifier += str(nm) + "_"
            prefix = loc_prefix + pos_modifier
            for stat in base_columns:
                col_to_be_changed = prefix + stat
                if stat in player_obj["stats"].keys():
                    unencoded_df.at[index, col_to_be_changed] = player_obj["stats"][stat]
                else:
                    unencoded_df.at[index, col_to_be_changed] = 0 

In [200]:
unencoded_df = unencoded_df.dropna(axis="index")

In [201]:
unencoded_df

Unnamed: 0,aw_c_AGE,aw_c_AST/TOV,aw_c_AST_PCT,aw_c_AST_PP,aw_c_AST_TOT,aw_c_BLK_PP,aw_c_BLK_TOT,aw_c_DREB_PCT,aw_c_DREB_PP,aw_c_DREB_TOT,...,hm_g_2_STL_TOT,hm_g_2_TOV_PP,hm_g_2_TOV_TOT,hm_g_2_TS_PCT,hm_g_2_USG_PCT,hm_g_2_W,hm_g_2_W_PCT,W_PCT_DIFF,GP,result
0,32,1.00513,0.14,0.041,196,0.034,166,0.203,0.108,523,...,99,0.031,180,0.508,0.187,41,0.5,-0.183,164,0
1,29,0.809524,0.05,0.016,17,0.042,46,0.197,0.104,113,...,112,0.035,171,0.551,0.281,48,0.738,0.235,162,1
2,32,1.00513,0.14,0.041,196,0.034,166,0.203,0.108,523,...,151,0.028,155,0.544,0.147,55,0.671,0.013,160,1
3,25,0.782609,0.054,0.017,18,0.021,22,0.184,0.087,91,...,112,0.035,171,0.551,0.281,48,0.738,0.051,158,1
4,31,0.591667,0.043,0.014,71,0.062,307,0.152,0.084,419,...,112,0.035,171,0.551,0.281,48,0.738,0.16,155,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19394,20,1.2,0.118,0.034,78,0.025,58,0.198,0.096,220,...,40,0.016,65,0.563,0.154,27,0.429,0.415,42,1
19395,33,1.34722,0.115,0.035,194,0.019,107,0.18,0.089,493,...,60,0.048,215,0.574,0.298,16,0.254,-0.262,41,0
19396,25,0.64,0.068,0.021,112,0.025,138,0.311,0.149,809,...,60,0.048,215,0.574,0.298,16,0.254,-1,4,0
19397,25,0.64,0.068,0.021,112,0.025,138,0.311,0.149,809,...,67,0.027,97,0.555,0.178,41,0.612,0.057,123,1


In [202]:
null_columns = unencoded_df.columns[unencoded_df.isnull().any()]
list(unencoded_df[null_columns].isnull().sum())

[]

In [None]:
display(unencoded_df[unencoded_df.isnull().any(axis=1)][null_columns])

In [204]:
unencoded_df.to_csv("training_data_mixed_model_v2.csv",index = False)

In [None]:
d = pd.DataFrame(index=np.arange(30))
i = 0
for season in games["SEASON_ID"].unique():
    season_str = str(season)[1:]
    relevant_games = games[games['SEASON_ID'] == season]
    tup_teams = set()
    for row in relevant_games.itertuples():
        tup = (row.TEAM_NAME, row.TEAM_ID, row.TEAM_ABBREVIATION)
        tup_teams.add(tup)
    d.insert(i, season_str, pd.Series(list(tup_teams)), True)
    i += 1