# This file will reshape the shift dataset as follows:

We want one row for each shift, and some ~500 columns for each NBA player. All values will be zero except for the 10 players that participate in the given shift. 

In [1]:
import pandas as pd
import numpy as np

shifts = pd.read_csv("../data/shifts_data.csv")
shifts.head()

Unnamed: 0.1,Unnamed: 0,point_differential,num_possessions,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,away_player_1,away_player_2,away_player_3,away_player_4,away_player_5
0,0,-7,19.2,202694,1628369,201143,202681,203935,1627750,203914,203115,200794,203999
1,1,3,7.68,1628369,203382,201143,202681,203935,1627750,203914,203115,200794,203999
2,2,-1,1.3824,1627759,203382,201143,202681,203935,1627750,203914,203115,200794,203999
3,3,-1,2.7648,1627759,203382,201143,202681,203935,1628420,203914,203115,200794,203999
4,4,2,5.2224,1627759,203382,201143,202681,203935,1628420,203115,1627736,203486,203999


## Method:

We will construct a dictionary with the key = player_id and the value = the index at which this player will be kept in the final dataframe. So we will get all unique players for each column ('home_player_1', 'home_player_2', etc.) and iterate through these unique lists checking if a player is already in the dict - if so, continue, if not, add player to the dict and increment index.

Once we have this dictionary, we can initialize a player matrix with N rows and M columns where N is the number of shifts and M is the number of unique players, and we initialize this matrix with all zeros. Then we iterate through the original shifts dataset and specifically inspect each of the player columns, and for each player we see we get his index via the dictionary created above and we set that index at that row equal to 1. This may be computationally intensive but should be feasible.

In [6]:
index = 0 # initialize index to zero
player_index_map = dict() # initialize player to index mapping as dictionary
# first iterate through home columns (this should cover everything, but we will also do away columns just in case)
for i in range(1, 6):
    name = "home_player_" + str(i)
    players = pd.unique(shifts[name])
    for player in players:
        if player not in player_index_map:
            player_index_map[player] = index
            index += 1
        else:
            continue # if player is already in the index then continue
            
# now iterate through away columns to catch any special cases of players who only participated in away games
for i in range(1,6):
    name = "away_player_" + str(i)
    players = pd.unique(shifts[name])
    for player in players:
        if player not in player_index_map:
            player_index_map[player] = index
            index += 1
        else:
            continue # if player is already in the index then continue

In [10]:
M =len(player_index_map.keys()) # 529 unique NBA players in 2018-19 season according to our data. This seems reasonable
M

529

In [22]:
N = shifts.shape[0] # number of unique shifts in 2018-19 season
players_mat = np.zeros((N,M)) # initialize matrix

for i in range(N):
    # first handle home players
    for j in range(1,6):
        name = "home_player_" + str(j)
        player_id = shifts[name].iloc[i]
        index = player_index_map[player_id]
        players_mat[i, index] = 1
    # Now handle away players
    for j in range(1,6):
        name = "away_player_" + str(j)
        player_id = shifts[name].iloc[i]
        index = player_index_map[player_id]
        players_mat[i, index] = 1
    

In [30]:
players_df = pd.DataFrame(players_mat)
point_diff = shifts['point_differential']
num_poss = shifts['num_possessions']
point_diff_per_100 = point_diff / num_poss
# point_diff_per_100
# players_df.insert(loc = 0, column = "point_differential", value = point_diff)
# players_df.insert(loc = 1, column = "num_possessions", value = num_poss)
players_df.insert(loc = 0, column = "point_diff_per_100", value = point_diff_per_100)
players_df # This is the final dataframe that we will use to train our model

Unnamed: 0,point_diff_per_100,0,1,2,3,4,5,6,7,8,...,519,520,521,522,523,524,525,526,527,528
0,-0.364583,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.390625,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.723380,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.361690,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.382966,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33886,0.388682,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33887,0.605620,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33888,-0.723380,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33889,0.754831,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
player_index_df = pd.DataFrame(list(player_index_map.items()))
player_index_df.columns = ["player_id", "index"]
player_index_df
player_index_df.to_csv(r'../data/player_index_map.csv')

In [51]:
players_df.to_csv(r'../data/shifts_data_final_2018_19.csv')