# Creating Player Priors

This notebook will create player prior distributions based on contract values, team ratings, and potentially other variables as well (but first we will focus only on contract values/team ratings for priors for simplicity). 

In [37]:
import pymc3 as pm
import pandas as pd
import numpy as np

data = pd.read_csv("../data/shifts_data_final_2018_19.csv")

data.drop(data.columns[0], axis = 1, inplace = True)
data.head()

Unnamed: 0,point_diff_per_100,home_team,away_team,0,1,2,3,4,5,6,...,519,520,521,522,523,524,525,526,527,528
0,-0.364583,Celtics,Nuggets,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.390625,Celtics,Nuggets,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.72338,Celtics,Nuggets,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.36169,Celtics,Nuggets,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.382966,Celtics,Nuggets,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## First - create a mapping from player ID to name

In [38]:
playerlist = pd.read_csv("../data/playerlist.csv")
playerlist.drop(playerlist.columns[0], axis = 1, inplace = True)
playerlist.head()

Unnamed: 0,DISPLAY_FIRST_LAST,DISPLAY_LAST_COMMA_FIRST,FROM_YEAR,GAMES_PLAYED_FLAG,OTHERLEAGUE_EXPERIENCE_CH,PERSON_ID,PLAYERCODE,ROSTERSTATUS,TEAM_ABBREVIATION,TEAM_CITY,TEAM_CODE,TEAM_ID,TEAM_NAME,TO_YEAR
0,Alaa Abdelnaby,"Abdelnaby, Alaa",1990,Y,0,76001,HISTADD_alaa_abdelnaby,0,,,,0,,1994
1,Zaid Abdul-Aziz,"Abdul-Aziz, Zaid",1968,Y,0,76002,HISTADD_zaid_abdul-aziz,0,,,,0,,1977
2,Kareem Abdul-Jabbar,"Abdul-Jabbar, Kareem",1969,Y,0,76003,HISTADD_kareem_abdul-jabbar,0,,,,0,,1988
3,Mahmoud Abdul-Rauf,"Abdul-Rauf, Mahmoud",1990,Y,0,51,mahmoud_abdul-rauf,0,,,,0,,2000
4,Tariq Abdul-Wahad,"Abdul-Wahad, Tariq",1997,Y,0,1505,tariq_abdul-wahad,0,,,,0,,2003


In [39]:
player_index_map = pd.read_csv("../data/player_index_map.csv")
player_index_map.drop(player_index_map.columns[0], axis = 1, inplace = True)

index_player_dict = dict() # This will map indexes to player id's
player_index_dict = dict() # This maps player id's to indexes
for i in range(len(player_index_map)):
    index = i
    playerid = player_index_map.iloc[i].player_id
    index_player_dict[index] = playerid
    player_index_dict[playerid] = index

player_index_map.head()

Unnamed: 0,player_id,index
0,202694.0,0
1,1628369.0,1
2,1627759.0,2
3,1626179.0,3
4,201577.0,4


In [50]:
# Note - player_index_map is already sorted by index so we can just take the player_id column as all of the id's of players we need
player_ids = player_index_map.player_id
player_names = []
for item in player_ids:
    cur_id = item
    cur_row = playerlist.loc[playerlist.PERSON_ID == cur_id]
    name = cur_row.DISPLAY_FIRST_LAST.iloc[0]
    player_names.append(name)
    
#player_names
player_index_map["Name"] = player_names    
player_index_map.head()

Unnamed: 0,player_id,index,Name
0,202694.0,0,Marcus Morris
1,1628369.0,1,Jayson Tatum
2,1627759.0,2,Jaylen Brown
3,1626179.0,3,Terry Rozier
4,201577.0,4,Robin Lopez


## Now that we have a list of names in the proper order, we can link these names to contract values

In [51]:
contracts_df = pd.read_csv("../data/contract_data.csv")
contracts_df.drop(contracts_df.columns[0], axis = 1, inplace = True)
contracts_df.head()

Unnamed: 0,Year,Name,Age,Pos,Contract Value,Type
0,2018,Kent Bazemore,29,SG,"$18,089,887",Cap Space
1,2018,Miles Plumlee,30,C,"$12,500,000",Bird
2,2018,Dewayne Dedmon,29,C,"$7,200,000",Cap Space
3,2018,Trae Young,20,PG,"$5,356,440",Rookie
4,2018,Alex Len,25,C,"$4,350,000",Room


In [52]:
# Get rid of '$' in the contract value column and cast to float
contracts_df = contracts_df[contracts_df['Contract Value'] != '- ']
contracts_df = contracts_df[contracts_df['Contract Value'] != 'PG']
contracts_df = contracts_df[contracts_df['Contract Value'] != 'G']
contracts_df = contracts_df[contracts_df['Contract Value'] != 'C']
contracts_df = contracts_df[contracts_df['Contract Value'] != 'SG']
contracts_df = contracts_df[contracts_df['Contract Value'] != 'SF']
# contracts_df
contracts_df['Contract Value'] = contracts_df['Contract Value'].replace('[\$,]', '', regex=True).astype(float)
contracts_df

Unnamed: 0,Year,Name,Age,Pos,Contract Value,Type
0,2018,Kent Bazemore,29,SG,18089887.0,Cap Space
1,2018,Miles Plumlee,30,C,12500000.0,Bird
2,2018,Dewayne Dedmon,29,C,7200000.0,Cap Space
3,2018,Trae Young,20,PG,5356440.0,Rookie
4,2018,Alex Len,25,C,4350000.0,Room
...,...,...,...,...,...,...
882,2019,Shabazz Napier,28,PG,1845301.0,Cap Space
883,2019,Isaac Bonga,19,PG,1416852.0,Cap Space
884,2019,Gary Payton II,26,PG,1052909.0,Hardship
885,2019,Admiral Schofield,22,SF,1000000.0,NT-MLE


In [65]:
# Now that we have player contracts, we can iterate through player_names and collect contract data
contracts_df = contracts_df.loc[contracts_df.Year == 2018]
# contracts_df

wrong_match = {"Marcus Morris": "Marcus Morris Sr.", "Harry Giles III":"Harry Giles", "D.J. Wilson":"DJ Wilson", 
               "Bam Adebayo":"Edrice Adebayo", "Nene":"Nene Hilario", "Patty Mills":"Patrick Mills", 
               "Juancho Hernangomez":"Juan Hernangomez", "Robert Williams III":"Robert Williams", 
               "Danuel House Jr.":"Danuel House", "PJ Tucker":"P.J. Tucker", "Mo Bamba":"Mohamed Bamba", 
               "Svi Mykhailiuk":"Sviatoslav Mykhailiuk", "Ish Smith":"Ishmael Smith","CJ McCollum" :"C.J. McCollum",
               "CJ Miles":"C.J. Miles", "Lou Williams":"Louis Williams", "JJ Redick":"J.J. Redick", "JR Smith":"J.R. Smith",
              "Frank Mason":"Frank Mason III", "J.J. Barea":"Jose Barea"}

i = 0
zeros_list = set()
contract_lst = []
contract_types = []
for player in player_names:
    try:
        contract = contracts_df.loc[contracts_df.Name == player]['Contract Value'].iloc[0] # get the contract
        contract_types.append(contracts_df.loc[contracts_df.Name == player]['Type'].iloc[0])
    except:
        if(player in wrong_match):
            contract = contracts_df.loc[contracts_df.Name == wrong_match[player]]['Contract Value'].iloc[0]
            contract_types.append(contracts_df.loc[contracts_df.Name == wrong_match[player]]['Type'].iloc[0])
        else:
            contract = 0
            contract_types.append("NA")
        zeros_list.add(player_names[i])
    i += 1
    contract_lst.append(contract) # so we will either append the contract (if we have one) or append 0 (if we don't have one)
    
#assert(len(contract_lst) == len(player_names))

## At this point we have a list of player contracts that is indexed in the same way as the list of player names, which is indexed in order according to their indexes in the main dataset.

Sanity check below - Steph Curry has the highest contract value. Lebron also has a very high contract value. This seems to check out.

In [60]:
lebron_index = player_names.index("LeBron James")
contract_lst[lebron_index]
contract_lst.index(np.nanmax(np.array(contract_lst, dtype=np.float64)))
player_names[317]

#player_names.index("Kyle Kuzma")
#contract_lst[167]

'Stephen Curry'

# Now - convert contract value into an estimate of +/- on the basis of 3mil contract value per point

In [61]:
prior_means = np.array(contract_lst) / 3e6
prior_means[lebron_index]
prior_means[317]

12.485718

In [69]:
# Now set an array of prior standard deviations that are large. Specific value is chosen arbitrarily
prior_sd = np.full(len(prior_means), 3)
np.shape(prior_sd)
np.shape(prior_means)


priors_df = pd.DataFrame(prior_means, columns = ['mu'])
priors_df['sd'] = prior_sd
priors_df['name'] = player_names
priors_df['type'] = contract_types
priors_df = priors_df[priors_df['mu'] != 0]
#print(priors_df)
priors_df.to_csv(r'../data/priors_contract_only_zerorem.csv')

priors_rookie = priors_df[priors_df['type'] == 'Rookie']
priors_rookie.to_csv(r'../data/priors_rookies.csv')

Unnamed: 0,team,rating
0,HOU,13.219058
1,GSW,12.688947
2,TOR,11.463679
3,PHI,9.700574
4,UTA,9.675653
5,OKC,8.526489
6,BOS,8.267972
7,SAS,8.012503
8,MIN,7.507834
9,POR,7.476475
