# Creating Priors

This notebook will build on earlier notebooks to create team priors based on team ratings and contract value.

In [81]:
import pandas as pd
import numpy as np

data = pd.read_csv("../data/priors_contract+team.csv")
contract_priors = pd.read_csv("../data/priors_contract_only.csv")

data.drop(data.columns[0], axis = 1, inplace = True)
contract_priors.drop(contract_priors.columns[0], axis = 1, inplace = True)

data.head()
contract_priors.head()

Unnamed: 0,mu,sd,name
0,0.0,5,Marcus Morris
1,2.2336,5,Jayson Tatum
2,1.72332,5,Jaylen Brown
3,1.016797,5,Terry Rozier
4,4.785917,5,Robin Lopez


In [82]:
# add a new col to `data` that specifies the index where each row should be at when we reshuffle
# first need to create a dictionary mapping player name to index

d = dict()
for i in range(len(contract_priors)):
    player = contract_priors.iloc[i]['name']
    d[player] = i
    
new_col = []
for i in range(len(data)):
    new_col.append(d[data.iloc[i]['name']])
    



In [83]:
data['index'] = new_col
data.sort_values(by = ['index'], inplace = True)
data.fillna(0, inplace = True)
data

Unnamed: 0,rating,Team,mu,sd,name,index
0,0.000000,0,0.000000,5,Marcus Morris,0
118,8.267972,Boston Celtics,2.233600,5,Jayson Tatum,1
119,8.267972,Boston Celtics,1.723320,5,Jaylen Brown,2
120,8.267972,Boston Celtics,1.016797,5,Terry Rozier,3
130,-1.539042,Chicago Bulls,4.785917,5,Robin Lopez,4
...,...,...,...,...,...,...
113,0.000000,0,0.000000,5,RJ Hunter,524
114,0.000000,0,0.000000,5,Kalin Lucas,525
115,0.000000,0,0.000000,5,Angel Delgado,526
116,0.000000,0,0.000000,5,Dusty Hannahs,527


### Now center contract priors 'mu'

In [84]:
# first replace all 0 values in mu with the mean of the nonzero elements
tmp_mean = np.mean(data['mu'].loc[data.mu != 0])
data['mu'].replace(0, tmp_mean, inplace = True)

data['mu'] = data['mu'] - np.mean(data['mu']) # center the contract priors
data

Unnamed: 0,rating,Team,mu,sd,name,index
0,0.000000,0,0.000000,5,Marcus Morris,0
118,8.267972,Boston Celtics,-0.210400,5,Jayson Tatum,1
119,8.267972,Boston Celtics,-0.720680,5,Jaylen Brown,2
120,8.267972,Boston Celtics,-1.427204,5,Terry Rozier,3
130,-1.539042,Chicago Bulls,2.341916,5,Robin Lopez,4
...,...,...,...,...,...,...
113,0.000000,0,0.000000,5,RJ Hunter,524
114,0.000000,0,0.000000,5,Kalin Lucas,525
115,0.000000,0,0.000000,5,Angel Delgado,526
116,0.000000,0,0.000000,5,Dusty Hannahs,527


In [85]:
min_mu = min(data['mu'])
max_mu = max(data['mu'])
print(min_mu, max_mu)

range_mu = max_mu - min_mu
print(range_mu)

-2.442421248175182 10.041717751824816
12.484138999999999


### Now center team ratings

In [86]:
# replace zero team ratings with mean of nonzero team ratings
tmp_mean = np.mean(data['rating'].loc[data.rating != 0])
data['rating'].replace(0, tmp_mean, inplace = True)

data['rating'] = data['rating'] - np.mean(data['rating']) # center team ratings
data

Unnamed: 0,rating,Team,mu,sd,name,index
0,0.000000,0,0.000000,5,Marcus Morris,0
118,2.753041,Boston Celtics,-0.210400,5,Jayson Tatum,1
119,2.753041,Boston Celtics,-0.720680,5,Jaylen Brown,2
120,2.753041,Boston Celtics,-1.427204,5,Terry Rozier,3
130,-7.053973,Chicago Bulls,2.341916,5,Robin Lopez,4
...,...,...,...,...,...,...
113,0.000000,0,0.000000,5,RJ Hunter,524
114,0.000000,0,0.000000,5,Kalin Lucas,525
115,0.000000,0,0.000000,5,Angel Delgado,526
116,0.000000,0,0.000000,5,Dusty Hannahs,527


In [87]:
min_rating = min(data.rating)
max_rating = max(data.rating)
print(min_rating, max_rating)

range_rating = max_rating - min_rating
print(range_rating)

-8.985215934303032 7.70412714369697
16.689343078


## Combine Contract Prior (stored as mu) and Team Rating Prior

We need some weighted average of the contract prior and the team rating prior to get a mean for the final prior distirbution for each player. However, I am unsure of how to come up with a reasonable weighting other than using cross validation and selecting from a few choices of weights (80/20, 70/30, 60/40, 50/50, etc.). Perhaps Kostas or Brian has some input on how we should weight these, or perhaps cross validation is simply best. 

For now - we tried a simple average, but this gives very unintuitive and likely bad priors (James Harden has a negative prior mean since the rockets are quite good, and Steph Curry's mean is barely positive). Instead, let's try an 80/20 split in favor of contract priors. These results seem a bit more reasonable, though for the final split we should use cross validation to determine the optimal weighting.

**Note** - this does a very poor job of dealing with rookie contracts, as expected. Jayson Tatum should not have a negative prior, but he does due to his low contract and his good team.

In [93]:
# Below we start with simple average of team rating and contract prior
# NOTE - we negate subtract team rating to capture the notion that +/- should be penalized for players on 
# good teams and boosted for players on bad teams

final_priors_df = pd.DataFrame({'name': data['name'], 'mu': 0.8*data['mu'] - 0.2*data['rating'], 'sd': data['sd']})
final_priors_df

Unnamed: 0,name,mu,sd
0,Marcus Morris,0.000000,5
118,Jayson Tatum,-0.718928,5
119,Jaylen Brown,-1.127152,5
120,Terry Rozier,-1.692371,5
130,Robin Lopez,3.284328,5
...,...,...,...
113,RJ Hunter,0.000000,5
114,Kalin Lucas,0.000000,5
115,Angel Delgado,0.000000,5
116,Dusty Hannahs,0.000000,5


In [101]:
final_priors_df.iloc[275]

name    Jared Dudley
mu           1.36346
sd                 5
Name: 281, dtype: object

In [102]:
final_priors_df.to_csv(r'../data/priors_team_contract_formatted.csv')