In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

In [2]:
home = pd.read_csv('../data/NBAHomePer100.csv')

team_dict = {
    'ATL': 'Hawks',
    'BOS': 'Celtics',
    'CHA': 'Hornets', 
    'CHI': 'Bulls',
    'CLE': 'Cavaliers',
    'DAL': 'Mavericks',
    'DEN': 'Nuggets',
    'DET': 'Pistons',
    'GSW': 'Warriors',
    'HOU': 'Rockets',
    'IND': 'Pacers',
    'LAC': 'Clippers',
    'LAL': 'Lakers',
    'MEM': 'Grizzlies',
    'MIA': 'Heat',
    'MIL': 'Bucks',
    'MIN': 'Timberwolves',
    'NJN': 'Nets', 
    'NOH': 'Pelicans', 
    'NYK': 'Knicks',
    'OKC': 'Thunder',
    'ORL': 'Magic',
    'PHI': '76ers',
    'PHX': 'Suns',
    'POR': 'Trail Blazers',
    'SAC': 'Kings',
    'SAS': 'Spurs',
    'TOR': 'Raptors',
    'UTA': 'Jazz',
    'WAS': 'Wizards',
    'BKN': 'Nets',
    'NOP': 'Pelicans'
}

home['season'] = home['TeamSeason'].str[:4]
home['team'] = home['TeamSeason'].str[4:]
home = home.drop(columns=['TeamSeason'])

def rename_columns(col):
    if col.endswith('1'):
        return col[:-1]
    elif col.endswith('2'):
        return 'opponent_' + col[:-1]
    else:
        return col

home.columns = [rename_columns(col) for col in home.columns]

away = pd.read_csv('../data/NBAAwayPer100.csv')

away['season'] = away['TeamSeason'].str[:4]
away['team'] = away['TeamSeason'].str[4:]
away = away.drop(columns=['TeamSeason'])

away.columns = [rename_columns(col) for col in away.columns]

In [3]:
master_df = pd.merge(home, away, on=['team', 'season'], suffixes=('_home', '_away'))

team_rapm = pd.read_csv('../data/TeamRAPMMinutesWeighted.csv')
player_rapm = pd.read_csv('../data/PlayerRAPM.csv')

team_rapm[['season', 'team']] = team_rapm['team_season'].str.split(' - ', expand=True)
team_rapm['season'] = team_rapm['season'].str.split('-').str[0]

team_rapm = team_rapm.drop(columns=['team_season'])

team_rapm['team'] = team_rapm['team'].map(team_dict)

df = pd.merge(master_df, team_rapm, on=['team', 'season'])

df['total_wins'] = df['Wins_home'] + df['Wins_away']

df = df.drop(columns =  ['Wins_home','Wins_away','Unnamed: 0','Team_Stats_home','Team_Stats_away','Opponent_Stats_home','Opponent_Stats_away'])

desired_columns = ['season', 'team'] + [col for col in df.columns if col not in ['season', 'team', 'total_wins']] + ['total_wins']
df = df[desired_columns]

## Create dummies for fixed effects model ##
team_dummies = pd.get_dummies(df['team'], prefix='team', drop_first=True)
season_dummies = pd.get_dummies(df['season'], prefix='season', drop_first=True)

df = pd.concat([df, team_dummies, season_dummies], axis=1)

df = pd.read_csv('../data/master_df_with_lags.csv')
df = df.drop(columns = 'Column 1')
df = df.drop(columns = 'lag 2')

df = df.dropna()

In [4]:
df

Unnamed: 0,season,team,team_score_home,assists_home,blocks_home,defensive_rebounds_home,fast_break_points_home,field_goals_made_home,field_goals_attempted_home,flagrant_fouls_home,...,season_2013,season_2014,season_2015,season_2016,season_2017,season_2018,season_2019,season_2020,season_2021,season_2022
1,2010,76ers,106.379759,21.826870,6.007636,32.556666,19.203449,40.610571,88.592959,0.026234,...,0,0,0,0,0,0,0,0,0,0
2,2011,76ers,104.245712,23.362721,4.197989,33.792507,17.104849,40.441496,88.809629,0.078223,...,0,0,0,0,0,0,0,0,0,0
3,2012,76ers,103.461156,23.249698,5.812424,34.177056,15.743367,41.284821,91.603810,0.033214,...,0,0,0,0,0,0,0,0,0,0
4,2013,76ers,117.513286,26.916451,5.720918,38.952262,13.598904,46.580153,107.196876,0.031262,...,1,0,0,0,0,0,0,0,0,0
5,2014,76ers,117.262895,24.282297,4.873866,37.105206,17.841831,44.793149,104.672074,0.029011,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
409,2018,Wizards,125.057466,4.747174,25.017318,38.152138,15.231731,46.889269,103.185516,9.494349,...,0,0,0,0,0,1,0,0,0,0
410,2019,Wizards,126.401906,29.273765,4.694849,36.343655,16.956690,46.589473,102.264859,0.027617,...,0,0,0,0,0,0,1,0,0,0
411,2020,Wizards,129.485215,29.035509,5.011523,34.830082,16.600669,46.920380,102.485637,0.156610,...,0,0,0,0,0,0,0,1,0,0
412,2021,Wizards,126.932556,28.262086,4.751486,38.968356,14.501289,46.743515,100.213160,0.123415,...,0,0,0,0,0,0,0,0,1,0


In [5]:
# Split 1 (Random Split )

# X = df.drop(columns = ['total_wins','season','team'])
# y = df['total_wins']

# X = df.drop(columns = ['total_wins','season','team'])
# y = df['total_wins']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state=42)

In [6]:
# Split 2 (Time Series Split)

df['season'] = df['season'].astype(int)
X = df.drop(columns=['total_wins', 'team'])
y = df['total_wins']

X_train = X.loc[X['season'] <= 2019]
X_test = X.loc[X['season'] > 2019]
y_train = y.loc[X['season'] <= 2019]
y_test = y.loc[X['season'] > 2019]

X_train = X_train.drop(columns=['season'])
X_test = X_test.drop(columns=['season'])


In [7]:
# Linear Regression #

model = LinearRegression()
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

lr_coefs = model.coef_
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error on Test Data:", mse)
r2 = r2_score(y_test, y_pred)
print('R2 score for LR is ', r2)
print( ' ')

Mean Squared Error on Test Data: 13183.356620488805
R2 score for LR is  -102.31694219346683
 


In [8]:
#Ridge

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

alphas = [1e-15, 1e-10, 1e-8, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 0.5, 1.0, 5.0, 10.0, 20.0, 50.0, 100.0, 500.0, 1000.0]

param_grid = {'alpha': alphas}

ridge_model = Ridge()
grid_search = GridSearchCV(estimator=ridge_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train_scaled, y_train)

best_alpha = grid_search.best_params_['alpha']
ridge_model = Ridge(alpha=best_alpha)
ridge_model.fit(X_train_scaled, y_train)
y_pred = ridge_model.predict(X_test_scaled)

ridge_coefs = ridge_model.coef_

print("Best Alpha:", grid_search.best_params_['alpha'])
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error on Test Data:", mse)
r2 = r2_score(y_test, y_pred)
print('R2 score for Ridge is ', r2)
print( ' ' )

Best Alpha: 5.0
Mean Squared Error on Test Data: 21.59126477052915
R2 score for Ridge is  0.8307909344791822
 


In [9]:
## Lasso ##

lasso_model = Lasso(max_iter = 100000, tol=6e-2)

grid_search = GridSearchCV(estimator=lasso_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)

grid_search.fit(X_train, y_train)
best_lasso_model = grid_search.best_estimator_

y_pred = best_lasso_model.predict(X_test)
r2 = r2_score(y_test, y_pred)

test_error = mean_squared_error(y_test, y_pred)
print("Best Alpha:", grid_search.best_params_['alpha'])
print("Lasso Regression Test Error (MSE):", test_error)
print('Lasso R2 Score: ', r2)
print( ' ' ' ' ' ' ' ' ' ')
lasso_coefs = best_lasso_model.coef_

Best Alpha: 1.0
Lasso Regression Test Error (MSE): 15.254912790767476
Lasso R2 Score:  0.8804484329491147
     


In [10]:
# Elastic Net

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

elastic_net_model = ElasticNet(tol = 6e-2)

param_grid = {
    'alpha': [1e-15, 1e-10, 1e-8, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 0.5, 1.0, 5.0, 10.0, 20.0, 50.0, 100.0, 500.0, 
              1000.0],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9, 0.95, 1.0]
}

grid_search = GridSearchCV(estimator=elastic_net_model, param_grid=param_grid, scoring='neg_mean_squared_error', 
                           cv=5)

grid_search.fit(X_train_scaled, y_train)

best_elastic_net_model = grid_search.best_estimator_

y_pred = best_elastic_net_model.predict(X_test_scaled)

r2 = r2_score(y_test, y_pred)
test_error = mean_squared_error(y_test, y_pred)

print("Best Parameters:", grid_search.best_params_)
print("Elastic Net Regression Test Error (MSE):", test_error)
print('Elastic Net R2 Score: ', r2)
print('')

elastic_net_coefs = best_elastic_net_model.coef_

Best Parameters: {'alpha': 0.1, 'l1_ratio': 0.1}
Elastic Net Regression Test Error (MSE): 34.58979156188198
Elastic Net R2 Score:  0.7289224893052675



In [11]:
feature_names = X.columns[1:]
coefs_df = pd.DataFrame({
    'Feature': feature_names,
    'Lasso Coefficient': (lasso_coefs),
    'Ridge Coefficient': (ridge_coefs),
    'ElasticNet Coefficient': (elastic_net_coefs)
})

coefs_df.to_csv('../data/coefficients_df.csv')

coefs_df

Unnamed: 0,Feature,Lasso Coefficient,Ridge Coefficient,ElasticNet Coefficient
0,team_score_home,0.892144,3.342494,1.758067
1,assists_home,0.000000,0.632691,0.651592
2,blocks_home,-0.000000,-0.104930,-0.349813
3,defensive_rebounds_home,0.000000,1.873159,0.895915
4,fast_break_points_home,-0.014332,-0.252972,-0.180731
...,...,...,...,...
138,season_2018,0.000000,-0.156041,-0.136443
139,season_2019,-0.000000,-0.108110,-0.575750
140,season_2020,0.000000,0.000000,0.000000
141,season_2021,0.000000,0.000000,0.000000


In [12]:
players_df = pd.read_csv('../data/players.csv')
players_df = players_df.loc[players_df['GP'] > 30]
players_df = players_df.drop(columns=['Unnamed: 0','X','Player'], errors='ignore')
players_df.rename(columns = {'Team':'team'}, inplace=True)
players_df['team'] = players_df['team'].map(team_dict)
players_df['season'] = players_df['season'].str[:4]
players_df = pd.get_dummies(players_df, columns=['season', 'team'], drop_first={'team': True, 'season': False})
players_df['playerId'] = players_df['playerId'].astype(str)

numeric_columns = players_df.select_dtypes(include=['number']).columns
scaler = StandardScaler()

players_df[numeric_columns] = scaler.fit_transform(players_df[numeric_columns])

players_normalized_df = pd.DataFrame(columns=df.columns)

players_normalized_df['playerId'] = players_df['playerId']
players_normalized_df['playerName'] = players_df['playerName']
players_normalized_df['team'] = np.nan
players_normalized_df['team_score_home'] = players_df['PTS']
players_normalized_df['assists_home'] = players_df['AST']
players_normalized_df['blocks_home'] = players_df['BLK']
players_normalized_df['defensive_rebounds_home'] = players_df['DREB']
players_normalized_df['fast_break_points_home'] = np.nan
players_normalized_df['field_goals_made_home'] = players_df['FGM']
players_normalized_df['field_goals_attempted_home'] = players_df['FGA']
players_normalized_df['flagrant_fouls_home'] = np.nan
players_normalized_df['fouls_home'] = players_df['PF']
players_normalized_df['free_throws_made_home'] = players_df['FTM']
players_normalized_df['free_throws_attempted_home'] = players_df['FTA']
players_normalized_df['offensive_rebounds_home'] = players_df['ORB']
players_normalized_df['points_in_paint_home'] = np.nan
players_normalized_df['steals_home'] = players_df['STL']
players_normalized_df['team_turnovers_home'] = players_df['TO']
players_normalized_df['technical_fouls_home'] = np.nan
players_normalized_df['three_point_field_goals_made_home'] = players_df['3PM']
players_normalized_df['three_point_field_goals_attempted_home'] = players_df['3PA']
players_normalized_df['total_rebounds_home'] = players_df['ORB'] + players_df['DREB']
players_normalized_df['total_technical_fouls_home'] = np.nan
players_normalized_df['turnover_points_home'] = np.nan
players_normalized_df['turnovers_home'] = players_df['TO']
players_normalized_df['opponent_team_score_home'] = np.nan
players_normalized_df['opponent_assists_home'] = np.nan
players_normalized_df['opponent_blocks_home'] = np.nan
players_normalized_df['opponent_defensive_rebounds_home'] = np.nan
players_normalized_df['opponent_fast_break_points_home'] = np.nan
players_normalized_df['opponent_field_goals_made_home'] = np.nan
players_normalized_df['opponent_field_goals_attempted_home'] = np.nan
players_normalized_df['opponent_flagrant_fouls_home'] = np.nan
players_normalized_df['opponent_fouls_home'] = np.nan
players_normalized_df['opponent_free_throws_made_home'] = np.nan
players_normalized_df['opponent_free_throws_attempted_home'] = np.nan
players_normalized_df['opponent_offensive_rebounds_home'] = np.nan
players_normalized_df['opponent_points_in_paint_home'] = np.nan
players_normalized_df['opponent_steals_home'] = np.nan
players_normalized_df['opponent_team_turnovers_home'] = np.nan
players_normalized_df['opponent_technical_fouls_home'] = np.nan
players_normalized_df['opponent_three_point_field_goals_made_home'] = np.nan
players_normalized_df['opponent_three_point_field_goals_attempted_home'] = np.nan
players_normalized_df['opponent_total_rebounds_home'] = np.nan
players_normalized_df['opponent_total_technical_fouls_home'] = np.nan
players_normalized_df['opponent_turnover_points_home'] = np.nan
players_normalized_df['opponent_turnovers_home'] = np.nan
players_normalized_df['Games_home'] = np.nan
players_normalized_df['Possessions_home'] = np.nan

players_normalized_df['team_score_away'] = players_df['PTS']
players_normalized_df['assists_away'] = players_df['AST']
players_normalized_df['blocks_away'] = players_df['BLK']
players_normalized_df['defensive_rebounds_away'] = players_df['DREB']
players_normalized_df['fast_break_points_away'] = np.nan
players_normalized_df['field_goals_made_away'] = players_df['FGM']
players_normalized_df['field_goals_attempted_away'] = players_df['FGA']
players_normalized_df['flagrant_fouls_away'] = np.nan
players_normalized_df['fouls_away'] = players_df['PF']
players_normalized_df['free_throws_made_away'] = players_df['FTM']
players_normalized_df['free_throws_attempted_away'] = players_df['FTA']
players_normalized_df['offensive_rebounds_away'] = players_df['ORB']
players_normalized_df['points_in_paint_away'] = np.nan
players_normalized_df['steals_away'] = players_df['STL']
players_normalized_df['team_turnovers_away'] = players_df['TO']
players_normalized_df['technical_fouls_away'] = np.nan
players_normalized_df['three_point_field_goals_made_away'] = players_df['3PM']
players_normalized_df['three_point_field_goals_attempted_away'] = players_df['3PA']
players_normalized_df['total_rebounds_away'] = players_df['ORB'] + players_df['DREB']
players_normalized_df['total_technical_fouls_away'] = np.nan
players_normalized_df['turnover_points_away'] = np.nan
players_normalized_df['turnovers_away'] = players_df['TO']
players_normalized_df['opponent_team_score_away'] = np.nan
players_normalized_df['opponent_assists_away'] = np.nan
players_normalized_df['opponent_blocks_away'] = np.nan
players_normalized_df['opponent_defensive_rebounds_away'] = np.nan
players_normalized_df['opponent_fast_break_points_away'] = np.nan
players_normalized_df['opponent_field_goals_made_away'] = np.nan
players_normalized_df['opponent_field_goals_attempted_away'] = np.nan
players_normalized_df['opponent_flagrant_fouls_away'] = np.nan
players_normalized_df['opponent_fouls_away'] = np.nan
players_normalized_df['opponent_free_throws_made_away'] = np.nan
players_normalized_df['opponent_free_throws_attempted_away'] = np.nan
players_normalized_df['opponent_offensive_rebounds_away'] = np.nan
players_normalized_df['opponent_points_in_paint_away'] = np.nan
players_normalized_df['opponent_steals_away'] = np.nan
players_normalized_df['opponent_team_turnovers_away'] = np.nan
players_normalized_df['opponent_technical_fouls_away'] = np.nan
players_normalized_df['opponent_three_point_field_goals_made_away'] = np.nan
players_normalized_df['opponent_three_point_field_goals_attempted_away'] = np.nan
players_normalized_df['opponent_total_rebounds_away'] = np.nan
players_normalized_df['opponent_total_technical_fouls_away'] = np.nan
players_normalized_df['opponent_turnover_points_away'] = np.nan
players_normalized_df['opponent_turnovers_away'] = np.nan
players_normalized_df['Games_away'] = np.nan
players_normalized_df['Possessions_away'] = np.nan
players_normalized_df['MP'] = players_df['Min']
players_normalized_df['new_LA_RAPM'] = players_df['LA_RAPM']
players_normalized_df['new_LA_RAPM__Def'] = players_df['LA_RAPM__Def']
players_normalized_df['new_LA_RAPM__Off'] = players_df['LA_RAPM__Off']
players_normalized_df['new_RAPM'] = players_df['RAPM']
players_normalized_df['new_RAPM__Def'] = players_df['RAPM__Def']
players_normalized_df['new_RAPM__Off'] = players_df['RAPM__Off']
players_normalized_df['total_wins'] = np.nan
players_normalized_df['lag 1'] = np.nan
# players_normalized_df['lag 2'] = np.nan
players_normalized_df['team_change'] = np.nan
players_normalized_df['team_Bucks'] = players_df['team_Bucks']
players_normalized_df['team_Bulls'] = players_df['team_Bulls']
players_normalized_df['team_Cavaliers'] = players_df['team_Cavaliers']
players_normalized_df['team_Celtics'] = players_df['team_Celtics']
players_normalized_df['team_Clippers'] = players_df['team_Clippers']
players_normalized_df['team_Grizzlies'] = players_df['team_Grizzlies']
players_normalized_df['team_Hawks'] = players_df['team_Hawks']
players_normalized_df['team_Heat'] = players_df['team_Heat']

players_normalized_df = players_normalized_df[['playerId', 'playerName'] + [col for col in players_normalized_df.columns if col not in ['playerId', 'playerName']]]

features = players_normalized_df.drop(columns=['playerId', 'playerName', 'season', 'team','total_wins']).columns
players_normalized_df[features] = players_normalized_df[features].fillna(0)
players_normalized_df['ridge_projection'] = players_normalized_df[features].dot(ridge_coefs)
players_normalized_df['lasso_projection'] = players_normalized_df[features].dot(lasso_coefs)
players_normalized_df['elastic_net_projection'] = players_normalized_df[features].dot(elastic_net_coefs)

player_coefficients = players_normalized_df[['playerId', 'playerName','ridge_projection', 'lasso_projection', 'elastic_net_projection']]

In [13]:
player_coefficients

Unnamed: 0,playerId,playerName,ridge_projection,lasso_projection,elastic_net_projection
0,201985,AJ Price,-1.127863,-0.185230,-0.872495
1,201166,Aaron Brooks,0.101664,-0.210987,-0.787982
2,201189,Aaron Gray,-0.656595,0.067534,-1.875201
3,1733,Al Harrington,2.677690,-0.021720,1.968420
4,201143,Al Horford,1.197090,0.413666,1.307962
...,...,...,...,...,...
6557,1629139,Yuta Watanabe,-1.726979,-0.099884,-1.466170
6558,1628380,Zach Collins,2.185716,0.143244,0.259020
6559,203897,Zach LaVine,5.403041,-0.083168,5.896084
6560,1630192,Zeke Nnaji,-0.798826,-0.032913,-2.101333


In [14]:
player_coefficients.loc[:, ['ridge_scaled', 'lasso_scaled', 'elastic_net_scaled']] = scaler.fit_transform(
    player_coefficients[['ridge_projection', 'lasso_projection', 'elastic_net_projection']]
)

# Shift the standardized values to have an average of 100
player_coefficients['ridge_normalized'] = round(player_coefficients['ridge_scaled'] * 10 + 100)
player_coefficients['lasso_normalized'] = round(player_coefficients['lasso_scaled'] * 10 + 100)
player_coefficients['elastic_net_normalized'] = round(player_coefficients['elastic_net_scaled'] * 10 + 100)

# Drop the intermediate scaled columns
player_coefficients.drop(columns=['ridge_scaled', 'lasso_scaled', 'elastic_net_scaled'], inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_coefficients.loc[:, ['ridge_scaled', 'lasso_scaled', 'elastic_net_scaled']] = scaler.fit_transform(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_coefficients['ridge_normalized'] = round(player_coefficients['ridge_scaled'] * 10 + 100)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pl

In [15]:
player_coefficients_norm = player_coefficients[['playerId','playerName','ridge_normalized','lasso_normalized','elastic_net_normalized']]

In [16]:
player_coefficients_norm

Unnamed: 0,playerId,playerName,ridge_normalized,lasso_normalized,elastic_net_normalized
0,201985,AJ Price,95.0,89.0,96.0
1,201166,Aaron Brooks,100.0,88.0,97.0
2,201189,Aaron Gray,97.0,104.0,92.0
3,1733,Al Harrington,113.0,99.0,108.0
4,201143,Al Horford,106.0,123.0,105.0
...,...,...,...,...,...
6557,1629139,Yuta Watanabe,92.0,94.0,94.0
6558,1628380,Zach Collins,110.0,108.0,101.0
6559,203897,Zach LaVine,125.0,95.0,124.0
6560,1630192,Zeke Nnaji,96.0,98.0,91.0


In [17]:
positions = pd.read_csv('../data/positions1.csv')

positions = positions[['LeagueDashPlayerStats.PLAYER_ID', 'Position']]
positions.rename(columns={'LeagueDashPlayerStats.PLAYER_ID': 'playerId', 'Position': 'position'}, inplace=True)
positions['playerId'] = positions['playerId'].astype(str)

player_coefficients_norm['playerId'] = player_coefficients_norm['playerId'].astype(str)

player_master_df = player_coefficients_norm.merge(positions, on='playerId', how='left')
player_master_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_coefficients_norm['playerId'] = player_coefficients_norm['playerId'].astype(str)


Unnamed: 0,playerId,playerName,ridge_normalized,lasso_normalized,elastic_net_normalized,position
0,201985,AJ Price,95.0,89.0,96.0,Guard
1,201985,AJ Price,95.0,89.0,96.0,Guard
2,201985,AJ Price,95.0,89.0,96.0,Guard
3,201985,AJ Price,95.0,89.0,96.0,Guard
4,201985,AJ Price,95.0,89.0,96.0,Guard
...,...,...,...,...,...,...
56184,1630192,Zeke Nnaji,96.0,98.0,91.0,Forward
56185,1630192,Zeke Nnaji,96.0,98.0,91.0,Center
56186,1630533,Ziaire Williams,110.0,92.0,107.0,Forward
56187,1630533,Ziaire Williams,110.0,92.0,107.0,Forward


In [18]:
pip install cvxpy

Note: you may need to restart the kernel to use updated packages.


In [19]:
import cvxpy as cp

performance_metrics = [
    'ridge_normalized',
    'elastic_net_normalized',
    'lasso_normalized',
    'graeme+'
]

player_master_df['graeme+'] = (player_master_df['elastic_net_normalized'] + player_master_df['ridge_normalized']) / 2

rosters = {}
solve_values = {}

for metric in performance_metrics:
    performance_metric = player_master_df[metric]
    
    n = len(player_master_df)
    x = cp.Variable(n, boolean=True)

    obj = cp.Maximize(cp.sum(cp.multiply(performance_metric, x)))

    constraints = [
        cp.sum(x) == 10,
        cp.sum(cp.multiply(x, (player_master_df['position'] == 'Guard').astype(int).values)) >= 2,
        cp.sum(cp.multiply(x, (player_master_df['position'] == 'Center').astype(int).values)) >= 2,
        cp.sum(cp.multiply(x, (player_master_df['position'] == 'Forward').astype(int).values)) >= 2,
    ]
    
    unique_player_ids = player_master_df['playerId'].unique()
    for player_id in unique_player_ids:
        indices = (player_master_df['playerId'] == player_id).astype(int).values
        constraints.append(cp.sum(cp.multiply(x, indices)) <= 1)
        
    problem = cp.Problem(obj, constraints)

    solve_value = problem.solve()
    solve_values[metric] = solve_value
    
    roster = np.where(x.value >= 0.5)[0]
    roster_names = player_master_df.iloc[roster][['playerId', 'playerName', 'position']]
    rosters[metric] = roster_names

for metric, roster in rosters.items():
    print(f"Roster for {metric}:")
    print(roster)
    print(f"Solve value for {metric}: {solve_values[metric]}")
    print("\n")

Roster for ridge_normalized:
      playerId          playerName position
4529      2216       Zach Randolph   Center
38911   201935        James Harden    Guard
40864   201939       Stephen Curry    Guard
42433  1628415       Dillon Brooks  Forward
43124  1628991   Jaren Jackson Jr.   Center
43476  1626157  Karl-Anthony Towns  Forward
49066   203897         Zach LaVine    Guard
53619  1630217        Desmond Bane    Guard
53684  1628378    Donovan Mitchell    Guard
54865  1630163         LaMelo Ball    Guard
Solve value for ridge_normalized: 1356.0


Roster for elastic_net_normalized:
      playerId         playerName position
4529      2216      Zach Randolph   Center
27552   201939      Stephen Curry    Guard
38911   201935       James Harden    Guard
40473   202331        Paul George    Guard
43124  1628991  Jaren Jackson Jr.   Center
46271  1628415      Dillon Brooks  Forward
50002  1630217       Desmond Bane    Guard
50778  1628369       Jayson Tatum  Forward
53684  1628378   Donov

In [20]:
salaries = pd.read_csv('../data/salaries.csv')
salaries.drop(columns=['Player'], inplace=True)
salaries.replace(np.nan, 10000000000000, inplace=True)

performance_metrics = [
    'ridge_normalized',
    'elastic_net_normalized',
    'lasso_normalized',
    'graeme+'
]

salaries['graeme+'] = (salaries['elastic_net_normalized'] + salaries['ridge_normalized']) / 2

salaries.drop(columns=['2020/21', '2021/22', '2022/23'], inplace=True)
salaries['2023/24'] = salaries['2023/24'].astype(int)

salary_caps = {
    "Big Market Optimal Roster": 200_000_000,
    "Mid Market Optimal Roster": 170_000_000,
    "Small Market Optimal Roster": 130_000_000,
}

for team_type, salary_cap in salary_caps.items():
    rosters = {}
    solve_values = {}

    for metric in performance_metrics:
        performance_metric = salaries[metric]

        n = len(salaries)
        x = cp.Variable(n, boolean=True)

        obj = cp.Maximize(cp.sum(cp.multiply(performance_metric, x)))

        constraints = [
            cp.sum(x) == 10,
            cp.sum(cp.multiply(x, (salaries['position'] == 'Guard').astype(int).values)) >= 2,
            cp.sum(cp.multiply(x, (salaries['position'] == 'Center').astype(int).values)) >= 2,
            cp.sum(cp.multiply(x, (salaries['position'] == 'Forward').astype(int).values)) >= 2,
            cp.sum(cp.multiply(x, salaries['2023/24'].values)) <= salary_cap
        ]

        unique_player_ids = salaries['playerId'].unique()
        for player_id in unique_player_ids:
            indices = (salaries['playerId'] == player_id).astype(int).values
            constraints.append(cp.sum(cp.multiply(x, indices)) <= 1)

        problem = cp.Problem(obj, constraints)

        solve_value = problem.solve()
        solve_values[metric] = solve_value

        roster = np.where(x.value >= 0.5)[0]
        roster_names = salaries.iloc[roster][['playerId', 'playerName', 'position']]
        rosters[metric] = roster_names

    print(f"Optimal roster construction at {salary_cap} ({team_type}):\n")
    for metric, roster in rosters.items():
        print(f"Roster for {metric}:")
        print(roster)
        print(f"Solve value for {metric}: {solve_values[metric]}")
        print("\n")

Optimal roster construction at 200000000 (Big Market Optimal Roster):

Roster for ridge_normalized:
     playerId         playerName position
56     201572        Brook Lopez   Center
146   1630217       Desmond Bane    Guard
151   1628415      Dillon Brooks    Guard
156   1628378   Donovan Mitchell    Guard
163   1629130    Duncan Robinson  Forward
257   1628991  Jaren Jackson Jr.  Forward
374   1630568         Luka Garza   Center
409    201144        Mike Conley    Guard
485    200752           Rudy Gay    Guard
580   1630533    Ziaire Williams  Forward
Solve value for ridge_normalized: 1200.000000000001


Roster for elastic_net_normalized:
     playerId       playerName position
56     201572      Brook Lopez   Center
144    201565     Derrick Rose    Guard
146   1630217     Desmond Bane    Guard
163   1629130  Duncan Robinson  Forward
178    201569      Eric Gordon    Guard
183   1630596      Evan Mobley   Center
228   1629630        Ja Morant    Guard
369      2544     LeBron Jame

In [21]:
salaries = pd.read_csv('../data/salaries.csv')
salaries.drop(columns=['Player'], inplace=True)
salaries.replace(np.nan, 10000000000000, inplace=True)

performance_metrics = [
    'ridge_normalized',
    'elastic_net_normalized',
    'lasso_normalized',
    'graeme+'
]

salaries['graeme+'] = (salaries['elastic_net_normalized'] + salaries['ridge_normalized']) / 2

salaries.drop(columns=['2020/21', '2021/22', '2022/23'], inplace=True)
salaries['2023/24'] = salaries['2023/24'].astype(int)

salary_caps = {
    "Offseason Targets with 30 Mil Cap Space - Detroit, Utah": 30_000_000,
    "Offseason Targets with 20 Mil Cap Space - Orlando, San Antonio": 20_000_000
}

for team_type, salary_cap in salary_caps.items():
    rosters = {}
    solve_values = {}

    for metric in performance_metrics:
        performance_metric = salaries[metric]

        n = len(salaries)
        x = cp.Variable(n, boolean=True)

        obj = cp.Maximize(cp.sum(cp.multiply(performance_metric, x)))

        constraints = [
            cp.sum(x) == 4,
            cp.sum(cp.multiply(x, (salaries['position'] == 'Guard').astype(int).values)) >= 1,
            cp.sum(cp.multiply(x, (salaries['position'] == 'Center').astype(int).values)) >= 1,
            cp.sum(cp.multiply(x, (salaries['position'] == 'Forward').astype(int).values)) >= 1,
            cp.sum(cp.multiply(x, salaries['2023/24'].values)) <= salary_cap
        ]

        unique_player_ids = salaries['playerId'].unique()
        for player_id in unique_player_ids:
            indices = (salaries['playerId'] == player_id).astype(int).values
            constraints.append(cp.sum(cp.multiply(x, indices)) <= 1)

        problem = cp.Problem(obj, constraints)

        solve_value = problem.solve()
        solve_values[metric] = solve_value

        roster = np.where(x.value >= 0.5)[0]
        roster_names = salaries.iloc[roster][['playerId', 'playerName', 'position']]
        rosters[metric] = roster_names

    print(f"Optimal roster construction at {salary_cap} ({team_type}):\n")
    for metric, roster in rosters.items():
        print(f"Roster for {metric}:")
        print(roster)
        print(f"Solve value for {metric}: {solve_values[metric]}")
        print("\n")

Optimal roster construction at 30000000 (Offseason Targets with 30 Mil Cap Space - Detroit, Utah):

Roster for ridge_normalized:
     playerId       playerName position
146   1630217     Desmond Bane    Guard
163   1629130  Duncan Robinson  Forward
374   1630568       Luka Garza   Center
580   1630533  Ziaire Williams  Forward
Solve value for ridge_normalized: 476.0


Roster for elastic_net_normalized:
     playerId       playerName position
144    201565     Derrick Rose    Guard
183   1630596      Evan Mobley   Center
485    200752         Rudy Gay    Guard
580   1630533  Ziaire Williams  Forward
Solve value for elastic_net_normalized: 468.0


Roster for lasso_normalized:
     playerId      playerName position
341    201567      Kevin Love  Forward
423   1629650     Moses Brown   Center
496   1630567  Scottie Barnes    Guard
566   1631117  Walker Kessler   Center
Solve value for lasso_normalized: 508.0000000000001


Roster for graeme+:
     playerId       playerName position
144    2