In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [92]:
df = pd.read_csv('../data/cleaned_nba_player_data.csv')
df

Unnamed: 0,Season,Player,MP,PTS,AST,TRB,STL,BLK,TS%,PER,WS,BPM,VORP,USG%,W/L%,Share
0,2003,Tracy McGrady,39.4,32.1,5.5,6.5,1.7,0.8,0.564,30.3,16.1,10.5,9.3,35.2,0.512,0.359
1,2003,Kobe Bryant,41.5,30.0,5.9,6.9,2.2,0.8,0.550,26.2,14.9,7.1,7.7,32.9,0.610,0.417
2,2003,Allen Iverson,42.5,27.6,5.5,4.2,2.7,0.2,0.500,21.2,9.2,3.5,4.8,32.9,0.585,0.070
3,2003,Shaquille O'Neal,37.8,27.5,3.1,11.1,0.6,2.4,0.602,29.5,13.2,6.5,5.5,30.2,0.610,0.106
4,2003,Paul Pierce,39.2,25.9,4.4,7.3,1.8,0.8,0.532,22.7,10.1,4.9,5.4,33.2,0.537,0.001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9264,2025,John Konchar,12.1,2.4,0.9,3.3,0.7,0.3,0.592,12.1,1.5,1.7,0.5,8.0,0.585,0.000
9265,2025,Miles Norris,11.7,2.3,0.0,3.0,0.7,0.3,0.354,8.5,0.0,-1.9,0.0,12.4,0.744,0.000
9266,2025,Dwight Powell,10.0,2.1,1.0,2.1,0.3,0.4,0.713,12.9,1.7,0.0,0.3,7.6,0.476,0.000
9267,2025,Jericho Sims,11.9,1.8,0.6,3.7,0.2,0.4,0.639,10.0,1.3,-3.0,-0.2,7.6,0.500,0.000


VORP * Win Percentage should capture high value on the winning teams, a key indicator of MVP vote share

In [112]:
df_features = df.copy()
#New feature - PER * Win Percentage
df_features['VORP_W/L'] = df_features['VORP'] * df_features['W/L%']
#Reorder columns
df_features = df_features.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 15]]
df_features

Unnamed: 0,Season,Player,MP,PTS,AST,TRB,STL,BLK,TS%,PER,WS,BPM,VORP,USG%,W/L%,VORP_W/L,Share
0,2003,Tracy McGrady,39.4,32.1,5.5,6.5,1.7,0.8,0.564,30.3,16.1,10.5,9.3,35.2,0.512,4.7616,0.359
1,2003,Kobe Bryant,41.5,30.0,5.9,6.9,2.2,0.8,0.550,26.2,14.9,7.1,7.7,32.9,0.610,4.6970,0.417
2,2003,Allen Iverson,42.5,27.6,5.5,4.2,2.7,0.2,0.500,21.2,9.2,3.5,4.8,32.9,0.585,2.8080,0.070
3,2003,Shaquille O'Neal,37.8,27.5,3.1,11.1,0.6,2.4,0.602,29.5,13.2,6.5,5.5,30.2,0.610,3.3550,0.106
4,2003,Paul Pierce,39.2,25.9,4.4,7.3,1.8,0.8,0.532,22.7,10.1,4.9,5.4,33.2,0.537,2.8998,0.001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9264,2025,John Konchar,12.1,2.4,0.9,3.3,0.7,0.3,0.592,12.1,1.5,1.7,0.5,8.0,0.585,0.2925,0.000
9265,2025,Miles Norris,11.7,2.3,0.0,3.0,0.7,0.3,0.354,8.5,0.0,-1.9,0.0,12.4,0.744,0.0000,0.000
9266,2025,Dwight Powell,10.0,2.1,1.0,2.1,0.3,0.4,0.713,12.9,1.7,0.0,0.3,7.6,0.476,0.1428,0.000
9267,2025,Jericho Sims,11.9,1.8,0.6,3.7,0.2,0.4,0.639,10.0,1.3,-3.0,-0.2,7.6,0.500,-0.1000,0.000


- Stats will be standardized compared only to their own season
- This should eliminate era bias, for instance players in the 90s scored less points on average
- This also reduces data leakage, scaling globally means stats from the 2021 season will affect stats in the 2016 season
  

In [113]:
season_dict = {key: value for key, value in df_features.groupby('Season')}
season_dict[2024]

Unnamed: 0,Season,Player,MP,PTS,AST,TRB,STL,BLK,TS%,PER,WS,BPM,VORP,USG%,W/L%,VORP_W/L,Share
8364,2024,Joel Embiid,33.6,34.7,5.6,11.0,1.2,1.7,0.644,34.1,7.5,11.6,4.5,39.6,0.573,2.5785,0.000
8365,2024,Luka Dončić,37.5,33.9,9.8,9.2,1.4,0.5,0.617,28.1,12.0,9.9,8.0,36.0,0.610,4.8800,0.572
8366,2024,Giannis Antetokounmpo,35.2,30.4,6.5,11.5,1.2,1.1,0.649,29.9,13.2,9.0,7.2,33.0,0.598,4.3056,0.194
8367,2024,Shai Gilgeous-Alexander,34.0,30.1,6.2,5.5,2.0,0.9,0.636,29.3,14.6,9.0,7.1,32.8,0.695,4.9345,0.646
8368,2024,Jalen Brunson,35.4,28.7,6.7,3.6,0.9,0.2,0.592,23.4,11.2,5.8,5.4,32.5,0.610,3.2940,0.143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8804,2024,Jason Preston,10.1,1.7,2.3,2.4,0.3,0.1,0.316,10.5,0.0,-2.6,0.0,12.4,0.378,0.0000,0.000
8805,2024,Ibou Badji,10.3,1.5,0.6,2.3,0.1,0.9,0.623,8.1,0.2,-5.8,-0.2,8.0,0.256,-0.0512,0.000
8806,2024,Dariq Whitehead,12.0,1.5,1.5,2.0,0.0,0.5,0.255,4.9,0.0,-7.9,0.0,10.6,0.390,0.0000,0.000
8807,2024,Jack White,16.0,1.5,0.3,3.0,1.0,0.3,0.188,1.1,-0.1,-10.3,-0.1,12.1,0.329,-0.0329,0.000


In [115]:
scaled_columns = ['MP', 'PTS', 'AST', 'TRB', 'STL', 'BLK', 'TS%', 'PER', 'WS', 'BPM', 'VORP', 'USG%', 'VORP_W/L']

In [116]:
for season in range(2003, 2026):
    df_season = season_dict[season]
    scaler = StandardScaler()
    scaler.fit(df_season[scaled_columns])
    df_season[scaled_columns] = scaler.transform(df_season[scaled_columns])
    season_dict[season] = df_season

In [117]:
df_list = []
for season in range(2003, 2026):
    df_season = season_dict[season]
    df_list.append(df_season)
df_overall = pd.concat(df_list, axis=0).reset_index(drop=True)
df_overall

Unnamed: 0,Season,Player,MP,PTS,AST,TRB,STL,BLK,TS%,PER,WS,BPM,VORP,USG%,W/L%,VORP_W/L,Share
0,2003,Tracy McGrady,1.752518,3.906772,1.976588,0.989477,2.029217,0.547788,1.094251,3.679828,3.819312,3.533756,5.219498,3.400096,0.512,4.542465,0.359
1,2003,Kobe Bryant,1.989500,3.547491,2.208540,1.160492,3.127797,0.547788,0.822543,2.769521,3.455285,2.480413,4.230047,2.923437,0.610,4.473849,0.417
2,2003,Allen Iverson,2.102348,3.136883,1.976588,0.006143,4.226377,-0.554120,-0.147845,1.659390,1.726161,1.365108,2.436666,2.923437,0.585,2.467411,0.070
3,2003,Shaquille O'Neal,1.571961,3.119774,0.584878,2.956146,-0.387660,3.486209,1.831746,3.502207,2.939582,2.294529,2.869551,2.363881,0.610,3.048417,0.106
4,2003,Paul Pierce,1.729949,2.846035,1.338721,1.331507,2.248933,0.547788,0.473203,1.992429,1.999180,1.798838,2.807710,2.985610,0.537,2.564918,0.001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9264,2025,John Konchar,-1.395727,-1.304718,-0.850703,-0.383064,-0.204874,-0.408694,0.377048,-0.469053,-0.463602,0.790282,-0.127939,-1.964855,0.585,-0.129821,0.000
9265,2025,Miles Norris,-1.448642,-1.320769,-1.343209,-0.513075,-0.204874,-0.408694,-3.342469,-1.300337,-1.038213,-0.378206,-0.515632,-1.163180,0.744,-0.497531,0.000
9266,2025,Dwight Powell,-1.673532,-1.352871,-0.795980,-0.903111,-1.249105,-0.168899,2.268063,-0.284323,-0.386988,0.238496,-0.283016,-2.037734,0.476,-0.318013,0.000
9267,2025,Jericho Sims,-1.422184,-1.401025,-1.014872,-0.209715,-1.510163,-0.168899,1.111574,-0.953968,-0.540217,-0.735245,-0.670709,-2.037734,0.500,-0.623244,0.000


In [118]:
df_overall.to_csv('../data/enriched_nba_player_data.csv', index=False)