<a href="https://colab.research.google.com/github/dejiandrew/nba-award-predictor/blob/deji/lightgbm_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
!pip install wget
import pandas as pd
import wget
import duckdb
import lightgbm as lgb



In [37]:
wget.download('https://storage.googleapis.com/nba_award_predictor/nba_data/features-overall-weekly.csv')
features_overall_weekly_df = pd.read_csv('features-overall-weekly.csv')

In [38]:
features_overall_weekly_df.columns

Index(['player_id', 'full_name', 'team', 'season', 'week', 'week_start',
       'conference', 'pow_conference', 'games_played_this_week', 'numMinutes',
       'points', 'assists', 'blocks', 'steals', 'reboundsTotal',
       'reboundsDefensive', 'reboundsOffensive', 'fieldGoalsAttempted',
       'fieldGoalsMade', 'threePointersAttempted', 'threePointersMade',
       'freeThrowsAttempted', 'freeThrowsMade', 'turnovers', 'foulsPersonal',
       'plusMinusPoints', 'wins_this_week', 'wins_vs_team_with_all_nba_player',
       'is_win_vs_over_500', 'opponent_has_all_nba', 'avg_opp_score',
       'avg_opp_winrate_prior', 'avg_opp_wins_prior', 'avg_opp_losses_prior',
       'away_games_prior', 'away_losses_prior', 'away_win_streak_prior',
       'away_wins_prior', 'home_games_prior', 'home_losses_prior',
       'home_win_streak_prior', 'home_wins_prior', 'losses_prior',
       'wins_vs_over_500_prior', 'won_player_of_the_week',
       'all_star_this_season', 'mvp_this_season',
       'all_nba_f

In [39]:
# numeric_df = features_overall_weekly_df.select_dtypes(include='number')
# numeric_df

In [40]:
# View the class imbalance (99.4% positive class, 0.6% negative class)
print(features_overall_weekly_df['won_player_of_the_week'].value_counts())
print(features_overall_weekly_df['won_player_of_the_week'].value_counts(normalize=True))

won_player_of_the_week
0    233104
1      1393
Name: count, dtype: int64
won_player_of_the_week
0    0.99406
1    0.00594
Name: proportion, dtype: float64


In [41]:
# Drop columns that contain the answer. Also drop mvp, all-star, all-nba because they are from the future when we try to do inference during season
leakage_cols = ['pow_player_id', 'player_of_the_week', 'pow_conference',
                'all_star_this_season', 'mvp_this_season',
                'all_nba_first_team_this_season', 'all_nba_second_team_this_season',
                'all_nba_third_team_this_season']

# Set aside identifier columns
id_cols = ['player_id', 'full_name', 'team', 'season', 'week', 'week_start', 'conference']

features_overall_weekly_df_clean = features_overall_weekly_df.drop(columns=leakage_cols + id_cols)
df_encoded = pd.get_dummies(features_overall_weekly_df_clean, drop_first=True)
df_encoded

Unnamed: 0,games_played_this_week,numMinutes,points,assists,blocks,steals,reboundsTotal,reboundsDefensive,reboundsOffensive,fieldGoalsAttempted,...,breakout_score,league_pts_mean,league_pts_std,league_ast_mean,league_ast_std,league_pm_mean,league_pm_std,z_s_pts,z_s_ast,z_s_pm
0,3,101.00,61.0,3.0,5.0,1.0,35.0,29.0,6.0,43.0,...,-0.623703,,,,,,,0.000000,0.000000,0.000000
1,2,82.00,54.0,14.0,6.0,0.0,19.0,16.0,3.0,41.0,...,-1.218188,,,,,,,0.000000,0.000000,0.000000
2,1,11.00,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,...,-1.365436,,,,,,,0.000000,0.000000,0.000000
3,2,64.00,27.0,7.0,2.0,4.0,12.0,0.0,0.0,21.0,...,-0.576144,,,,,,,0.000000,0.000000,0.000000
4,4,41.00,15.0,2.0,2.0,0.0,5.0,4.0,1.0,16.0,...,0.643651,,,,,,,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234492,1,4.56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.008380,37.443734,37.846821,8.712458,10.37058,-0.670054,30.233355,-0.989349,-0.840113,-0.275522
234493,3,68.75,30.0,1.0,0.0,4.0,14.0,9.0,5.0,23.0,...,-0.045008,37.443734,37.846821,8.712458,10.37058,-0.670054,30.233355,-0.196681,-0.743686,0.948292
234494,3,128.70,44.0,12.0,2.0,2.0,10.0,8.0,2.0,26.0,...,0.580753,37.443734,37.846821,8.712458,10.37058,-0.670054,30.233355,0.173232,0.317007,-1.102423
234495,1,5.48,3.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,-0.385517,37.443734,37.846821,8.712458,10.37058,-0.670054,30.233355,-0.910083,-0.840113,0.154467


In [42]:
from sklearn.model_selection import train_test_split

X = df_encoded.drop(columns=['won_player_of_the_week'])
y = df_encoded['won_player_of_the_week']
identifiers = features_overall_weekly_df[id_cols]

X_train, X_test, y_train, y_test, id_train, id_test = train_test_split(
    X, y, identifiers,
    test_size=0.2,
    stratify=y,          # Maintain the same class imbalance ratio within both y_train and y_test
    random_state=42
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Positive class in y_train: {100 * round(y_train.mean(),4)}%")
print(f"Positive class in y_test: {100 * round(y_test.mean(),4)}%")

Training samples: 187597
Test samples: 46900
Positive class in y_train: 0.59%
Positive class in y_test: 0.59%


In [43]:
# from sklearn.model_selection import GroupShuffleSplit

# X = df_encoded.drop(columns=['won_player_of_the_week'])
# y = df_encoded['won_player_of_the_week']
# identifiers = features_overall_weekly_df[id_cols]

# # Create groups for each season-week combination
# groups = (features_overall_weekly_df['season'].astype(str) + '_' +
#           features_overall_weekly_df['week'].astype(str))

# # Use GroupShuffleSplit instead of train_test_split
# gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# train_idx, test_idx = next(gss.split(X, y, groups=groups))

# # Split using the indices
# X_train = X.iloc[train_idx]
# X_test = X.iloc[test_idx]
# y_train = y.iloc[train_idx]
# y_test = y.iloc[test_idx]
# id_train = identifiers.iloc[train_idx]
# id_test = identifiers.iloc[test_idx]

# print(f"Training samples: {len(X_train)}")
# print(f"Test samples: {len(X_test)}")
# print(f"Positive class in y_train: {100 * round(y_train.mean(),4)}%")
# print(f"Positive class in y_test: {100 * round(y_test.mean(),4)}%")

In [44]:
!pip install lightgbm



In [45]:

# Train LightGBM model
lgb_model = lgb.LGBMClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    num_leaves=31,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

# Train LightGBM model
lgb_model.fit(X_train, y_train)

# Get predictions
y_pred_proba = lgb_model.predict_proba(X_test)[:, 1]

In [46]:
evaluation_df = id_test.copy()
evaluation_df['actual_won'] = y_test.values
evaluation_df['pred_proba'] = y_pred_proba
evaluation_df

Unnamed: 0,player_id,full_name,team,season,week,week_start,conference,actual_won,pred_proba
158437,203561,Brandon Davies,Nets,2014,50,2014-12-08,East,0,0.000282
186539,1626170,Jerian Grant,Magic,2018,45,2018-11-05,East,0,0.000433
229946,203110,Draymond Green,Warriors,2024,15,2025-04-07,West,0,0.000488
158419,203492,Ray McCallum,Kings,2014,50,2014-12-08,West,0,0.000088
115867,200770,Jordan Farmar,Lakers,2008,49,2008-12-01,West,0,0.000445
...,...,...,...,...,...,...,...,...,...
120531,201954,Darren Collison,Hornets,2009,11,2010-03-15,East,0,0.000885
116364,1630,Mikki Moore,Kings,2008,51,2008-12-15,West,0,0.000037
162720,201581,JJ Hickson,Wizards,2015,12,2016-03-21,East,0,0.000150
96423,200749,Shelden Williams,Hawks,2006,5,2007-01-29,East,0,0.000324


In [47]:
actual_winners_df = evaluation_df[evaluation_df['actual_won'] == 1]
actual_winners_df.head()

Unnamed: 0,player_id,full_name,team,season,week,week_start,conference,actual_won,pred_proba
225597,1629636,Darius Garland,Cavaliers,2024,2,2025-01-06,East,1,0.358548
123156,101114,Deron Williams,Jazz,2009,50,2009-12-07,West,1,0.746777
186576,1627783,Pascal Siakam,Raptors,2018,45,2018-11-05,East,1,0.554024
11797,76504,Adrian Dantley,Jazz,1983,50,1983-12-12,West,1,0.472757
145504,200794,Paul Millsap,Hawks,2013,4,2014-01-20,East,1,0.591213


In [48]:
# How many season,conference,week combinations were there in actual_winners_df? This is our denominator for accuracy.

query = """
SELECT COUNT(DISTINCT(season,conference,week)) AS total_season_conference_weeks_we_couldve_predicted
FROM actual_winners_df
"""

total_season_conference_weeks_we_couldve_predicted = duckdb.query(query).df()['total_season_conference_weeks_we_couldve_predicted'].sum()
total_season_conference_weeks_we_couldve_predicted

np.int64(279)

In [49]:
# How many of the actual winners did we predict?
# Aka how many of the actual winners did our model assign a top-k probability for that [season,conference,week] combo?
for k in [1, 3, 5, 10]:
  query = f"""

  WITH Highest_Probabilities AS (
  SELECT *
  ,RANK() OVER(PARTITION BY season, conference, week ORDER BY pred_proba DESC) AS rnk
  FROM evaluation_df
  )
  ,
  Actual_Winners AS (
    SELECT * FROM evaluation_df
    WHERE actual_won = 1
  )


  SELECT * FROM Highest_Probabilities
  JOIN Actual_Winners
  ON
  Highest_Probabilities.season = Actual_Winners.season
  AND
  Highest_Probabilities.week = Actual_Winners.week
  AND
  Highest_Probabilities.conference = Actual_Winners.conference
  WHERE rnk <= {k}
  AND Highest_Probabilities.pred_proba = Actual_Winners.pred_proba
  """
  duckdb.query(query).df()
  correct_predictions_df = duckdb.query(query).df()
  total_correct_predictions = correct_predictions_df["actual_won"].sum()
  print(f"Top-{k} accuracy:", round(100*total_correct_predictions/total_season_conference_weeks_we_couldve_predicted,2),"%")

Top-1 accuracy: 84.23 %
Top-3 accuracy: 97.49 %
Top-5 accuracy: 99.28 %
Top-10 accuracy: 99.64 %


In [50]:
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': lgb_model.feature_importances_
}).sort_values('importance', ascending=False)

feature_importance['importance_pct'] = (feature_importance['importance'] / feature_importance['importance'].sum() * 100)

print("LightGBM Top 20 features:")
print(feature_importance.head(20)[['feature','importance_pct']])

LightGBM Top 20 features:
                 feature  importance_pct
2                 points        6.933333
36              team_pts        6.800000
41  fieldGoalsPercentage        4.800000
18        wins_this_week        3.966667
6          reboundsTotal        3.033333
3                assists        2.866667
44    points_mean_season        2.833333
7      reboundsDefensive        2.533333
10        fieldGoalsMade        2.533333
37              team_ast        2.466667
1             numMinutes        2.366667
50                 z_pts        2.200000
61               z_s_ast        2.133333
60               z_s_pts        2.033333
39              team_stl        2.033333
21  opponent_has_all_nba        2.000000
14        freeThrowsMade        1.966667
53        breakout_score        1.966667
46   assists_mean_season        1.933333
4                 blocks        1.900000


In [51]:
import joblib

# Save model
joblib.dump(lgb_model, 'lightgbm_potw_model.pkl')

# Save feature names
joblib.dump(X_train.columns.tolist(), 'lightgbm_model_features.pkl')

['lightgbm_model_features.pkl']

**The below steps will use the above model for inference.**

In [52]:
def get_potw_predictions(week_start: str):
  """
  get_potw_predictions() does inference to predict POW.
  week_start must be a Monday in 'yyyy-mm-dd' format
  """
  import duckdb
  import joblib
  import wget

  # features-overall-weekly-for-inference.csv is calculated in a separate script that runs within the daily production pipeline
  # features-overall-weekly-for-inference.csv is needed to make predictions while the current week is in-progress (i.e. no POW announced yet)
  wget.download('https://storage.googleapis.com/nba_award_predictor/nba_data/features-overall-weekly-for-inference.csv')

  # Read in model & its feature names
  model = joblib.load('lightgbm_potw_model.pkl')
  feature_names = joblib.load('lightgbm_model_features.pkl')

  # Read in player-week stats
  overall_weekly_agg_df = pd.read_csv('features-overall-weekly-for-inference.csv')

  # Query all players who played for the input week
  query = f"""
  SELECT * FROM overall_weekly_agg_df
  WHERE week_start = '{week_start}'
  """
  df = duckdb.query(query).df()

  #Keep player info separate
  player_info = df[['player_id', 'full_name', 'conference', 'season', 'week']]
  X = df[feature_names]
  probabilities = model.predict_proba(X)[:, 1]

  # Attach probabilities to the player identities
  results = player_info.copy()
  results['probability'] = probabilities

  # Now partition by season, week, and conference and return top 5 for each conference
  results
  query = f"""
  WITH CTE AS (SELECT *,
  RANK() OVER(PARTITION BY season, week, conference ORDER BY probability DESC) AS rank
  FROM results
  )

  ,CTE2 AS (
  SELECT * FROM CTE WHERE conference = 'East' AND rank <= 5
  UNION ALL
  SELECT * FROM CTE WHERE conference = 'West' AND rank <= 5

  )
  SELECT
  '{week_start}' AS week_start
  ,rank
  ,full_name as name
  ,conference
  ,probability
  FROM CTE2
  """

  top_five_per_conference = duckdb.query(query).df()

  return top_five_per_conference

get_potw_predictions('2025-12-01')

Unnamed: 0,week_start,rank,name,conference,probability
0,2025-12-01,1,Scottie Barnes,East,0.835398
1,2025-12-01,2,Jaylen Brown,East,0.805545
2,2025-12-01,3,Donovan Mitchell,East,0.72383
3,2025-12-01,4,Jalen Johnson,East,0.684765
4,2025-12-01,5,Tyrese Maxey,East,0.662581
5,2025-12-01,1,De'Aaron Fox,West,0.818891
6,2025-12-01,2,Alperen Sengun,West,0.816536
7,2025-12-01,3,Shai Gilgeous-Alexander,West,0.755111
8,2025-12-01,4,Deni Avdija,West,0.66943
9,2025-12-01,5,Anthony Edwards,West,0.614331
