<a href="https://colab.research.google.com/github/dejiandrew/nba-award-predictor/blob/deji/lightgbm_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [516]:
!rm *.csv
!rm *.pkl

In [517]:
!pip install wget
import pandas as pd
import wget
import duckdb
import lightgbm as lgb
import joblib



In [518]:
wget.download('https://storage.googleapis.com/nba_award_predictor/nba_data/features-overall-weekly.csv')
features_overall_weekly_df = pd.read_csv('features-overall-weekly.csv')

In [519]:
# Remove unwanted features
features_overall_weekly_df = features_overall_weekly_df.drop(columns=['league_pts_mean', 'league_pts_std',
       'league_ast_mean', 'league_ast_std', 'league_pm_mean', 'league_pm_std',
       'z_s_pts', 'z_s_ast', 'z_s_pm'])
features_overall_weekly_df

Unnamed: 0,player_id,full_name,team,season,week,week_start,conference,pow_conference,games_played_this_week,numMinutes,...,points_mean_season,points_std_season,assists_mean_season,assists_std_season,plusMinusPoints_mean_season,plusMinusPoints_std_season,z_pts,z_ast,z_pm,breakout_score
0,305,Robert Parish,Warriors,1979,1,1979-12-31,West,West,3,101.00,...,63.00,7.874008,8.60,3.382307,0.0,0.000000,-0.254000,-1.655675,0.000000,-0.623703
1,76003,Kareem Abdul-Jabbar,Lakers,1979,1,1979-12-31,West,West,2,82.00,...,105.00,25.674890,19.40,7.200000,0.0,0.000000,-1.986377,-0.750000,0.000000,-1.218188
2,76005,Tom Abernethy,Warriors,1979,1,1979-12-31,West,West,1,11.00,...,22.00,13.190906,4.75,2.680951,0.0,0.000000,-1.667816,-1.771759,0.000000,-1.365436
3,76011,Alvan Adams,Suns,1979,1,1979-12-31,West,West,2,64.00,...,44.20,27.095387,13.80,7.884161,0.0,0.000000,-0.634794,-0.862489,0.000000,-0.576144
4,76085,James Bailey,SuperSonics,1979,1,1979-12-31,West,West,4,41.00,...,18.00,6.745369,0.75,0.433013,0.0,0.000000,-0.444750,2.886751,0.000000,0.643651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234887,1642959,Chris Youngblood,Thunder,2025,49,2025-12-01,West,West,2,16.62,...,4.75,4.205651,1.00,0.707107,-1.5,10.874282,0.297219,0.000000,-0.965581,-0.044507
234888,1642962,Drake Powell,Nets,2025,49,2025-12-01,East,East,3,114.24,...,32.40,14.554724,9.60,4.630335,-24.0,26.773121,0.247342,1.814124,1.568738,0.981656
234889,1642964,Brooks Barnhizer,Thunder,2025,49,2025-12-01,West,West,2,18.74,...,4.75,2.772634,1.25,0.433013,-7.0,11.379807,1.172171,1.732051,-1.230249,0.859651
234890,1643007,Taelon Peter,Pacers,2025,49,2025-12-01,East,East,3,8.66,...,4.00,2.756810,2.60,2.497999,-3.8,8.541663,-1.088214,-1.040833,0.327805,-0.790796


In [520]:
# Only train on 2021+ seasons to account for modern scoring patterns.
#features_overall_weekly_df = features_overall_weekly_df[features_overall_weekly_df['season'] >= 2021]
#features_overall_weekly_df = features_overall_weekly_df[features_overall_weekly_df['week_start'] != '2025-12-08']
#features_overall_weekly_df = features_overall_weekly_df[features_overall_weekly_df['week_start'] != '2025-12-01']
# features_overall_weekly_df = features_overall_weekly_df[features_overall_weekly_df['week_start'] != '2025-11-24']
# features_overall_weekly_df = features_overall_weekly_df[features_overall_weekly_df['week_start'] != '2025-11-17']
# features_overall_weekly_df = features_overall_weekly_df[features_overall_weekly_df['week_start'] != '2025-11-10']

In [521]:
features_overall_weekly_df.columns

Index(['player_id', 'full_name', 'team', 'season', 'week', 'week_start',
       'conference', 'pow_conference', 'games_played_this_week', 'numMinutes',
       'points', 'assists', 'blocks', 'steals', 'reboundsTotal',
       'reboundsDefensive', 'reboundsOffensive', 'fieldGoalsAttempted',
       'fieldGoalsMade', 'threePointersAttempted', 'threePointersMade',
       'freeThrowsAttempted', 'freeThrowsMade', 'turnovers', 'foulsPersonal',
       'plusMinusPoints', 'wins_this_week', 'wins_vs_team_with_all_nba_player',
       'is_win_vs_over_500', 'opponent_has_all_nba', 'avg_opp_score',
       'avg_opp_winrate_prior', 'avg_opp_wins_prior', 'avg_opp_losses_prior',
       'away_games_prior', 'away_losses_prior', 'away_win_streak_prior',
       'away_wins_prior', 'home_games_prior', 'home_losses_prior',
       'home_win_streak_prior', 'home_wins_prior', 'losses_prior',
       'wins_vs_over_500_prior', 'won_player_of_the_week',
       'all_star_this_season', 'mvp_this_season',
       'all_nba_f

In [522]:
# View the class imbalance (99.4% positive class, 0.6% negative class)
print(features_overall_weekly_df['won_player_of_the_week'].value_counts())
print(features_overall_weekly_df['won_player_of_the_week'].value_counts(normalize=True))

won_player_of_the_week
0    233497
1      1395
Name: count, dtype: int64
won_player_of_the_week
0    0.994061
1    0.005939
Name: proportion, dtype: float64


In [523]:
# Drop columns that contain the answer. Also drop mvp, all-star, all-nba because they are from the future when we try to do inference during season
leakage_cols = ['pow_player_id', 'player_of_the_week', 'pow_conference',
                'all_star_this_season', 'mvp_this_season',
                'all_nba_first_team_this_season', 'all_nba_second_team_this_season',
                'all_nba_third_team_this_season']

# Set aside identifier columns
id_cols = ['player_id', 'full_name', 'team', 'season', 'week', 'week_start', 'conference']

features_overall_weekly_df_clean = features_overall_weekly_df.drop(columns=leakage_cols + id_cols)
df_encoded = pd.get_dummies(features_overall_weekly_df_clean, drop_first=True)
df_encoded

Unnamed: 0,games_played_this_week,numMinutes,points,assists,blocks,steals,reboundsTotal,reboundsDefensive,reboundsOffensive,fieldGoalsAttempted,...,points_mean_season,points_std_season,assists_mean_season,assists_std_season,plusMinusPoints_mean_season,plusMinusPoints_std_season,z_pts,z_ast,z_pm,breakout_score
0,3,101.00,61.0,3.0,5.0,1.0,35.0,29.0,6.0,43.0,...,63.00,7.874008,8.60,3.382307,0.0,0.000000,-0.254000,-1.655675,0.000000,-0.623703
1,2,82.00,54.0,14.0,6.0,0.0,19.0,16.0,3.0,41.0,...,105.00,25.674890,19.40,7.200000,0.0,0.000000,-1.986377,-0.750000,0.000000,-1.218188
2,1,11.00,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,...,22.00,13.190906,4.75,2.680951,0.0,0.000000,-1.667816,-1.771759,0.000000,-1.365436
3,2,64.00,27.0,7.0,2.0,4.0,12.0,0.0,0.0,21.0,...,44.20,27.095387,13.80,7.884161,0.0,0.000000,-0.634794,-0.862489,0.000000,-0.576144
4,4,41.00,15.0,2.0,2.0,0.0,5.0,4.0,1.0,16.0,...,18.00,6.745369,0.75,0.433013,0.0,0.000000,-0.444750,2.886751,0.000000,0.643651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234887,2,16.62,6.0,1.0,0.0,0.0,7.0,6.0,1.0,5.0,...,4.75,4.205651,1.00,0.707107,-1.5,10.874282,0.297219,0.000000,-0.965581,-0.044507
234888,3,114.24,36.0,18.0,0.0,6.0,6.0,6.0,0.0,34.0,...,32.40,14.554724,9.60,4.630335,-24.0,26.773121,0.247342,1.814124,1.568738,0.981656
234889,2,18.74,8.0,2.0,0.0,1.0,2.0,2.0,0.0,6.0,...,4.75,2.772634,1.25,0.433013,-7.0,11.379807,1.172171,1.732051,-1.230249,0.859651
234890,3,8.66,1.0,0.0,0.0,0.0,1.0,1.0,0.0,3.0,...,4.00,2.756810,2.60,2.497999,-3.8,8.541663,-1.088214,-1.040833,0.327805,-0.790796


In [524]:
from sklearn.model_selection import GroupShuffleSplit

X = df_encoded.drop(columns=['won_player_of_the_week'])
y = df_encoded['won_player_of_the_week']
identifiers = features_overall_weekly_df[id_cols]

# Create groups for each season-week-conference combination
groups = (features_overall_weekly_df['season'].astype(str) + '_' +
          features_overall_weekly_df['week'].astype(str) + '_' +
          features_overall_weekly_df['conference'].astype(str))

# Use GroupShuffleSplit instead of train_test_split
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))

# Split using the indices
X_train = X.iloc[train_idx]
X_test = X.iloc[test_idx]
y_train = y.iloc[train_idx]
y_test = y.iloc[test_idx]
id_train = identifiers.iloc[train_idx]
id_test = identifiers.iloc[test_idx]

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Positive class in y_train: {100 * round(y_train.mean(),4)}%")
print(f"Positive class in y_test: {100 * round(y_test.mean(),4)}%")

Training samples: 187735
Test samples: 47157
Positive class in y_train: 0.59%
Positive class in y_test: 0.59%


In [525]:
!pip install lightgbm



In [526]:

# Train LightGBM model
lgb_model = lgb.LGBMClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    num_leaves=31,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

# Train LightGBM model
lgb_model.fit(X_train, y_train)

# Get predictions
y_pred_proba = lgb_model.predict_proba(X_test)[:, 1]

In [527]:
evaluation_df = id_test.copy()
evaluation_df['actual_won'] = y_test.values
evaluation_df['pred_proba'] = y_pred_proba
evaluation_df

Unnamed: 0,player_id,full_name,team,season,week,week_start,conference,actual_won,pred_proba
1946,305,Robert Parish,Warriors,1979,51,1979-12-17,West,0,0.398226
1947,1453,Walter Davis,Suns,1979,51,1979-12-17,West,0,0.372005
1948,76003,Kareem Abdul-Jabbar,Lakers,1979,51,1979-12-17,West,0,0.915900
1949,76011,Alvan Adams,Suns,1979,51,1979-12-17,West,0,0.123803
1950,76085,James Bailey,SuperSonics,1979,51,1979-12-17,West,0,0.000748
...,...,...,...,...,...,...,...,...,...
234116,1642959,Chris Youngblood,Thunder,2025,47,2025-11-17,West,0,0.000128
234117,1642962,Drake Powell,Nets,2025,47,2025-11-17,East,0,0.000311
234118,1642964,Brooks Barnhizer,Thunder,2025,47,2025-11-17,West,0,0.000132
234119,1643007,Taelon Peter,Pacers,2025,47,2025-11-17,East,0,0.000290


In [528]:
actual_winners_df = evaluation_df[evaluation_df['actual_won'] == 1]
actual_winners_df.head()

Unnamed: 0,player_id,full_name,team,season,week,week_start,conference,actual_won,pred_proba
2016,77689,Swen Nater,Clippers,1979,51,1979-12-17,West,1,0.394728
3538,77952,Micheal Ray Richardson,Knicks,1980,12,1981-03-16,East,1,0.980805
3934,305,Robert Parish,Celtics,1980,48,1980-11-24,East,1,0.846006
4639,78549,Gus Williams,SuperSonics,1981,1,1982-01-04,West,1,0.980817
4773,76091,Greg Ballard,Bullets,1981,3,1982-01-18,East,1,0.9556


In [529]:
print(f"Total samples in evaluation_df: {len(evaluation_df)}")
print(f"Actual winners: {len(actual_winners_df)}")

Total samples in evaluation_df: 47157
Actual winners: 278


In [530]:
# How many season,conference,week combinations were there in actual_winners_df? This is our denominator for accuracy.

query = """
SELECT COUNT(DISTINCT(season,conference,week)) AS total_season_conference_weeks_we_couldve_predicted
FROM actual_winners_df
"""

total_season_conference_weeks_we_couldve_predicted = duckdb.query(query).df()['total_season_conference_weeks_we_couldve_predicted'].sum()
total_season_conference_weeks_we_couldve_predicted

np.int64(274)

In [531]:
# How many of the actual winners did we predict?
# Aka how many of the actual winners did our model assign a top-k probability for that [season,conference,week] combo?
for k in [1, 2, 3, 4, 5, 10]:
  query = f"""

  WITH Highest_Probabilities AS (
  SELECT *
  ,RANK() OVER(PARTITION BY season, conference, week ORDER BY pred_proba DESC) AS rnk
  FROM evaluation_df
  )
  ,
  Actual_Winners AS (
    SELECT * FROM evaluation_df
    WHERE actual_won = 1
  )


  SELECT * FROM Highest_Probabilities
  JOIN Actual_Winners
  ON
  Highest_Probabilities.season = Actual_Winners.season
  AND
  Highest_Probabilities.week = Actual_Winners.week
  AND
  Highest_Probabilities.conference = Actual_Winners.conference
  WHERE rnk <= {k}
  AND Highest_Probabilities.pred_proba = Actual_Winners.pred_proba
  """
  duckdb.query(query).df()
  correct_predictions_df = duckdb.query(query).df()
  total_correct_predictions = correct_predictions_df["actual_won"].sum()
  print(f"Top-{k} accuracy:", round(100*total_correct_predictions/total_season_conference_weeks_we_couldve_predicted,2),"%")

Top-1 accuracy: 47.45 %
Top-2 accuracy: 72.99 %
Top-3 accuracy: 81.02 %
Top-4 accuracy: 85.04 %
Top-5 accuracy: 90.15 %
Top-10 accuracy: 97.45 %


In [532]:
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': lgb_model.feature_importances_
}).sort_values('importance', ascending=False)

feature_importance['importance_pct'] = (feature_importance['importance'] / feature_importance['importance'].sum() * 100)

print("LightGBM Top 20 features:")
print(feature_importance.head(20)[['feature','importance_pct']])

LightGBM Top 20 features:
                   feature  importance_pct
2                   points        8.900000
36                team_pts        5.800000
41    fieldGoalsPercentage        4.900000
3                  assists        4.333333
18          wins_this_week        3.800000
6            reboundsTotal        3.566667
37                team_ast        3.100000
44      points_mean_season        2.833333
17         plusMinusPoints        2.533333
53          breakout_score        2.466667
50                   z_pts        2.366667
1               numMinutes        2.366667
13     freeThrowsAttempted        2.200000
14          freeThrowsMade        2.166667
7        reboundsDefensive        2.133333
10          fieldGoalsMade        2.133333
39                team_stl        1.966667
46     assists_mean_season        1.966667
21    opponent_has_all_nba        1.966667
0   games_played_this_week        1.800000


In [533]:
# import joblib

# # Save model
# joblib.dump(lgb_model, 'lightgbm_potw_model.pkl')

# # Save feature names
# joblib.dump(X_train.columns.tolist(), 'lightgbm_model_features.pkl')

In [534]:
print("\nTraining final model on 100% of data for deployment...")
final_model = lgb.LGBMClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    num_leaves=31,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

final_model.fit(X, y)  # ALL data

# 5. Save the final model
joblib.dump(final_model, 'lightgbm_potw_model.pkl')
joblib.dump(X.columns.tolist(), 'lightgbm_model_features.pkl')


Training final model on 100% of data for deployment...


['lightgbm_model_features.pkl']

In [535]:
# Feature importance from final model
final_feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)

final_feature_importance['importance_pct'] = (final_feature_importance['importance'] / final_feature_importance['importance'].sum() * 100)

print("Final Model Top 20 features:")
print(final_feature_importance.head(20)[['feature','importance_pct']])

Final Model Top 20 features:
                 feature  importance_pct
2                 points        7.733333
36              team_pts        6.966667
3                assists        4.933333
41  fieldGoalsPercentage        4.800000
18        wins_this_week        4.433333
6          reboundsTotal        3.133333
37              team_ast        2.733333
1             numMinutes        2.700000
46   assists_mean_season        2.600000
17       plusMinusPoints        2.600000
7      reboundsDefensive        2.566667
10        fieldGoalsMade        2.366667
44    points_mean_season        2.366667
4                 blocks        2.200000
13   freeThrowsAttempted        2.200000
14        freeThrowsMade        2.200000
16         foulsPersonal        2.133333
39              team_stl        2.100000
50                 z_pts        2.066667
53        breakout_score        2.000000


**The below steps will use the above model for inference.**

In [536]:
def get_potw_predictions(week_start: str):
  """
  get_potw_predictions() does inference to predict POW.
  week_start must be a Monday in 'yyyy-mm-dd' format
  """
  import duckdb
  import joblib
  import wget

  # features-overall-weekly-for-inference.csv is calculated in a separate script that runs within the daily production pipeline
  # features-overall-weekly-for-inference.csv is needed to make predictions while the current week is in-progress (i.e. no POW announced yet)
  wget.download('https://storage.googleapis.com/nba_award_predictor/nba_data/features-overall-weekly-for-inference.csv')

  # Read in model & its feature names
  model = joblib.load('lightgbm_potw_model.pkl')
  feature_names = joblib.load('lightgbm_model_features.pkl')

  # Read in player-week stats
  overall_weekly_agg_df = pd.read_csv('features-overall-weekly-for-inference.csv')

  # Remove unwanted features
  overall_weekly_agg_df = overall_weekly_agg_df.drop(columns=['league_pts_mean', 'league_pts_std',
       'league_ast_mean', 'league_ast_std', 'league_pm_mean', 'league_pm_std',
       'z_s_pts', 'z_s_ast', 'z_s_pm'])

  # Query all players who played for the input week
  query = f"""
  SELECT * FROM overall_weekly_agg_df
  WHERE week_start = '{week_start}'
  """
  df = duckdb.query(query).df()

  #Keep player info separate
  player_info = df[['player_id', 'full_name', 'conference', 'season', 'week']]
  X = df[feature_names]
  probabilities = model.predict_proba(X)[:, 1]

  # Attach probabilities to the player identities
  results = player_info.copy()
  results['probability'] = probabilities

  # Now partition by season, week, and conference and return top 5 for each conference
  results
  query = f"""
  WITH CTE AS (SELECT *,
  RANK() OVER(PARTITION BY season, week, conference ORDER BY probability DESC) AS rank
  FROM results
  )

  ,CTE2 AS (
  SELECT * FROM CTE WHERE conference = 'East' AND rank <= 5
  UNION ALL
  SELECT * FROM CTE WHERE conference = 'West' AND rank <= 5

  )
  SELECT
  '{week_start}' AS week_start
  ,rank
  ,full_name as name
  ,conference
  ,probability
  FROM CTE2
  """

  top_five_per_conference = duckdb.query(query).df()

  return top_five_per_conference

get_potw_predictions('2025-12-08')

Unnamed: 0,week_start,rank,name,conference,probability
0,2025-12-08,1,Jalen Brunson,East,0.701225
1,2025-12-08,2,Brandon Ingram,East,0.638874
2,2025-12-08,3,Jaylen Brown,East,0.472538
3,2025-12-08,4,Joel Embiid,East,0.307531
4,2025-12-08,5,Jalen Suggs,East,0.07802
5,2025-12-08,1,Stephon Castle,West,0.807452
6,2025-12-08,2,Julius Randle,West,0.650672
7,2025-12-08,3,Nikola Jokic,West,0.649164
8,2025-12-08,4,Luka Doncic,West,0.221675
9,2025-12-08,5,Kon Knueppel,West,0.218039
