<a href="https://colab.research.google.com/github/dejiandrew/nba-award-predictor/blob/deji/lightgbm_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install wget
import pandas as pd
import wget
import duckdb
import lightgbm as lgb

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9655 sha256=245f2e5e4bcdc0775df12181263ebff07250f583e0f629f6bad83ad00e01038b
  Stored in directory: /root/.cache/pip/wheels/01/46/3b/e29ffbe4ebe614ff224bad40fc6a5773a67a163251585a13a9
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [2]:
wget.download('https://storage.googleapis.com/nba_award_predictor/nba_data/features-overall-weekly.csv')
features_overall_weekly_df = pd.read_csv('features-overall-weekly.csv')

In [3]:
features_overall_weekly_df.columns

Index(['player_id', 'full_name', 'team', 'season', 'week', 'week_start',
       'conference', 'pow_conference', 'games_played_this_week', 'numMinutes',
       'points', 'assists', 'blocks', 'steals', 'reboundsTotal',
       'reboundsDefensive', 'reboundsOffensive', 'fieldGoalsAttempted',
       'fieldGoalsMade', 'threePointersAttempted', 'threePointersMade',
       'freeThrowsAttempted', 'freeThrowsMade', 'turnovers', 'foulsPersonal',
       'plusMinusPoints', 'wins_this_week', 'wins_vs_team_with_all_nba_player',
       'is_win_vs_over_500', 'opponent_has_all_nba', 'avg_opp_score',
       'avg_opp_winrate_prior', 'avg_opp_wins_prior', 'avg_opp_losses_prior',
       'away_games_prior', 'away_losses_prior', 'away_win_streak_prior',
       'away_wins_prior', 'home_games_prior', 'home_losses_prior',
       'home_win_streak_prior', 'home_wins_prior', 'losses_prior',
       'wins_vs_over_500_prior', 'won_player_of_the_week',
       'all_star_this_season', 'mvp_this_season',
       'all_nba_f

In [4]:
# numeric_df = features_overall_weekly_df.select_dtypes(include='number')
# numeric_df

In [5]:
# View the class imbalance (99.4% positive class, 0.6% negative class)
print(features_overall_weekly_df['won_player_of_the_week'].value_counts())
print(features_overall_weekly_df['won_player_of_the_week'].value_counts(normalize=True))

won_player_of_the_week
0    233103
1      1394
Name: count, dtype: int64
won_player_of_the_week
0    0.994055
1    0.005945
Name: proportion, dtype: float64


In [6]:
# Drop columns that contain the answer. Also drop mvp, all-star, all-nba because they are from the future when we try to do inference during season
leakage_cols = ['pow_player_id', 'player_of_the_week', 'pow_conference',
                'all_star_this_season', 'mvp_this_season',
                'all_nba_first_team_this_season', 'all_nba_second_team_this_season',
                'all_nba_third_team_this_season']

# Set aside identifier columns
id_cols = ['player_id', 'full_name', 'team', 'season', 'week', 'week_start', 'conference']

features_overall_weekly_df_clean = features_overall_weekly_df.drop(columns=leakage_cols + id_cols)
df_encoded = pd.get_dummies(features_overall_weekly_df_clean, drop_first=True)
df_encoded

Unnamed: 0,games_played_this_week,numMinutes,points,assists,blocks,steals,reboundsTotal,reboundsDefensive,reboundsOffensive,fieldGoalsAttempted,...,points_mean_season,points_std_season,assists_mean_season,assists_std_season,plusMinusPoints_mean_season,plusMinusPoints_std_season,z_pts,z_ast,z_pm,breakout_score
0,3,45.00,14.0,11.0,1.0,1.0,5.0,5.0,0.0,22.0,...,27.000000,0.000000,4.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000
1,4,39.00,24.0,7.0,0.0,1.0,5.0,3.0,2.0,20.0,...,20.500000,6.500000,7.500000,3.500000,0.00,0.000000,0.538462,-0.142857,0.000000,0.226374
2,3,51.00,22.0,5.0,0.0,3.0,3.0,2.0,1.0,23.0,...,21.666667,5.557777,7.333333,2.867442,0.00,0.000000,0.059976,-0.813733,0.000000,-0.214132
3,3,50.00,11.0,13.0,1.0,0.0,2.0,2.0,0.0,11.0,...,21.750000,4.815340,6.750000,2.680951,0.00,0.000000,-2.232449,2.331262,0.000000,-0.416846
4,2,28.00,19.0,1.0,0.0,1.0,2.0,1.0,1.0,15.0,...,19.600000,6.086050,8.000000,3.464102,0.00,0.000000,-0.098586,-2.020726,0.000000,-0.655511
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234492,2,22.40,8.0,5.0,1.0,1.0,7.0,5.0,2.0,10.0,...,4.500000,1.500000,3.000000,3.000000,-10.00,6.000000,2.333333,0.666667,0.000000,1.366667
234493,2,3.23,0.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,...,5.666667,2.054805,3.666667,2.624669,-10.00,4.898979,-2.757764,-0.635001,3.470110,-0.875360
234494,1,5.48,3.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,4.250000,3.031089,3.250000,2.384848,-5.75,8.496323,-0.412393,-1.362770,1.147555,-0.385517
234495,3,32.79,0.0,0.0,0.0,3.0,3.0,3.0,0.0,6.0,...,3.000000,0.000000,3.000000,0.000000,30.00,0.000000,0.000000,0.000000,0.000000,0.000000


In [7]:
from sklearn.model_selection import train_test_split

X = df_encoded.drop(columns=['won_player_of_the_week'])
y = df_encoded['won_player_of_the_week']
identifiers = features_overall_weekly_df[id_cols]

X_train, X_test, y_train, y_test, id_train, id_test = train_test_split(
    X, y, identifiers,
    test_size=0.2,
    stratify=y,          # Maintain the same class imbalance ratio within both y_train and y_test
    random_state=42
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Positive class in y_train: {100 * round(y_train.mean(),4)}%")
print(f"Positive class in y_test: {100 * round(y_test.mean(),4)}%")

Training samples: 187597
Test samples: 46900
Positive class in y_train: 0.59%
Positive class in y_test: 0.59%


In [8]:
!pip install lightgbm



In [9]:

# Train LightGBM model
lgb_model = lgb.LGBMClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    num_leaves=31,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

# Train LightGBM model
lgb_model.fit(X_train, y_train)

# Get predictions
y_pred_proba = lgb_model.predict_proba(X_test)[:, 1]

In [10]:
evaluation_df = id_test.copy()
evaluation_df['actual_won'] = y_test.values
evaluation_df['pred_proba'] = y_pred_proba
evaluation_df

Unnamed: 0,player_id,full_name,team,season,week,week_start,conference,actual_won,pred_proba
158527,201973,Jonas Jerebko,Pistons,2009,47,2009-11-16,East,0,0.000136
186610,203613,Jonathon Simmons,Magic,2018,2,2019-01-07,East,0,0.000291
229969,1631110,Jeremy Sochan,Spurs,2023,10,2024-03-04,West,0,0.001606
104490,76441,Wayne Cooper,Warriors,1979,7,1980-02-11,West,0,0.000110
115947,77719,Kurt Nimphius,Mavericks,1984,7,1985-02-11,West,0,0.000168
...,...,...,...,...,...,...,...,...,...
120612,78325,Corny Thompson,Mavericks,1982,3,1983-01-17,West,0,0.000157
15100,262,Michael Cage,Clippers,1987,50,1987-12-07,West,0,0.021908
61379,1889,Andre Miller,Nuggets,2011,6,2012-02-06,West,0,0.002503
96519,2744,Al Jefferson,Pacers,2016,46,2016-11-14,East,0,0.000052


In [11]:
actual_winners_df = evaluation_df[evaluation_df['actual_won'] == 1]
actual_winners_df.head()

Unnamed: 0,player_id,full_name,team,season,week,week_start,conference,actual_won,pred_proba
216791,1629627,Zion Williamson,Pelicans,2022,49,2022-12-05,West,1,0.989505
110951,77142,Magic Johnson,Lakers,1986,7,1987-02-09,West,1,0.972145
171380,202711,Bojan Bogdanovic,Jazz,2020,18,2021-05-03,West,1,0.692686
14607,252,Karl Malone,Jazz,1994,50,1994-12-12,West,1,0.974988
133912,200750,Brandon Roy,Trail Blazers,2007,50,2007-12-10,West,1,0.895685


In [12]:
# How many season,conference,week combinations were there in actual_winners_df? This is our denominator for accuracy.

query = """
SELECT COUNT(DISTINCT(season,conference,week)) AS total_season_conference_weeks_we_couldve_predicted
FROM actual_winners_df
"""

total_season_conference_weeks_we_couldve_predicted = duckdb.query(query).df()['total_season_conference_weeks_we_couldve_predicted'].sum()
total_season_conference_weeks_we_couldve_predicted

np.int64(278)

In [13]:
# How many of the actual winners did we predict?
# Aka how many of the actual winners did our model assign a top-k probability for that [season,conference,week] combo?
for k in [1, 3, 5, 10]:
  query = f"""

  WITH Highest_Probabilities AS (
  SELECT *
  ,RANK() OVER(PARTITION BY season, conference, week ORDER BY pred_proba DESC) AS rnk
  FROM evaluation_df
  )
  ,
  Actual_Winners AS (
    SELECT * FROM evaluation_df
    WHERE actual_won = 1
  )


  SELECT * FROM Highest_Probabilities
  JOIN Actual_Winners
  ON
  Highest_Probabilities.season = Actual_Winners.season
  AND
  Highest_Probabilities.week = Actual_Winners.week
  AND
  Highest_Probabilities.conference = Actual_Winners.conference
  WHERE rnk <= {k}
  AND Highest_Probabilities.pred_proba = Actual_Winners.pred_proba
  """
  duckdb.query(query).df()
  correct_predictions_df = duckdb.query(query).df()
  total_correct_predictions = correct_predictions_df["actual_won"].sum()
  print(f"Top-{k} accuracy:", round(100*total_correct_predictions/total_season_conference_weeks_we_couldve_predicted,2),"%")

Top-1 accuracy: 75.54 %
Top-3 accuracy: 95.32 %
Top-5 accuracy: 97.84 %
Top-10 accuracy: 100.0 %


In [14]:
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': lgb_model.feature_importances_
}).sort_values('importance', ascending=False)

feature_importance['importance_pct'] = (feature_importance['importance'] / feature_importance['importance'].sum() * 100)

print("LightGBM Top 20 features:")
print(feature_importance.head(20)[['feature','importance_pct']])

LightGBM Top 20 features:
                 feature  importance_pct
2                 points        7.666667
36              team_pts        6.133333
41  fieldGoalsPercentage        4.966667
3                assists        3.833333
18        wins_this_week        3.433333
37              team_ast        3.366667
44    points_mean_season        3.200000
6          reboundsTotal        3.066667
1             numMinutes        2.933333
46   assists_mean_season        2.600000
7      reboundsDefensive        2.466667
10        fieldGoalsMade        2.366667
39              team_stl        2.333333
50                 z_pts        2.266667
21  opponent_has_all_nba        2.166667
17       plusMinusPoints        2.133333
4                 blocks        2.100000
51                 z_ast        2.066667
14        freeThrowsMade        2.066667
53        breakout_score        1.933333


In [15]:
import joblib

# Save model
joblib.dump(lgb_model, 'lightgbm_potw_model.pkl')

# Save feature names
joblib.dump(X_train.columns.tolist(), 'lightgbm_model_features.pkl')

['lightgbm_model_features.pkl']

**The below steps will use the above model for inference.**

In [22]:
def get_potw_predictions(week_start: str):
  """
  get_potw_predictions() does inference to predict POW.
  week_start must be a Monday in 'yyyy-mm-dd' format
  """
  import duckdb
  import joblib
  import wget

  # features-overall-weekly-for-inference.csv is calculated in a separate script that runs within the daily production pipeline
  # features-overall-weekly-for-inference.csv is needed to make predictions while the current week is in-progress (i.e. no POW announced yet)
  wget.download('https://storage.googleapis.com/nba_award_predictor/nba_data/features-overall-weekly-for-inference.csv')

  # Read in model & its feature names
  model = joblib.load('lightgbm_potw_model.pkl')
  feature_names = joblib.load('lightgbm_model_features.pkl')

  # Read in player-week stats
  overall_weekly_agg_df = pd.read_csv('features-overall-weekly-for-inference.csv')

  # Query all players who played for the input week
  query = f"""
  SELECT * FROM overall_weekly_agg_df
  WHERE week_start = '{week_start}'
  """
  df = duckdb.query(query).df()

  #Keep player info separate
  player_info = df[['player_id', 'full_name', 'conference', 'season', 'week']]
  X = df[feature_names]
  probabilities = model.predict_proba(X)[:, 1]

  # Attach probabilities to the player identities
  results = player_info.copy()
  results['probability'] = probabilities

  # Now partition by season, week, and conference and return top 5 for each conference
  results
  query = f"""
  WITH CTE AS (SELECT *,
  RANK() OVER(PARTITION BY season, week, conference ORDER BY probability DESC) AS rank
  FROM results
  )

  ,CTE2 AS (
  SELECT * FROM CTE WHERE conference = 'East' AND rank <= 5
  UNION ALL
  SELECT * FROM CTE WHERE conference = 'West' AND rank <= 5

  )
  SELECT
  '{week_start}' AS week_start
  ,rank
  ,full_name as name
  ,conference
  ,probability
  FROM CTE2
  """

  top_five_per_conference = duckdb.query(query).df()

  return top_five_per_conference

get_potw_predictions('2025-12-01')

Unnamed: 0,week_start,rank,name,conference,probability
0,2025-12-01,1,Jalen Johnson,East,0.643083
1,2025-12-01,2,Desmond Bane,East,0.630744
2,2025-12-01,3,Giannis Antetokounmpo,East,0.499823
3,2025-12-01,4,Tyrese Maxey,East,0.463047
4,2025-12-01,5,Jaylen Brown,East,0.433554
5,2025-12-01,1,Deni Avdija,West,0.728505
6,2025-12-01,2,Shai Gilgeous-Alexander,West,0.724511
7,2025-12-01,3,Alperen Sengun,West,0.690428
8,2025-12-01,4,De'Aaron Fox,West,0.599276
9,2025-12-01,5,Anthony Edwards,West,0.526884
