<a href="https://colab.research.google.com/github/dejiandrew/nba-award-predictor/blob/deji/lightgbm_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install wget
import pandas as pd
import wget
import duckdb
import lightgbm as lgb



In [2]:
wget.download('https://storage.googleapis.com/nba_award_predictor/nba_data/features-overall-weekly.csv')
features_overall_weekly_df = pd.read_csv('features-overall-weekly.csv')

In [3]:
features_overall_weekly_df.columns

Index(['player_id', 'full_name', 'team', 'season', 'week', 'week_start',
       'conference', 'pow_conference', 'games_played_this_week', 'numMinutes',
       'points', 'assists', 'blocks', 'steals', 'reboundsTotal',
       'reboundsDefensive', 'reboundsOffensive', 'fieldGoalsAttempted',
       'fieldGoalsMade', 'threePointersAttempted', 'threePointersMade',
       'freeThrowsAttempted', 'freeThrowsMade', 'turnovers', 'foulsPersonal',
       'plusMinusPoints', 'wins_this_week', 'wins_vs_team_with_all_nba_player',
       'is_win_vs_over_500', 'opponent_has_all_nba', 'avg_opp_score',
       'avg_opp_winrate_prior', 'avg_opp_wins_prior', 'avg_opp_losses_prior',
       'away_games_prior', 'away_losses_prior', 'away_win_streak_prior',
       'away_wins_prior', 'home_games_prior', 'home_losses_prior',
       'home_win_streak_prior', 'home_wins_prior', 'losses_prior',
       'wins_vs_over_500_prior', 'won_player_of_the_week',
       'all_star_this_season', 'mvp_this_season',
       'all_nba_f

In [4]:
# numeric_df = features_overall_weekly_df.select_dtypes(include='number')
# numeric_df

In [5]:
# View the class imbalance (99.4% positive class, 0.6% negative class)
print(features_overall_weekly_df['won_player_of_the_week'].value_counts())
print(features_overall_weekly_df['won_player_of_the_week'].value_counts(normalize=True))

won_player_of_the_week
0    231975
1      1390
Name: count, dtype: int64
won_player_of_the_week
0    0.994044
1    0.005956
Name: proportion, dtype: float64


In [6]:
# Drop columns that contain the answer. Also drop mvp, all-star, all-nba because they are from the future when we try to do inference during season
leakage_cols = ['pow_player_id', 'player_of_the_week', 'pow_conference',
                'all_star_this_season', 'mvp_this_season',
                'all_nba_first_team_this_season', 'all_nba_second_team_this_season',
                'all_nba_third_team_this_season']

# Set aside identifier columns
id_cols = ['player_id', 'full_name', 'team', 'season', 'week', 'week_start', 'conference']

features_overall_weekly_df_clean = features_overall_weekly_df.drop(columns=leakage_cols + id_cols)
df_encoded = pd.get_dummies(features_overall_weekly_df_clean, drop_first=True)
df_encoded

Unnamed: 0,games_played_this_week,numMinutes,points,assists,blocks,steals,reboundsTotal,reboundsDefensive,reboundsOffensive,fieldGoalsAttempted,...,points_mean_season,points_std_season,assists_mean_season,assists_std_season,plusMinusPoints_mean_season,plusMinusPoints_std_season,z_pts,z_ast,z_pm,breakout_score
0,3,45.00,14.0,11.0,1.0,1.0,5.0,5.0,0.0,22.0,...,27.000000,0.000000,4.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
1,4,39.00,24.0,7.0,0.0,1.0,5.0,3.0,2.0,20.0,...,20.500000,6.500000,7.500000,3.500000,0.0,0.0,0.538462,-0.142857,0.000000,0.226374
2,3,51.00,22.0,5.0,0.0,3.0,3.0,2.0,1.0,23.0,...,21.666667,5.557777,7.333333,2.867442,0.0,0.0,0.059976,-0.813733,0.000000,-0.214132
3,3,50.00,11.0,13.0,1.0,0.0,2.0,2.0,0.0,11.0,...,21.750000,4.815340,6.750000,2.680951,0.0,0.0,-2.232449,2.331262,0.000000,-0.416846
4,2,28.00,19.0,1.0,0.0,1.0,2.0,1.0,1.0,15.0,...,19.600000,6.086050,8.000000,3.464102,0.0,0.0,-0.098586,-2.020726,0.000000,-0.655511
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233360,3,124.08,36.0,16.0,0.0,8.0,16.0,14.0,2.0,34.0,...,23.000000,19.000000,6.000000,4.000000,-34.0,36.0,0.684211,2.500000,1.000000,1.292105
233361,3,12.45,0.0,1.0,1.0,0.0,6.0,5.0,1.0,1.0,...,6.000000,0.000000,1.000000,0.000000,-5.0,0.0,0.000000,0.000000,0.000000,0.000000
233362,3,34.87,6.0,1.0,1.0,1.0,10.0,8.0,2.0,4.0,...,3.000000,3.000000,1.000000,0.000000,1.5,6.5,1.000000,0.000000,-3.923077,-0.284615
233363,3,32.89,6.0,6.0,1.0,0.0,5.0,4.0,1.0,10.0,...,3.000000,0.000000,0.000000,0.000000,-4.0,0.0,0.000000,0.000000,0.000000,0.000000


In [7]:
from sklearn.model_selection import train_test_split

X = df_encoded.drop(columns=['won_player_of_the_week'])
y = df_encoded['won_player_of_the_week']
identifiers = features_overall_weekly_df[id_cols]

X_train, X_test, y_train, y_test, id_train, id_test = train_test_split(
    X, y, identifiers,
    test_size=0.2,
    stratify=y,          # Maintain the same class imbalance ratio within both y_train and y_test
    random_state=42
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Positive class in y_train: {100 * round(y_train.mean(),4)}%")
print(f"Positive class in y_test: {100 * round(y_test.mean(),4)}%")

Training samples: 186692
Test samples: 46673
Positive class in y_train: 0.6%
Positive class in y_test: 0.6%


In [8]:
!pip install lightgbm



In [9]:

# Train LightGBM model
lgb_model = lgb.LGBMClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    num_leaves=31,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

# Train LightGBM model
lgb_model.fit(X_train, y_train)

# Get predictions
y_pred_proba = lgb_model.predict_proba(X_test)[:, 1]

In [10]:
evaluation_df = id_test.copy()
evaluation_df['actual_won'] = y_test.values
evaluation_df['pred_proba'] = y_pred_proba
evaluation_df

Unnamed: 0,player_id,full_name,team,season,week,week_start,conference,actual_won,pred_proba
151050,201591,DJ White,Bobcats,2010,11,2011-03-14,East,0,0.000230
210793,1628984,Devonte' Graham,Hornets,2020,16,2021-04-19,East,0,0.000117
55487,1724,Bryce Drew,Hornets,2001,14,2002-04-01,West,0,0.000079
139259,201142,Kevin Durant,Suns,2023,13,2024-03-25,West,0,0.092048
40374,959,Steve Nash,Mavericks,1998,8,1999-02-22,West,0,0.000130
...,...,...,...,...,...,...,...,...,...
193035,1626145,Tyus Jones,Grizzlies,2020,18,2021-05-03,West,0,0.000049
19782,353,Darrell Armstrong,Hornets,2003,14,2004-03-29,East,0,0.000115
9548,174,John Battle,Cavaliers,1992,5,1993-02-01,East,0,0.000091
53664,1713,Vince Carter,Hawks,2018,6,2019-02-04,East,0,0.000231


In [11]:
actual_winners_df = evaluation_df[evaluation_df['actual_won'] == 1]
actual_winners_df.head()

Unnamed: 0,player_id,full_name,team,season,week,week_start,conference,actual_won,pred_proba
86370,2548,Dwyane Wade,Heat,2008,47,2008-11-17,East,1,0.64481
46374,1453,Walter Davis,Suns,1982,8,1983-02-21,West,1,0.866899
154632,201942,DeMar DeRozan,Bulls,2021,6,2022-02-07,East,1,0.929789
14740,255,Grant Hill,Pistons,1996,3,1997-01-13,East,1,0.97541
167707,202689,Kemba Walker,Hornets,2018,14,2019-04-01,East,1,0.693781


In [12]:
# How many season,conference,week combinations were there in actual_winners_df? This is our denominator for accuracy.

query = """
SELECT COUNT(DISTINCT(season,conference,week)) AS total_season_conference_weeks_we_couldve_predicted
FROM actual_winners_df
"""

total_season_conference_weeks_we_couldve_predicted = duckdb.query(query).df()['total_season_conference_weeks_we_couldve_predicted'].sum()
total_season_conference_weeks_we_couldve_predicted

np.int64(277)

In [13]:
# How many of the actual winners did we predict?
# Aka how many of the actual winners did our model assign a top-k probability for that [season,conference,week] combo?
for k in [1, 3, 5, 10]:
  query = f"""

  WITH Highest_Probabilities AS (
  SELECT *
  ,RANK() OVER(PARTITION BY season, conference, week ORDER BY pred_proba DESC) AS rnk
  FROM evaluation_df
  )
  ,
  Actual_Winners AS (
    SELECT * FROM evaluation_df
    WHERE actual_won = 1
  )


  SELECT * FROM Highest_Probabilities
  JOIN Actual_Winners
  ON
  Highest_Probabilities.season = Actual_Winners.season
  AND
  Highest_Probabilities.week = Actual_Winners.week
  AND
  Highest_Probabilities.conference = Actual_Winners.conference
  WHERE rnk <= {k}
  AND Highest_Probabilities.pred_proba = Actual_Winners.pred_proba
  """
  duckdb.query(query).df()
  correct_predictions_df = duckdb.query(query).df()
  total_correct_predictions = correct_predictions_df["actual_won"].sum()
  print(f"Top-{k} accuracy:", round(100*total_correct_predictions/total_season_conference_weeks_we_couldve_predicted,2),"%")

Top-1 accuracy: 75.81 %
Top-3 accuracy: 93.86 %
Top-5 accuracy: 97.11 %
Top-10 accuracy: 98.92 %


In [14]:
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': lgb_model.feature_importances_
}).sort_values('importance', ascending=False)

feature_importance['importance_pct'] = (feature_importance['importance'] / feature_importance['importance'].sum() * 100)

print("LightGBM Top 20 features:")
print(feature_importance.head(20)[['feature','importance_pct']])

LightGBM Top 20 features:
                       feature  importance_pct
2                       points        8.033333
36                    team_pts        7.966667
3                      assists        4.700000
41        fieldGoalsPercentage        4.666667
18              wins_this_week        3.833333
1                   numMinutes        2.800000
37                    team_ast        2.766667
53              breakout_score        2.566667
10              fieldGoalsMade        2.566667
6                reboundsTotal        2.500000
13         freeThrowsAttempted        2.433333
44          points_mean_season        2.400000
4                       blocks        2.366667
7            reboundsDefensive        2.300000
14              freeThrowsMade        2.133333
39                    team_stl        2.000000
49  plusMinusPoints_std_season        2.000000
50                       z_pts        1.966667
17             plusMinusPoints        1.966667
51                       z_ast    

In [15]:
import joblib

# Save model
joblib.dump(lgb_model, 'lightgbm_potw_model.pkl')

# Save feature names
joblib.dump(X_train.columns.tolist(), 'lightgbm_model_features.pkl')

['lightgbm_model_features.pkl']