<a href="https://colab.research.google.com/github/dejiandrew/nba-award-predictor/blob/deji/autogluon_automl_data_prep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install wget
!pip install h2o
!pip install autogluon
import wget
import pandas as pd
import h2o
import joblib
import numpy as np

In [19]:
wget.download('https://storage.googleapis.com/nba_award_predictor/nba_data/features-overall-weekly.csv')
features_overall_weekly_df = pd.read_csv('features-overall-weekly.csv')

In [None]:
list(features_overall_weekly_df.columns)

In [None]:
#features_overall_weekly_df = features_overall_weekly_df[features_overall_weekly_df['season'] >= 2021]
features_overall_weekly_df = features_overall_weekly_df.drop(columns=['league_pts_mean', 'league_pts_std',
       'league_ast_mean', 'league_ast_std', 'league_pm_mean', 'league_pm_std',
       'z_s_pts', 'z_s_ast', 'z_s_pm'])
features_overall_weekly_df

In [None]:
# Drop columns that contain the answer. Also drop mvp, all-star, all-nba because they are from the future when we try to do inference during season
leakage_cols = ['pow_player_id', 'player_of_the_week', 'pow_conference',
                'all_star_this_season', 'mvp_this_season',
                'all_nba_first_team_this_season', 'all_nba_second_team_this_season',
                'all_nba_third_team_this_season']

# Set aside identifier columns
id_cols = ['player_id', 'full_name', 'team', 'season', 'week', 'week_start', 'conference']

features_overall_weekly_df_clean = features_overall_weekly_df.drop(columns=leakage_cols + id_cols)
df_encoded = pd.get_dummies(features_overall_weekly_df_clean, drop_first=True)
df_encoded

In [23]:
# Use your grouped split data
from sklearn.model_selection import GroupShuffleSplit

X = df_encoded.drop(columns=['won_player_of_the_week'])
y = df_encoded['won_player_of_the_week']
identifiers = features_overall_weekly_df[id_cols]

groups = (features_overall_weekly_df['season'].astype(str) + '_' +
          features_overall_weekly_df['week'].astype(str) + '_' +
          features_overall_weekly_df['conference'].astype(str))

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))

# Create train and test CSVs
train_data = X.iloc[train_idx].copy()
train_data = train_data.replace([np.inf, -np.inf], np.nan)
train_data['won_player_of_the_week'] = y.iloc[train_idx]

test_data = X.iloc[test_idx].copy()
test_data = test_data.replace([np.inf, -np.inf], np.nan)
test_data['won_player_of_the_week'] = y.iloc[test_idx]


In [None]:
from autogluon.tabular import TabularPredictor

# Train AutoGluon
predictor = TabularPredictor(
    label='won_player_of_the_week',
    eval_metric='roc_auc',
    path='./ag_models'
).fit(
    train_data=train_data,
    time_limit=3600,
    presets='best_quality',
    num_gpus=1,
    ag_args_fit={'num_gpus': 1},
    verbosity=2
)

# Get predictions on test set
test_pred_proba = predictor.predict_proba(test_data)

# Create evaluation_df
evaluation_df = identifiers.iloc[test_idx].copy().reset_index(drop=True)
evaluation_df['actual_won'] = test_data['won_player_of_the_week'].values
evaluation_df['pred_proba'] = test_pred_proba[1].values  # Probability of class 1

evaluation_df.head()

In [None]:
import duckdb

# Total events
query = """
SELECT COUNT(DISTINCT(season, conference, week))
       AS total_season_conference_weeks_we_couldve_predicted
FROM evaluation_df
WHERE actual_won = 1
"""

total_season_conference_weeks_we_couldve_predicted = (
    duckdb.query(query).df()['total_season_conference_weeks_we_couldve_predicted'].item()
)

print("Total POTW events in test set:", total_season_conference_weeks_we_couldve_predicted)

# Top-k accuracy
for k in [1, 2, 3, 4, 5, 10]:
    query = f"""
    WITH Highest_Probabilities AS (
      SELECT *,
        RANK() OVER (
           PARTITION BY season, conference, week
           ORDER BY pred_proba DESC
        ) AS rnk
      FROM evaluation_df
    ),
    Actual_Winners AS (
      SELECT * FROM evaluation_df WHERE actual_won = 1
    )

    SELECT *
    FROM Highest_Probabilities
    JOIN Actual_Winners
      ON Highest_Probabilities.season = Actual_Winners.season
     AND Highest_Probabilities.week = Actual_Winners.week
     AND Highest_Probabilities.conference = Actual_Winners.conference
    WHERE rnk <= {k}
      AND Highest_Probabilities.pred_proba = Actual_Winners.pred_proba
    """

    correct_predictions_df = duckdb.query(query).df()
    total_correct_predictions = correct_predictions_df["actual_won"].sum()

    top_k_accuracy = 100 * total_correct_predictions / total_season_conference_weeks_we_couldve_predicted

    print(f"Top-{k} accuracy: {top_k_accuracy:.2f}%")

In [None]:
# Get the best model name
best_model_name = predictor.model_best
print(f"Best model: {best_model_name}")

# See the leaderboard
leaderboard = predictor.leaderboard(test_data)
print(leaderboard)

In [None]:
# Get feature importance for the best model only
feature_importance = predictor.feature_importance(test_data, model=predictor.model_best)

# Show top 20
print(f"Top 20 Features for {predictor.model_best}:")
print(feature_importance.head(20))

In [14]:
import joblib

# Get feature names
feature_names = list(train_data.drop(columns=['won_player_of_the_week']).columns)
joblib.dump(feature_names, 'autogluon_model_features.pkl')

print(f"Best model: {best_model_name}")
print(f"Features saved to: ./autogluon_model_features.pkl")

Best model: WeightedEnsemble_L3
Features saved to: ./autogluon_model_features_v2.pkl


In [None]:
def get_potw_predictions(week_start: str):
  """
  get_potw_predictions() does inference to predict POW.
  week_start must be a Monday in 'yyyy-mm-dd' format
  """
  import duckdb
  import joblib
  import wget
  import pandas as pd
  from autogluon.tabular import TabularPredictor

  # features-overall-weekly-for-inference.csv is calculated in a separate script that runs within the daily production pipeline
  # features-overall-weekly-for-inference.csv is needed to make predictions while the current week is in-progress (i.e. no POW announced yet)
  wget.download('https://storage.googleapis.com/nba_award_predictor/nba_data/features-overall-weekly-for-inference.csv')

  # Read in model (load the full predictor, not just the pkl)
  predictor = TabularPredictor.load('./ag_models')
  feature_names = joblib.load('autogluon_model_features.pkl')

  # Read in player-week stats
  overall_weekly_agg_df = pd.read_csv('features-overall-weekly-for-inference.csv')

  # Remove unwanted features
  overall_weekly_agg_df = overall_weekly_agg_df.drop(columns=['league_pts_mean', 'league_pts_std',
       'league_ast_mean', 'league_ast_std', 'league_pm_mean', 'league_pm_std',
       'z_s_pts', 'z_s_ast', 'z_s_pm'])

  # Query all players who played for the input week
  query = f"""
  SELECT * FROM overall_weekly_agg_df
  WHERE week_start = '{week_start}'
  """
  df = duckdb.query(query).df()

  #Keep player info separate
  player_info = df[['player_id', 'full_name', 'conference', 'season', 'week']]
  X = df[feature_names]

  # Use predictor.predict_proba instead
  probabilities = predictor.predict_proba(X)[1].values  # Get probability of class 1

  # Attach probabilities to the player identities
  results = player_info.copy()
  results['probability'] = probabilities

  # Now partition by season, week, and conference and return top 5 for each conference
  query = f"""
  WITH CTE AS (SELECT *,
  RANK() OVER(PARTITION BY season, week, conference ORDER BY probability DESC) AS rank
  FROM results
  )

  ,CTE2 AS (
  SELECT * FROM CTE WHERE conference = 'East' AND rank <= 5
  UNION ALL
  SELECT * FROM CTE WHERE conference = 'West' AND rank <= 5

  )
  SELECT
  '{week_start}' AS week_start
  ,rank
  ,full_name as name
  ,conference
  ,probability
  FROM CTE2
  """

  top_five_per_conference = duckdb.query(query).df()

  return top_five_per_conference

get_potw_predictions('2025-12-15')

In [None]:
from google.cloud import storage
import os

def upload_directory_to_gcs(local_path, bucket_name, gcs_path, credentials_path):
    """Upload a directory to GCS preserving structure"""
    storage_client = storage.Client.from_service_account_json(credentials_path)
    bucket = storage_client.bucket(bucket_name)

    for root, dirs, files in os.walk(local_path):
        for file in files:
            local_file = os.path.join(root, file)
            relative_path = os.path.relpath(local_file, local_path)
            blob_path = os.path.join(gcs_path, relative_path)

            blob = bucket.blob(blob_path)
            blob.upload_from_filename(local_file)
            print(f"Uploaded {local_file} to gs://{bucket_name}/{blob_path}")

# Upload the ag_models directory
upload_directory_to_gcs(
    local_path='./ag_models',
    bucket_name='nba_award_predictor',
    gcs_path='nba_data/models/ag_models',
    credentials_path='cis-5450-final-project-ff442a88ac1b.json'
)