In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import PCA

import requests
import os

In [2]:
RANDOM_STATE = 42

In [11]:
# Set to True to download data from GitHub, False to load from local processed directory
DOWNLOAD = False

if DOWNLOAD:
    # Download CSV files from GitHub repository
    directory = "./downloads/"
    filenames = [
        "master_data.parquet",
        "master_data_dropped.parquet",
        "master_data_imputed.parquet"
    ]

    # Common URL parts
    base_url = "https://github.com/fbec76/sas-curiosity-cup-2026/raw/refs/heads/main/datasets/processed/"
    for fname in filenames:
        url = base_url + fname + "?download="
        response = requests.get(url)
        if response.ok:

            if not os.path.exists(directory):
                os.makedirs(directory)
            with open(directory + fname, "wb") as f:
                f.write(response.content)
            print(f"Downloaded: {fname}")
        else:
            print(f"Failed to download: {fname}")
    data_dir = "./downloads/"
    master_data_orig = pd.read_parquet(data_dir + "master_data.parquet")
    master_data_dropped_orig = pd.read_parquet(data_dir + "master_data_dropped.parquet")
    master_data_imputed_orig = pd.read_parquet(data_dir + "master_data_imputed.parquet")
    print("Data loaded from downloads directory.")
else:
    master_data_orig = pd.read_parquet("../datasets/processed/master_data.parquet")
    master_data_dropped_orig = pd.read_parquet("../datasets/processed/master_data_dropped.parquet")
    master_data_imputed_orig = pd.read_parquet("../datasets/processed/master_data_imputed.parquet")
    data_dir = "../datasets/processed/"
    print("Data loaded from processed directory.")


Data loaded from processed directory.


In [12]:
master_data = master_data_orig.copy()
master_data_dropped = master_data_dropped_orig.copy()
master_data_imputed = master_data_imputed_orig.copy()

In [13]:
# drop GAME_DATE column from master_data_dropped and master_data_imputed
master_data_dropped = master_data_dropped.drop(columns=["GAME_DATE"])
master_data_imputed = master_data_imputed.drop(columns=["GAME_DATE"])

# conver POS_ columns to boolean in master_data_dropped
pos_columns = [col for col in master_data_dropped.columns if col.startswith("POS_")]
master_data_dropped[pos_columns] = master_data_dropped[pos_columns].astype(bool)

In [14]:
# define target variable and features list
target_variable = "MADE_SHOT"

# features list for master_data_dropped
cols_to_exclude = [
    target_variable, "GAME_ID", "PLAYER_ID", "PLAYER_NAME", "TEAM_ID", "TEAM_NAME",
    "LAT", "LON", "D_LAT", "D_LON", "FLIGHT_TIME_MIN", "HOME_TEAM", "AWAY_TEAM", "IS_3PT"
]

features_list = master_data_dropped.columns.difference(cols_to_exclude).tolist()

features_list_dropped = features_list
features_list_imputed = [col for col in features_list if not col.startswith("POS_")]

# Continuous features to normalize (z-score)
continuous_features = [
    "BODY_FAT_PCT",
    "DISTANCE_KM",
    "HAND_LENGTH_CM",
    "HAND_WIDTH_CM",
    "HEIGHT_CM",
    "LANE_AGILITY_TIME_S",
    "LOC_X_CM",
    "LOC_Y_CM",
    "MAX_VERTICAL_LEAP_CM",
    "REST_D",
    "SEASON",  # normalize to allow learning an overall trend
    "SHOT_DISTANCE_CM",
    "STANDING_REACH_CM",
    "STANDING_VERTICAL_LEAP_CM",
    "THREE_QUARTER_SPRINT_S",
    "TIME_LEFT_S",
    "TZ_SHIFT",
    "WEIGHT_KG",
    "WINGSPAN_CM",
]

# One-hot encode QUARTER
if "QUARTER" in master_data_dropped.columns:
    master_data_dropped = pd.get_dummies(
        master_data_dropped,
        columns=["QUARTER"],
        prefix="QUARTER",
        drop_first=False
    )

if "QUARTER" in master_data_imputed.columns:
    master_data_imputed = pd.get_dummies(
        master_data_imputed,
        columns=["QUARTER"],
        prefix="QUARTER",
        drop_first=False
    )

# Ensure we only scale columns that exist (robust to missing columns)
continuous_features_present = [c for c in continuous_features if c in master_data_dropped.columns]

# Rebuild features_list after one-hot encoding QUARTER (new columns added)
features_list = master_data_dropped.columns.difference(cols_to_exclude).tolist()

features_list_dropped = features_list
features_list_imputed = [col for col in features_list if not col.startswith("POS_")]
features_list_imputed

['BODY_FAT_PCT',
 'DISTANCE_KM',
 'HAND_LENGTH_CM',
 'HAND_WIDTH_CM',
 'HEIGHT_CM',
 'LANE_AGILITY_TIME_S',
 'LOC_X_CM',
 'LOC_Y_CM',
 'MAX_VERTICAL_LEAP_CM',
 'QUARTER_1',
 'QUARTER_2',
 'QUARTER_3',
 'QUARTER_4',
 'QUARTER_5',
 'QUARTER_6',
 'QUARTER_7',
 'QUARTER_8',
 'REST_D',
 'SEASON',
 'SHOT_DISTANCE_CM',
 'STANDING_REACH_CM',
 'STANDING_VERTICAL_LEAP_CM',
 'THREE_QUARTER_SPRINT_S',
 'TIME_LEFT_S',
 'TZ_SHIFT',
 'WEIGHT_KG',
 'WINGSPAN_CM']

In [15]:
# randomize order in master_data_dropped and master_data_imputed
master_data_dropped = master_data_dropped.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
master_data_imputed = master_data_imputed.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

In [16]:
# perform PCA for 90% variance retention on master_data_dropped and master_data_imputed
def perform_pca(df, features_list, variance_threshold=0.9):
    X = df[features_list]
    pca = PCA(n_components=variance_threshold, random_state=RANDOM_STATE)
    X_pca = pca.fit_transform(X)
    print(
        f"PCA reduced {len(features_list)} features to {X_pca.shape[1]} components to retain {variance_threshold * 100}% variance.")
    # print feature names in PCA 1 and PCA 2
    print("PCA Component 1 explained variance ratio:", pca.explained_variance_ratio_[0])
    print("PCA Component 2 explained variance ratio:", pca.explained_variance_ratio_[1])
    return X_pca


pca_dropped = perform_pca(master_data_dropped, features_list_dropped)
pca_imputed = perform_pca(master_data_imputed, features_list_imputed)


PCA reduced 32 features to 3 components to retain 90.0% variance.
PCA Component 1 explained variance ratio: 0.7085438911463618
PCA Component 2 explained variance ratio: 0.13519525016841136
PCA reduced 27 features to 3 components to retain 90.0% variance.
PCA Component 1 explained variance ratio: 0.7082841512138878
PCA Component 2 explained variance ratio: 0.13340619831473907
