In [1]:
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

import requests
import os

In [2]:
RANDOM_STATE = 42

In [4]:
# Set to True to download data from GitHub, False to load from local processed directory
DOWNLOAD = True

if DOWNLOAD:
    # Download CSV files from GitHub repository
    directory = "./downloads/"
    filenames = [
        "master_data.parquet",
        "master_data_dropped.parquet",
        "master_data_imputed.parquet"
    ]

    # Common URL parts
    base_url = "https://github.com/fbec76/sas-curiosity-cup-2026/raw/refs/heads/main/datasets/processed/"
    for fname in filenames:
        url = base_url + fname + "?download="
        response = requests.get(url)
        if response.ok:

            if not os.path.exists(directory):
                os.makedirs(directory)
            with open(directory + fname, "wb") as f:
                f.write(response.content)
            print(f"Downloaded: {fname}")
        else:
            print(f"Failed to download: {fname}")
    data_dir = "./downloads/"
    master_data_orig = pd.read_parquet(data_dir + "master_data.parquet")
    master_data_dropped_orig = pd.read_parquet(data_dir + "master_data_dropped.parquet")
    master_data_imputed_orig = pd.read_parquet(data_dir + "master_data_imputed.parquet")
    print("Data loaded from downloads directory.")
else:
    master_data_orig = pd.read_parquet("../datasets/processed/master_data.parquet")
    master_data_dropped_orig = pd.read_parquet("../datasets/processed/master_data_dropped.parquet")
    master_data_imputed_orig = pd.read_parquet("../datasets/processed/master_data_imputed.parquet")
    data_dir = "../datasets/processed/"
    print("Data loaded from processed directory.")


Downloaded: master_data.parquet
Downloaded: master_data_dropped.parquet
Downloaded: master_data_imputed.parquet
Data loaded from downloads directory.


In [5]:
master_data = master_data_orig.copy()
master_data_dropped = master_data_dropped_orig.copy()
master_data_imputed = master_data_imputed_orig.copy()

In [6]:
# drop GAME_DATE column from master_data_dropped and master_data_imputed
master_data_dropped = master_data_dropped.drop(columns=["GAME_DATE"])
master_data_imputed = master_data_imputed.drop(columns=["GAME_DATE"])

# conver POS_ columns to boolean in master_data_dropped
pos_columns = [col for col in master_data_dropped.columns if col.startswith("POS_")]
master_data_dropped[pos_columns] = master_data_dropped[pos_columns].astype(bool)

In [7]:
# define target variable and features list
target_variable = "MADE_SHOT"

# features list for master_data_dropped
cols_to_exclude = [
    target_variable, "GAME_ID", "PLAYER_ID", "PLAYER_NAME", "TEAM_ID", "TEAM_NAME",
    "LAT", "LON", "D_LAT", "D_LON", "FLIGHT_TIME_MIN", "HOME_TEAM", "AWAY_TEAM", "IS_3PT"
]

features_list = master_data_dropped.columns.difference(cols_to_exclude).tolist()

features_list_dropped = features_list
features_list_imputed = [col for col in features_list if not col.startswith("POS_")]

# Continuous features to normalize (z-score)
continuous_features = [
    "BODY_FAT_PCT",
    "DISTANCE_KM",
    "HAND_LENGTH_CM",
    "HAND_WIDTH_CM",
    "HEIGHT_CM",
    "LANE_AGILITY_TIME_S",
    "LOC_X_CM",
    "LOC_Y_CM",
    "MAX_VERTICAL_LEAP_CM",
    "REST_D",
    "SEASON",  # normalize to allow learning an overall trend
    "SHOT_DISTANCE_CM",
    "STANDING_REACH_CM",
    "STANDING_VERTICAL_LEAP_CM",
    "THREE_QUARTER_SPRINT_S",
    "TIME_LEFT_S",
    "TZ_SHIFT",
    "WEIGHT_KG",
    "WINGSPAN_CM",
]

# One-hot encode QUARTER
if "QUARTER" in master_data_dropped.columns:
    master_data_dropped = pd.get_dummies(
        master_data_dropped,
        columns=["QUARTER"],
        prefix="QUARTER",
        drop_first=False
    )

if "QUARTER" in master_data_imputed.columns:
    master_data_imputed = pd.get_dummies(
        master_data_imputed,
        columns=["QUARTER"],
        prefix="QUARTER",
        drop_first=False
    )

# Ensure we only scale columns that exist (robust to missing columns)
continuous_features_present = [c for c in continuous_features if c in master_data_dropped.columns]

# Rebuild features_list after one-hot encoding QUARTER (new columns added)
features_list = master_data_dropped.columns.difference(cols_to_exclude).tolist()

features_list_dropped = features_list
features_list_imputed = [col for col in features_list if not col.startswith("POS_")]

In [8]:
def svm_classifier_with_scaling(
        df,
        target,
        features_list,
        continuous_features,
        cv_folds=5,
        scoring="f1",
        kernel="rbf",  # "rbf" or "poly"
):
    X = df[features_list]
    y = df[target]

    start_time = pd.Timestamp.now()

    if kernel not in {"rbf", "poly"}:
        raise ValueError("kernel must be 'rbf' or 'poly'")

    # Only scale columns that are actually used and exist
    continuous_present = [c for c in continuous_features if c in features_list]

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), continuous_present),
        ],
        remainder="passthrough",
        verbose_feature_names_out=False,
    )

    pipe = Pipeline(
        steps=[
            ("preprocess", preprocessor),
            ("svc", SVC(kernel=kernel)),
        ]
    )

    if kernel == "rbf":
        param_grid = {
            "svc__C": [0.1, 1, 10, 100],
            "svc__gamma": ["scale", "auto", 1e-3, 1e-2, 1e-1],
        }
    else:  # poly
        param_grid = {
            "svc__C": [0.1, 1, 10, 100],
            "svc__gamma": ["scale", "auto", 1e-3, 1e-2, 1e-1],
            "svc__degree": [2, 3, 4, 5],
            "svc__coef0": [0.0, 0.5, 1.0],
        }

    grid_search = GridSearchCV(
        estimator=pipe,
        param_grid=param_grid,
        cv=cv_folds,
        scoring=scoring,
        n_jobs=-1,
    )
    grid_search.fit(X, y)

    best_model = grid_search.best_estimator_

    # CV predictions from the selected (tuned) pipeline
    y_pred_cv = cross_val_predict(best_model, X, y, cv=cv_folds, n_jobs=-1)

    print("Scaled continuous columns (present):", continuous_present)
    print("Best Hyperparameters:", grid_search.best_params_)
    print("Classification Report (CV preds):\n", classification_report(y, y_pred_cv))
    print("Confusion Matrix (CV preds):\n", confusion_matrix(y, y_pred_cv))
    print("Time taken:", pd.Timestamp.now() - start_time)

    return best_model

In [9]:
# randomize order in master_data_dropped and master_data_imputed
master_data_dropped = master_data_dropped.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
master_data_imputed = master_data_imputed.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

In [15]:
# Run SVM with RBF kernel on 25% of master_data_dropped
best_svm_rbf = svm_classifier_with_scaling(
    df=master_data_dropped.sample(frac=0.01, random_state=RANDOM_STATE),
    target=target_variable,
    features_list=features_list_dropped,
    continuous_features=continuous_features,
    cv_folds=2,
    scoring="f1",
    kernel="rbf",
)

Scaled continuous columns (present): ['BODY_FAT_PCT', 'DISTANCE_KM', 'HAND_LENGTH_CM', 'HAND_WIDTH_CM', 'HEIGHT_CM', 'LANE_AGILITY_TIME_S', 'LOC_X_CM', 'LOC_Y_CM', 'MAX_VERTICAL_LEAP_CM', 'REST_D', 'SEASON', 'SHOT_DISTANCE_CM', 'STANDING_REACH_CM', 'STANDING_VERTICAL_LEAP_CM', 'THREE_QUARTER_SPRINT_S', 'TIME_LEFT_S', 'TZ_SHIFT', 'WEIGHT_KG', 'WINGSPAN_CM']
Best Hyperparameters: {'svc__C': 1, 'svc__gamma': 0.001}
Classification Report (CV preds):
               precision    recall  f1-score   support

       False       0.61      0.58      0.60     13834
        True       0.55      0.59      0.57     12112

    accuracy                           0.58     25946
   macro avg       0.58      0.58      0.58     25946
weighted avg       0.58      0.58      0.58     25946

Confusion Matrix (CV preds):
 [[8001 5833]
 [5015 7097]]
Time taken: 0 days 00:04:03.388683


In [None]:
# Run SVM with RBF kernel on 25% of master_data_imputed
best_svm_rbf_imputed = svm_classifier_with_scaling(
    df=master_data_imputed.sample(frac=0.25, random_state=RANDOM_STATE),
    target=target_variable,
    features_list=features_list_imputed,
    continuous_features=continuous_features,
    cv_folds=5,
    scoring="f1",
    kernel="rbf",
)

In [None]:
# Run SVM with polynomial kernel on 25% of master_data_dropped
best_svm_poly = svm_classifier_with_scaling(
    df=master_data_dropped.sample(frac=0.25, random_state=RANDOM_STATE),
    target=target_variable,
    features_list=features_list_dropped,
    continuous_features=continuous_features,
    cv_folds=5,
    scoring="f1",
    kernel="poly",
)

In [None]:
# Run SVM with polynomial kernel on 25% of master_data_imputed
best_svm_poly_imputed = svm_classifier_with_scaling(
    df=master_data_imputed.sample(frac=0.25, random_state=RANDOM_STATE),
    target=target_variable,
    features_list=features_list_imputed,
    continuous_features=continuous_features,
    cv_folds=5,
    scoring="f1",
    kernel="poly",
)