In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import PCA

import requests
import os

In [3]:
RANDOM_STATE = 42

In [4]:
# Set to True to download data from GitHub, False to load from local processed directory
DOWNLOAD = False

if DOWNLOAD:
    # Download CSV files from GitHub repository
    directory = "./downloads/"
    filenames = [
        "master_data.parquet",
        "master_data_dropped.parquet",
        "master_data_imputed.parquet"
    ]

    # Common URL parts
    base_url = "https://github.com/fbec76/sas-curiosity-cup-2026/raw/refs/heads/main/datasets/processed/"
    for fname in filenames:
        url = base_url + fname + "?download="
        response = requests.get(url)
        if response.ok:

            if not os.path.exists(directory):
                os.makedirs(directory)
            with open(directory + fname, "wb") as f:
                f.write(response.content)
            print(f"Downloaded: {fname}")
        else:
            print(f"Failed to download: {fname}")
    data_dir = "./downloads/"
    master_data_orig = pd.read_parquet(data_dir + "master_data.parquet")
    master_data_dropped_orig = pd.read_parquet(data_dir + "master_data_dropped.parquet")
    master_data_imputed_orig = pd.read_parquet(data_dir + "master_data_imputed.parquet")
    print("Data loaded from downloads directory.")
else:
    master_data_orig = pd.read_parquet("../datasets/processed/master_data.parquet")
    master_data_dropped_orig = pd.read_parquet("../datasets/processed/master_data_dropped.parquet")
    master_data_imputed_orig = pd.read_parquet("../datasets/processed/master_data_imputed.parquet")
    data_dir = "../datasets/processed/"
    print("Data loaded from processed directory.")


Data loaded from processed directory.


In [7]:
master_data = master_data_orig.copy()
master_data_dropped = master_data_dropped_orig.copy()
master_data_imputed = master_data_imputed_orig.copy()

In [8]:
def _fit_linear_classifier(
        df,
        target,
        features_list,
        continuous_features,
        cv_folds=5,
        scoring="f1",
        model_type="logreg",  # "logreg" or "sgd"
        penalty_type="lasso",  # "lasso" or "ridge"
        sgd_max_iter=2000,
        sgd_tol=1e-3,
):
    import pandas as pd

    from sklearn.compose import ColumnTransformer
    from sklearn.linear_model import LogisticRegression, SGDClassifier
    from sklearn.metrics import classification_report, confusion_matrix
    from sklearn.model_selection import GridSearchCV, cross_val_predict
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler

    X = df[features_list]
    y = df[target]

    start_time = pd.Timestamp.now()

    # Only scale columns that are actually used and exist
    continuous_present = [c for c in continuous_features if c in features_list]

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), continuous_present),
        ],
        remainder="passthrough",
        verbose_feature_names_out=False,
    )

    if model_type == "logreg":
        # LogisticRegression with elasticnet via saga
        l1_ratio = 1.0 if penalty_type == "lasso" else 0.0
        clf = LogisticRegression(
            penalty="elasticnet",
            solver="saga",
            l1_ratio=l1_ratio,
            max_iter=10000,
        )
        pipe = Pipeline(
            steps=[
                ("preprocess", preprocessor),
                ("clf", clf),
            ]
        )
        param_grid = {"clf__C": [0.01, 0.1, 1, 10, 100]}

    elif model_type == "sgd":
        # SGDClassifier approximates logistic regression when loss='log_loss'
        penalty = "l1" if penalty_type == "lasso" else "l2"
        clf = SGDClassifier(
            loss="log_loss",
            penalty=penalty,
            max_iter=sgd_max_iter,
            tol=sgd_tol,
            random_state=42,
        )
        pipe = Pipeline(
            steps=[
                ("preprocess", preprocessor),
                ("clf", clf),
            ]
        )
        param_grid = {"clf__alpha": [1e-2, 1e-3, 1e-4, 1e-5]}

    else:
        raise ValueError("model_type must be 'logreg' or 'sgd'")

    grid_search = GridSearchCV(
        estimator=pipe,
        param_grid=param_grid,
        cv=cv_folds,
        scoring=scoring,
        n_jobs=-1,
    )
    grid_search.fit(X, y)

    best_model = grid_search.best_estimator_

    # CV predictions from the selected (tuned) pipeline
    y_pred_cv = cross_val_predict(best_model, X, y, cv=cv_folds, n_jobs=-1)

    print("Scaled continuous columns (present):", continuous_present)
    print("Best Hyperparameters:", grid_search.best_params_)
    print("Classification Report (CV preds):\n", classification_report(y, y_pred_cv))
    print("Confusion Matrix (CV preds):\n", confusion_matrix(y, y_pred_cv))
    print("Time taken:", pd.Timestamp.now() - start_time)

    return best_model


def lasso_logistic_regression(
        df,
        target,
        features_list,
        continuous_features,
        cv_folds=5,
        model_type="logreg",
        scoring="f1",
):
    return _fit_linear_classifier(
        df=df,
        target=target,
        features_list=features_list,
        continuous_features=continuous_features,
        cv_folds=cv_folds,
        scoring=scoring,
        model_type=model_type,
        penalty_type="lasso",
    )


def ridge_logistic_regression(
        df,
        target,
        features_list,
        continuous_features,
        cv_folds=5,
        model_type="logreg",
        scoring="f1",
):
    return _fit_linear_classifier(
        df=df,
        target=target,
        features_list=features_list,
        continuous_features=continuous_features,
        cv_folds=cv_folds,
        scoring=scoring,
        model_type=model_type,
        penalty_type="ridge",
    )

In [9]:
# drop GAME_DATE column from master_data_dropped and master_data_imputed
master_data_dropped = master_data_dropped.drop(columns=["GAME_DATE"])
master_data_imputed = master_data_imputed.drop(columns=["GAME_DATE"])

# conver POS_ columns to boolean in master_data_dropped
pos_columns = [col for col in master_data_dropped.columns if col.startswith("POS_")]
master_data_dropped[pos_columns] = master_data_dropped[pos_columns].astype(bool)

In [10]:
# define target variable and features list
target_variable = "MADE_SHOT"

# features list for master_data_dropped
cols_to_exclude = [
    target_variable, "GAME_ID", "PLAYER_ID", "PLAYER_NAME", "TEAM_ID", "TEAM_NAME",
    "LAT", "LON", "D_LAT", "D_LON", "FLIGHT_TIME_MIN", "HOME_TEAM", "AWAY_TEAM", "IS_3PT"
]

features_list = master_data_dropped.columns.difference(cols_to_exclude).tolist()

features_list_dropped = features_list
features_list_imputed = [col for col in features_list if not col.startswith("POS_")]

# Continuous features to normalize (z-score)
continuous_features = [
    "BODY_FAT_PCT",
    "DISTANCE_KM",
    "HAND_LENGTH_CM",
    "HAND_WIDTH_CM",
    "HEIGHT_CM",
    "LANE_AGILITY_TIME_S",
    "LOC_X_CM",
    "LOC_Y_CM",
    "MAX_VERTICAL_LEAP_CM",
    "REST_D",
    "SEASON",  # normalize to allow learning an overall trend
    "SHOT_DISTANCE_CM",
    "STANDING_REACH_CM",
    "STANDING_VERTICAL_LEAP_CM",
    "THREE_QUARTER_SPRINT_S",
    "TIME_LEFT_S",
    "TZ_SHIFT",
    "WEIGHT_KG",
    "WINGSPAN_CM",
]

# One-hot encode QUARTER
if "QUARTER" in master_data_dropped.columns:
    master_data_dropped = pd.get_dummies(
        master_data_dropped,
        columns=["QUARTER"],
        prefix="QUARTER",
        drop_first=False
    )

if "QUARTER" in master_data_imputed.columns:
    master_data_imputed = pd.get_dummies(
        master_data_imputed,
        columns=["QUARTER"],
        prefix="QUARTER",
        drop_first=False
    )

# Ensure we only scale columns that exist (robust to missing columns)
continuous_features_present = [c for c in continuous_features if c in master_data_dropped.columns]

# Rebuild features_list after one-hot encoding QUARTER (new columns added)
features_list = master_data_dropped.columns.difference(cols_to_exclude).tolist()

features_list_dropped = features_list
features_list_imputed = [col for col in features_list if not col.startswith("POS_")]

In [11]:
# randomize order in master_data_dropped and master_data_imputed
master_data_dropped = master_data_dropped.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
master_data_imputed = master_data_imputed.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

In [13]:
# Run LASSO logistic regression on 25% sample of master_data_dropped
print("Running LASSO logistic regression on master_data_dropped (25% sample)...")

lasso_model_dropped = lasso_logistic_regression(
    df=master_data_dropped.sample(frac=0.25, random_state=RANDOM_STATE),
    target=target_variable,
    features_list=features_list_dropped,
    continuous_features=continuous_features,
    cv_folds=5,
    model_type="sgd",
    scoring="f1",
)

Running LASSO logistic regression on master_data_dropped (25% sample)...


KeyboardInterrupt: 

In [None]:
# get the coefficients of the LASSO model and sort them by absolute value
lasso_coefficients = pd.Series(lasso_model_dropped.coef_[0], index=features_list_dropped)
lasso_coefficients_sorted = lasso_coefficients.reindex(lasso_coefficients.abs().sort_values(ascending=False).index)
print("LASSO Coefficients sorted by absolute value:")
lasso_coefficients_sorted

In [None]:
# Run Ridge logistic regression on 25% sample of master_data_dropped
print("Running Ridge logistic regression on master_data_dropped (25% sample)...")

ridge_model_dropped = ridge_logistic_regression(
    df=master_data_dropped.sample(frac=0.25, random_state=RANDOM_STATE),
    target=target_variable,
    features_list=features_list_dropped,
    continuous_features=continuous_features,
    cv_folds=5,
    model_type="sgd",
    scoring="f1",
)

In [None]:
# get the coefficients of the Ridge model and sort them by absolute value
ridge_coefficients = pd.Series(ridge_model_dropped.coef_[0], index=features_list_dropped)
ridge_coefficients_sorted = ridge_coefficients.reindex(ridge_coefficients.abs().sort_values(ascending=False).index)
print("Ridge Coefficients sorted by absolute value:")
ridge_coefficients_sorted

In [None]:
# Run LASSO logistic regression on 25% sample of master_data_imputed
print("Running LASSO logistic regression on master_data_imputed (25% sample)...")

lasso_model_imputed = lasso_logistic_regression(
    df=master_data_imputed.sample(frac=0.25, random_state=RANDOM_STATE),
    target=target_variable,
    features_list=features_list_imputed,
    continuous_features=continuous_features,
    cv_folds=5,
    model_type="sgd",
    scoring="f1",
)

In [None]:
# get the coefficients of the LASSO model and sort them by absolute value
lasso_coefficients_imputed = pd.Series(lasso_model_imputed.coef_[0], index=features_list_imputed)
lasso_coefficients_imputed_sorted = lasso_coefficients_imputed.reindex(
    lasso_coefficients_imputed.abs().sort_values(ascending=False).index)
print("LASSO Coefficients (imputed) sorted by absolute value:")
lasso_coefficients_imputed_sorted

In [None]:
# Run Ridge logistic regression on 10% sample of master_data_imputed
print("Running Ridge logistic regression on master_data_imputed (25% sample)...")

ridge_model_imputed = ridge_logistic_regression(
    df=master_data_imputed.sample(frac=0.25, random_state=RANDOM_STATE),
    target=target_variable,
    features_list=features_list_imputed,
    continuous_features=continuous_features,
    cv_folds=5,
    model_type="sgd",
    scoring="f1",
)

In [None]:
# get the coefficients of the Ridge model and sort them by absolute value
ridge_coefficients_imputed = pd.Series(ridge_model_imputed.coef_[0], index=features_list_imputed)
ridge_coefficients_imputed_sorted = ridge_coefficients_imputed.reindex(
    ridge_coefficients_imputed.abs().sort_values(ascending=False).index)
print("Ridge Coefficients (imputed) sorted by absolute value:")
ridge_coefficients_imputed_sorted