In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import PCA

import requests
import os

In [2]:
RANDOM_STATE = 42

In [3]:
# Set to True to download data from GitHub, False to load from local processed directory
DOWNLOAD = False

if DOWNLOAD:
    # Download CSV files from GitHub repository
    directory = "./downloads/"
    filenames = [
        "master_data.parquet",
        "master_data_dropped.parquet",
        "master_data_imputed.parquet"
    ]

    # Common URL parts
    base_url = "https://github.com/fbec76/sas-curiosity-cup-2026/raw/refs/heads/main/datasets/processed/"
    for fname in filenames:
        url = base_url + fname + "?download="
        response = requests.get(url)
        if response.ok:

            if not os.path.exists(directory):
                os.makedirs(directory)
            with open(directory + fname, "wb") as f:
                f.write(response.content)
            print(f"Downloaded: {fname}")
        else:
            print(f"Failed to download: {fname}")
    data_dir = "./downloads/"
    master_data_orig = pd.read_parquet(data_dir + "master_data.parquet")
    master_data_dropped_orig = pd.read_parquet(data_dir + "master_data_dropped.parquet")
    master_data_imputed_orig = pd.read_parquet(data_dir + "master_data_imputed.parquet")
    print("Data loaded from downloads directory.")
else:
    master_data_orig = pd.read_parquet("../datasets/processed/master_data.parquet")
    master_data_dropped_orig = pd.read_parquet("../datasets/processed/master_data_dropped.parquet")
    master_data_imputed_orig = pd.read_parquet("../datasets/processed/master_data_imputed.parquet")
    data_dir = "../datasets/processed/"
    print("Data loaded from processed directory.")


Data loaded from processed directory.


In [4]:
master_data = master_data_orig.copy()
master_data_dropped = master_data_dropped_orig.copy()
master_data_imputed = master_data_imputed_orig.copy()

In [5]:
def _fit_linear_classifier(
        df,
        target,
        features_list,
        continuous_features,
        cv_folds=5,
        scoring="f1",
        model_type="logreg",  # "logreg" or "sgd"
        penalty_type="lasso",  # "lasso" or "ridge"
        sgd_max_iter=2000,
        sgd_tol=1e-3,
):
    import pandas as pd

    from sklearn.compose import ColumnTransformer
    from sklearn.linear_model import LogisticRegression, SGDClassifier
    from sklearn.metrics import classification_report, confusion_matrix
    from sklearn.model_selection import GridSearchCV, cross_val_predict
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler

    X = df[features_list]
    y = df[target]

    start_time = pd.Timestamp.now()

    # Only scale columns that are actually used and exist
    continuous_present = [c for c in continuous_features if c in features_list]

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), continuous_present),
        ],
        remainder="passthrough",
        verbose_feature_names_out=False,
    )

    if model_type == "logreg":
        # LogisticRegression with elasticnet via saga
        l1_ratio = 1.0 if penalty_type == "lasso" else 0.0
        clf = LogisticRegression(
            penalty="elasticnet",
            solver="saga",
            l1_ratio=l1_ratio,
            max_iter=10000,
        )
        pipe = Pipeline(
            steps=[
                ("preprocess", preprocessor),
                ("clf", clf),
            ]
        )
        param_grid = {"clf__C": [0.01, 0.1, 1, 10, 100]}

    elif model_type == "sgd":
        # SGDClassifier approximates logistic regression when loss='log_loss'
        penalty = "l1" if penalty_type == "lasso" else "l2"
        clf = SGDClassifier(
            loss="log_loss",
            penalty=penalty,
            max_iter=sgd_max_iter,
            tol=sgd_tol,
            random_state=42,
        )
        pipe = Pipeline(
            steps=[
                ("preprocess", preprocessor),
                ("clf", clf),
            ]
        )
        param_grid = {"clf__alpha": [1e-2, 1e-3, 1e-4, 1e-5]}

    else:
        raise ValueError("model_type must be 'logreg' or 'sgd'")

    grid_search = GridSearchCV(
        estimator=pipe,
        param_grid=param_grid,
        cv=cv_folds,
        scoring=scoring,
        n_jobs=-1,
        verbose=2
    )
    grid_search.fit(X, y)

    best_model = grid_search.best_estimator_

    # CV predictions from the selected (tuned) pipeline
    y_pred_cv = cross_val_predict(best_model, X, y, cv=cv_folds, n_jobs=-1)

    print("Best Hyperparameters:", grid_search.best_params_)
    print("Classification Report (CV preds):\n", classification_report(y, y_pred_cv))
    print("Confusion Matrix (CV preds):\n", confusion_matrix(y, y_pred_cv))
    print("Time taken:", pd.Timestamp.now() - start_time)

    return best_model


def lasso_logistic_regression(
        df,
        target,
        features_list,
        continuous_features,
        cv_folds=5,
        model_type="logreg",
        scoring="f1",
):
    return _fit_linear_classifier(
        df=df,
        target=target,
        features_list=features_list,
        continuous_features=continuous_features,
        cv_folds=cv_folds,
        scoring=scoring,
        model_type=model_type,
        penalty_type="lasso",
    )


def ridge_logistic_regression(
        df,
        target,
        features_list,
        continuous_features,
        cv_folds=5,
        model_type="logreg",
        scoring="f1",
):
    return _fit_linear_classifier(
        df=df,
        target=target,
        features_list=features_list,
        continuous_features=continuous_features,
        cv_folds=cv_folds,
        scoring=scoring,
        model_type=model_type,
        penalty_type="ridge",
    )
def _get_sorted_feature_coefficients(model, feature_names):
    preprocessor = model.named_steps["preprocess"]
    classifier = model.named_steps["clf"]
    try:
        expanded_features = preprocessor.get_feature_names_out(feature_names)
    except AttributeError:
        expanded_features = feature_names
    coeffs = pd.Series(classifier.coef_.ravel(), index=expanded_features)
    return coeffs.reindex(coeffs.abs().sort_values(ascending=False).index)

In [6]:
# drop GAME_DATE column from master_data_dropped and master_data_imputed
master_data_dropped = master_data_dropped.drop(columns=["GAME_DATE"])
master_data_imputed = master_data_imputed.drop(columns=["GAME_DATE"])

# conver POS_ columns to boolean in master_data_dropped
pos_columns = [col for col in master_data_dropped.columns if col.startswith("POS_")]
master_data_dropped[pos_columns] = master_data_dropped[pos_columns].astype(bool)

In [7]:
# define target variable and features list
target_variable = "MADE_SHOT"

# features list for master_data_dropped
cols_to_exclude = [
    target_variable, "GAME_ID", "PLAYER_ID", "PLAYER_NAME", "TEAM_ID", "TEAM_NAME",
    "LAT", "LON", "D_LAT", "D_LON", "FLIGHT_TIME_MIN", "HOME_TEAM", "AWAY_TEAM", "IS_3PT"
]

features_list = master_data_dropped.columns.difference(cols_to_exclude).tolist()

features_list_dropped = features_list
features_list_imputed = [col for col in features_list if not col.startswith("POS_")]

# Continuous features to normalize (z-score)
continuous_features = [
    "BODY_FAT_PCT",
    "DISTANCE_KM",
    "HAND_LENGTH_CM",
    "HAND_WIDTH_CM",
    "HEIGHT_CM",
    "LANE_AGILITY_TIME_S",
    "LOC_X_CM",
    "LOC_Y_CM",
    "MAX_VERTICAL_LEAP_CM",
    "REST_D",
    "SEASON",  # normalize to allow learning an overall trend
    "SHOT_DISTANCE_CM",
    "STANDING_REACH_CM",
    "STANDING_VERTICAL_LEAP_CM",
    "THREE_QUARTER_SPRINT_S",
    "TIME_LEFT_S",
    "TZ_SHIFT",
    "WEIGHT_KG",
    "WINGSPAN_CM",
]

# One-hot encode QUARTER
if "QUARTER" in master_data_dropped.columns:
    master_data_dropped = pd.get_dummies(
        master_data_dropped,
        columns=["QUARTER"],
        prefix="QUARTER",
        drop_first=False
    )

if "QUARTER" in master_data_imputed.columns:
    master_data_imputed = pd.get_dummies(
        master_data_imputed,
        columns=["QUARTER"],
        prefix="QUARTER",
        drop_first=False
    )

# Ensure we only scale columns that exist (robust to missing columns)
continuous_features_present = [c for c in continuous_features if c in master_data_dropped.columns]

# Rebuild features_list after one-hot encoding QUARTER (new columns added)
features_list = master_data_dropped.columns.difference(cols_to_exclude).tolist()

features_list_dropped = features_list
features_list_imputed = [col for col in features_list if not col.startswith("POS_")]

In [8]:
# randomize order in master_data_dropped and master_data_imputed
master_data_dropped = master_data_dropped.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
master_data_imputed = master_data_imputed.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

In [9]:
# Run LASSO logistic regression on master_data_dropped
print("Running LASSO logistic regression on master_data_dropped (25% sample)...")

lasso_model_dropped = lasso_logistic_regression(
    df=master_data_dropped,
    target=target_variable,
    features_list=features_list_dropped,
    continuous_features=continuous_features,
    cv_folds=2,
    model_type="sgd",
    scoring="f1",
)

Running LASSO logistic regression on master_data_dropped (25% sample)...
Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV] END ....................................clf__alpha=0.01; total time=  13.6s
[CV] END ....................................clf__alpha=0.01; total time=  13.9s
[CV] END ..................................clf__alpha=0.0001; total time=  13.9s
[CV] END ...................................clf__alpha=0.001; total time=  14.5s
[CV] END ...................................clf__alpha=0.001; total time=  14.5s
[CV] END ..................................clf__alpha=0.0001; total time=  14.6s
[CV] END ...................................clf__alpha=1e-05; total time=  16.6s
[CV] END ...................................clf__alpha=1e-05; total time=  16.9s
Best Hyperparameters: {'clf__alpha': 0.0001}
Classification Report (CV preds):
               precision    recall  f1-score   support

       False       0.62      0.65      0.63   1400436
        True       0.56      0.

In [21]:
# get the coefficients of the LASSO model and sort them by absolute value
lasso_coeffs_dropped = _get_sorted_feature_coefficients(
    lasso_model_dropped,
    features_list_dropped,
)
print("LASSO coefficients sorted by absolute value (master_data_dropped):")
lasso_coeffs_dropped

LASSO coefficients sorted by absolute value (master_data_dropped):


SHOT_DISTANCE_CM            -0.434860
QUARTER_4                   -0.064848
QUARTER_5                   -0.062905
SEASON                       0.050586
HEIGHT_CM                    0.043092
POS_C                        0.033267
POS_PG                      -0.026154
TIME_LEFT_S                  0.019248
LOC_Y_CM                    -0.018627
MAX_VERTICAL_LEAP_CM        -0.009968
WINGSPAN_CM                 -0.002064
POS_SG                       0.001093
QUARTER_6                    0.000000
QUARTER_3                    0.000000
POS_PF                       0.000000
QUARTER_7                    0.000000
QUARTER_2                    0.000000
QUARTER_1                    0.000000
POS_SF                       0.000000
BODY_FAT_PCT                 0.000000
TZ_SHIFT                     0.000000
WEIGHT_KG                    0.000000
DISTANCE_KM                  0.000000
THREE_QUARTER_SPRINT_S       0.000000
STANDING_VERTICAL_LEAP_CM    0.000000
STANDING_REACH_CM            0.000000
REST_D      

In [11]:
# Run Ridge logistic regression on  master_data_dropped
print("Running Ridge logistic regression on master_data_dropped (25% sample)...")

ridge_model_dropped = ridge_logistic_regression(
    df=master_data_dropped,
    target=target_variable,
    features_list=features_list_dropped,
    continuous_features=continuous_features,
    cv_folds=2,
    model_type="sgd",
    scoring="f1",
)

Running Ridge logistic regression on master_data_dropped (25% sample)...
Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV] END ...................................clf__alpha=0.001; total time=  12.0s
[CV] END ....................................clf__alpha=0.01; total time=  12.1s
[CV] END ....................................clf__alpha=0.01; total time=  12.3s
[CV] END ...................................clf__alpha=0.001; total time=  12.5s
[CV] END ..................................clf__alpha=0.0001; total time=  12.6s
[CV] END ..................................clf__alpha=0.0001; total time=  13.3s
[CV] END ...................................clf__alpha=1e-05; total time=  15.7s
[CV] END ...................................clf__alpha=1e-05; total time=  16.0s
Best Hyperparameters: {'clf__alpha': 0.001}
Classification Report (CV preds):
               precision    recall  f1-score   support

       False       0.62      0.65      0.64   1400436
        True       0.57      0.5

In [20]:
# get the coefficients of the Ridge model and sort them by absolute value
ridge_coeffs_dropped = _get_sorted_feature_coefficients(
    ridge_model_dropped,
    features_list_dropped,
)
print("Ridge coefficients sorted by absolute value (master_data_dropped):")
ridge_coeffs_dropped

Ridge coefficients sorted by absolute value (master_data_dropped):


SHOT_DISTANCE_CM            -0.423191
QUARTER_5                   -0.062198
SEASON                       0.056035
POS_C                        0.043119
QUARTER_1                    0.040197
HEIGHT_CM                    0.039959
TIME_LEFT_S                  0.027535
QUARTER_3                    0.026504
QUARTER_2                    0.025853
WINGSPAN_CM                 -0.025154
QUARTER_6                   -0.023314
POS_PG                      -0.023088
LOC_Y_CM                    -0.021046
MAX_VERTICAL_LEAP_CM        -0.019543
WEIGHT_KG                   -0.018098
POS_PF                       0.015254
POS_SG                       0.013529
HAND_LENGTH_CM               0.010672
TZ_SHIFT                     0.010382
QUARTER_4                   -0.009525
BODY_FAT_PCT                 0.008894
POS_SF                       0.006785
QUARTER_7                   -0.006087
LANE_AGILITY_TIME_S          0.005658
STANDING_REACH_CM           -0.005007
THREE_QUARTER_SPRINT_S      -0.004499
HAND_WIDTH_C

In [13]:
# Run LASSO logistic regression on master_data_imputed
print("Running LASSO logistic regression on master_data_imputed (25% sample)...")

lasso_model_imputed = lasso_logistic_regression(
    df=master_data_imputed,
    target=target_variable,
    features_list=features_list_imputed,
    continuous_features=continuous_features,
    cv_folds=2,
    model_type="sgd",
    scoring="f1",
)

Running LASSO logistic regression on master_data_imputed (25% sample)...
Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV] END ..................................clf__alpha=0.0001; total time= 2.3min
[CV] END ....................................clf__alpha=0.01; total time= 2.3min
[CV] END ..................................clf__alpha=0.0001; total time= 2.3min
[CV] END ...................................clf__alpha=0.001; total time= 2.3min
[CV] END ....................................clf__alpha=0.01; total time= 2.3min
[CV] END ...................................clf__alpha=0.001; total time= 2.3min
[CV] END ...................................clf__alpha=1e-05; total time= 2.3min
[CV] END ...................................clf__alpha=1e-05; total time= 2.3min
Best Hyperparameters: {'clf__alpha': 0.0001}
Classification Report (CV preds):
               precision    recall  f1-score   support

       False       0.62      0.66      0.64   2411426
        True       0.56      0.

In [19]:
# get the coefficients of the LASSO model and sort them by absolute value
lasso_coeffs_imputed = _get_sorted_feature_coefficients(
    lasso_model_imputed,
    features_list_imputed,
)
print("LASSO coefficients sorted by absolute value (master_data_imputed):")
lasso_coeffs_imputed

LASSO coefficients sorted by absolute value (master_data_imputed):


SHOT_DISTANCE_CM            -0.390909
QUARTER_1                    0.059723
SEASON                       0.054998
HEIGHT_CM                    0.048181
QUARTER_4                   -0.045569
QUARTER_5                   -0.029552
TIME_LEFT_S                  0.022430
WINGSPAN_CM                 -0.012712
WEIGHT_KG                   -0.012086
LOC_Y_CM                    -0.002868
BODY_FAT_PCT                 0.000000
TZ_SHIFT                     0.000000
QUARTER_7                    0.000000
QUARTER_6                    0.000000
QUARTER_3                    0.000000
QUARTER_2                    0.000000
STANDING_VERTICAL_LEAP_CM    0.000000
THREE_QUARTER_SPRINT_S       0.000000
DISTANCE_KM                  0.000000
STANDING_REACH_CM            0.000000
REST_D                       0.000000
MAX_VERTICAL_LEAP_CM         0.000000
LOC_X_CM                     0.000000
LANE_AGILITY_TIME_S          0.000000
HAND_WIDTH_CM                0.000000
HAND_LENGTH_CM               0.000000
QUARTER_8   

In [15]:
# Run Ridge logistic regression on master_data_imputed
print("Running Ridge logistic regression on master_data_imputed (25% sample)...")

ridge_model_imputed = ridge_logistic_regression(
    df=master_data_imputed,
    target=target_variable,
    features_list=features_list_imputed,
    continuous_features=continuous_features,
    cv_folds=2,
    model_type="sgd",
    scoring="f1",
)

Running Ridge logistic regression on master_data_imputed (25% sample)...
Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV] END ..................................clf__alpha=0.0001; total time= 1.2min
[CV] END ...................................clf__alpha=0.001; total time= 1.3min
[CV] END ....................................clf__alpha=0.01; total time= 1.3min
[CV] END ...................................clf__alpha=0.001; total time= 1.3min
[CV] END ....................................clf__alpha=0.01; total time= 1.3min
[CV] END ..................................clf__alpha=0.0001; total time= 1.3min
[CV] END ...................................clf__alpha=1e-05; total time= 1.3min
[CV] END ...................................clf__alpha=1e-05; total time= 1.3min
Best Hyperparameters: {'clf__alpha': 0.01}
Classification Report (CV preds):
               precision    recall  f1-score   support

       False       0.62      0.66      0.64   2411426
        True       0.56      0.52

In [18]:
# get the coefficients of the Ridge model and sort them by absolute value
ridge_coeffs_imputed = _get_sorted_feature_coefficients(
    ridge_model_imputed,
    features_list_imputed,
)
print("Ridge coefficients sorted by absolute value (master_data_imputed):")
ridge_coeffs_imputed

Ridge coefficients sorted by absolute value (master_data_imputed):


SHOT_DISTANCE_CM            -3.834343e-01
SEASON                       5.069837e-02
LOC_Y_CM                    -3.097180e-02
QUARTER_1                    2.846386e-02
HEIGHT_CM                    2.795788e-02
QUARTER_4                   -2.642851e-02
TIME_LEFT_S                  2.212871e-02
QUARTER_5                   -1.427480e-02
WINGSPAN_CM                 -1.105098e-02
QUARTER_3                    8.704664e-03
QUARTER_2                    5.655772e-03
WEIGHT_KG                   -5.559587e-03
HAND_LENGTH_CM               4.033895e-03
LANE_AGILITY_TIME_S          3.909926e-03
HAND_WIDTH_CM                3.396122e-03
MAX_VERTICAL_LEAP_CM        -3.381002e-03
QUARTER_6                   -2.506209e-03
LOC_X_CM                    -2.052847e-03
TZ_SHIFT                     1.856047e-03
THREE_QUARTER_SPRINT_S       1.365715e-03
BODY_FAT_PCT                 1.231749e-03
DISTANCE_KM                 -1.145365e-03
QUARTER_7                   -5.336489e-04
STANDING_REACH_CM           -1.819