In [14]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier

import requests
import os

In [4]:
RANDOM_STATE = 42

In [5]:
# Set to True to download data from GitHub, False to load from local processed directory
DOWNLOAD = True

if DOWNLOAD:
    # Download CSV files from GitHub repository
    directory = "./downloads/"
    filenames = [
        "master_data.parquet",
        "master_data_dropped.parquet",
        "master_data_imputed.parquet"
    ]

    # Common URL parts
    base_url = "https://github.com/fbec76/sas-curiosity-cup-2026/raw/refs/heads/main/datasets/processed/"
    for fname in filenames:
        url = base_url + fname + "?download="
        response = requests.get(url)
        if response.ok:

            if not os.path.exists(directory):
                os.makedirs(directory)
            with open(directory + fname, "wb") as f:
                f.write(response.content)
            print(f"Downloaded: {fname}")
        else:
            print(f"Failed to download: {fname}")
    data_dir = "./downloads/"
    master_data_orig = pd.read_parquet(data_dir + "master_data.parquet")
    master_data_dropped_orig = pd.read_parquet(data_dir + "master_data_dropped.parquet")
    master_data_imputed_orig = pd.read_parquet(data_dir + "master_data_imputed.parquet")
    print("Data loaded from downloads directory.")
else:
    master_data_orig = pd.read_parquet("../datasets/processed/master_data.parquet")
    master_data_dropped_orig = pd.read_parquet("../datasets/processed/master_data_dropped.parquet")
    master_data_imputed_orig = pd.read_parquet("../datasets/processed/master_data_imputed.parquet")
    data_dir = "../datasets/processed/"
    print("Data loaded from processed directory.")


Downloaded: master_data.parquet
Downloaded: master_data_dropped.parquet
Downloaded: master_data_imputed.parquet
Data loaded from downloads directory.


In [6]:
master_data = master_data_orig.copy()
master_data_dropped = master_data_dropped_orig.copy()
master_data_imputed = master_data_imputed_orig.copy()

In [7]:
# check the shape of the dataframes
print('master_data shape:', master_data.shape, "Count of NA values:", master_data.isna().sum().sum())
print('master_data_dropped shape:', master_data_dropped.shape, "Count of NA values:",
      master_data_dropped.isna().sum().sum())
print('master_data_imputed shape:', master_data_imputed.shape, "Count of NA values:",
      master_data_imputed.isna().sum().sum())

master_data shape: (4450350, 40) Count of NA values: 29692416
master_data_dropped shape: (2594574, 40) Count of NA values: 0
master_data_imputed shape: (4450350, 35) Count of NA values: 0


In [9]:
def _fit_linear_classifier(
        df,
        target,
        features_list,
        cv_folds=5,
        scoring="f1",
        model_type="logreg",  # "logreg" or "sgd"
        penalty_type="lasso",  # "lasso" or "ridge"
        sgd_max_iter=2000,
        sgd_tol=1e-3,
):
    X = df[features_list]
    y = df[target]

    start_time = pd.Timestamp.now()

    if model_type == "logreg":
        # Keep your current setup: LogisticRegression with elasticnet via saga
        l1_ratio = 1.0 if penalty_type == "lasso" else 0.0
        model = LogisticRegression(
            penalty="elasticnet",
            solver="saga",
            l1_ratio=l1_ratio,
            max_iter=10000,
        )
        param_grid = {"C": [0.01, 0.1, 1, 10, 100]}

    elif model_type == "sgd":
        # SGDClassifier approximates logistic regression when loss='log_loss'
        # Regularization strength is controlled by alpha (roughly inverse of C).
        penalty = "l1" if penalty_type == "lasso" else "l2"
        model = SGDClassifier(
            loss="log_loss",
            penalty=penalty,
            max_iter=sgd_max_iter,
            tol=sgd_tol,
            random_state=42,
        )
        param_grid = {"alpha": [1e-2, 1e-3, 1e-4, 1e-5]}

    else:
        raise ValueError("model_type must be 'logreg' or 'sgd'")

    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=cv_folds,
        scoring=scoring,
        n_jobs=-1,
    )
    grid_search.fit(X, y)

    best_model = grid_search.best_estimator_

    y_pred_cv = cross_val_predict(best_model, X, y, cv=cv_folds, n_jobs=-1)

    print("Best Hyperparameters:", grid_search.best_params_)
    print("Classification Report (CV preds):\n", classification_report(y, y_pred_cv))
    print("Confusion Matrix (CV preds):\n", confusion_matrix(y, y_pred_cv))
    print("Time taken:", pd.Timestamp.now() - start_time)

    return best_model


def lasso_logistic_regression(df, target, features_list, cv_folds=5, model_type="logreg", scoring="f1"):
    return _fit_linear_classifier(
        df=df,
        target=target,
        features_list=features_list,
        cv_folds=cv_folds,
        scoring=scoring,
        model_type=model_type,
        penalty_type="lasso",
    )


def ridge_logistic_regression(df, target, features_list, cv_folds=5, model_type="logreg", scoring="f1"):
    return _fit_linear_classifier(
        df=df,
        target=target,
        features_list=features_list,
        cv_folds=cv_folds,
        scoring=scoring,
        model_type=model_type,
        penalty_type="ridge",
    )

In [10]:
# drop GAME_DATE column from master_data_dropped and master_data_imputed
master_data_dropped = master_data_dropped.drop(columns=["GAME_DATE"])
master_data_imputed = master_data_imputed.drop(columns=["GAME_DATE"])

# conver POS_ columns to boolean in master_data_dropped
pos_columns = [col for col in master_data_dropped.columns if col.startswith("POS_")]
master_data_dropped[pos_columns] = master_data_dropped[pos_columns].astype(bool)

# show column data types for master_data_dropped and master_data_imputed in dataframe format
print("Data types for master_data_dropped:")
print(master_data_dropped.dtypes)
print("\nData types for master_data_imputed:")
print(master_data_imputed.dtypes)

Data types for master_data_dropped:
GAME_ID                         int64
SEASON                          int64
QUARTER                         int64
TIME_LEFT_S                     int64
HOME_TEAM                    category
AWAY_TEAM                    category
TEAM_ID                         int64
TEAM_NAME                    category
PLAYER_ID                       int64
PLAYER_NAME                    object
LOC_X_CM                      float64
LOC_Y_CM                      float64
SHOT_DISTANCE_CM              float64
IS_3PT                           bool
DISTANCE_KM                   float64
REST_D                        float64
TZ_SHIFT                      float64
FLIGHT_TIME_MIN               float64
LAT                           float64
LON                           float64
D_LAT                         float64
D_LON                         float64
HEIGHT_CM                     float64
WEIGHT_KG                     float64
WINGSPAN_CM                   float64
STANDING_REACH

In [None]:
# define target variable and features list
target_variable = "MADE_SHOT"

# features list for master_data_dropped
cols_to_exclude = [
    target_variable, "GAME_ID", "PLAYER_ID", "PLAYER_NAME", "TEAM_ID", "TEAM_NAME",
    "LAT", "LON", "D_LAT", "D_LON", "FLIGHT_TIME_MIN", "HOME_TEAM", "AWAY_TEAM", "IS_3PT"
]

features_list = master_data_dropped.columns.difference(cols_to_exclude).tolist()

features_list_dropped = features_list
features_list_imputed = [col for col in features_list if not col.startswith("POS_")]

# Continuous features to normalize (z-score)
continuous_features = [
    "BODY_FAT_PCT",
    "DISTANCE_KM",
    "HAND_LENGTH_CM",
    "HAND_WIDTH_CM",
    "HEIGHT_CM",
    "LANE_AGILITY_TIME_S",
    "LOC_X_CM",
    "LOC_Y_CM",
    "MAX_VERTICAL_LEAP_CM",
    "REST_D",
    "SEASON",  # normalize to allow learning an overall trend
    "SHOT_DISTANCE_CM",
    "STANDING_REACH_CM",
    "STANDING_VERTICAL_LEAP_CM",
    "THREE_QUARTER_SPRINT_S",
    "TIME_LEFT_S",
    "TZ_SHIFT",
    "WEIGHT_KG",
    "WINGSPAN_CM",
]

# One-hot encode QUARTER
if "QUARTER" in master_data_dropped.columns:
    master_data_dropped = pd.get_dummies(
        master_data_dropped,
        columns=["QUARTER"],
        prefix="QUARTER",
        drop_first=False
    )

if "QUARTER" in master_data_imputed.columns:
    master_data_imputed = pd.get_dummies(
        master_data_imputed,
        columns=["QUARTER"],
        prefix="QUARTER",
        drop_first=False
    )

# Ensure we only scale columns that exist (robust to missing columns)
continuous_features_present = [c for c in continuous_features if c in master_data_dropped.columns]

scaler = StandardScaler()
master_data_dropped[continuous_features_present] = scaler.fit_transform(
    master_data_dropped[continuous_features_present]
)

# Rebuild features_list after one-hot encoding QUARTER (new columns added)
features_list = master_data_dropped.columns.difference(cols_to_exclude).tolist()

features_list_dropped = features_list
features_list_imputed = [col for col in features_list if not col.startswith("POS_")]
features_list_imputed

In [12]:
# randomize order in master_data_dropped and master_data_imputed
master_data_dropped = master_data_dropped.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
master_data_imputed = master_data_imputed.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

In [None]:
# Run LASSO logistic regression on 10% sample of master_data_dropped
print("Running LASSO logistic regression on master_data_dropped (10% sample)...")

lasso_model_dropped = lasso_logistic_regression(
    master_data_dropped.sample(frac=1, random_state=RANDOM_STATE),
    target_variable,
    features_list_dropped,
    cv_folds=3,
    model_type="sgd",
    scoring="f1"
)

Running LASSO logistic regression on master_data_dropped (10% sample)...


In [67]:
# get the coefficients of the LASSO model and sort them by absolute value
lasso_coefficients = pd.Series(lasso_model_dropped.coef_[0], index=features_list_dropped)
lasso_coefficients_sorted = lasso_coefficients.reindex(lasso_coefficients.abs().sort_values(ascending=False).index)
print("LASSO Coefficients sorted by absolute value:")
lasso_coefficients_sorted

LASSO Coefficients sorted by absolute value:


Unnamed: 0,0
SHOT_DISTANCE_CM,-0.605753
QUARTER_5,-0.390435
IS_3PT,0.350273
QUARTER_1,0.140088
SEASON,0.096376
QUARTER_2,0.079888
POS_PF,0.047023
QUARTER_6,-0.034856
HAND_LENGTH_CM,0.023435
BODY_FAT_PCT,-0.014541


In [68]:
# Run Ridge logistic regression on 10% sample of master_data_dropped
print("Running Ridge logistic regression on master_data_dropped (10% sample)...")

ridge_model_dropped = ridge_logistic_regression(
    master_data_dropped.sample(frac=0.1, random_state=RANDOM_STATE),
    target_variable,
    features_list_dropped,
    cv_folds=3,
    model_type="sgd",
    scoring="f1"
)

Running Ridge logistic regression on master_data_dropped (1% sample)...
Best Hyperparameters: {'alpha': 0.001}
Classification Report (CV preds):
               precision    recall  f1-score   support

       False       0.62      0.66      0.64    139950
        True       0.57      0.52      0.54    119507

    accuracy                           0.60    259457
   macro avg       0.59      0.59      0.59    259457
weighted avg       0.60      0.60      0.60    259457

Confusion Matrix (CV preds):
 [[92716 47234]
 [57183 62324]]
Time taken: 0 days 00:00:04.531186


In [69]:
# get the coefficients of the Ridge model and sort them by absolute value
ridge_coefficients = pd.Series(ridge_model_dropped.coef_[0], index=features_list_dropped)
ridge_coefficients_sorted = ridge_coefficients.reindex(ridge_coefficients.abs().sort_values(ascending=False).index)
print("Ridge Coefficients sorted by absolute value:")
ridge_coefficients_sorted

Ridge Coefficients sorted by absolute value:


Unnamed: 0,0
SHOT_DISTANCE_CM,-0.55055
IS_3PT,0.354304
QUARTER_5,-0.107544
TIME_LEFT_S,0.058316
QUARTER_2,0.049662
SEASON,0.048475
WINGSPAN_CM,-0.039711
QUARTER_1,0.035121
QUARTER_3,0.030167
POS_C,0.029742


In [73]:
# Run LASSO logistic regression on 10% sample of master_data_imputed
print("Running LASSO logistic regression on master_data_imputed (10% sample)...")
lasso_model_imputed = lasso_logistic_regression(
    master_data_imputed.sample(frac=0.1, random_state=RANDOM_STATE),
    target_variable,
    features_list_imputed,
    cv_folds=3,
    model_type="sgd",
    scoring="f1"
)

Running LASSO logistic regression on master_data_imputed (1% sample)...
Best Hyperparameters: {'alpha': 0.01}
Classification Report (CV preds):
               precision    recall  f1-score   support

       False       0.56      0.43      0.49    240824
        True       0.47      0.60      0.53    204211

    accuracy                           0.51    445035
   macro avg       0.52      0.52      0.51    445035
weighted avg       0.52      0.51      0.51    445035

Confusion Matrix (CV preds):
 [[103224 137600]
 [ 80694 123517]]
Time taken: 0 days 00:00:47.082151


In [74]:
# get the coefficients of the LASSO model and sort them by absolute value
lasso_coefficients_imputed = pd.Series(lasso_model_imputed.coef_[0], index=features_list_imputed)
lasso_coefficients_imputed_sorted = lasso_coefficients_imputed.reindex(
    lasso_coefficients_imputed.abs().sort_values(ascending=False).index)
print("LASSO Coefficients (imputed) sorted by absolute value:")
lasso_coefficients_imputed_sorted

LASSO Coefficients (imputed) sorted by absolute value:


Unnamed: 0,0
SHOT_DISTANCE_CM,-0.003258
HEIGHT_CM,0.000964
HAND_LENGTH_CM,0.0
BODY_FAT_PCT,0.0
HAND_WIDTH_CM,0.0
IS_3PT,0.0
LANE_AGILITY_TIME_S,0.0
LOC_X_CM,0.0
LOC_Y_CM,0.0
MAX_VERTICAL_LEAP_CM,0.0


In [75]:
# Run Ridge logistic regression on 10% sample of master_data_imputed
print("Running Ridge logistic regression on master_data_imputed (10% sample)...")
ridge_model_imputed = ridge_logistic_regression(
    master_data_imputed.sample(frac=0.1, random_state=RANDOM_STATE),
    target_variable,
    features_list_imputed,
    cv_folds=3,
    model_type="sgd",
    scoring="f1"
)

Running Ridge logistic regression on master_data_imputed (1% sample)...
Best Hyperparameters: {'alpha': 0.0001}
Classification Report (CV preds):
               precision    recall  f1-score   support

       False       0.61      0.32      0.42    240824
        True       0.49      0.75      0.59    204211

    accuracy                           0.52    445035
   macro avg       0.55      0.54      0.51    445035
weighted avg       0.55      0.52      0.50    445035

Confusion Matrix (CV preds):
 [[ 78243 162581]
 [ 50097 154114]]
Time taken: 0 days 00:06:15.773784


In [76]:
# get the coefficients of the Ridge model and sort them by absolute value
ridge_coefficients_imputed = pd.Series(ridge_model_imputed.coef_[0], index=features_list_imputed)
ridge_coefficients_imputed_sorted = ridge_coefficients_imputed.reindex(
    ridge_coefficients_imputed.abs().sort_values(ascending=False).index)
print("Ridge Coefficients (imputed) sorted by absolute value:")
ridge_coefficients_imputed_sorted

Ridge Coefficients (imputed) sorted by absolute value:


Unnamed: 0,0
IS_3PT,20.563448
QUARTER_4,-2.70421
QUARTER_1,2.310883
QUARTER_5,-1.620637
QUARTER_3,1.24051
THREE_QUARTER_SPRINT_S,-1.214949
QUARTER_2,0.567599
HEIGHT_CM,0.504992
BODY_FAT_PCT,-0.413424
STANDING_REACH_CM,-0.224128
