In [14]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix

In [15]:
# import parquet file from ../datasets/processed/
master_data = pd.read_parquet('../datasets/processed/master_data.parquet')
master_data_dropped = pd.read_parquet('../datasets/processed/master_data_dropped.parquet')
master_data_imputed = pd.read_parquet('../datasets/processed/master_data_imputed.parquet')

In [16]:
# check the shape of the dataframes
print('master_data shape:', master_data.shape, "Count of NA values:", master_data.isna().sum().sum())
print('master_data_dropped shape:', master_data_dropped.shape, "Count of NA values:",
      master_data_dropped.isna().sum().sum())
print('master_data_imputed shape:', master_data_imputed.shape, "Count of NA values:",
      master_data_imputed.isna().sum().sum())

master_data shape: (4450350, 40) Count of NA values: 29692416
master_data_dropped shape: (2594574, 40) Count of NA values: 0
master_data_imputed shape: (4450350, 35) Count of NA values: 0


In [17]:
# LASSO logistic regression for a given dataframe
# do CV and hyperparameter tuning using GridSearchCV, and return the best model and its performance metrics
# parameters: dataframe, target variable, list of features
def lasso_logistic_regression(df, target, features_list, cv_folds=5):
    X = df[features_list]
    y = df[target]

    model = LogisticRegression(
        solver="saga",
        l1_ratio=1.0,  # LASSO
        max_iter=10000
    )

    param_grid = {"C": [0.01, 0.1, 1, 10, 100]}

    start_time = pd.Timestamp.now()

    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=cv_folds,
        scoring="f1",  # choose what fits your problem; e.g. "roc_auc"
        n_jobs=-1
    )
    grid_search.fit(X, y)

    best_model = grid_search.best_estimator_

    # CV predictions for an unbiased in-sample performance estimate
    y_pred_cv = cross_val_predict(best_model, X, y, cv=cv_folds)

    print("Best Hyperparameters:", grid_search.best_params_)
    print("Classification Report (CV preds):\n", classification_report(y, y_pred_cv))
    print("Confusion Matrix (CV preds):\n", confusion_matrix(y, y_pred_cv))
    print("Time taken for LASSO logistic regression:", pd.Timestamp.now() - start_time)

    return best_model


# ridge logistic regression for a given dataframe
# do CV and hyperparameter tuning using GridSearchCV, and return the best model and its performance metrics
# parameters: dataframe, target variable, list of features
def ridge_logistic_regression(df, target, features_list, cv_folds=5):
    X = df[features_list]
    y = df[target]

    model = LogisticRegression(
        solver="saga",
        l1_ratio=0.0,  # Ridge
        max_iter=10000
    )

    param_grid = {"C": [0.01, 0.1, 1, 10, 100]}

    start_time = pd.Timestamp.now()

    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=cv_folds,
        scoring="f1",  # choose what fits your problem; e.g. "roc_auc"
        n_jobs=-1
    )
    grid_search.fit(X, y)

    best_model = grid_search.best_estimator_

    # CV predictions for an unbiased in-sample performance estimate
    y_pred_cv = cross_val_predict(best_model, X, y, cv=cv_folds)

    print("Best Hyperparameters:", grid_search.best_params_)
    print("Classification Report (CV preds):\n", classification_report(y, y_pred_cv))
    print("Confusion Matrix (CV preds):\n", confusion_matrix(y, y_pred_cv))
    print("Time taken for Ridge logistic regression:", pd.Timestamp.now() - start_time)

    return best_model

In [18]:
# drop GAME_DATE column from master_data_dropped and master_data_imputed
master_data_dropped = master_data_dropped.drop(columns=["GAME_DATE"])
master_data_imputed = master_data_imputed.drop(columns=["GAME_DATE"])

# one hot encode HOME_TEAM and AWAY_TEAM
master_data_dropped = pd.get_dummies(master_data_dropped, columns=["HOME_TEAM", "AWAY_TEAM"], drop_first=True)
master_data_imputed = pd.get_dummies(master_data_imputed, columns=["HOME_TEAM", "AWAY_TEAM"], drop_first=True)

# conver POS_ columns to boolean in master_data_dropped
pos_columns = [col for col in master_data_dropped.columns if col.startswith("POS_")]
master_data_dropped[pos_columns] = master_data_dropped[pos_columns].astype(bool)

# show column data types for master_data_dropped and master_data_imputed in dataframe format
print("Data types for master_data_dropped:")
print(master_data_dropped.dtypes)
print("\nData types for master_data_imputed:")
print(master_data_imputed.dtypes)

Data types for master_data_dropped:
GAME_ID          int64
SEASON           int64
QUARTER          int64
TIME_LEFT_S      int64
TEAM_ID          int64
                 ...  
AWAY_TEAM_SAC     bool
AWAY_TEAM_SAS     bool
AWAY_TEAM_TOR     bool
AWAY_TEAM_UTA     bool
AWAY_TEAM_WAS     bool
Length: 95, dtype: object

Data types for master_data_imputed:
GAME_ID          int64
SEASON           int64
QUARTER          int64
TIME_LEFT_S      int64
TEAM_ID          int64
                 ...  
AWAY_TEAM_SAC     bool
AWAY_TEAM_SAS     bool
AWAY_TEAM_TOR     bool
AWAY_TEAM_UTA     bool
AWAY_TEAM_WAS     bool
Length: 90, dtype: object


In [19]:
# define target variable and features list
target_variable = "MADE_SHOT"
# features list for master_data_dropped
features_list = [col for col in master_data_dropped.columns if
                 col != target_variable and not col.endswith("_ID") and not col.endswith("_NAME")]

features_list_dropped = features_list
features_list_imputed = [col for col in features_list if not col.startswith("POS_")]
features_list_imputed

['SEASON',
 'QUARTER',
 'TIME_LEFT_S',
 'LOC_X_CM',
 'LOC_Y_CM',
 'SHOT_DISTANCE_CM',
 'IS_3PT',
 'DISTANCE_KM',
 'REST_D',
 'TZ_SHIFT',
 'FLIGHT_TIME_MIN',
 'LAT',
 'LON',
 'D_LAT',
 'D_LON',
 'HEIGHT_CM',
 'WEIGHT_KG',
 'WINGSPAN_CM',
 'STANDING_REACH_CM',
 'HAND_LENGTH_CM',
 'HAND_WIDTH_CM',
 'BODY_FAT_PCT',
 'LANE_AGILITY_TIME_S',
 'THREE_QUARTER_SPRINT_S',
 'MAX_VERTICAL_LEAP_CM',
 'STANDING_VERTICAL_LEAP_CM',
 'HOME_TEAM_BKN',
 'HOME_TEAM_BOS',
 'HOME_TEAM_CHA',
 'HOME_TEAM_CHI',
 'HOME_TEAM_CLE',
 'HOME_TEAM_DAL',
 'HOME_TEAM_DEN',
 'HOME_TEAM_DET',
 'HOME_TEAM_GSW',
 'HOME_TEAM_HOU',
 'HOME_TEAM_IND',
 'HOME_TEAM_LAC',
 'HOME_TEAM_LAL',
 'HOME_TEAM_MEM',
 'HOME_TEAM_MIA',
 'HOME_TEAM_MIL',
 'HOME_TEAM_MIN',
 'HOME_TEAM_NOP',
 'HOME_TEAM_NYK',
 'HOME_TEAM_OKC',
 'HOME_TEAM_ORL',
 'HOME_TEAM_PHI',
 'HOME_TEAM_PHX',
 'HOME_TEAM_POR',
 'HOME_TEAM_SAC',
 'HOME_TEAM_SAS',
 'HOME_TEAM_TOR',
 'HOME_TEAM_UTA',
 'HOME_TEAM_WAS',
 'AWAY_TEAM_BKN',
 'AWAY_TEAM_BOS',
 'AWAY_TEAM_CHA',
 'AWA

In [20]:
# randomize order in master_data_dropped and master_data_imputed
master_data_dropped = master_data_dropped.sample(frac=1, random_state=42).reset_index(drop=True)
master_data_imputed = master_data_imputed.sample(frac=1, random_state=42).reset_index(drop=True)

In [23]:
# Run LASSO logistic regression on 1% of master_data_dropped
print("Running LASSO logistic regression on master_data_dropped...")
lasso_model_dropped = lasso_logistic_regression(master_data_dropped.sample(frac=0.01, random_state=42),
                                                 target_variable, features_list_dropped)


Running LASSO logistic regression on master_data_dropped...


KeyboardInterrupt: 

In [None]:
# Run Ridge logistic regression on 1% of master_data_dropped
print("Running Ridge logistic regression on master_data_dropped...")
ridge_model_dropped = ridge_logistic_regression(master_data_dropped.sample(frac=0.01, random_state=42),
                                                 target_variable, features_list_dropped)