In [1]:
# Clone your GitHub repo (you’ll be prompted to authorize if it's private)
!git clone https://github.com/colterwood/LHL-final-final-project.git

Cloning into 'LHL-final-final-project'...
remote: Enumerating objects: 119, done.[K
remote: Counting objects: 100% (119/119), done.[K
remote: Compressing objects: 100% (108/108), done.[K
remote: Total 119 (delta 64), reused 19 (delta 6), pack-reused 0 (from 0)[K
Receiving objects: 100% (119/119), 1.27 MiB | 2.60 MiB/s, done.
Resolving deltas: 100% (64/64), done.


In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split

In [3]:
# load the player game logs CSV from the data folder
df = pd.read_csv("LHL-final-final-project/data/2024_merged_gamelogs.csv")


# preview
df.head()

Unnamed: 0,team,g_num,month,day,home_away,opp,win_loss,team_score,opp_score,team_fg,...,day_of_week_by_team_travel_distance,team_vs_opp_median_score_by_team_travel_distance,team_vs_opp_homeaway_median_score_by_team_travel_distance,team_home_or_away_median_score_by_team_travel_distance,team_home_or_away_median_allowed_by_team_travel_distance,team_day_median_score_by_team_travel_distance,team_day_median_allowed_by_team_travel_distance,travel_distance_by_team_travel_distance,median_score_for_by_team_travel_distance,median_score_against_by_team_travel_distance
0,ATL,1,5,15,2,LAS,1,92,81,34,...,3.0,75.0,70.0,78.5,79.5,76.0,80.0,0.0,77.0,79.5
1,ATL,2,5,18,2,PHO,2,85,88,27,...,2.0,76.0,76.0,78.0,80.5,76.0,80.0,1.0,77.5,77.5
2,ATL,3,5,21,1,DAL,1,83,78,30,...,4.0,81.0,75.5,78.0,80.5,73.0,78.0,3.0,81.0,85.0
3,ATL,4,5,26,1,MIN,2,79,92,31,...,3.0,75.0,70.0,78.5,79.5,76.0,80.0,0.0,77.0,79.5
4,ATL,5,5,29,2,WAS,1,73,67,26,...,4.0,75.0,76.5,78.0,80.5,76.0,80.0,2.0,78.0,80.0


In [4]:
# features (drop identifiers and targets)
features = df.drop(columns=['team_score', 'opp_score', 'team', 'opp'])

# targets
targets = df[['team_score', 'opp_score']]

# group-aware split
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(features, groups=df['team']))

X_train = features.iloc[train_idx]
X_test = features.iloc[test_idx]
y_train = targets.iloc[train_idx]
y_test = targets.iloc[test_idx]

# model
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model.fit(X_train, y_train)

In [5]:
# get feature importances for team_score prediction
team_score_importance = model.estimators_[0].feature_importances_

# get feature importances for opp_score prediction
opp_score_importance = model.estimators_[1].feature_importances_

# match to feature names
feature_names = features.columns

# build DataFrame
importances_df = (
    pd.DataFrame({
        'feature': feature_names,
        'team_score_importance': team_score_importance,
        'opp_score_importance': opp_score_importance
    })
    .sort_values(by='team_score_importance', ascending=False)
)

importances_df.head(20)  # or whatever number you want to view

Unnamed: 0,feature,team_score_importance,opp_score_importance
37,advanced_ortg,0.771811,0.000807
57,team_vs_opp_homeaway_median_score,0.06306,0.001122
39,advanced_pace,0.051088,0.068585
5,team_fg,0.028227,0.001548
42,advanced_ts_pct,0.005816,0.000612
56,team_vs_opp_median_score,0.004108,0.000472
16,team_ast,0.003441,0.000471
19,team_tov,0.002953,0.001352
7,team_fg_pct,0.002157,0.000643
33,opponent_stl,0.001959,0.001807


In [6]:
# show all or more of the ranked features
importances_df.sort_values(by='team_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
37,advanced_ortg,0.771811,0.000807
57,team_vs_opp_homeaway_median_score,0.06306,0.001122
39,advanced_pace,0.051088,0.068585
5,team_fg,0.028227,0.001548
42,advanced_ts_pct,0.005816,0.000612
56,team_vs_opp_median_score,0.004108,0.000472
16,team_ast,0.003441,0.000471
19,team_tov,0.002953,0.001352
7,team_fg_pct,0.002157,0.000643
33,opponent_stl,0.001959,0.001807


In [7]:
# show all or more of the ranked features
importances_df.sort_values(by='opp_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
38,advanced_drtg,0.000367,0.78682
39,advanced_pace,0.051088,0.068585
21,opponent_fg,0.001408,0.067323
23,opponent_fg_pct,0.000315,0.007825
51,defensive_four_factors_efg_pct,0.000313,0.002968
27,opponent_ft,0.000547,0.002232
17,team_stl,0.001365,0.001874
33,opponent_stl,0.001959,0.001807
5,team_fg,0.028227,0.001548
19,team_tov,0.002953,0.001352


In [8]:
# define substrings that identify counting stats
counting_keywords = ['fg', 'fga', '3p', '3pa', 'ft', 'fta', 'orb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf']

# drop columns where the name contains any of those keywords but NOT pct or rate stats
counting_cols = [col for col in df.columns if any(k in col for k in counting_keywords)
                 and not col.endswith('_pct')
                 and 'four_factors' not in col
                 and 'advanced' not in col
                 and col not in ['team_score', 'opp_score']]

In [9]:
print(counting_cols)

['team_fg', 'team_fga', 'team_3p', 'team_3pa', 'team_ft', 'team_fta', 'team_orb', 'team_trb', 'team_ast', 'team_stl', 'team_blk', 'team_tov', 'team_pf', 'opponent_fg', 'opponent_fga', 'opponent_3p', 'opponent_3pa', 'opponent_ft', 'opponent_fta', 'opponent_orb', 'opponent_trb', 'opponent_ast', 'opponent_stl', 'opponent_blk', 'opponent_tov', 'opponent_pf', 'team_fg_by_team_home_away', 'team_fga_by_team_home_away', 'team_fg_pct_by_team_home_away', 'team_3p_by_team_home_away', 'team_3pa_by_team_home_away', 'team_3p_pct_by_team_home_away', 'team_ft_by_team_home_away', 'team_fta_by_team_home_away', 'team_ft_pct_by_team_home_away', 'team_orb_by_team_home_away', 'team_trb_by_team_home_away', 'team_ast_by_team_home_away', 'team_stl_by_team_home_away', 'team_blk_by_team_home_away', 'team_tov_by_team_home_away', 'team_pf_by_team_home_away', 'opponent_fg_by_team_home_away', 'opponent_fga_by_team_home_away', 'opponent_fg_pct_by_team_home_away', 'opponent_3p_by_team_home_away', 'opponent_3pa_by_team

In [10]:
df = df.drop(columns=counting_cols)

In [13]:
# make sure we only use numeric features (exclude targets and IDs)
X = df.drop(columns=['team_score', 'opp_score', 'team', 'opp'], errors='ignore')
X = X.select_dtypes(include='number')

# compute correlation matrix
corr_matrix = X.corr().abs()

# keep upper triangle only (to avoid duplicate pairs)
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# flatten and filter for high correlation pairs
high_corr_pairs = (
    upper.stack()
    .reset_index()
    .rename(columns={'level_0': 'feature_1', 'level_1': 'feature_2', 0: 'correlation'})
    .query('correlation > 0.9')
    .sort_values(by='correlation', ascending=False)
)

high_corr_pairs

Unnamed: 0,feature_1,feature_2,correlation
436,home_away,home_away_by_team_home_away,1.0
3823,team_home_or_away_median_score,team_home_or_away_median_score_by_team_home_away,1.0
3543,day_of_week,day_of_week_by_team_day_of_week,1.0
4058,team_day_median_score,team_day_median_score_by_team_day_of_week,1.0
4484,median_score_against,median_score_against_by_team_travel_distance,1.0
4387,median_score_for,median_score_for_by_team_travel_distance,1.0
4289,travel_distance,travel_distance_by_team_travel_distance,1.0
3925,team_home_or_away_median_allowed,team_home_or_away_median_allowed_by_team_home_...,1.0
4158,team_day_median_allowed,team_day_median_allowed_by_team_day_of_week,1.0
0,g_num,month,0.974134


In [14]:
# always keep these no matter what
must_keep = ['advanced_ortg', 'advanced_drtg']

def auto_select_drops(high_corr_df):
    drop_cols = []

    for _, row in high_corr_df.iterrows():
        f1, f2 = row['feature_1'], row['feature_2']

        # skip if either feature is protected
        if f1 in must_keep or f2 in must_keep:
            continue

        # drop the more engineered-looking one
        if '_by_' in f1 or 'median' in f1:
            drop_cols.append(f1)
        elif '_by_' in f2 or 'median' in f2:
            drop_cols.append(f2)
        else:
            # fallback: drop longer one
            drop_cols.append(f1 if len(f1) > len(f2) else f2)

    return list(set(drop_cols))

In [16]:
# create drop list
auto_drop_cols = auto_select_drops(high_corr_pairs)

In [17]:
# View suggested drop list
print(auto_drop_cols)

['team_day_median_allowed_by_team_home_away', 'advanced_ts_pct_by_team_travel_distance', 'median_score_against', 'team_day_median_score_by_team_home_away', 'team_day_median_allowed', 'offensive_four_factors_efg_pct', 'month', 'defensive_four_factors_efg_pct', 'offensive_four_factors_ft_per_fga', 'median_score_against_by_team_home_away', 'advanced_ts_pct_by_team_day_of_week', 'travel_distance_by_team_travel_distance', 'advanced_ts_pct', 'team_vs_opp_homeaway_median_score_by_team_home_away', 'day_of_week_by_team_day_of_week', 'team_home_or_away_median_allowed', 'advanced_3par_by_team_home_away', 'advanced_ts_pct_by_team_home_away', 'team_home_or_away_median_allowed_by_team_home_away', 'home_away_by_team_home_away', 'team_day_median_score', 'team_home_or_away_median_allowed_by_team_day_of_week', 'team_home_or_away_median_score', 'median_score_for', 'advanced_drtg_by_team_home_away']


In [19]:
# Manually built drop list
drop_list = [
    'home_away_by_team_home_away',
    'team_home_or_away_median_score_by_team_home_away',
    'day_of_week_by_team_day_of_week',
    'team_day_median_score_by_team_day_of_week',
    'median_score_against_by_team_travel_distance',
    'median_score_for_by_team_travel_distance',
    'travel_distance_by_team_travel_distance',
    'team_home_or_away_median_allowed_by_team_home_away',
    'team_day_median_allowed_by_team_day_of_week',
    'g_num',
    'offensive_four_factors_efg_pct',
    'team_day_median_allowed_by_team_home_away',
    'team_home_or_away_median_allowed_by_team_home_away',
    'team_home_or_away_median_allowed',
    'team_home_or_away_median_allowed_by_team_travel_distance',
    'offensive_four_factors_ft_per_fga',
    'team_day_median_allowed_by_team_home_away',
    'team_home_or_away_median_allowed_by_team_day_of_week',
    'offensive_four_factors_efg_pct_by_team_travel_distance',
    'team_home_or_away_median_score_by_team_home_away',
    'team_vs_opp_homeaway_median_score_by_team_home_away',
    'team_fg_pct',
    'offensive_four_factors_efg_pct_by_team_day_of_week',
    'offensive_four_factors_efg_pct_by_team_home_away'
]

In [20]:
# drop from df
df = df.drop(columns=drop_list, errors='ignore')

In [21]:
# filter to numeric features only (excluding IDs and targets)
X = df.drop(columns=['team_score', 'opp_score', 'team', 'opp'], errors='ignore')
X = X.select_dtypes(include='number')

# compute VIFs
vif_data = pd.DataFrame()
vif_data['feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_data = vif_data.sort_values(by='VIF', ascending=False)

vif_data

  vif = 1. / (1. - r_squared_i)


Unnamed: 0,feature,VIF
2,home_away,inf
49,offensive_four_factors_ft_per_fga_by_team_home...,inf
34,month_by_team_home_away,inf
35,day_by_team_home_away,inf
36,win_loss_by_team_home_away,inf
...,...,...
16,advanced_ast_pct,1.842376
18,advanced_blk_pct,1.638229
5,team_ft_pct,1.480301
1,day,1.472835


In [22]:
X.corr()['home_away'].sort_values(ascending=False)

Unnamed: 0,home_away
home_away,1.000000
travel_distance_by_team_home_away,0.807283
home_away_by_team_travel_distance,0.508346
travel_distance,0.390108
home_away_by_team_day_of_week,0.323630
...,...
advanced_ortg_by_team_home_away,-0.217181
offensive_four_factors_orb_pct_by_team_home_away,-0.227798
defensive_four_factors_drb_pct_by_team_home_away,-0.253229
offensive_four_factors_ft_per_fga_by_team_home_away,-0.295397


In [23]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor

# define features and targets
features = df.drop(columns=['team_score', 'opp_score', 'team', 'opp'], errors='ignore')
targets = df[['team_score', 'opp_score']]

# stratified split by team
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(features, groups=df['team']))

X_train = features.iloc[train_idx]
X_test = features.iloc[test_idx]
y_train = targets.iloc[train_idx]
y_test = targets.iloc[test_idx]

# train model
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model.fit(X_train, y_train)

In [24]:
# get feature importances for team_score prediction
team_score_importance = model.estimators_[0].feature_importances_

# get feature importances for opp_score prediction
opp_score_importance = model.estimators_[1].feature_importances_

# match to feature names
feature_names = features.columns

# build DataFrame
importances_df = (
    pd.DataFrame({
        'feature': feature_names,
        'team_score_importance': team_score_importance,
        'opp_score_importance': opp_score_importance
    })
    .sort_values(by='team_score_importance', ascending=False)
)

importances_df.head(20)  # or whatever number you want to view

Unnamed: 0,feature,team_score_importance,opp_score_importance
9,advanced_ortg,0.789104,0.00186
27,team_vs_opp_homeaway_median_score,0.071171,0.002426
11,advanced_pace,0.063676,0.08003
14,advanced_ts_pct,0.008506,0.001539
26,team_vs_opp_median_score,0.004471,0.001566
19,offensive_four_factors_tov_pct,0.003621,0.004165
4,team_3p_pct,0.002133,0.000915
79,defensive_four_factors_drb_pct_by_team_day_of_...,0.002,0.000753
66,advanced_pace_by_team_day_of_week,0.001366,0.002547
68,advanced_3par_by_team_day_of_week,0.001342,0.000196


In [25]:
# show all or more of the ranked features
importances_df.sort_values(by='team_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
9,advanced_ortg,0.789104,0.00186
27,team_vs_opp_homeaway_median_score,0.071171,0.002426
11,advanced_pace,0.063676,0.08003
14,advanced_ts_pct,0.008506,0.001539
26,team_vs_opp_median_score,0.004471,0.001566
19,offensive_four_factors_tov_pct,0.003621,0.004165
4,team_3p_pct,0.002133,0.000915
79,defensive_four_factors_drb_pct_by_team_day_of_...,0.002,0.000753
66,advanced_pace_by_team_day_of_week,0.001366,0.002547
68,advanced_3par_by_team_day_of_week,0.001342,0.000196


In [27]:
# show all or more of the ranked features
importances_df.sort_values(by='opp_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
10,advanced_drtg,0.001143,0.842567
11,advanced_pace,0.063676,0.08003
6,opponent_fg_pct,0.000596,0.008254
19,offensive_four_factors_tov_pct,0.003621,0.004165
21,defensive_four_factors_efg_pct,0.000796,0.004106
66,advanced_pace_by_team_day_of_week,0.001366,0.002547
27,team_vs_opp_homeaway_median_score,0.071171,0.002426
24,defensive_four_factors_ft_per_fga,0.000916,0.002101
9,advanced_ortg,0.789104,0.00186
17,advanced_stl_pct,0.001242,0.001621


In [28]:
from google.colab import files
df.to_csv("wnba_model_ready_v1.csv", index=False)
files.download("wnba_model_ready_v1.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [30]:
from google.colab import files
# save feature importances
importances_df.to_csv("league_importances.csv", index=False)
files.download("league_importances.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [40]:
# Create ATL data subset
df_atl = df[df['team'] == 'ATL'].copy()

# define features and target
features_atl = df_atl.drop(columns=['team_score', 'opp_score', 'team', 'opp'], errors='ignore')
targets_atl = df_atl[['team_score', 'opp_score']]

In [41]:
X_train, X_test, y_train, y_test = train_test_split(features_atl, targets_atl, test_size=0.2, random_state=42)

model_atl = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model_atl.fit(X_train, y_train)

In [42]:
importances_atl = pd.DataFrame({
    'feature': features_atl.columns,
    'team_score_importance': model_atl.estimators_[0].feature_importances_,
    'opp_score_importance': model_atl.estimators_[1].feature_importances_,
}).sort_values(by='team_score_importance', ascending=False)

importances_atl.head(20)

Unnamed: 0,feature,team_score_importance,opp_score_importance
14,advanced_ts_pct,0.450337,0.004531
9,advanced_ortg,0.251102,0.012536
19,offensive_four_factors_tov_pct,0.05352,0.020985
16,advanced_ast_pct,0.018078,0.007542
20,offensive_four_factors_orb_pct,0.017525,0.011316
27,team_vs_opp_homeaway_median_score,0.017474,0.003593
7,opponent_3p_pct,0.017239,0.002882
10,advanced_drtg,0.013249,0.641198
8,opponent_ft_pct,0.012864,0.001647
4,team_3p_pct,0.012468,0.003753


In [45]:
# show all or more of the ranked features
importances_atl.sort_values(by='team_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
14,advanced_ts_pct,0.450337,0.004531
9,advanced_ortg,0.251102,0.012536
19,offensive_four_factors_tov_pct,0.05352,0.020985
16,advanced_ast_pct,0.018078,0.007542
20,offensive_four_factors_orb_pct,0.017525,0.011316
27,team_vs_opp_homeaway_median_score,0.017474,0.003593
7,opponent_3p_pct,0.017239,0.002882
10,advanced_drtg,0.013249,0.641198
8,opponent_ft_pct,0.012864,0.001647
4,team_3p_pct,0.012468,0.003753


In [46]:
# show all or more of the ranked features
importances_atl.sort_values(by='opp_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
10,advanced_drtg,0.013249,0.641198
11,advanced_pace,0.00111,0.048323
6,opponent_fg_pct,0.003872,0.046115
17,advanced_stl_pct,0.009164,0.03374
0,month,0.00036,0.026202
19,offensive_four_factors_tov_pct,0.05352,0.020985
9,advanced_ortg,0.251102,0.012536
20,offensive_four_factors_orb_pct,0.017525,0.011316
12,advanced_ftr,0.010029,0.011276
21,defensive_four_factors_efg_pct,0.008195,0.009751


In [47]:
from google.colab import files
# save feature importances
importances_df.to_csv("atl_importances.csv", index=False)
files.download("atl_importances.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>