In [2]:
# Clone your GitHub repo (you’ll be prompted to authorize if it's private)
!git clone https://github.com/colterwood/LHL-final-final-project.git

Cloning into 'LHL-final-final-project'...
remote: Enumerating objects: 115, done.[K
remote: Counting objects: 100% (115/115), done.[K
remote: Compressing objects: 100% (104/104), done.[K
remote: Total 115 (delta 61), reused 19 (delta 6), pack-reused 0 (from 0)[K
Receiving objects: 100% (115/115), 1.24 MiB | 2.57 MiB/s, done.
Resolving deltas: 100% (61/61), done.


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor

In [4]:
# load the player game logs CSV from the data folder
df = pd.read_csv("LHL-final-final-project/data/2024_merged_gamelogs.csv")


# preview
df.head()

Unnamed: 0,team,g_num,month,day,home_away,opp,win_loss,team_score,opp_score,team_fg,...,day_of_week_by_team_travel_distance,team_vs_opp_median_score_by_team_travel_distance,team_vs_opp_homeaway_median_score_by_team_travel_distance,team_home_or_away_median_score_by_team_travel_distance,team_home_or_away_median_allowed_by_team_travel_distance,team_day_median_score_by_team_travel_distance,team_day_median_allowed_by_team_travel_distance,travel_distance_by_team_travel_distance,median_score_for_by_team_travel_distance,median_score_against_by_team_travel_distance
0,ATL,1,5,15,2,LAS,1,92,81,34,...,3.0,75.0,70.0,78.5,79.5,76.0,80.0,0.0,77.0,79.5
1,ATL,2,5,18,2,PHO,2,85,88,27,...,2.0,76.0,76.0,78.0,80.5,76.0,80.0,1.0,77.5,77.5
2,ATL,3,5,21,1,DAL,1,83,78,30,...,4.0,81.0,75.5,78.0,80.5,73.0,78.0,3.0,81.0,85.0
3,ATL,4,5,26,1,MIN,2,79,92,31,...,3.0,75.0,70.0,78.5,79.5,76.0,80.0,0.0,77.0,79.5
4,ATL,5,5,29,2,WAS,1,73,67,26,...,4.0,75.0,76.5,78.0,80.5,76.0,80.0,2.0,78.0,80.0


In [5]:
# features (drop identifiers and targets)
features = df.drop(columns=['team_score', 'opp_score', 'team', 'opp'])

# targets
targets = df[['team_score', 'opp_score']]

# group-aware split
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(features, groups=df['team']))

X_train = features.iloc[train_idx]
X_test = features.iloc[test_idx]
y_train = targets.iloc[train_idx]
y_test = targets.iloc[test_idx]

# model
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model.fit(X_train, y_train)

In [6]:
# get feature importances for team_score prediction
team_score_importance = model.estimators_[0].feature_importances_

# get feature importances for opp_score prediction
opp_score_importance = model.estimators_[1].feature_importances_

# match to feature names
feature_names = features.columns

# build DataFrame
importances_df = (
    pd.DataFrame({
        'feature': feature_names,
        'team_score_importance': team_score_importance,
        'opp_score_importance': opp_score_importance
    })
    .sort_values(by='team_score_importance', ascending=False)
)

importances_df.head(20)  # or whatever number you want to view

Unnamed: 0,feature,team_score_importance,opp_score_importance
37,advanced_ortg,0.771811,0.000807
57,team_vs_opp_homeaway_median_score,0.063060,0.001122
39,advanced_pace,0.051088,0.068585
5,team_fg,0.028227,0.001548
42,advanced_ts_pct,0.005816,0.000612
...,...,...,...
4,win_loss,0.000020,0.000007
65,month_by_team_home_away,0.000015,0.000017
132,win_loss_by_team_day_of_week,0.000008,0.000044
196,win_loss_by_team_travel_distance,0.000003,0.000004


In [7]:
# show all or more of the ranked features
importances_df.sort_values(by='team_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
37,advanced_ortg,0.771811,0.000807
57,team_vs_opp_homeaway_median_score,0.06306,0.001122
39,advanced_pace,0.051088,0.068585
5,team_fg,0.028227,0.001548
42,advanced_ts_pct,0.005816,0.000612
56,team_vs_opp_median_score,0.004108,0.000472
16,team_ast,0.003441,0.000471
19,team_tov,0.002953,0.001352
7,team_fg_pct,0.002157,0.000643
33,opponent_stl,0.001959,0.001807


In [8]:
# show all or more of the ranked features
importances_df.sort_values(by='opp_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
38,advanced_drtg,0.000367,0.78682
39,advanced_pace,0.051088,0.068585
21,opponent_fg,0.001408,0.067323
23,opponent_fg_pct,0.000315,0.007825
51,defensive_four_factors_efg_pct,0.000313,0.002968
27,opponent_ft,0.000547,0.002232
17,team_stl,0.001365,0.001874
33,opponent_stl,0.001959,0.001807
5,team_fg,0.028227,0.001548
19,team_tov,0.002953,0.001352


In [9]:
# define substrings that identify counting stats
counting_keywords = ['fg', 'fga', '3p', '3pa', 'ft', 'fta', 'orb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf']

# drop columns where the name contains any of those keywords but NOT pct or rate stats
counting_cols = [col for col in df.columns if any(k in col for k in counting_keywords)
                 and not col.endswith('_pct')
                 and 'four_factors' not in col
                 and 'advanced' not in col
                 and col not in ['team_score', 'opp_score']]

In [10]:
print(counting_cols)

['team_fg', 'team_fga', 'team_3p', 'team_3pa', 'team_ft', 'team_fta', 'team_orb', 'team_trb', 'team_ast', 'team_stl', 'team_blk', 'team_tov', 'team_pf', 'opponent_fg', 'opponent_fga', 'opponent_3p', 'opponent_3pa', 'opponent_ft', 'opponent_fta', 'opponent_orb', 'opponent_trb', 'opponent_ast', 'opponent_stl', 'opponent_blk', 'opponent_tov', 'opponent_pf', 'team_fg_by_team_home_away', 'team_fga_by_team_home_away', 'team_fg_pct_by_team_home_away', 'team_3p_by_team_home_away', 'team_3pa_by_team_home_away', 'team_3p_pct_by_team_home_away', 'team_ft_by_team_home_away', 'team_fta_by_team_home_away', 'team_ft_pct_by_team_home_away', 'team_orb_by_team_home_away', 'team_trb_by_team_home_away', 'team_ast_by_team_home_away', 'team_stl_by_team_home_away', 'team_blk_by_team_home_away', 'team_tov_by_team_home_away', 'team_pf_by_team_home_away', 'opponent_fg_by_team_home_away', 'opponent_fga_by_team_home_away', 'opponent_fg_pct_by_team_home_away', 'opponent_3p_by_team_home_away', 'opponent_3pa_by_team

In [11]:
df = df.drop(columns=counting_cols)

In [13]:
# compute correlation matrix
corr_matrix = X.corr().abs()

# keep upper triangle only (to avoid duplicate pairs)
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# flatten and filter for high correlation pairs
high_corr_pairs = (
    upper.stack()
    .reset_index()
    .rename(columns={'level_0': 'feature_1', 'level_1': 'feature_2', 0: 'correlation'})
    .query('correlation > 0.9')
    .sort_values(by='correlation', ascending=False)
)

high_corr_pairs

Unnamed: 0,feature_1,feature_2,correlation
436,home_away,home_away_by_team_home_away,1.0
3823,team_home_or_away_median_score,team_home_or_away_median_score_by_team_home_away,1.0
3543,day_of_week,day_of_week_by_team_day_of_week,1.0
4058,team_day_median_score,team_day_median_score_by_team_day_of_week,1.0
4484,median_score_against,median_score_against_by_team_travel_distance,1.0
4387,median_score_for,median_score_for_by_team_travel_distance,1.0
4289,travel_distance,travel_distance_by_team_travel_distance,1.0
3925,team_home_or_away_median_allowed,team_home_or_away_median_allowed_by_team_home_...,1.0
4158,team_day_median_allowed,team_day_median_allowed_by_team_day_of_week,1.0
0,g_num,month,0.974134


In [14]:
# always keep these no matter what
must_keep = ['advanced_ortg', 'advanced_drtg']

def auto_select_drops(high_corr_df):
    drop_cols = []

    for _, row in high_corr_df.iterrows():
        f1, f2 = row['feature_1'], row['feature_2']

        # skip if either feature is protected
        if f1 in must_keep or f2 in must_keep:
            continue

        # drop the more engineered-looking one
        if '_by_' in f1 or 'median' in f1:
            drop_cols.append(f1)
        elif '_by_' in f2 or 'median' in f2:
            drop_cols.append(f2)
        else:
            # fallback: drop longer one
            drop_cols.append(f1 if len(f1) > len(f2) else f2)

    return list(set(drop_cols))

In [16]:
df = df.drop(columns=auto_drop_cols, errors='ignore')

In [17]:
# recompute VIFs
X = df.drop(columns=['team_score', 'opp_score', 'team', 'opp'], errors='ignore')
X = X.select_dtypes(include='number')

vif_data = pd.DataFrame()
vif_data['feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_data = vif_data.sort_values(by='VIF', ascending=False)

In [18]:
high_corr_pairs.shape

(42, 3)

In [19]:
auto_drop_cols = auto_select_drops(high_corr_pairs)
df = df.drop(columns=auto_drop_cols, errors='ignore')

In [20]:
len(auto_drop_cols)

25

In [21]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# rebuild X with numeric-only features (excluding targets and IDs)
X = df.drop(columns=['team_score', 'opp_score', 'team', 'opp'], errors='ignore')
X = X.select_dtypes(include='number')

# calculate VIFs
vif_data = pd.DataFrame()
vif_data['feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_data = vif_data.sort_values(by='VIF', ascending=False)

vif_data

Unnamed: 0,feature,VIF
47,team_vs_opp_median_score_by_team_home_away,9.100268e+09
32,advanced_pace_by_team_home_away,7.857039e+09
50,median_score_for_by_team_home_away,3.839980e+09
48,team_home_or_away_median_score_by_team_home_away,1.523043e+09
34,advanced_trb_pct_by_team_home_away,6.401607e+08
...,...,...
9,opponent_ft_pct,1.952047e+00
16,advanced_ast_pct,1.808095e+00
18,advanced_blk_pct,1.647392e+00
1,day,1.393211e+00


In [27]:
must_keep = ['advanced_ortg', 'advanced_drtg']

# filter out protected features from VIF drop list
high_vif_cols = [
    col for col in vif_data[vif_data['VIF'] > 10]['feature']
    if col not in must_keep
]

df = df.drop(columns=high_vif_cols, errors='ignore')

In [23]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor

# define features and targets
features = df.drop(columns=['team_score', 'opp_score', 'team', 'opp'], errors='ignore')
targets = df[['team_score', 'opp_score']]

# stratified split by team
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(features, groups=df['team']))

X_train = features.iloc[train_idx]
X_test = features.iloc[test_idx]
y_train = targets.iloc[train_idx]
y_test = targets.iloc[test_idx]

# train model
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model.fit(X_train, y_train)

In [24]:
# get feature importances for team_score prediction
team_score_importance = model.estimators_[0].feature_importances_

# get feature importances for opp_score prediction
opp_score_importance = model.estimators_[1].feature_importances_

# match to feature names
feature_names = features.columns

# build DataFrame
importances_df = (
    pd.DataFrame({
        'feature': feature_names,
        'team_score_importance': team_score_importance,
        'opp_score_importance': opp_score_importance
    })
    .sort_values(by='team_score_importance', ascending=False)
)

importances_df.head(20)  # or whatever number you want to view

Unnamed: 0,feature,team_score_importance,opp_score_importance
17,team_vs_opp_homeaway_median_score,0.675823,0.19881
3,team_3p_pct,0.048631,0.01208
7,advanced_pace,0.01578,0.032269
0,g_num,0.015603,0.018247
2,win_loss,0.015579,0.26218
8,advanced_ftr,0.015168,0.033149
16,team_vs_opp_median_score,0.014359,0.01326
5,opponent_3p_pct,0.011167,0.114588
9,advanced_3par,0.010949,0.009987
14,defensive_four_factors_ft_per_fga,0.010545,0.018044


In [25]:
# show all or more of the ranked features
importances_df.sort_values(by='team_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
17,team_vs_opp_homeaway_median_score,0.675823,0.19881
3,team_3p_pct,0.048631,0.01208
7,advanced_pace,0.01578,0.032269
0,g_num,0.015603,0.018247
2,win_loss,0.015579,0.26218
8,advanced_ftr,0.015168,0.033149
16,team_vs_opp_median_score,0.014359,0.01326
5,opponent_3p_pct,0.011167,0.114588
9,advanced_3par,0.010949,0.009987
14,defensive_four_factors_ft_per_fga,0.010545,0.018044


In [26]:
# show all or more of the ranked features
importances_df.sort_values(by='team_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
17,team_vs_opp_homeaway_median_score,0.675823,0.19881
3,team_3p_pct,0.048631,0.01208
7,advanced_pace,0.01578,0.032269
0,g_num,0.015603,0.018247
2,win_loss,0.015579,0.26218
8,advanced_ftr,0.015168,0.033149
16,team_vs_opp_median_score,0.014359,0.01326
5,opponent_3p_pct,0.011167,0.114588
9,advanced_3par,0.010949,0.009987
14,defensive_four_factors_ft_per_fga,0.010545,0.018044
