In [1]:
# Clone your GitHub repo (you’ll be prompted to authorize if it's private)
!git clone https://github.com/colterwood/LHL-final-final-project.git

Cloning into 'LHL-final-final-project'...
remote: Enumerating objects: 131, done.[K
remote: Counting objects: 100% (131/131), done.[K
remote: Compressing objects: 100% (119/119), done.[K
remote: Total 131 (delta 70), reused 23 (delta 7), pack-reused 0 (from 0)[K
Receiving objects: 100% (131/131), 1.33 MiB | 2.28 MiB/s, done.
Resolving deltas: 100% (70/70), done.


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split

In [None]:
# load the player game logs CSV from the data folder
df = pd.read_csv("LHL-final-final-project/data/2024_merged_gamelogs.csv")


# preview
df.head()

Unnamed: 0,team,g_num,month,day,home_away,opp,win_loss,team_score,opp_score,team_fg,...,day_of_week_by_team_travel_distance,team_vs_opp_median_score_by_team_travel_distance,team_vs_opp_homeaway_median_score_by_team_travel_distance,team_home_or_away_median_score_by_team_travel_distance,team_home_or_away_median_allowed_by_team_travel_distance,team_day_median_score_by_team_travel_distance,team_day_median_allowed_by_team_travel_distance,travel_distance_by_team_travel_distance,median_score_for_by_team_travel_distance,median_score_against_by_team_travel_distance
0,ATL,1,5,15,2,LAS,1,92,81,34,...,3.0,75.0,70.0,78.5,79.5,76.0,80.0,0.0,77.0,79.5
1,ATL,2,5,18,2,PHO,2,85,88,27,...,2.0,76.0,76.0,78.0,80.5,76.0,80.0,1.0,77.5,77.5
2,ATL,3,5,21,1,DAL,1,83,78,30,...,4.0,81.0,75.5,78.0,80.5,73.0,78.0,3.0,81.0,85.0
3,ATL,4,5,26,1,MIN,2,79,92,31,...,3.0,75.0,70.0,78.5,79.5,76.0,80.0,0.0,77.0,79.5
4,ATL,5,5,29,2,WAS,1,73,67,26,...,4.0,75.0,76.5,78.0,80.5,76.0,80.0,2.0,78.0,80.0


In [None]:
# features (drop identifiers and targets)
features = df.drop(columns=['team_score', 'opp_score', 'team', 'opp'])

# targets
targets = df[['team_score', 'opp_score']]

# group-aware split
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(features, groups=df['team']))

X_train = features.iloc[train_idx]
X_test = features.iloc[test_idx]
y_train = targets.iloc[train_idx]
y_test = targets.iloc[test_idx]

# model
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model.fit(X_train, y_train)

In [None]:
# get feature importances for team_score prediction
team_score_importance = model.estimators_[0].feature_importances_

# get feature importances for opp_score prediction
opp_score_importance = model.estimators_[1].feature_importances_

# match to feature names
feature_names = features.columns

# build DataFrame
importances_df = (
    pd.DataFrame({
        'feature': feature_names,
        'team_score_importance': team_score_importance,
        'opp_score_importance': opp_score_importance
    })
    .sort_values(by='team_score_importance', ascending=False)
)

importances_df.head(20)  # or whatever number you want to view

Unnamed: 0,feature,team_score_importance,opp_score_importance
37,advanced_ortg,0.771811,0.000807
57,team_vs_opp_homeaway_median_score,0.06306,0.001122
39,advanced_pace,0.051088,0.068585
5,team_fg,0.028227,0.001548
42,advanced_ts_pct,0.005816,0.000612
56,team_vs_opp_median_score,0.004108,0.000472
16,team_ast,0.003441,0.000471
19,team_tov,0.002953,0.001352
7,team_fg_pct,0.002157,0.000643
33,opponent_stl,0.001959,0.001807


In [None]:
# show all or more of the ranked features
importances_df.sort_values(by='team_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
37,advanced_ortg,0.771811,0.000807
57,team_vs_opp_homeaway_median_score,0.06306,0.001122
39,advanced_pace,0.051088,0.068585
5,team_fg,0.028227,0.001548
42,advanced_ts_pct,0.005816,0.000612
56,team_vs_opp_median_score,0.004108,0.000472
16,team_ast,0.003441,0.000471
19,team_tov,0.002953,0.001352
7,team_fg_pct,0.002157,0.000643
33,opponent_stl,0.001959,0.001807


In [None]:
# show all or more of the ranked features
importances_df.sort_values(by='opp_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
38,advanced_drtg,0.000367,0.78682
39,advanced_pace,0.051088,0.068585
21,opponent_fg,0.001408,0.067323
23,opponent_fg_pct,0.000315,0.007825
51,defensive_four_factors_efg_pct,0.000313,0.002968
27,opponent_ft,0.000547,0.002232
17,team_stl,0.001365,0.001874
33,opponent_stl,0.001959,0.001807
5,team_fg,0.028227,0.001548
19,team_tov,0.002953,0.001352


In [None]:
# define substrings that identify counting stats
counting_keywords = ['fg', 'fga', '3p', '3pa', 'ft', 'fta', 'orb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf']

# drop columns where the name contains any of those keywords but NOT pct or rate stats
counting_cols = [col for col in df.columns if any(k in col for k in counting_keywords)
                 and not col.endswith('_pct')
                 and 'four_factors' not in col
                 and 'advanced' not in col
                 and col not in ['team_score', 'opp_score']]

In [None]:
print(counting_cols)

['team_fg', 'team_fga', 'team_3p', 'team_3pa', 'team_ft', 'team_fta', 'team_orb', 'team_trb', 'team_ast', 'team_stl', 'team_blk', 'team_tov', 'team_pf', 'opponent_fg', 'opponent_fga', 'opponent_3p', 'opponent_3pa', 'opponent_ft', 'opponent_fta', 'opponent_orb', 'opponent_trb', 'opponent_ast', 'opponent_stl', 'opponent_blk', 'opponent_tov', 'opponent_pf', 'team_fg_by_team_home_away', 'team_fga_by_team_home_away', 'team_fg_pct_by_team_home_away', 'team_3p_by_team_home_away', 'team_3pa_by_team_home_away', 'team_3p_pct_by_team_home_away', 'team_ft_by_team_home_away', 'team_fta_by_team_home_away', 'team_ft_pct_by_team_home_away', 'team_orb_by_team_home_away', 'team_trb_by_team_home_away', 'team_ast_by_team_home_away', 'team_stl_by_team_home_away', 'team_blk_by_team_home_away', 'team_tov_by_team_home_away', 'team_pf_by_team_home_away', 'opponent_fg_by_team_home_away', 'opponent_fga_by_team_home_away', 'opponent_fg_pct_by_team_home_away', 'opponent_3p_by_team_home_away', 'opponent_3pa_by_team

In [None]:
df = df.drop(columns=counting_cols)

In [None]:
# make sure we only use numeric features (exclude targets and IDs)
X = df.drop(columns=['team_score', 'opp_score', 'team', 'opp'], errors='ignore')
X = X.select_dtypes(include='number')

# compute correlation matrix
corr_matrix = X.corr().abs()

# keep upper triangle only (to avoid duplicate pairs)
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# flatten and filter for high correlation pairs
high_corr_pairs = (
    upper.stack()
    .reset_index()
    .rename(columns={'level_0': 'feature_1', 'level_1': 'feature_2', 0: 'correlation'})
    .query('correlation > 0.9')
    .sort_values(by='correlation', ascending=False)
)

high_corr_pairs

Unnamed: 0,feature_1,feature_2,correlation
436,home_away,home_away_by_team_home_away,1.0
3823,team_home_or_away_median_score,team_home_or_away_median_score_by_team_home_away,1.0
3543,day_of_week,day_of_week_by_team_day_of_week,1.0
4058,team_day_median_score,team_day_median_score_by_team_day_of_week,1.0
4484,median_score_against,median_score_against_by_team_travel_distance,1.0
4387,median_score_for,median_score_for_by_team_travel_distance,1.0
4289,travel_distance,travel_distance_by_team_travel_distance,1.0
3925,team_home_or_away_median_allowed,team_home_or_away_median_allowed_by_team_home_...,1.0
4158,team_day_median_allowed,team_day_median_allowed_by_team_day_of_week,1.0
0,g_num,month,0.974134


In [None]:
# always keep these no matter what
must_keep = ['advanced_ortg', 'advanced_drtg']

def auto_select_drops(high_corr_df):
    drop_cols = []

    for _, row in high_corr_df.iterrows():
        f1, f2 = row['feature_1'], row['feature_2']

        # skip if either feature is protected
        if f1 in must_keep or f2 in must_keep:
            continue

        # drop the more engineered-looking one
        if '_by_' in f1 or 'median' in f1:
            drop_cols.append(f1)
        elif '_by_' in f2 or 'median' in f2:
            drop_cols.append(f2)
        else:
            # fallback: drop longer one
            drop_cols.append(f1 if len(f1) > len(f2) else f2)

    return list(set(drop_cols))

In [None]:
# create drop list
auto_drop_cols = auto_select_drops(high_corr_pairs)

In [None]:
# View suggested drop list
print(auto_drop_cols)

['team_day_median_allowed_by_team_home_away', 'advanced_ts_pct_by_team_travel_distance', 'median_score_against', 'team_day_median_score_by_team_home_away', 'team_day_median_allowed', 'offensive_four_factors_efg_pct', 'month', 'defensive_four_factors_efg_pct', 'offensive_four_factors_ft_per_fga', 'median_score_against_by_team_home_away', 'advanced_ts_pct_by_team_day_of_week', 'travel_distance_by_team_travel_distance', 'advanced_ts_pct', 'team_vs_opp_homeaway_median_score_by_team_home_away', 'day_of_week_by_team_day_of_week', 'team_home_or_away_median_allowed', 'advanced_3par_by_team_home_away', 'advanced_ts_pct_by_team_home_away', 'team_home_or_away_median_allowed_by_team_home_away', 'home_away_by_team_home_away', 'team_day_median_score', 'team_home_or_away_median_allowed_by_team_day_of_week', 'team_home_or_away_median_score', 'median_score_for', 'advanced_drtg_by_team_home_away']


In [None]:
# Manually built drop list
drop_list = [
    'home_away_by_team_home_away',
    'team_home_or_away_median_score_by_team_home_away',
    'day_of_week_by_team_day_of_week',
    'team_day_median_score_by_team_day_of_week',
    'median_score_against_by_team_travel_distance',
    'median_score_for_by_team_travel_distance',
    'travel_distance_by_team_travel_distance',
    'team_home_or_away_median_allowed_by_team_home_away',
    'team_day_median_allowed_by_team_day_of_week',
    'g_num',
    'offensive_four_factors_efg_pct',
    'team_day_median_allowed_by_team_home_away',
    'team_home_or_away_median_allowed_by_team_home_away',
    'team_home_or_away_median_allowed',
    'team_home_or_away_median_allowed_by_team_travel_distance',
    'offensive_four_factors_ft_per_fga',
    'team_day_median_allowed_by_team_home_away',
    'team_home_or_away_median_allowed_by_team_day_of_week',
    'offensive_four_factors_efg_pct_by_team_travel_distance',
    'team_home_or_away_median_score_by_team_home_away',
    'team_vs_opp_homeaway_median_score_by_team_home_away',
    'team_fg_pct',
    'offensive_four_factors_efg_pct_by_team_day_of_week',
    'offensive_four_factors_efg_pct_by_team_home_away'
]

In [None]:
# drop from df
df = df.drop(columns=drop_list, errors='ignore')

In [None]:
# filter to numeric features only (excluding IDs and targets)
X = df.drop(columns=['team_score', 'opp_score', 'team', 'opp'], errors='ignore')
X = X.select_dtypes(include='number')

# compute VIFs
vif_data = pd.DataFrame()
vif_data['feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_data = vif_data.sort_values(by='VIF', ascending=False)

vif_data

  vif = 1. / (1. - r_squared_i)


Unnamed: 0,feature,VIF
2,home_away,inf
49,offensive_four_factors_ft_per_fga_by_team_home...,inf
34,month_by_team_home_away,inf
35,day_by_team_home_away,inf
36,win_loss_by_team_home_away,inf
...,...,...
16,advanced_ast_pct,1.842376
18,advanced_blk_pct,1.638229
5,team_ft_pct,1.480301
1,day,1.472835


In [None]:
X.corr()['home_away'].sort_values(ascending=False)

Unnamed: 0,home_away
home_away,1.000000
travel_distance_by_team_home_away,0.807283
home_away_by_team_travel_distance,0.508346
travel_distance,0.390108
home_away_by_team_day_of_week,0.323630
...,...
advanced_ortg_by_team_home_away,-0.217181
offensive_four_factors_orb_pct_by_team_home_away,-0.227798
defensive_four_factors_drb_pct_by_team_home_away,-0.253229
offensive_four_factors_ft_per_fga_by_team_home_away,-0.295397


In [None]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor

# define features and targets
features = df.drop(columns=['team_score', 'opp_score', 'team', 'opp'], errors='ignore')
targets = df[['team_score', 'opp_score']]

# stratified split by team
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(features, groups=df['team']))

X_train = features.iloc[train_idx]
X_test = features.iloc[test_idx]
y_train = targets.iloc[train_idx]
y_test = targets.iloc[test_idx]

# train model
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model.fit(X_train, y_train)

In [None]:
# get feature importances for team_score prediction
team_score_importance = model.estimators_[0].feature_importances_

# get feature importances for opp_score prediction
opp_score_importance = model.estimators_[1].feature_importances_

# match to feature names
feature_names = features.columns

# build DataFrame
importances_df = (
    pd.DataFrame({
        'feature': feature_names,
        'team_score_importance': team_score_importance,
        'opp_score_importance': opp_score_importance
    })
    .sort_values(by='team_score_importance', ascending=False)
)

importances_df.head(20)  # or whatever number you want to view

Unnamed: 0,feature,team_score_importance,opp_score_importance
9,advanced_ortg,0.789104,0.00186
27,team_vs_opp_homeaway_median_score,0.071171,0.002426
11,advanced_pace,0.063676,0.08003
14,advanced_ts_pct,0.008506,0.001539
26,team_vs_opp_median_score,0.004471,0.001566
19,offensive_four_factors_tov_pct,0.003621,0.004165
4,team_3p_pct,0.002133,0.000915
79,defensive_four_factors_drb_pct_by_team_day_of_...,0.002,0.000753
66,advanced_pace_by_team_day_of_week,0.001366,0.002547
68,advanced_3par_by_team_day_of_week,0.001342,0.000196


In [None]:
# show all or more of the ranked features
importances_df.sort_values(by='team_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
9,advanced_ortg,0.789104,0.00186
27,team_vs_opp_homeaway_median_score,0.071171,0.002426
11,advanced_pace,0.063676,0.08003
14,advanced_ts_pct,0.008506,0.001539
26,team_vs_opp_median_score,0.004471,0.001566
19,offensive_four_factors_tov_pct,0.003621,0.004165
4,team_3p_pct,0.002133,0.000915
79,defensive_four_factors_drb_pct_by_team_day_of_...,0.002,0.000753
66,advanced_pace_by_team_day_of_week,0.001366,0.002547
68,advanced_3par_by_team_day_of_week,0.001342,0.000196


In [None]:
# show all or more of the ranked features
importances_df.sort_values(by='opp_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
10,advanced_drtg,0.001143,0.842567
11,advanced_pace,0.063676,0.08003
6,opponent_fg_pct,0.000596,0.008254
19,offensive_four_factors_tov_pct,0.003621,0.004165
21,defensive_four_factors_efg_pct,0.000796,0.004106
66,advanced_pace_by_team_day_of_week,0.001366,0.002547
27,team_vs_opp_homeaway_median_score,0.071171,0.002426
24,defensive_four_factors_ft_per_fga,0.000916,0.002101
9,advanced_ortg,0.789104,0.00186
17,advanced_stl_pct,0.001242,0.001621


In [None]:
from google.colab import files
df.to_csv("wnba_model_ready_v1.csv", index=False)
files.download("wnba_model_ready_v1.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files
# save feature importances
importances_df.to_csv("league_importances.csv", index=False)
files.download("league_importances.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
# Create ATL data subset
df_atl = df[df['team'] == 'ATL'].copy()

# define features and target
features_atl = df_atl.drop(columns=['team_score', 'opp_score', 'team', 'opp'], errors='ignore')
targets_atl = df_atl[['team_score', 'opp_score']]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(features_atl, targets_atl, test_size=0.2, random_state=42)

model_atl = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model_atl.fit(X_train, y_train)

In [13]:
importances_atl = pd.DataFrame({
    'feature': features_atl.columns,
    'team_score_importance': model_atl.estimators_[0].feature_importances_,
    'opp_score_importance': model_atl.estimators_[1].feature_importances_,
}).sort_values(by='team_score_importance', ascending=False)

importances_atl.head(20)

Unnamed: 0,feature,team_score_importance,opp_score_importance
14,advanced_ts_pct,0.450337,0.004531
9,advanced_ortg,0.251102,0.012536
19,offensive_four_factors_tov_pct,0.05352,0.020985
16,advanced_ast_pct,0.018078,0.007542
20,offensive_four_factors_orb_pct,0.017525,0.011316
27,team_vs_opp_homeaway_median_score,0.017474,0.003593
7,opponent_3p_pct,0.017239,0.002882
10,advanced_drtg,0.013249,0.641198
8,opponent_ft_pct,0.012864,0.001647
4,team_3p_pct,0.012468,0.003753


In [None]:
# show all or more of the ranked features
importances_atl.sort_values(by='team_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
14,advanced_ts_pct,0.450337,0.004531
9,advanced_ortg,0.251102,0.012536
19,offensive_four_factors_tov_pct,0.05352,0.020985
16,advanced_ast_pct,0.018078,0.007542
20,offensive_four_factors_orb_pct,0.017525,0.011316
27,team_vs_opp_homeaway_median_score,0.017474,0.003593
7,opponent_3p_pct,0.017239,0.002882
10,advanced_drtg,0.013249,0.641198
8,opponent_ft_pct,0.012864,0.001647
4,team_3p_pct,0.012468,0.003753


In [None]:
# show all or more of the ranked features
importances_atl.sort_values(by='opp_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
10,advanced_drtg,0.013249,0.641198
11,advanced_pace,0.00111,0.048323
6,opponent_fg_pct,0.003872,0.046115
17,advanced_stl_pct,0.009164,0.03374
0,month,0.00036,0.026202
19,offensive_four_factors_tov_pct,0.05352,0.020985
9,advanced_ortg,0.251102,0.012536
20,offensive_four_factors_orb_pct,0.017525,0.011316
12,advanced_ftr,0.010029,0.011276
21,defensive_four_factors_efg_pct,0.008195,0.009751


In [14]:
from google.colab import files
# save feature importances
importances_atl.to_csv("atl_importances.csv", index=False)
files.download("atl_importances.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [5]:
# load the player game logs CSV from the data folder
df = pd.read_csv("LHL-final-final-project/data/wnba_model_ready_v1.csv")


# preview
df.head()

Unnamed: 0,team,month,day,home_away,opp,win_loss,team_score,opp_score,team_3p_pct,team_ft_pct,...,defensive_four_factors_efg_pct_by_team_travel_distance,defensive_four_factors_tov_pct_by_team_travel_distance,defensive_four_factors_drb_pct_by_team_travel_distance,defensive_four_factors_ft_per_fga_by_team_travel_distance,day_of_week_by_team_travel_distance,team_vs_opp_median_score_by_team_travel_distance,team_vs_opp_homeaway_median_score_by_team_travel_distance,team_home_or_away_median_score_by_team_travel_distance,team_day_median_score_by_team_travel_distance,team_day_median_allowed_by_team_travel_distance
0,ATL,5,15,2,LAS,1,92,81,0.471,0.842,...,0.473,14.05,77.6,0.1815,3.0,75.0,70.0,78.5,76.0,80.0
1,ATL,5,18,2,PHO,2,85,88,0.435,0.778,...,0.464,12.2,79.7,0.305,2.0,76.0,76.0,78.0,76.0,80.0
2,ATL,5,21,1,DAL,1,83,78,0.318,0.727,...,0.46,11.7,71.8,0.227,4.0,81.0,75.5,78.0,73.0,78.0
3,ATL,5,26,1,MIN,2,79,92,0.273,0.786,...,0.473,14.05,77.6,0.1815,3.0,75.0,70.0,78.5,76.0,80.0
4,ATL,5,29,2,WAS,1,73,67,0.286,0.714,...,0.5,14.4,79.5,0.2,4.0,75.0,76.5,78.0,76.0,80.0


In [7]:
# Create CHI data subset
df_chi = df[df['team'] == 'CHI'].copy()

# define features and target
features_chi = df_chi.drop(columns=['team_score', 'opp_score', 'team', 'opp'], errors='ignore')
targets_chi = df_chi[['team_score', 'opp_score']]

# Train model
X_train, X_test, y_train, y_test = train_test_split(features_chi, targets_chi, test_size=0.2, random_state=42)

model_chi = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model_chi.fit(X_train, y_train)


importances_chi = pd.DataFrame({
    'feature': features_chi.columns,
    'team_score_importance': model_chi.estimators_[0].feature_importances_,
    'opp_score_importance': model_chi.estimators_[1].feature_importances_,
}).sort_values(by='team_score_importance', ascending=False)

In [8]:
# show all or more of the ranked features
importances_chi.sort_values(by='team_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
9,advanced_ortg,0.499736,0.002854
27,team_vs_opp_homeaway_median_score,0.136476,0.005037
11,advanced_pace,0.093623,0.003157
3,win_loss,0.059917,6.1e-05
14,advanced_ts_pct,0.041564,0.004965
15,advanced_trb_pct,0.017154,0.003654
22,defensive_four_factors_tov_pct,0.017074,0.001474
12,advanced_ftr,0.011171,0.002678
23,defensive_four_factors_drb_pct,0.010997,0.003666
10,advanced_drtg,0.010501,0.722572


In [9]:
# show all or more of the ranked features
importances_chi.sort_values(by='opp_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
10,advanced_drtg,0.010501,0.722572
7,opponent_3p_pct,0.005086,0.039947
1,day,0.00161,0.032701
21,defensive_four_factors_efg_pct,0.002858,0.021644
16,advanced_ast_pct,0.006288,0.020638
13,advanced_3par,0.005646,0.017185
6,opponent_fg_pct,0.003916,0.010219
4,team_3p_pct,0.001648,0.009573
26,team_vs_opp_median_score,0.001237,0.008241
24,defensive_four_factors_ft_per_fga,0.004065,0.007819


In [10]:
from google.colab import files
# save feature importances
importances_chi.to_csv("chi_importances.csv", index=False)
files.download("chi_importances.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [15]:
# Create CON data subset
df_con = df[df['team'] == 'CON'].copy()

# define features and target
features_con = df_con.drop(columns=['team_score', 'opp_score', 'team', 'opp'], errors='ignore')
targets_con = df_con[['team_score', 'opp_score']]

# Train model
X_train, X_test, y_train, y_test = train_test_split(features_con, targets_con, test_size=0.2, random_state=42)

model_con = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model_con.fit(X_train, y_train)

importances_con = pd.DataFrame({
    'feature': features_con.columns,
    'team_score_importance': model_con.estimators_[0].feature_importances_,
    'opp_score_importance': model_con.estimators_[1].feature_importances_,
}).sort_values(by='team_score_importance', ascending=False)

In [16]:
# show all or more of the ranked features
importances_con.sort_values(by='team_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
9,advanced_ortg,0.655248,0.005863
14,advanced_ts_pct,0.081441,0.005844
27,team_vs_opp_homeaway_median_score,0.061179,0.005091
11,advanced_pace,0.021311,0.054595
24,defensive_four_factors_ft_per_fga,0.017102,0.00534
15,advanced_trb_pct,0.011234,0.017871
26,team_vs_opp_median_score,0.00999,0.002055
16,advanced_ast_pct,0.008725,0.01027
23,defensive_four_factors_drb_pct,0.008,0.005196
5,team_ft_pct,0.006828,0.009705


In [17]:
# show all or more of the ranked features
importances_con.sort_values(by='opp_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
10,advanced_drtg,0.003325,0.416197
6,opponent_fg_pct,0.002812,0.203576
21,defensive_four_factors_efg_pct,0.001304,0.134012
11,advanced_pace,0.021311,0.054595
1,day,0.005939,0.021377
15,advanced_trb_pct,0.011234,0.017871
16,advanced_ast_pct,0.008725,0.01027
20,offensive_four_factors_orb_pct,0.006044,0.009749
5,team_ft_pct,0.006828,0.009705
7,opponent_3p_pct,0.00419,0.009004


In [18]:
from google.colab import files
# save feature importances
importances_con.to_csv("con_importances.csv", index=False)
files.download("con_importances.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [19]:
# Create DAL data subset
df_dal = df[df['team'] == 'DAL'].copy()

# define features and target
features_dal = df_dal.drop(columns=['team_score', 'opp_score', 'team', 'opp'], errors='ignore')
targets_dal = df_dal[['team_score', 'opp_score']]

# Train model
X_train, X_test, y_train, y_test = train_test_split(features_dal, targets_dal, test_size=0.2, random_state=42)

model_dal = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model_dal.fit(X_train, y_train)

importances_dal = pd.DataFrame({
    'feature': features_dal.columns,
    'team_score_importance': model_dal.estimators_[0].feature_importances_,
    'opp_score_importance': model_dal.estimators_[1].feature_importances_,
}).sort_values(by='team_score_importance', ascending=False)

In [20]:
# show all or more of the ranked features
importances_dal.sort_values(by='team_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
9,advanced_ortg,0.578885,0.008019
14,advanced_ts_pct,0.16018,0.005643
27,team_vs_opp_homeaway_median_score,0.097947,0.007748
15,advanced_trb_pct,0.013723,0.009715
26,team_vs_opp_median_score,0.010418,0.010628
10,advanced_drtg,0.009756,0.568623
19,offensive_four_factors_tov_pct,0.00936,0.001126
7,opponent_3p_pct,0.00781,0.052607
20,offensive_four_factors_orb_pct,0.007519,0.010718
21,defensive_four_factors_efg_pct,0.006993,0.13144


In [21]:
# show all or more of the ranked features
importances_dal.sort_values(by='opp_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
10,advanced_drtg,0.009756,0.568623
21,defensive_four_factors_efg_pct,0.006993,0.13144
7,opponent_3p_pct,0.00781,0.052607
13,advanced_3par,0.003207,0.021483
8,opponent_ft_pct,0.00092,0.014759
12,advanced_ftr,0.000823,0.014427
6,opponent_fg_pct,0.002432,0.011654
20,offensive_four_factors_orb_pct,0.007519,0.010718
26,team_vs_opp_median_score,0.010418,0.010628
18,advanced_blk_pct,0.006549,0.009818


In [22]:
from google.colab import files
# save feature importances
importances_dal.to_csv("dal_importances.csv", index=False)
files.download("dal_importances.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [23]:
# Create IND data subset
df_ind = df[df['team'] == 'IND'].copy()

# define features and target
features_ind = df_ind.drop(columns=['team_score', 'opp_score', 'team', 'opp'], errors='ignore')
targets_ind = df_ind[['team_score', 'opp_score']]

# Train model
X_train, X_test, y_train, y_test = train_test_split(features_ind, targets_ind, test_size=0.2, random_state=42)

model_ind = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model_ind.fit(X_train, y_train)

importances_ind = pd.DataFrame({
    'feature': features_ind.columns,
    'team_score_importance': model_ind.estimators_[0].feature_importances_,
    'opp_score_importance': model_ind.estimators_[1].feature_importances_,
}).sort_values(by='team_score_importance', ascending=False)

In [24]:
# show all or more of the ranked features
importances_ind.sort_values(by='team_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
9,advanced_ortg,0.397738,0.007333
27,team_vs_opp_homeaway_median_score,0.274837,0.004458
14,advanced_ts_pct,0.092523,0.013095
26,team_vs_opp_median_score,0.040816,0.000848
23,defensive_four_factors_drb_pct,0.028225,0.003855
4,team_3p_pct,0.013122,0.011997
0,month,0.011392,0.005623
19,offensive_four_factors_tov_pct,0.010079,0.031164
15,advanced_trb_pct,0.007675,0.002398
12,advanced_ftr,0.007523,0.002869


In [25]:
# show all or more of the ranked features
importances_ind.sort_values(by='opp_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
10,advanced_drtg,0.007371,0.682746
6,opponent_fg_pct,0.006026,0.041293
19,offensive_four_factors_tov_pct,0.010079,0.031164
21,defensive_four_factors_efg_pct,0.004361,0.025362
3,win_loss,0.001903,0.023821
24,defensive_four_factors_ft_per_fga,0.002402,0.019191
11,advanced_pace,0.00546,0.017108
14,advanced_ts_pct,0.092523,0.013095
7,opponent_3p_pct,0.004772,0.012133
4,team_3p_pct,0.013122,0.011997


In [26]:
from google.colab import files
# save feature importances
importances_ind.to_csv("ind_importances.csv", index=False)
files.download("ind_importances.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [27]:
# Create LAS data subset
df_las = df[df['team'] == 'LAS'].copy()

# define features and target
features_las = df_las.drop(columns=['team_score', 'opp_score', 'team', 'opp'], errors='ignore')
targets_las = df_las[['team_score', 'opp_score']]

# Train model
X_train, X_test, y_train, y_test = train_test_split(features_las, targets_las, test_size=0.2, random_state=42)

model_las = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model_las.fit(X_train, y_train)

importances_las = pd.DataFrame({
    'feature': features_las.columns,
    'team_score_importance': model_las.estimators_[0].feature_importances_,
    'opp_score_importance': model_las.estimators_[1].feature_importances_,
}).sort_values(by='team_score_importance', ascending=False)

In [30]:
# show all or more of the ranked features
importances_las.sort_values(by='team_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
9,advanced_ortg,0.683922,0.040604
27,team_vs_opp_homeaway_median_score,0.075575,0.014754
14,advanced_ts_pct,0.067601,0.026025
1,day,0.016163,0.008221
21,defensive_four_factors_efg_pct,0.015775,0.038267
6,opponent_fg_pct,0.012137,0.050634
22,defensive_four_factors_tov_pct,0.01081,0.038331
17,advanced_stl_pct,0.010647,0.002236
4,team_3p_pct,0.008335,0.008338
11,advanced_pace,0.006588,0.039459


In [31]:
# show all or more of the ranked features
importances_las.sort_values(by='opp_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
10,advanced_drtg,0.002713,0.525805
6,opponent_fg_pct,0.012137,0.050634
24,defensive_four_factors_ft_per_fga,0.002251,0.050494
9,advanced_ortg,0.683922,0.040604
11,advanced_pace,0.006588,0.039459
22,defensive_four_factors_tov_pct,0.01081,0.038331
21,defensive_four_factors_efg_pct,0.015775,0.038267
15,advanced_trb_pct,0.005157,0.03103
14,advanced_ts_pct,0.067601,0.026025
27,team_vs_opp_homeaway_median_score,0.075575,0.014754


In [32]:
from google.colab import files
# save feature importances
importances_las.to_csv("las_importances.csv", index=False)
files.download("las_importances.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [33]:
# Create MIN data subset
df_min = df[df['team'] == 'MIN'].copy()

# define features and target
features_min = df_min.drop(columns=['team_score', 'opp_score', 'team', 'opp'], errors='ignore')
targets_min = df_min[['team_score', 'opp_score']]

# Train model
X_train, X_test, y_train, y_test = train_test_split(features_min, targets_min, test_size=0.2, random_state=42)

model_min = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model_min.fit(X_train, y_train)

importances_min = pd.DataFrame({
    'feature': features_min.columns,
    'team_score_importance': model_min.estimators_[0].feature_importances_,
    'opp_score_importance': model_min.estimators_[1].feature_importances_,
}).sort_values(by='team_score_importance', ascending=False)

In [34]:
# show top features by team_score importance
importances_min.sort_values(by='team_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
14,advanced_ts_pct,0.470444,0.003506
9,advanced_ortg,0.201857,0.009382
4,team_3p_pct,0.04212,0.001983
24,defensive_four_factors_ft_per_fga,0.034729,0.019637
27,team_vs_opp_homeaway_median_score,0.033483,0.015486
20,offensive_four_factors_orb_pct,0.016912,0.003647
5,team_ft_pct,0.014544,0.002624
12,advanced_ftr,0.013434,0.036434
13,advanced_3par,0.011331,0.020486
26,team_vs_opp_median_score,0.011133,0.011577


In [35]:
# show top features by opp_score importance
importances_min.sort_values(by='opp_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
10,advanced_drtg,0.002366,0.517255
6,opponent_fg_pct,0.001822,0.098068
12,advanced_ftr,0.013434,0.036434
22,defensive_four_factors_tov_pct,0.001984,0.033168
11,advanced_pace,0.006016,0.032831
21,defensive_four_factors_efg_pct,0.002102,0.031669
13,advanced_3par,0.011331,0.020486
24,defensive_four_factors_ft_per_fga,0.034729,0.019637
27,team_vs_opp_homeaway_median_score,0.033483,0.015486
66,advanced_pace_by_team_day_of_week,0.002089,0.014053


In [36]:
from google.colab import files
# save feature importances
importances_min.to_csv("min_importances.csv", index=False)
files.download("min_importances.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [37]:
# Create NYL data subset
df_nyl = df[df['team'] == 'NYL'].copy()

# define features and target
features_nyl = df_nyl.drop(columns=['team_score', 'opp_score', 'team', 'opp'], errors='ignore')
targets_nyl = df_nyl[['team_score', 'opp_score']]

# Train model
X_train, X_test, y_train, y_test = train_test_split(features_nyl, targets_nyl, test_size=0.2, random_state=42)

model_nyl = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model_nyl.fit(X_train, y_train)

importances_nyl = pd.DataFrame({
    'feature': features_nyl.columns,
    'team_score_importance': model_nyl.estimators_[0].feature_importances_,
    'opp_score_importance': model_nyl.estimators_[1].feature_importances_,
}).sort_values(by='team_score_importance', ascending=False)

In [38]:
# show top features by team_score importance
importances_nyl.sort_values(by='team_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
9,advanced_ortg,0.627155,0.004271
14,advanced_ts_pct,0.17783,0.003746
11,advanced_pace,0.04534,0.01236
7,opponent_3p_pct,0.026923,0.00432
27,team_vs_opp_homeaway_median_score,0.013177,0.00582
19,offensive_four_factors_tov_pct,0.010453,0.003125
4,team_3p_pct,0.007879,0.003722
26,team_vs_opp_median_score,0.007485,0.002103
21,defensive_four_factors_efg_pct,0.007314,0.031292
10,advanced_drtg,0.005522,0.734214


In [39]:
# show top features by opp_score importance
importances_nyl.sort_values(by='opp_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
10,advanced_drtg,0.005522,0.734214
16,advanced_ast_pct,0.001397,0.037709
21,defensive_four_factors_efg_pct,0.007314,0.031292
6,opponent_fg_pct,0.00247,0.02244
17,advanced_stl_pct,0.001777,0.015867
11,advanced_pace,0.04534,0.01236
0,month,0.000605,0.008794
22,defensive_four_factors_tov_pct,0.003318,0.007187
18,advanced_blk_pct,0.001217,0.006582
12,advanced_ftr,0.000906,0.006128


In [40]:
from google.colab import files
# save feature importances
importances_nyl.to_csv("nyl_importances.csv", index=False)
files.download("nyl_importances.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [41]:
# Create PHO data subset
df_pho = df[df['team'] == 'PHO'].copy()

# define features and target
features_pho = df_pho.drop(columns=['team_score', 'opp_score', 'team', 'opp'], errors='ignore')
targets_pho = df_pho[['team_score', 'opp_score']]

# Train model
X_train, X_test, y_train, y_test = train_test_split(features_pho, targets_pho, test_size=0.2, random_state=42)

model_pho = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model_pho.fit(X_train, y_train)

importances_pho = pd.DataFrame({
    'feature': features_pho.columns,
    'team_score_importance': model_pho.estimators_[0].feature_importances_,
    'opp_score_importance': model_pho.estimators_[1].feature_importances_,
}).sort_values(by='team_score_importance', ascending=False)

In [42]:
# show top features by team_score importance
importances_pho.sort_values(by='team_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
9,advanced_ortg,0.435872,0.003157
14,advanced_ts_pct,0.167835,0.001334
3,win_loss,0.062581,0.000619
27,team_vs_opp_homeaway_median_score,0.039856,0.002922
21,defensive_four_factors_efg_pct,0.026688,0.076908
6,opponent_fg_pct,0.025944,0.520782
4,team_3p_pct,0.022989,0.004575
26,team_vs_opp_median_score,0.022256,0.000562
10,advanced_drtg,0.014405,0.184516
12,advanced_ftr,0.013675,0.006463


In [43]:
# show top features by opp_score importance
importances_pho.sort_values(by='opp_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
6,opponent_fg_pct,0.025944,0.520782
10,advanced_drtg,0.014405,0.184516
21,defensive_four_factors_efg_pct,0.026688,0.076908
11,advanced_pace,0.010951,0.021577
24,defensive_four_factors_ft_per_fga,0.006639,0.019291
16,advanced_ast_pct,0.003166,0.015744
8,opponent_ft_pct,0.012878,0.014508
13,advanced_3par,0.003779,0.010586
19,offensive_four_factors_tov_pct,0.012818,0.009408
5,team_ft_pct,0.001368,0.009238


In [44]:
from google.colab import files
# save feature importances
importances_pho.to_csv("pho_importances.csv", index=False)
files.download("pho_importances.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [45]:
# Create SEA data subset
df_sea = df[df['team'] == 'SEA'].copy()

# define features and target
features_sea = df_sea.drop(columns=['team_score', 'opp_score', 'team', 'opp'], errors='ignore')
targets_sea = df_sea[['team_score', 'opp_score']]

# Train model
X_train, X_test, y_train, y_test = train_test_split(features_sea, targets_sea, test_size=0.2, random_state=42)

model_sea = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model_sea.fit(X_train, y_train)

importances_sea = pd.DataFrame({
    'feature': features_sea.columns,
    'team_score_importance': model_sea.estimators_[0].feature_importances_,
    'opp_score_importance': model_sea.estimators_[1].feature_importances_,
}).sort_values(by='team_score_importance', ascending=False)

In [46]:
# show top features by team_score importance
importances_sea.sort_values(by='team_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
9,advanced_ortg,0.493531,0.00598
14,advanced_ts_pct,0.152638,0.00148
5,team_ft_pct,0.062982,0.00361
11,advanced_pace,0.041423,0.009078
27,team_vs_opp_homeaway_median_score,0.021832,0.00088
12,advanced_ftr,0.01992,0.008709
4,team_3p_pct,0.0179,0.004929
20,offensive_four_factors_orb_pct,0.015178,0.001418
0,month,0.012406,0.000329
26,team_vs_opp_median_score,0.011711,0.008894


In [47]:
# show top features by opp_score importance
importances_sea.sort_values(by='opp_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
10,advanced_drtg,0.003021,0.593749
21,defensive_four_factors_efg_pct,0.002185,0.111957
6,opponent_fg_pct,0.002411,0.054638
16,advanced_ast_pct,0.001386,0.034779
17,advanced_stl_pct,0.007166,0.017622
1,day,0.002421,0.017465
19,offensive_four_factors_tov_pct,0.006191,0.011953
7,opponent_3p_pct,0.006269,0.01017
22,defensive_four_factors_tov_pct,0.010951,0.009976
11,advanced_pace,0.041423,0.009078


In [48]:
from google.colab import files
# save feature importances
importances_sea.to_csv("sea_importances.csv", index=False)
files.download("sea_importances.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [49]:
# Create WAS data subset
df_was = df[df['team'] == 'WAS'].copy()

# define features and target
features_was = df_was.drop(columns=['team_score', 'opp_score', 'team', 'opp'], errors='ignore')
targets_was = df_was[['team_score', 'opp_score']]

# Train model
X_train, X_test, y_train, y_test = train_test_split(features_was, targets_was, test_size=0.2, random_state=42)

model_was = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model_was.fit(X_train, y_train)

importances_was = pd.DataFrame({
    'feature': features_was.columns,
    'team_score_importance': model_was.estimators_[0].feature_importances_,
    'opp_score_importance': model_was.estimators_[1].feature_importances_,
}).sort_values(by='team_score_importance', ascending=False)

In [50]:
# show top features by team_score importance
importances_was.sort_values(by='team_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
9,advanced_ortg,0.799454,0.009387
27,team_vs_opp_homeaway_median_score,0.035831,0.000771
14,advanced_ts_pct,0.014024,0.008597
13,advanced_3par,0.011262,0.002027
4,team_3p_pct,0.010084,0.008752
26,team_vs_opp_median_score,0.008856,0.000939
8,opponent_ft_pct,0.008847,0.013089
16,advanced_ast_pct,0.007293,0.014156
11,advanced_pace,0.006306,0.036963
20,offensive_four_factors_orb_pct,0.006156,0.009457


In [51]:
# show top features by opp_score importance
importances_was.sort_values(by='opp_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
10,advanced_drtg,0.002573,0.702318
11,advanced_pace,0.006306,0.036963
21,defensive_four_factors_efg_pct,0.002473,0.036178
24,defensive_four_factors_ft_per_fga,0.004755,0.017554
6,opponent_fg_pct,0.001687,0.014602
16,advanced_ast_pct,0.007293,0.014156
8,opponent_ft_pct,0.008847,0.013089
17,advanced_stl_pct,0.00201,0.012181
12,advanced_ftr,0.00239,0.01065
20,offensive_four_factors_orb_pct,0.006156,0.009457


In [52]:
from google.colab import files
# save feature importances
importances_was.to_csv("was_importances.csv", index=False)
files.download("was_importances.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [53]:
# Create LVA data subset
df_lva = df[df['team'] == 'LVA'].copy()

# define features and target
features_lva = df_lva.drop(columns=['team_score', 'opp_score', 'team', 'opp'], errors='ignore')
targets_lva = df_lva[['team_score', 'opp_score']]

# Train model
X_train, X_test, y_train, y_test = train_test_split(features_lva, targets_lva, test_size=0.2, random_state=42)

model_lva = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model_lva.fit(X_train, y_train)

importances_lva = pd.DataFrame({
    'feature': features_lva.columns,
    'team_score_importance': model_lva.estimators_[0].feature_importances_,
    'opp_score_importance': model_lva.estimators_[1].feature_importances_,
}).sort_values(by='team_score_importance', ascending=False)

In [54]:
# show top features by team_score importance
importances_lva.sort_values(by='team_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
9,advanced_ortg,0.512396,0.005935
14,advanced_ts_pct,0.264178,0.002102
27,team_vs_opp_homeaway_median_score,0.093995,0.005149
26,team_vs_opp_median_score,0.02705,0.004712
15,advanced_trb_pct,0.005291,0.003077
19,offensive_four_factors_tov_pct,0.004751,0.00143
18,advanced_blk_pct,0.00474,0.000985
75,offensive_four_factors_orb_pct_by_team_day_of_...,0.004535,0.000342
13,advanced_3par,0.004345,0.003158
11,advanced_pace,0.003985,0.006585


In [55]:
# show top features by opp_score importance
importances_lva.sort_values(by='opp_score_importance', ascending=False).head(50)

Unnamed: 0,feature,team_score_importance,opp_score_importance
10,advanced_drtg,0.001431,0.822671
21,defensive_four_factors_efg_pct,0.002213,0.02857
6,opponent_fg_pct,0.000839,0.018685
60,month_by_team_day_of_week,0.001448,0.015393
8,opponent_ft_pct,0.002651,0.009053
4,team_3p_pct,0.000886,0.008292
0,month,0.000808,0.006878
11,advanced_pace,0.003985,0.006585
22,defensive_four_factors_tov_pct,0.001722,0.006335
9,advanced_ortg,0.512396,0.005935


In [56]:
from google.colab import files
# save feature importances
importances_lva.to_csv("lva_importances.csv", index=False)
files.download("lva_importances.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [57]:
# list of team abbreviations in alphabetical order
teams = ['ATL', 'CHI', 'CON', 'DAL', 'IND', 'LAS', 'LVA', 'MIN', 'NYL', 'PHO', 'SEA', 'WAS']

# load and tag each importance CSV
dfs = []
for team in teams:
    df_team = pd.read_csv(f"{team.lower()}_importances.csv")
    df_team['team'] = team
    dfs.append(df_team)

# vertically merge into one long DataFrame
all_importances = pd.concat(dfs, ignore_index=True)

In [58]:
print(all_importances)

                                              feature  team_score_importance  \
0                                     advanced_ts_pct               0.450337   
1                                       advanced_ortg               0.251102   
2                      offensive_four_factors_tov_pct               0.053520   
3                                    advanced_ast_pct               0.018078   
4                      offensive_four_factors_orb_pct               0.017525   
...                                               ...                    ...   
1363                       win_loss_by_team_home_away               0.000000   
1364          team_day_median_score_by_team_home_away               0.000000   
1365                 win_loss_by_team_travel_distance               0.000000   
1366    team_day_median_score_by_team_travel_distance               0.000000   
1367  team_day_median_allowed_by_team_travel_distance               0.000000   

      opp_score_importance team  
0    

In [60]:
# filter where either importance meets threshold
threshold = 0.005
filtered = all_importances[
    (all_importances['team_score_importance'] >= threshold) |
    (all_importances['opp_score_importance'] >= threshold)
]

# separate counts for each type
team_score_counts = (
    filtered[filtered['team_score_importance'] >= threshold]
    .groupby('team')['feature']
    .nunique()
    .reset_index(name='team_score_count')
)

opp_score_counts = (
    filtered[filtered['opp_score_importance'] >= threshold]
    .groupby('team')['feature']
    .nunique()
    .reset_index(name='opp_score_count')
)

# merge the two counts together
team_feature_counts = pd.merge(team_score_counts, opp_score_counts, on='team', how='outer').fillna(0)
team_feature_counts[['team_score_count', 'opp_score_count']] = team_feature_counts[['team_score_count', 'opp_score_count']].astype(int)

display(team_feature_counts)

Unnamed: 0,team,team_score_count,opp_score_count
0,ATL,18,20
1,CHI,16,14
2,CON,16,19
3,DAL,13,18
4,IND,14,18
5,LAS,12,17
6,LVA,5,11
7,MIN,19,23
8,NYL,10,13
9,PHO,19,18


In [62]:
# count how many teams each feature appears in at or above threshold
team_score_feature_counts = (
    filtered[filtered['team_score_importance'] >= threshold]
    .groupby('feature')['team']
    .nunique()
    .reset_index(name='team_score_team_count')
)

opp_score_feature_counts = (
    filtered[filtered['opp_score_importance'] >= threshold]
    .groupby('feature')['team']
    .nunique()
    .reset_index(name='opp_score_team_count')
)

# merge both counts into one view
feature_team_counts = pd.merge(team_score_feature_counts, opp_score_feature_counts, on='feature', how='outer').fillna(0)
feature_team_counts[['team_score_team_count', 'opp_score_team_count']] = feature_team_counts[['team_score_team_count', 'opp_score_team_count']].astype(int)

display(feature_team_counts.sort_values(by='team_score_team_count', ascending=False))

Unnamed: 0,feature,team_score_team_count,opp_score_team_count
7,advanced_ortg,12,9
14,advanced_ts_pct,12,5
38,team_vs_opp_homeaway_median_score,12,7
29,offensive_four_factors_tov_pct,10,7
34,team_3p_pct,10,7
10,advanced_pace,9,10
13,advanced_trb_pct,9,6
40,team_vs_opp_median_score,9,4
4,advanced_drtg,6,12
27,offensive_four_factors_orb_pct,6,7


In [72]:
feature_team_counts.shape[0]

44

In [89]:
# get max importance values for your final features
feature_importance_summary = (
    all_importances[all_importances['feature'].isin(feature_team_counts['feature'])]
    .groupby('feature')[['team_score_importance', 'opp_score_importance']]
    .max()
    .reset_index()
    .sort_values(by='team_score_importance', ascending=False)
)

display(feature_importance_summary)

Unnamed: 0,feature,team_score_importance,opp_score_importance
7,advanced_ortg,0.799454,0.040604
14,advanced_ts_pct,0.470444,0.026025
38,team_vs_opp_homeaway_median_score,0.274837,0.015486
10,advanced_pace,0.093623,0.054595
37,team_ft_pct,0.062982,0.009705
43,win_loss,0.062581,0.023821
29,offensive_four_factors_tov_pct,0.05352,0.031164
34,team_3p_pct,0.04212,0.011997
40,team_vs_opp_median_score,0.040816,0.011577
21,defensive_four_factors_ft_per_fga,0.034729,0.050494


In [90]:
features_to_check = [
    'advanced_ts_pct_by_team_day_of_week',
    'team_day_median_score',
    'day_by_team_day_of_week',
    'defensive_four_factors_ft_per_fga_by_team_day_of_week',
    'advanced_3par_by_team_day_of_week',
    'advanced_drtg_by_team_day_of_week',
    'advanced_ortg_by_team_travel_distance',
    'team_vs_opp_homeaway_median_score_by_team_day_of_week',
    'advanced_ortg_by_team_day_of_week',
    'offensive_four_factors_orb_pct_by_team_day_of_week',
    'offensive_four_factors_ft_per_fga_by_team_day_of_week',
    'day_of_week',
    'offensive_four_factors_tov_pct_by_team_day_of_week',
    'advanced_pace_by_team_day_of_week',
    'team_vs_opp_median_score_by_team_day_of_week',
    'month_by_team_day_of_week',
    'travel_distance_by_team_day_of_week'
]

In [92]:
# Compute max importance for the features you're checking
importance_check = (
    all_importances[all_importances['feature'].isin(features_to_check)]
    .groupby('feature')[['team_score_importance', 'opp_score_importance']]
    .max()
    .reset_index()
)

# Isolate features to drop (both importances < 0.01)
drop_list = importance_check[
    (importance_check['team_score_importance'] < 0.01) &
    (importance_check['opp_score_importance'] < 0.01)
]['feature'].tolist()

# Drop them from feature_team_counts
feature_team_counts = feature_team_counts[
    ~feature_team_counts['feature'].isin(drop_list)
].reset_index(drop=True)

In [93]:
feature_team_counts['feature'].tolist()

['advanced_3par',
 'advanced_ast_pct',
 'advanced_blk_pct',
 'advanced_drtg',
 'advanced_ftr',
 'advanced_ortg',
 'advanced_pace',
 'advanced_pace_by_team_day_of_week',
 'advanced_stl_pct',
 'advanced_trb_pct',
 'advanced_ts_pct',
 'day',
 'defensive_four_factors_drb_pct',
 'defensive_four_factors_efg_pct',
 'defensive_four_factors_ft_per_fga',
 'defensive_four_factors_tov_pct',
 'month',
 'month_by_team_day_of_week',
 'offensive_four_factors_orb_pct',
 'offensive_four_factors_tov_pct',
 'opponent_3p_pct',
 'opponent_fg_pct',
 'opponent_ft_pct',
 'team_3p_pct',
 'team_day_median_allowed',
 'team_ft_pct',
 'team_vs_opp_homeaway_median_score',
 'team_vs_opp_median_score',
 'win_loss']

In [97]:
# final feature list
model_features = feature_team_counts['feature'].tolist()

# columns needed for modeling
columns_to_keep = model_features + ['team', 'opp', 'team_score', 'opp_score']

# create the modeling-ready DataFrame
df_final = df[columns_to_keep].copy()

In [98]:
df_final.shape

(480, 33)

In [100]:
from google.colab import files

# re-save with final feature set
df_final.to_csv("wnba_model_ready_v2.csv", index=False)
files.download("wnba_model_ready_v2.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [102]:
from google.colab import files

# re-save with final feature set
df.to_csv("wnba_model_ready_v1.csv", index=False)
files.download("wnba_model_ready_v1.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>