In [None]:
import pandas as pd
from numpy import array
pd.set_option('display.max_columns', None)
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
# Creating array of all years to easily iterate through seasons

years = [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]

for i, year in enumerate(years):
    year = str(year)
    if i != len(years) - 1:
        next_year = str(years[i + 1])
        years[i] = str(year) + "-" + next_year[2:]

years = years[0:9]

seas_ids = ['22015', '22016', '22017', '22018', '22019', '22020', '22021', '22022', '22023', '22024']

years

In [None]:
# The following code is necessary for API scraping and was found at the following link: https://github.com/basketballrelativity/synergy/blob/master/synergy_exploration.ipynb

import pandas as pd
import itertools

import time

from py_ball import league_dash

HEADERS = {'Connection': 'keep-alive',
           'Host': 'stats.nba.com',
           'Origin': 'http://stats.nba.com',
           'Upgrade-Insecure-Requests': '1',
           'Referer': 'stats.nba.com',
           'x-nba-stats-origin': 'stats',
           'x-nba-stats-token': 'true',
           'Accept-Language': 'en-US,en;q=0.9',
           "X-NewRelic-ID": "VQECWF5UChAHUlNTBwgBVw==",
           'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6)' +\
                         ' AppleWebKit/537.36 (KHTML, like Gecko)' + \
                         ' Chrome/81.0.4044.129 Safari/537.36'}

pd.options.mode.chained_assignment = None  # Disabling pandas SetWithCopyWarnings

In [None]:
# Scraping lineup data for every season and every team

league_id = '00'
group_quantity = '5'
per_mode = 'Totals'
plus_minus = 'N'
rank = 'N'
pace_adjust = 'N'
measure_type = 'Advanced'
period = '0'
vs_conference = ''
last_n_games = '0'
location = ''
outcome = ''

for year in years:

    team_df = pd.read_csv(Read in csv for teams)
    team_ids = team_df['GROUP_SET'].tolist()
    team_net_ratings = team_df['NET_RATING'].tolist()

    season = year
    vs_division = ''
    game_segment = ''
    month = '0'
    season_type = 'Regular Season'
    game_scope = ''
    player_experience = ''
    player_position = ''
    starters_bench = ''

    lineup_data = league_dash.LeagueDash(headers=HEADERS,
                                endpoint='leaguedashlineups',
                                league_id = league_id,
                                group_quantity = group_quantity,
                                per_mode = per_mode,
                                plus_minus = plus_minus,
                                rank = rank,
                                pace_adjust = pace_adjust,
                                measure_type = measure_type,
                                period = period,
                                vs_conference = vs_conference,
                                last_n_games = last_n_games,
                                #team_id = team_id,
                                location = location,
                                outcome = outcome,
                                #date_from = date_from,
                                #date_to = date_to,
                                #opp_team_id = opp_team_id,
                                season = season,
                                vs_division = vs_division,
                                game_segment = game_segment,
                                month = month,
                                season_type = season_type,
                                game_scope = game_scope,
                                player_experience = player_experience,
                                player_position = player_position,
                                starters_bench = starters_bench)
    time.sleep(10)

    lineup_df = pd.DataFrame(lineup_data.data['Lineups'])
    rows_to_drop = []
    # print(len(lineup_df))

    for i in range(len(lineup_df)):
        minutes_together = int(lineup_df.loc[i, 'MIN'])
        possessions_together = lineup_df.loc[i, 'POSS']
        net_rating = lineup_df.loc[i, 'NET_RATING']
        team_identification = lineup_df.loc[i, 'TEAM_ID']
        if (possessions_together / 600) >= 1:
            lineup_df.loc[i, 'R'] = net_rating
        elif (possessions_together / 600) < 1:
            index = team_ids.index(team_identification)
            team_net_rating = team_net_ratings[index]
            lineup_df.loc[i, 'R'] = ((possessions_together / 600) * net_rating) + (1 - (possessions_together / 600)) * team_net_rating
        if minutes_together < 48:
            rows_to_drop.append(i)

    lineup_df = lineup_df.drop(rows_to_drop, axis = 0)
    print(f"{season} lineup information scraped.")

    lineup_df.to_csv(Save csv in each season's folder, index = False)

In [None]:
# Iterating through lineup data frames for each season and counting number of players in each cluster

for i, year in enumerate(years):
    df = pd.read_csv(Read in lineup csv for season)

    all_players_df = pd.read_csv(Read in season's csv with player archetypes)

    players_ids = all_players_df['PLAYER_ID'].tolist()
    players_pie = all_players_df['PIE'].tolist()

    for j in range(len(players_ids)):
        players_ids[j] = str(players_ids[j])
    players_clusters = all_players_df['Cluster'].tolist()

    for w in range(len(df)):
        group_ids = str(df.loc[w, 'GROUP_ID'])
        cleaned_string = group_ids.strip('-')
        integer_strings = cleaned_string.split('-')

        cluster_0 = 0
        cluster_1 = 0
        cluster_2 = 0
        cluster_3 = 0
        cluster_4 = 0
        cluster_5 = 0
        cluster_6 = 0
        cluster_7 = 0
        cluster_8 = 0

        for j, strings in enumerate(integer_strings):
            player_index = players_ids.index(strings)
            player_cluster = players_clusters[player_index]
            player_pie = players_pie[player_index]
            if player_cluster == 0:
                cluster_0 += 1
            if player_cluster == 1:
                cluster_1 += 1
            if player_cluster == 2:
                cluster_2 += 1
            if player_cluster == 3:
                cluster_3 += 1
            if player_cluster == 4:
                cluster_4 += 1
            if player_cluster == 5:
                cluster_5 += 1
            if player_cluster == 6:
                cluster_6 += 1
            if player_cluster == 7:
                cluster_7 += 1
            if player_cluster == 8:
                cluster_8 += 1
        
        df.loc[w, "Cluster_0"] = cluster_0
        df.loc[w, "Cluster_1"] = cluster_1
        df.loc[w, "Cluster_2"] = cluster_2
        df.loc[w, "Cluster_3"] = cluster_3
        df.loc[w, "Cluster_4"] = cluster_4
        df.loc[w, "Cluster_5"] = cluster_5
        df.loc[w, "Cluster_6"] = cluster_6
        df.loc[w, "Cluster_7"] = cluster_7
        df.loc[w, "Cluster_8"] = cluster_8
    df.to_csv(Save csv in season's folder, index = False)

In [None]:
# Concatenating all lineup information into one data frame

lineup_dfs = []

for year in years:
    df = pd.read_csv(Read in csv's lineup information)
    lineup_dfs.append(df)

lineup_df = pd.concat(lineup_dfs, axis = 0)
lineup_df.reset_index(drop=True, inplace=True)

lineup_df = lineup_df.sort_values(by='R', ascending=False)

lineup_df.to_csv(Save csv, index = False)

In [None]:
# Using Random Forest Regressor to predict MSE from correlation between features and target column

df = pd.read_csv(Read in csv with all lineups)

features = ['MIN', 'POSS', 'Cluster_0', 'Cluster_1', 'Cluster_2', 'Cluster_3', 'Cluster_4', 'Cluster_5', 'Cluster_6', 'Cluster_7', 'Cluster_8']
target_column = 'R'

# Separate features and target variable
X = df[features]
y = df[target_column]

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the model
model = RandomForestRegressor(random_state=42)

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate mean squared error (MSE)
test_mse = mean_squared_error(y_test, y_pred)

# Output the MSE value
print(f"Test MSE: {test_mse}")

In [None]:
# Using Random Forest Regressor to predict net ratings for all lineup combinations
df = pd.read_csv(Read in csv with all lineups)

# Separate features and target variable
X = df[features]
y = df[target_column]

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the model
model = RandomForestRegressor(random_state=42)

# Define the grid of hyperparameters to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Set up the grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best hyperparameters: ", grid_search.best_params_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred)
test_r2 = r2_score(y_test, y_pred)

# Asking model for most and least efficient lineups

best_model.fit(X_train, y_train)

# Keeping minutes and possessions constant
min = 100000000
poss = 100000000

# Finding all possible five man lineup combinations
num_clusters = 9
combinations = [comb for comb in itertools.product(range(6), repeat=num_clusters) if sum(comb) == 5]
combinations_df = pd.DataFrame(combinations, columns=[f'cluster_{i}' for i in range(num_clusters)])

# Add constans to df
combinations_df['MIN'] = min
combinations_df['POSS'] = poss

# Predicting efficiency
predictions = best_model.predict(combinations_df)

# Add the predictions to the DataFrame
combinations_df['Predicted_Net_Rating'] = predictions

# Find most and least efficient lineups
most_efficient = combinations_df.loc[combinations_df['Predicted_Net_Rating'].idxmax()]
least_efficient = combinations_df.loc[combinations_df['Predicted_Net_Rating'].idxmin()]

combinations_df = combinations_df.sort_values(by='Predicted_Net_Rating', ascending=False)
combinations_df.to_csv(Save csv with lineup combinations and predicted net ratings, index = False)