## 0) Import Libraries

In [2]:
import pandas as pd
import numpy as np

In [None]:
from nba_api.stats.endpoints import leaguegamelog
from nba_api.stats.static import players

In [None]:
from sklearn.linear_model import Ridge

## 0.5) Helper functions

In [92]:
import ast
from collections import Counter

# Function to convert stringified lists back to actual lists
def fix_player_lists(player_str):
    if isinstance(player_str, str):  # Ensure it's a string before converting
        return ast.literal_eval(player_str)  # Convert string to list
    return player_str  # If it's already a list, return as is

# Function to replace replacement players with ID 9999999
def replace_replacement_players(player_list):
    return [9999999 if p in replacement_players else p for p in player_list] if isinstance(player_list, list) else player_list

In [93]:
nba_team_ids = {
    'ATL': '01', 'BOS': '02', 'BKN': '03', 'CHA': '04', 'CHI': '05',
    'CLE': '06', 'DAL': '07', 'DEN': '08', 'DET': '09', 'GSW': '10',
    'HOU': '11', 'IND': '12', 'LAC': '13', 'LAL': '14', 'MEM': '15',
    'MIA': '16', 'MIL': '17', 'MIN': '18', 'NOP': '19', 'NYK': '20',
    'OKC': '21', 'ORL': '22', 'PHI': '23', 'PHX': '24', 'POR': '25',
    'SAC': '26', 'SAS': '27', 'TOR': '28', 'UTA': '29', 'WAS': '30'
}

In [94]:
def tag_team(player_list, team_abbr):
    team_id = nba_team_ids.get(team_abbr)
    return [int(f"{pid}{team_id}") for pid in player_list]

In [95]:
all_players_api = players.get_players()

def get_name(i):
    for p in all_players_api:
        if i == p['id']:
            return p['full_name']

## 1) Process Raw PBP -> Cleaned Stints

In [None]:
raw_stints = pd.read_csv('../datasets/processed_pbp_stints.csv').drop(columns = 'Unnamed: 0')
raw_stints['home_players'] = raw_stints['home_players'].apply(fix_player_lists)
raw_stints['away_players'] = raw_stints['away_players'].apply(fix_player_lists)

In [None]:
raw_stints['nid'] = raw_stints['id'].apply(lambda x: '00' + str(x))

In [98]:
all23 = leaguegamelog.LeagueGameLog(season = '2023-24').get_data_frames()[0]
all22 = leaguegamelog.LeagueGameLog(season = '2022-23').get_data_frames()[0]
all21 = leaguegamelog.LeagueGameLog(season = '2021-22').get_data_frames()[0]
all20 = leaguegamelog.LeagueGameLog(season = '2020-21').get_data_frames()[0]
all19 = leaguegamelog.LeagueGameLog(season = '2019-20').get_data_frames()[0]
all18 = leaguegamelog.LeagueGameLog(season = '2018-19').get_data_frames()[0]
all17 = leaguegamelog.LeagueGameLog(season = '2017-18').get_data_frames()[0]
all16 = leaguegamelog.LeagueGameLog(season = '2016-17').get_data_frames()[0]
all15 = leaguegamelog.LeagueGameLog(season = '2015-16').get_data_frames()[0]
all14 = leaguegamelog.LeagueGameLog(season = '2014-15').get_data_frames()[0]

all23 = all23[all23['MATCHUP'].str.contains('@')].reset_index(drop = True)
all22 = all22[all22['MATCHUP'].str.contains('@')].reset_index(drop = True)
all21 = all21[all21['MATCHUP'].str.contains('@')].reset_index(drop = True)
all20 = all20[all20['MATCHUP'].str.contains('@')].reset_index(drop = True)
all19 = all19[all19['MATCHUP'].str.contains('@')].reset_index(drop = True)
all18 = all18[all18['MATCHUP'].str.contains('@')].reset_index(drop = True)
all17 = all17[all17['MATCHUP'].str.contains('@')].reset_index(drop = True)
all16 = all16[all16['MATCHUP'].str.contains('@')].reset_index(drop = True)
all15 = all15[all15['MATCHUP'].str.contains('@')].reset_index(drop = True)
all14 = all14[all14['MATCHUP'].str.contains('@')].reset_index(drop = True)

In [99]:
unique_vals23 = set(all23['GAME_ID'])
unique_vals22 = set(all22['GAME_ID'])
unique_vals21 = set(all21['GAME_ID'])
unique_vals20 = set(all20['GAME_ID'])
unique_vals19 = set(all19['GAME_ID'])
unique_vals18 = set(all18['GAME_ID'])
unique_vals17 = set(all17['GAME_ID'])
unique_vals16 = set(all16['GAME_ID'])
unique_vals15 = set(all15['GAME_ID'])
unique_vals14 = set(all14['GAME_ID'])

In [None]:
matchup23 = all23[['GAME_ID', 'MATCHUP']].copy()
matchup22 = all22[['GAME_ID', 'MATCHUP']].copy()
matchup21 = all21[['GAME_ID', 'MATCHUP']].copy()
matchup20 = all20[['GAME_ID', 'MATCHUP']].copy()
matchup19 = all19[['GAME_ID', 'MATCHUP']].copy()
matchup18 = all18[['GAME_ID', 'MATCHUP']].copy()
matchup17 = all17[['GAME_ID', 'MATCHUP']].copy()
matchup16 = all16[['GAME_ID', 'MATCHUP']].copy()
matchup15 = all15[['GAME_ID', 'MATCHUP']].copy()
matchup14 = all14[['GAME_ID', 'MATCHUP']].copy()

matchup23[['Away', 'Home']] = matchup23['MATCHUP'].str.split(r'\s*@\s*', expand=True)
matchup22[['Away', 'Home']] = matchup22['MATCHUP'].str.split(r'\s*@\s*', expand=True)
matchup21[['Away', 'Home']] = matchup21['MATCHUP'].str.split(r'\s*@\s*', expand=True)
matchup20[['Away', 'Home']] = matchup20['MATCHUP'].str.split(r'\s*@\s*', expand=True)
matchup19[['Away', 'Home']] = matchup19['MATCHUP'].str.split(r'\s*@\s*', expand=True)
matchup18[['Away', 'Home']] = matchup18['MATCHUP'].str.split(r'\s*@\s*', expand=True)
matchup17[['Away', 'Home']] = matchup17['MATCHUP'].str.split(r'\s*@\s*', expand=True)
matchup16[['Away', 'Home']] = matchup16['MATCHUP'].str.split(r'\s*@\s*', expand=True)
matchup15[['Away', 'Home']] = matchup15['MATCHUP'].str.split(r'\s*@\s*', expand=True)
matchup14[['Away', 'Home']] = matchup14['MATCHUP'].str.split(r'\s*@\s*', expand=True)

season23_stints = raw_stints[raw_stints['nid'].isin(unique_vals23)].copy().reset_index(drop = True)
season22_stints = raw_stints[raw_stints['nid'].isin(unique_vals22)].copy().reset_index(drop = True)
season21_stints = raw_stints[raw_stints['nid'].isin(unique_vals21)].copy().reset_index(drop = True)
season20_stints = raw_stints[raw_stints['nid'].isin(unique_vals20)].copy().reset_index(drop = True)
season19_stints = raw_stints[raw_stints['nid'].isin(unique_vals19)].copy().reset_index(drop = True)
season18_stints = raw_stints[raw_stints['nid'].isin(unique_vals18)].copy().reset_index(drop = True)
season17_stints = raw_stints[raw_stints['nid'].isin(unique_vals17)].copy().reset_index(drop = True)
season16_stints = raw_stints[raw_stints['nid'].isin(unique_vals16)].copy().reset_index(drop = True)
season15_stints = raw_stints[raw_stints['nid'].isin(unique_vals15)].copy().reset_index(drop = True)
season14_stints = raw_stints[raw_stints['nid'].isin(unique_vals14)].copy().reset_index(drop = True)

season23_stints = season23_stints.merge(matchup23[['GAME_ID', 'Home', 'Away']], left_on='nid', right_on = 'GAME_ID', how='left').drop(columns = 'GAME_ID')
season22_stints = season22_stints.merge(matchup22[['GAME_ID', 'Home', 'Away']], left_on='nid', right_on = 'GAME_ID', how='left').drop(columns = 'GAME_ID')
season21_stints = season21_stints.merge(matchup21[['GAME_ID', 'Home', 'Away']], left_on='nid', right_on = 'GAME_ID', how='left').drop(columns = 'GAME_ID')
season20_stints = season20_stints.merge(matchup20[['GAME_ID', 'Home', 'Away']], left_on='nid', right_on = 'GAME_ID', how='left').drop(columns = 'GAME_ID')
season19_stints = season19_stints.merge(matchup19[['GAME_ID', 'Home', 'Away']], left_on='nid', right_on = 'GAME_ID', how='left').drop(columns = 'GAME_ID')
season18_stints = season18_stints.merge(matchup18[['GAME_ID', 'Home', 'Away']], left_on='nid', right_on = 'GAME_ID', how='left').drop(columns = 'GAME_ID')
season17_stints = season17_stints.merge(matchup17[['GAME_ID', 'Home', 'Away']], left_on='nid', right_on = 'GAME_ID', how='left').drop(columns = 'GAME_ID')
season16_stints = season16_stints.merge(matchup16[['GAME_ID', 'Home', 'Away']], left_on='nid', right_on = 'GAME_ID', how='left').drop(columns = 'GAME_ID')
season15_stints = season15_stints.merge(matchup15[['GAME_ID', 'Home', 'Away']], left_on='nid', right_on = 'GAME_ID', how='left').drop(columns = 'GAME_ID')
season14_stints = season14_stints.merge(matchup14[['GAME_ID', 'Home', 'Away']], left_on='nid', right_on = 'GAME_ID', how='left').drop(columns = 'GAME_ID')

In [101]:
all_stints = [season23_stints, season22_stints, season21_stints, season20_stints, season19_stints, season18_stints, season17_stints, season16_stints, season15_stints, season14_stints]

In [None]:
season23_stints['season'] = 1
season22_stints['season'] = 2
season21_stints['season'] = 3
season20_stints['season'] = 4
season19_stints['season'] = 5
season18_stints['season'] = 6
season17_stints['season'] = 7
season16_stints['season'] = 8
season15_stints['season'] = 9
season14_stints['season'] = 10

In [None]:
for i, stints_df in enumerate(all_stints):

    stints_df['home_players'] = stints_df.apply(lambda row: tag_team(row['home_players'], row['Home']), axis=1)
    stints_df['away_players'] = stints_df.apply(lambda row: tag_team(row['away_players'], row['Away']), axis=1)

0
1
2
3
4
5
6
7
8
9


In [104]:
processed_stints = []

for stints_df in all_stints:

    stints_offense = stints_df.copy()
    stints_defense = stints_df.copy()
    
    stints_defense["home_players"], stints_defense["away_players"] = stints_defense["away_players"], stints_defense["home_players"]
    
    # Flip points scored and allowed
    stints_defense["points_scored"], stints_defense["points_allowed"] = stints_defense["points_allowed"], stints_defense["points_scored"]
    stints_defense["offensive_possessions"], stints_defense["defensive_possessions"] = stints_defense["defensive_possessions"], stints_defense["offensive_possessions"]
    
    defense = stints_defense.loc[:, ['home_players', 'away_players', 'points_scored', 'offensive_possessions']].copy()
    offense = stints_offense.loc[:, ['home_players', 'away_players', 'points_scored', 'offensive_possessions']].copy()
    
    defense = defense[defense['offensive_possessions'] != 0]
    offense = offense[offense['offensive_possessions'] != 0]
    
    offense.reset_index(inplace = True, drop = True)
    defense.reset_index(inplace = True, drop = True)
    
    output_df = pd.concat([offense.copy(), defense.copy()], ignore_index=True)
    
    output_df = output_df.rename(columns={
        "home_players": "offensive_players",  # Home team is always the offensive team now
        "away_players": "defensive_players",  # Away team is always the defensive team now
        "points_scored": "offensive_points",  # Renaming to clarify that this is offensive output
        "offensive_possessions": "possessions"
    })
    
    output_df["offensive_players"] = output_df["offensive_players"].apply(fix_player_lists)
    output_df["defensive_players"] = output_df["defensive_players"].apply(fix_player_lists)
    
    # Flatten player appearances into a single list
    all_players = list(output_df["offensive_players"].explode()) + list(output_df["defensive_players"].explode())

    # Count number of stints each player appears in
    player_counts = Counter(all_players)

    # Identify replacement-level players (played in fewer than 75 stints)
    replacement_players = {player for player, count in player_counts.items() if count < 225}
    
    # Apply the function to both offensive and defensive players
    output_df["offensive_players"] = output_df["offensive_players"].apply(replace_replacement_players)
    output_df["defensive_players"] = output_df["defensive_players"].apply(replace_replacement_players)

    # Remove stints where all players are replacement-level
    output_df = output_df[
        (output_df["offensive_players"].apply(lambda x: set(x) != {9999999})) & 
        (output_df["defensive_players"].apply(lambda x: set(x) != {9999999}))
    ]
    
    processed_stints.append(output_df)

processed_stints2 = []
#Make PPP
for stints_df in processed_stints:
    
    stints_df['ppp'] = stints_df['offensive_points']/stints_df['possessions']
    league_avg_ppp = stints_df["offensive_points"].sum() / stints_df["possessions"].sum()
    
    stints_df['ppp'] = stints_df['ppp'] - league_avg_ppp
    
    processed_stints2.append(stints_df)

processed_stints3 = []

for stints_df in processed_stints2:
    
    new_df = stints_df[stints_df['offensive_players'].apply(lambda x: sum(isinstance(p, int) for p in x) == 5) &
             stints_df['defensive_players'].apply(lambda x: sum(isinstance(p, int) for p in x) == 5)].copy()
    
    processed_stints3.append(new_df[(new_df['ppp'] <= 5) & (new_df['offensive_points'] >= 0)].copy().reset_index(drop=True))

up_list = []
num_player_list = []
index_list = []

for stints_df in processed_stints3:
    unique_players = sorted(set(stints_df['offensive_players'].explode()).union(set(stints_df['defensive_players'].explode())))
    num_players = len(unique_players)
    player_index = {player: idx for idx, player in enumerate(unique_players)}
    
    up_list.append(unique_players)
    num_player_list.append(num_players)
    index_list.append(player_index)

In [105]:
#setup ready, create RAPM

num_stints = []
X_rapms = []
Y_rapms = []
weights  = []

for i, stints in enumerate(processed_stints3):
    num = len(stints)
    X = np.zeros((num, num_player_list[i] * 2))
    Y = stints['ppp'].values
    possessions = stints['possessions'].values
    
    replacement_idx = index_list[i][9999999]
    
    for idx, row in stints.iterrows():
        offensive_players = row["offensive_players"]
        defensive_players = row["defensive_players"]

        for player in offensive_players:
            if player == 9999999:
                X[idx, replacement_idx] = -0.02  
            else:
                X[idx, index_list[i][player]] = 1  

        for player in defensive_players:
            if player == 9999999:
                X[idx, replacement_idx + num_player_list[i]] = 0.02
            else:
                X[idx, index_list[i][player] + num_player_list[i]] = -1 
        
    print(f"Matrix shape: {X.shape}, Expected: ({num}, {num_player_list[i] * 2})")
    
    num_stints.append(num_stints)
    X_rapms.append(X)
    Y_rapms.append(Y)
    weights.append(possessions)


Matrix shape: (68321, 950), Expected: (68321, 950)
Matrix shape: (69456, 950), Expected: (69456, 950)
Matrix shape: (69692, 992), Expected: (69692, 992)
Matrix shape: (60434, 970), Expected: (60434, 970)
Matrix shape: (63078, 912), Expected: (63078, 912)
Matrix shape: (71076, 956), Expected: (71076, 956)
Matrix shape: (69434, 904), Expected: (69434, 904)
Matrix shape: (69606, 872), Expected: (69606, 872)
Matrix shape: (70159, 884), Expected: (70159, 884)
Matrix shape: (69864, 924), Expected: (69864, 924)


In [106]:
team_ids_to_abbr = {v: k for k, v in nba_team_ids.items()}

In [107]:
ridge_objects = []
rapm_values_list = []
orapm_values = []
drapm_values = []
rapm_results_list = []
rapm_results_list1 = []

for i in range(len(processed_stints3)):
    
    ridge = Ridge(alpha = 3000)
    ridge.fit(X_rapms[i], Y_rapms[i], sample_weight = weights[i])
    
    ridge_objects.append(ridge)
    rapm_values_list.append(ridge.coef_)
    
    orapm = ridge.coef_[:num_player_list[i]]
    drapm = ridge.coef_[num_player_list[i]:]
    
    rapm_results = pd.DataFrame({
        "Player": [player // 100 for player in up_list[i]],
        "Player_Name": [get_name(player // 100) for player in up_list[i]],
        "Team": [team_ids_to_abbr.get(str(pid)[-2:]) for pid in up_list[i]],
        "Offensive_RAPM": orapm *100,
        "Defensive_RAPM": drapm * 100,
        "Total_RAPM": (orapm + drapm) *100  # Combined impact
    })
    
    rapm_results1 = pd.DataFrame({
        "Player": [player for player in up_list[i]],
        "Offensive_RAPM": orapm *100,
        "Defensive_RAPM": drapm * 100,
        "Total_RAPM": (orapm + drapm) *100  # Combined impact
    })
    
    rapm_results_list1.append(rapm_results1)
    rapm_results_list.append(rapm_results)
    orapm_values.append(orapm_values)
    drapm_values.append(drapm_values)

In [108]:
for i, rapm in enumerate(rapm_results_list):
    rapm['Season'] = 23 - i

In [109]:
all_rapm_results = pd.concat(rapm_results_list)
all_rapm_results = all_rapm_results[all_rapm_results['Player'] != 99999].copy().reset_index(drop = True)

In [111]:
from collections import defaultdict
from itertools import combinations

def compute_player_possessions(df, min_possessions=150):
    pair_possessions = defaultdict(int)  # Track total possessions for player pairs
    player_possessions = defaultdict(int)  # Track total possessions for individual players

    # Iterate through each stint, summing possessions
    for _, row in df.iterrows():
        players = row["offensive_players"]
        possessions = row["possessions"]

        # Count possessions for each individual player
        for player in players:
            player_possessions[player] += possessions

        # Count possessions for each pair
        for pair in combinations(players, 2):
            pair_possessions[tuple(sorted(pair))] += possessions

    # Convert to DataFrame
    player_possessions = pd.DataFrame(list(player_possessions.items()), columns=["Player", "Total Possessions"])
    frequent_teammates_possessions = pd.DataFrame(
        [(pair, count) for pair, count in pair_possessions.items() if count >= min_possessions],
        columns=["Player Pair", "Total Possessions Together"]
    )

    return player_possessions, frequent_teammates_possessions

In [112]:
player_possessions_list = []
teammate_possessions_list = []

for stints_df in processed_stints3:
    
    pp, fp = compute_player_possessions(stints_df)
    
    pp = pp[pp['Player'] != 9999999]
    
    fp = fp[fp['Player Pair'].apply(lambda x: 9999999 not in x)].reset_index(drop = True)
    
    player_possessions_list.append(pp)
    teammate_possessions_list.append(fp)

In [113]:
from itertools import combinations

def optimized_add_synergy_columns_to_X(X, stints, teammate_possessions, player_index):

    num_players = int(X.shape[1] / 2)
    num_stints = len(stints)
    qualifying_pairs = teammate_possessions['Player Pair'].tolist()
    synergy_index = {pair: idx for idx, pair in enumerate(qualifying_pairs)}
    qualifying_pairs_set = set(synergy_index.keys())

    # Expand the matrix with zeros for synergy terms
    num_synergy_terms = len(qualifying_pairs)
    X_expanded = np.hstack((X.copy().astype(np.float16), np.zeros((num_stints, num_synergy_terms * 2), dtype=np.float16)))

    for i, row in stints.iterrows():
        # Offensive synergy
        for p1, p2 in combinations(row['offensive_players'], 2):
            pair = tuple(sorted((p1, p2)))
            if pair in qualifying_pairs_set:
                col = 2 * num_players + synergy_index[pair] * 2
                X_expanded[i, col] = 1

        # Defensive synergy
        for p1, p2 in combinations(row['defensive_players'], 2):
            pair = tuple(sorted((p1, p2)))
            if pair in qualifying_pairs_set:
                col = 2 * num_players + synergy_index[pair] * 2 + 1
                X_expanded[i, col] = -1
        
    return X_expanded, synergy_index


In [None]:
X_rapms_expanded = []
synergy_indexes = []
teammate_possessions_list2 = []

for i in range(0, 10):
    
    pos = teammate_possessions_list[i][teammate_possessions_list[i]['Total Possessions Together'] > 1200].copy().reset_index(drop = True)
    
    x, s = optimized_add_synergy_columns_to_X(X_rapms[i], processed_stints3[i], pos, index_list[i])
    
    X_rapms_expanded.append(x)
    synergy_indexes.append(s)
    teammate_possessions_list2.append(pos)

0
1
2
3
4
5
6
7
8
9


In [None]:
rapm_values_list_syn = []
orapm_values_syn      = []
drapm_values_syn      = []
rapm_results_list_syn = []
synergy_results_list  = []

for i in range(len(processed_stints3)):
    print(f"Season {i}")

    baseline_coefs = rapm_values_list[i]
    n_players      = num_player_list[i]
    n_feats        = X_rapms_expanded[i].shape[1]

    X_ind   = X_rapms_expanded[i][:, :2*n_players].astype(np.float16)
    X_int   = X_rapms_expanded[i][:, 2*n_players:].astype(np.float16)

    y       = Y_rapms[i]
    y_resid = y - X_ind.dot(baseline_coefs[:2*n_players])

    ridge      = Ridge(alpha=1200)
    ridge.fit(X_int, y_resid, sample_weight=weights[i])
    synergy_coefs = ridge.coef_ 

    orapm = baseline_coefs[:n_players]
    drapm = baseline_coefs[n_players:2*n_players]

    rapm_values_list_syn.append(baseline_coefs)
    orapm_values_syn.append(orapm)
    drapm_values_syn.append(drapm)

    rapm_results = pd.DataFrame({
        "Player":          [player // 100 for player in up_list[i]],
        "Player_Name":     [get_name(player // 100) for player in up_list[i]],
        "Team":            [team_ids_to_abbr.get(str(pid)[-2:]) for pid in up_list[i]],
        "Offensive_RAPM":  orapm * 100,
        "Defensive_RAPM":  drapm * 100,
        "Total_RAPM":      (orapm + drapm) * 100,
    })
    rapm_results_list_syn.append(rapm_results)

    pair_ids = teammate_possessions_list2[i]['Player Pair'].tolist()
    synergy_df = pd.DataFrame({
        "Player Pair":           pair_ids,
        "Possessions Played":    teammate_possessions_list2[i]['Total Possessions Together'].tolist(),
        "Hybrid Offensive RAPM": synergy_coefs[::2] * 100,
        "Hybrid Defensive RAPM": synergy_coefs[1::2] * 100,
    })
    synergy_df["Total Hybrid RAPM"] = (
        synergy_df["Hybrid Offensive RAPM"]
      + synergy_df["Hybrid Defensive RAPM"]
    )
    synergy_df["Season"] = 23 - i
    synergy_results_list.append(synergy_df)


Season 0
Season 1
Season 2
Season 3
Season 4
Season 5
Season 6
Season 7
Season 8
Season 9


In [None]:
global_synergies = pd.concat(synergy_results_list, ignore_index = True)
global_synergies['Pair Names'] = global_synergies['Player Pair'].apply(lambda x: (get_name(x[0] // 100), (get_name(x[1] // 100))))

Unnamed: 0,Player Pair,Possessions Played,Hybrid Offensive RAPM,Hybrid Defensive RAPM,Total Hybrid RAPM,Season,Pair Names
0,"(162617904, 164170604)",1311,1.853961,0.426349,2.280310,23,"(Terry Rozier, Brandon Miller)"
1,"(162617904, 162897004)",1716,0.834587,-1.202082,-0.367495,23,"(Terry Rozier, Miles Bridges)"
2,"(20233004, 164170604)",1226,1.221301,1.797957,3.019258,23,"(Gordon Hayward, Brandon Miller)"
3,"(162897004, 164170604)",3702,1.596205,-0.431401,1.164804,23,"(Miles Bridges, Brandon Miller)"
4,"(162902304, 164170604)",1617,0.982174,0.081880,1.064054,23,"(P.J. Washington, Brandon Miller)"
...,...,...,...,...,...,...,...
7233,"(20193619, 20234719)",1928,1.171834,-0.577879,0.593956,14,"(Tyreke Evans, Quincy Pondexter)"
7234,"(20234719, 20307619)",1529,2.296789,0.262868,2.559657,14,"(Quincy Pondexter, Anthony Davis)"
7235,"(20156919, 20196719)",1764,-1.997506,0.608548,-1.388958,14,"(Eric Gordon, Dante Cunningham)"
7236,"(20268321, 20307921)",1400,0.924980,0.298580,1.223560,14,"(Enes Freedom, Dion Waiters)"


In [None]:
all_rapm_results.to_csv('../datasets/all_rapm_results.csv', index = False)
global_synergies.to_csv('../datasets/all_synergy_results.csv', index = False)

# 2) Clean Raw X to X Vector

In [None]:
global_synergies = pd.read_csv('../datasets/all_synergy_results.csv')
all_rapm_results = pd.read_csv('../datasets/all_rapm_results.csv')

In [None]:
# Load in all raw X

box_data = pd.read_csv('../datasets/all_player_stats.csv')
shooting_data = pd.read_csv('../datasets/all_shooting_data.csv')
advanced_data = pd.read_csv('../datasets/all_advanced_stats.csv')
hustle_data = pd.read_csv('../datasets/all_hustle_stats.csv')

In [156]:
hustle_data_target_cols = [
    'CONTESTED_SHOTS_2PT', 'CONTESTED_SHOTS_3PT', 'DEFLECTIONS', 'CHARGES_DRAWN', 'SCREEN_ASSISTS', 'OFF_LOOSE_BALLS_RECOVERED', 'DEF_LOOSE_BALLS_RECOVERED', 'OFF_BOXOUTS', 'DEF_BOXOUTS'
]

hustle_data[hustle_data_target_cols] = hustle_data[hustle_data_target_cols].copy().div(hustle_data['G'], axis=0)

In [157]:
shooting_cols = [ # need to divide FGA by games played
    'PLAYER_ID', 'SEASON', 'GROUP_VALUE', 'FGA', 'FG_PCT', 'PCT_AST_FGM']
advanced_data_cols = [
    'PLAYER_ID', 'SEASON', 'E_AST_RATIO', 'E_OREB_PCT', 'E_DREB_PCT', 'E_TOV_PCT', 'E_USG_PCT'
]
hustle_data_cols = [
    'PLAYER_ID', 'SEASON', 'CONTESTED_SHOTS_2PT', 'CONTESTED_SHOTS_3PT', 'DEFLECTIONS', 'CHARGES_DRAWN', 'SCREEN_ASSISTS', 'OFF_LOOSE_BALLS_RECOVERED', 'DEF_LOOSE_BALLS_RECOVERED', 'OFF_BOXOUTS', 'DEF_BOXOUTS'
]

In [158]:
# Get target columns for shooting, advanced, and hustle data

clean_box_data = box_data.copy()
clean_shooting_data = shooting_data[shooting_cols].copy()
clean_advanced_data = advanced_data[advanced_data_cols].copy()
clean_hustle_data = hustle_data[hustle_data_cols].copy() # Hustle data only has data from 2015-16 onward

In [159]:
# Need to flatten shooting data to get player-rows

clean_shooting_data_flat = clean_shooting_data.pivot_table(
    index=['PLAYER_ID', 'SEASON'], 
    columns='GROUP_VALUE',
    values=['FGA', 'FG_PCT', 'PCT_AST_FGM'],
    aggfunc='first')

In [160]:
clean_shooting_data_flat = clean_shooting_data_flat.reset_index()

clean_shooting_data_flat.columns = [ "_".join([str(c) for c in col if c not in ("", None)])
    if isinstance(col, tuple) else str(col)
    for col in clean_shooting_data_flat.columns
]

In [None]:
# Have to take out 2014-15 from all datasets

clean_box_data = clean_box_data[clean_box_data['SEASON'] != '2014-15'].copy().reset_index(drop = True)
clean_advanced_data = clean_advanced_data[clean_advanced_data['SEASON'] != '2014-15'].copy().reset_index(drop = True)
clean_shooting_data_flat = clean_shooting_data_flat[clean_shooting_data_flat['SEASON'] != '2014-15'].copy().reset_index(drop = True)

In [171]:
player_rapms = all_rapm_results[['Player', 'Offensive_RAPM', 'Defensive_RAPM', 'Season']].copy()

player_rapms['Season'] = player_rapms['Season'].apply(lambda x: f"20{x}-{str(x+1)[-2:]}")
player_rapms = player_rapms.rename(columns = {'Player' : 'PLAYER_ID', 'Season' : 'SEASON'})

In [None]:
from functools import reduce

dfs_to_merge = [clean_box_data, clean_advanced_data, clean_shooting_data_flat, player_rapms]

X = reduce(lambda left, right: pd.merge(left, right, on=['PLAYER_ID', 'SEASON'], how='left'), dfs_to_merge)

In [178]:
X.fillna(0, inplace = True)

In [None]:
X.to_csv('../datasets/player_x.csv', index = False)

# 3) Clean into X and Y to feed into MLP

In [181]:
global_synergies_clean = global_synergies[global_synergies['Season'] != 14].copy().reset_index(drop = True)

In [182]:
import ast

global_synergies_clean['Player Pair'] = global_synergies_clean['Player Pair'].apply(ast.literal_eval)

In [183]:
global_synergies_clean['Player_ID_A'] = global_synergies_clean['Player Pair'].apply(lambda x: x[0] // 100)
global_synergies_clean['Player_ID_B'] = global_synergies_clean['Player Pair'].apply(lambda x: x[1] // 100)

In [184]:
X_A = X.copy()
X_B = X.copy()

X_A = X_A.rename(columns=lambda col: f"{col}_A" if col not in ['PLAYER_ID', 'SEASON'] else col)
X_B = X_B.rename(columns=lambda col: f"{col}_B" if col not in ['PLAYER_ID', 'SEASON'] else col)

In [185]:
global_synergies_clean

Unnamed: 0,Player Pair,Possessions Played,Hybrid Offensive RAPM,Hybrid Defensive RAPM,Total Hybrid RAPM,Season,Pair Names,Player_ID_A,Player_ID_B
0,"(162617904, 164170604)",1311,1.853961,0.426349,2.280310,23,"('Terry Rozier', 'Brandon Miller')",1626179,1641706
1,"(162617904, 162897004)",1716,0.834587,-1.202082,-0.367495,23,"('Terry Rozier', 'Miles Bridges')",1626179,1628970
2,"(20233004, 164170604)",1226,1.221301,1.797957,3.019258,23,"('Gordon Hayward', 'Brandon Miller')",202330,1641706
3,"(162897004, 164170604)",3702,1.596205,-0.431401,1.164804,23,"('Miles Bridges', 'Brandon Miller')",1628970,1641706
4,"(162902304, 164170604)",1617,0.982174,0.081880,1.064054,23,"('P.J. Washington', 'Brandon Miller')",1629023,1641706
...,...,...,...,...,...,...,...,...,...
6503,"(274404, 20268904)",1914,0.067790,0.091150,0.158940,15,"('Al Jefferson', 'Kemba Walker')",2744,202689
6504,"(274404, 10110704)",1277,0.706093,0.344169,1.050262,15,"('Al Jefferson', 'Marvin Williams')",2744,101107
6505,"(274404, 20158704)",1649,0.715432,-1.379336,-0.663904,15,"('Al Jefferson', 'Nicolas Batum')",2744,201587
6506,"(274404, 20239104)",1374,-1.232804,-0.755244,-1.988048,15,"('Al Jefferson', 'Jeremy Lin')",2744,202391


In [186]:
Y = global_synergies_clean[['Player_ID_A', 'Player_ID_B', 'Season', 'Possessions Played', 'Hybrid Offensive RAPM']].copy()

In [187]:
Y["Season"] = Y["Season"].apply(lambda x: f"20{x}-{str(x+1)[-2:]}")

In [188]:
pair_df = Y.merge(X_A, left_on=['Player_ID_A', 'Season'], right_on=['PLAYER_ID', 'SEASON'], how='left').merge(X_B, left_on=['Player_ID_B', 'Season'], right_on=['PLAYER_ID', 'SEASON'], how='left')

In [189]:
feature_cols = [col for col in pair_df.columns if col.endswith('_A') or col.endswith('_B')]

In [190]:
X_pair = pair_df[feature_cols].copy()
Y_pair = pair_df[['Hybrid Offensive RAPM', 'Possessions Played']]

In [None]:
X_pair.to_csv('../datasets/X_pair.csv', index = False)
Y_pair.to_csv('../datasets/Y_pair.csv', index = False)