In [347]:
import pandas as pd
import numpy as np
from fuzzywuzzy import process
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score

In [348]:
contracts = pd.read_csv('nba_contracts.csv', encoding='utf-8')

# Formatting the strings with $ signs and columns to be floats
contracts['Value'] = contracts['Value'].replace({'\$': '', ',': ''}, regex=True).astype(float)
contracts['AAV'] = contracts['AAV'].replace({'\$': '', ',': ''}, regex=True).astype(float)

contracts['Player'] = contracts['Player'].str.replace('.', '', regex=False) # For players with C.J., etc.

# I want the three seasons of data before the contract signing
contracts['start_minus1'] = contracts['Start'] - 1
contracts['start_minus2'] = contracts['Start'] - 2

# These players missed one full season out of the three directly before they signed a contract,
# so I replaced the missing seasons with their next-most-recent season
contracts.loc[contracts['Player'] == 'Kawhi Leonard', 'start_minus2'] = 2021
contracts.loc[contracts['Player'] == 'Kevin Durant', 'start_minus2'] = 2019
contracts.loc[contracts['Player'] == 'Zion Williamson', 'start_minus1'] = 2021
contracts.loc[contracts['Player'] == 'Zion Williamson', 'start_minus2'] = 2020
contracts.loc[contracts['Player'] == 'Miles Bridges', 'start_minus1'] = 2022
contracts.loc[contracts['Player'] == 'Miles Bridges', 'start_minus2'] = 2021

contracts.rename(columns={'Age\nAt Signing': 'Age'}, inplace=True)
contracts.drop(columns=['RK', 'Team\nCurrently With'], inplace=True)

# Formatting issue between Spotrac and Basketball-Reference
contracts['Player'] = contracts['Player'].str.replace('Nicolas Claxton', 'Nic Claxton', regex=False)

In [349]:
# Adding the league-wide salary cap for each season in the data
salary_caps = {2021: 112414000, 2022: 123655000, 2023: 136021000, 2024: 140588000}

contracts = pd.merge(contracts, pd.DataFrame(list(salary_caps.items()), columns=['Start', 'salary_cap_at_sign']), on='Start', how='left')

In [350]:
# Merging the per-game and advanced stats
per_game_stats = pd.read_csv('nba_per_game_stats.csv', encoding='utf-8')
advanced_stats = pd.read_csv('nba_advanced_stats.csv', encoding='utf-8')

per_game_stats = per_game_stats[['name_display', 'Year', 'age', 'team_name_abbr', 'games', 'pts_per_g', 'ast_per_g', 'trb_per_g', 'efg_pct', 'awards']]
advanced_stats = advanced_stats[['name_display', 'Year', 'per', 'ws', 'vorp', 'usg_pct']]

# Some players have multiple rows due to switching teams, but this will only keep the first one (overall stats across teams)
per_game_stats = per_game_stats.drop_duplicates(subset=['name_display', 'Year'])
advanced_stats = advanced_stats.drop_duplicates(subset=['name_display', 'Year'])

stats = pd.merge(per_game_stats, advanced_stats, on=['name_display', 'Year'], how='left')
stats.rename(columns={'name_display': 'Player'}, inplace=True)

# Here I'm encoding whether the player made an All-Star team or won any awards, based on the formatting of the 'awards' BR column
stats['AS'] = np.where(stats['awards'].str.contains('AS', na=False), 1, 0)
stats['MVP'] = np.where((stats['awards'].str.contains(r'MVP-1(?!\d)', regex=True, na=False)), 1, 0)
stats['MVP-3'] = np.where(stats['awards'].str.contains('MVP-2', na=False, regex=False), 1, 
                          np.where(stats['awards'].str.contains('MVP-3', na=False, regex=False), 1, 0))
stats['MVP-5'] = np.where(stats['awards'].str.contains('MVP-4', na=False, regex=False), 1, 
                          np.where(stats['awards'].str.contains('MVP-5', na=False, regex=False), 1, 0))
stats['DPOY'] = np.where(stats['awards'].str.contains(r'DPOY-1(?!\d)', regex=True, na=False), 1, 0)
stats['NBA1'] = np.where(stats['awards'].str.contains('NBA1', na=False, regex=False), 1, 0)
stats['NBA2'] = np.where(stats['awards'].str.contains('NBA2', na=False, regex=False), 1, 0)
stats['NBA3'] = np.where(stats['awards'].str.contains('NBA3', na=False, regex=False), 1, 0)
stats['MIP'] = np.where(stats['awards'].str.contains(r'MIP-1(?!\d)', na=False, regex=False), 1, 0)
stats['CPOY'] = np.where(stats['awards'].str.contains(r'CPOY-1(?!\d)', regex=True, na=False), 1, 0)
stats.drop(columns=['awards'], inplace=True)

champions = {2019: 'TOR', 2020: 'LAL', 2021: 'MIL', 2022: 'GSW', 2023: 'DEN', 2024: 'BOS'}

# Create a new column that checks if the (Year, Team) pair is a champion
stats['Champ'] = np.where(stats.apply(lambda row: champions.get(row['Year']) == row['team_name_abbr'], axis=1), 1, 0)

# Could add draft position as well given more time, though unlikely to influence veteran contracts

In [351]:
# Since there are some inconsistencies in player spelling (Jokic vs Jokić, etc.), a normal merge doesn't work.
# This function will find the matches as long as the strings are relatively similar
def fuzzy_merge(df_1, df_2, key1, left_key, right_key, threshold):
    matches = {}
    for item in df_1[key1]:
        # Find the closest match in the second dataframe's key column
        match = process.extractOne(item, df_2[key1].tolist())
        # If the similarity score is above the threshold, store the mapping
        if match[1] >= threshold:
            matches[item] = match[0]
    
    # Create a new column in df_1 with the matches from df_2
    df_1['merge_key'] = df_1[key1].map(matches)
    
    df_merged = pd.merge(df_1, df_2, left_on=['merge_key',left_key], right_on=[key1,right_key], how='left')
    df_merged.drop('merge_key', axis=1, inplace=True)
    
    return df_merged


In [352]:
# Merging the contracts and stats dataframes together using the above function
df_initial = fuzzy_merge(contracts, stats, 'Player', 'Start', 'Year', threshold=80)
df_initial.rename(columns={'Player_y': 'Player'}, inplace=True)
df_initial.drop(columns=['Player_x'], inplace=True)

df = pd.merge(df_initial, stats, left_on=['Player', 'start_minus1'], right_on=['Player', 'Year'], how='left')
df = pd.merge(df, stats, left_on=['Player', 'start_minus2'], right_on=['Player', 'Year'], how='left')

df.drop(columns=['team_name_abbr_x', 'team_name_abbr_y', 'team_name_abbr', 'End', 'age_y', 'age', 'Year_y', 'Year'], inplace=True)
df.dropna(inplace=True)

In [None]:
# Creating an XGBoost model to predict contract total value from previous statistics
# I originally tested the model with AAV and contract length, but eventually switched the
# final target value to be total contract value (AAV * contract length)
def create_pred_model(df, y_var):
    y = df[y_var]

    X = df.drop(columns=['AAV', 'Yrs', 'Value']) # target variable options
    X = X.drop(columns=['Start', 'age_x', 'Year_x', 'start_minus1', 'start_minus2', 'Player']) # other non-relevant columns

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

    categorical_features = ['Pos'] 
    numerical_features = X.columns.difference(categorical_features).tolist()
        
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='drop')

    pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', xgb.XGBRegressor(objective='reg:squarederror', random_state=42))
        ])
    
    param_grid = {
        'model__n_estimators': [200, 300],
        'model__max_depth': [3, 5, 7],
        'model__learning_rate': [0.01, 0.05, 0.1],
        'model__subsample': [0.8, 1.0],
        'model__colsample_bytree': [0.8, 1.0],
        'model__min_child_weight': [1, 3, 5]
    }
    
    grid_search = GridSearchCV(
        pipeline,
        param_grid=param_grid,
        cv=5,
        scoring='neg_mean_squared_error',
        verbose=1,
        n_jobs=-1
    )
        
    # Fit model
    grid_search.fit(X_train, y_train)
        
    # Get best model
    best_model = grid_search.best_estimator_
    fitted_preprocessor = best_model.named_steps['preprocessor']
        
    # Evaluate model
    y_pred = best_model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
        
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"MAE: {mae:.2f}")
    print(f"R²: {r2:.4f}")

    return best_model, fitted_preprocessor, X_test, y_pred, y_test

# Running the function to create the full model
best_value_model, preprocessor_value, X_test_value, y_pred_value, y_test_value = create_pred_model(df, 'Value')

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best parameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.05, 'model__max_depth': 5, 'model__min_child_weight': 5, 'model__n_estimators': 200, 'model__subsample': 0.8}
MAE: 25654911.03
R²: 0.7322


In [None]:
# Creating the dataframe with 2025 free agents and their 2024-25 statistics
free_agents_raw = pd.read_csv('nba_free_agents.csv', encoding='utf-8')
champ_odds = pd.read_csv('champ_odds.csv', encoding='utf-8')

all_stars = ['LeBron James', 'Kyrie Irving', 'James Harden'] # Players named to the 2025 All-Star teams
second_team = ['LeBron James'] # Educated guess about All-NBA 1st through 3rd team players

free_agents = free_agents_raw.rename(columns={'Player (201)': 'Player'})
free_agents['Year'] = 2025
free_agents['Year_minus1'] = 2024
free_agents['Year_minus2'] = 2023

# Fixing some differences in player names across Spotrac and Basketball Reference
free_agents['Player'] = free_agents['Player'].str.replace("Nah'Shon", 'Bones', regex=False)
free_agents['Player'] = free_agents['Player'].str.replace("AJ", 'A.J.', regex=False)
free_agents['Player'] = free_agents['Player'].str.replace("Bruce Brown Jr.", 'Bruce Brown', regex=False)
free_agents['Player'] = free_agents['Player'].str.replace("Boston Jr", 'Boston Jr.', regex=False)
free_agents['Player'] = free_agents['Player'].str.replace("Jae’Sean", "Jae'Sean", regex=False)
free_agents['Player'] = free_agents['Player'].str.replace("Cameron Thomas", 'Cam Thomas', regex=False)
free_agents['Player'] = free_agents['Player'].str.replace("Cancar", 'Čančar', regex=False)
free_agents['Player'] = free_agents['Player'].str.replace("Saric", 'Šarić', regex=False)

# Adding the statistics from BR
free_agents = pd.merge(free_agents, stats, on=['Player', 'Year'], how='left')
free_agents = pd.merge(free_agents, stats, left_on=['Player', 'Year_minus1'], right_on=['Player', 'Year'], how='left')
free_agents = pd.merge(free_agents, stats, left_on=['Player', 'Year_minus2'], right_on=['Player', 'Year'], how='left')

free_agents.drop(columns=['team_name_abbr_x', 'team_name_abbr_y', 'Year_x', 'Year_y', 'Year', 'Champ_x'], inplace=True)
free_agents.rename(columns={'Champ': 'Champ_z'}, inplace=True)

# Adding championship percentages for this season based on sportsbooks' 2024-25 championship odds
# This is a proxy for whether the player's team will win the championship
free_agents = pd.merge(free_agents, champ_odds, left_on='Prev Team', right_on='team_name_abbr', how='left')

free_agents.rename(columns={'Champ': 'Champ_x'}, inplace=True)
free_agents.rename(columns={'Champ_z': 'Champ'}, inplace=True)
free_agents['Champ_x'] = free_agents['Champ_x'].fillna(0)

free_agents['AS'] = np.where(free_agents['Player'].isin(all_stars), 1, 0)
free_agents['NBA2'] = np.where(free_agents['Player'].isin(second_team), 1, 0)
free_agents['salary_cap_at_sign'] = 154647000

free_agents.drop(columns=['YOE', 'Prev AAV', 'Type','Prev Team', 'team_name_abbr_y'], inplace=True)

In [387]:
# Making the predictions for the 2025 free agents and printing out the top 10 in predicted total value
X_pred_value = preprocessor_value.transform(free_agents)

value_predictions = best_value_model.named_steps['model'].predict(X_pred_value)
free_agents['predicted_value'] = value_predictions

result_df = free_agents.dropna() # Filtering out players with less than 3 years of experience
result_df = result_df.sort_values('predicted_value', ascending=False)

result_df[['Player', 'predicted_value']].head(10).round(-5)

Unnamed: 0,Player,predicted_value
2,Kyrie Irving,139800000.0
0,LeBron James,129100000.0
3,James Harden,120700000.0
6,John Collins,119200000.0
42,Jonathan Kuminga,108500000.0
11,Myles Turner,108500000.0
1,Fred VanVleet,108300000.0
88,Cam Thomas,104700000.0
13,D'Angelo Russell,100200000.0
5,Julius Randle,94000000.0
