NBA Analysis
Analyzes advanced statistics, creates a regression model to predict salary, and offers preliminary assessment of under-and-over-valued players based on actual salaries relative to the predicted salaries 

In [1]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error

In [2]:
# load csv files
# Note: Full script (available in Github repo) scrapes (i) advanced stats and (ii) salary data from Basketballreference.com 
# and puts them into two separate csv files.
# This summary analysis uploads the two csv files as the starting point
df_stats = pd.read_csv('nba_advanced_stats_2025.csv')
df_salary = pd.read_csv('nba_player_salaries_2025.csv')

In [3]:
# Explore data
print(df_stats.columns)
print(df_stats.shape)
print(df_stats.sample(1))
print(df_salary.columns)
print(df_salary.shape)
print(df_salary.sample(1))

Index(['Rk', 'Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP', 'PER', 'TS%',
       '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
       'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP',
       'Awards'],
      dtype='object')
(736, 29)
        Rk            Player   Age Team Pos     G   GS     MP   PER    TS%  \
478  378.0  Brandon Williams  25.0  DAL  PG  33.0  3.0  487.0  18.3  0.634   

     ...  USG%  OWS  DWS   WS  WS/48  OBPM  DBPM  BPM  VORP  Awards  
478  ...  22.6  0.9  0.4  1.3  0.129   1.0  -0.1  1.0   0.4     NaN  

[1 rows x 29 columns]
Index(['Player', 'Salary_2025_26'], dtype='object')
(470, 2)
             Player  Salary_2025_26
466  Mamadi Diakite        464050.0


In [None]:
# Clean the salary data sets 

# check for missing values
print(df_salary.isnull().sum())

# clean up salary data
df_salary_clean = df_salary.dropna(subset=['Salary_2025_26'])
print(df_salary_clean.isnull().sum())
print(f"Total players with salary: {len(df_salary_clean)}")

# check for duplicates in Player names in both datasets
print(f"Duplicate players in salary data: {df_salary_clean['Player'].duplicated().sum()}")

# remove duplicates by taking the first entry



Player             0
Salary_2025_26    22
dtype: int64
Player            0
Salary_2025_26    0
dtype: int64
Total players with salary: 448
Duplicate players in salary data: 9


In [None]:
# clean the stats data set

# check for duplicates in Player names
print(f"Duplicate players in stats data: {df_stats['Player'].duplicated().sum()}")

# identify players who appear multiple times
player_counts = df_stats['Player'].value_counts()
print(player_counts)
duplicated_players = player_counts[player_counts > 1].index
print(duplicated_players)

# if players have multile entries, keep the instance with 2TM or 3TM, otherwise keep the first occurrence
def keep_combined_stats(group):
    # check if there's a 2TM or 3TM entry
    combined_entry = group[group['Team'].isin(['2TM', '3TM'])]
    if len(combined_entry) > 0:
        return combined_entry
    else:
        # if no combined entry, return the original entry 
        return group 
    
# apply the function to each group of duplicated players
df_stats_cleaned = df_stats.groupby('Player', group_keys=False).apply(keep_combined_stats)

# reset index after groupby operation
df_stats_cleaned = df_stats_cleaned.reset_index(drop=True)

Player
Dennis Schröder     4
Colin Castleton     4
David Roddy         4
MarJon Beauchamp    4
Cody Martin         3
                   ..
James Wiseman       1
Alondes Williams    1
Zyon Pullin         1
Jahlil Okafor       1
Jayson Tatum        1
Name: count, Length: 570, dtype: int64
Index(['Dennis Schröder', 'Colin Castleton', 'David Roddy', 'MarJon Beauchamp',
       'Cody Martin', 'Kyle Kuzma', 'Zach Collins', 'Jake LaRavia',
       'Jonas Valančiūnas', 'Bogdan Bogdanović', 'Luka Dončić', 'Terance Mann',
       'Javonte Green', 'Nick Richards', 'Jusuf Nurkić', 'Caris LeVert',
       'Jimmy Butler', 'Kevin Porter Jr.', 'D'Angelo Russell', 'De'Aaron Fox',
       'De'Andre Hunter', 'Caleb Martin', 'Kyle Anderson', 'Ben Simmons',
       'Davion Mitchell', 'Andrew Wiggins', 'Quentin Grimes', 'Anthony Davis',
       'KJ Martin', 'Georges Niang', 'Thomas Bryant', 'Bruce Brown',
       'Kevin Huerter', 'Dorian Finney-Smith', 'Tre Jones', 'Kelly Olynyk',
       'Khris Middleton', 'Shake M

  df_stats_cleaned = df_stats.groupby('Player', group_keys=False).apply(keep_combined_stats)


In [13]:
# check for duplicated players again
print(f"Duplicate players in cleaned stats data: {df_stats_cleaned['Player'].duplicated().sum()}")


Duplicate players in cleaned stats data: 0


In [14]:
# Merge datasets on Player name
df_merged = pd.merge(df_stats_cleaned, df_salary_clean[['Player', 'Salary_2025_26']], on='Player', how='inner')
print(f"Merged dataset shape: {df_merged.shape}")

Merged dataset shape: (408, 30)


In [None]:
# scatter plot of salary vs. different features 

In [None]:
# correlation matrix 

In [None]:
# create a simple regression of salary vs. PER

In [None]:
# create a multi-linear regression of salary vs. multiple variables

In [None]:
# identify most overrated and underrated players based on the regression 