NBA player stats analysis
Q: Which stats are most correlated with salaries? 
Q: Can we forecast salaries to help determine players that may be over- and/or under-valued?

In [1]:
#1: import core packages
import requests 
from bs4 import BeautifulSoup
import pandas as pd
import time 
import os

In [2]:
#1a: load existing data if available
if os.path.exists('nba_stats_with_salary_2025.csv'):
    print("Found existing data file! Loading...")
    df_merged = pd.read_csv('nba_stats_with_salary_2025.csv')
    df_merged_qualified = df_merged[df_merged['MP'] >= 15].copy()
    
    print(f"Loaded {len(df_merged)} players")
    print(f"Qualified players (15+ MPG): {len(df_merged_qualified)}")
    print("\nYou can now skip to Cell 20 to analyze the data!")
    print("Or continue from Cell 2 to re-scrape fresh data.")
else:
    print("No existing data found. Run cells 2+ to scrape data.")

Found existing data file! Loading...
Loaded 736 players
Qualified players (15+ MPG): 473

You can now skip to Cell 20 to analyze the data!
Or continue from Cell 2 to re-scrape fresh data.


In [3]:
#2: set up the url 
url = 'https://www.basketball-reference.com/leagues/NBA_2025_per_game.html'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

In [4]:
#3: request the data
print(f"Fetching data from {url}...")
response = requests.get(url, headers=headers)
print(f"Status code: {response.status_code}")


Fetching data from https://www.basketball-reference.com/leagues/NBA_2025_per_game.html...
Status code: 200


In [5]:
#4: Parse the HTML
soup = BeautifulSoup(response.content, 'lxml')
table = soup.find('table', {'id': 'per_game_stats'})
print("Table Found!" if table else "Table not found")

Table Found!


In [6]:
#5: extract column headers
headers_list = [ ]
for th in table.find('thead').find_all('th'):
    headers_list.append(th.text.strip())
print(f"Columns: {headers_list}")

Columns: ['Rk', 'Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Awards']


In [7]:
#6: extract all rows 
rows = []
for tr in table.find('tbody').find_all('tr'):
    # Skip header rows that appear in the middle
    if tr.find('th', {'scope': 'row'}) is None:
        continue
    
    row = []
    for td in tr.find_all(['th', 'td']):
        row.append(td.text.strip())
    
    if row:
        rows.append(row)

print(f"Scraped {len(rows)} player records")


Scraped 736 player records


In [8]:
#7: create data frame
df = pd.DataFrame(rows, columns=headers_list)
print(df.head())


  Rk                   Player Age Team Pos   G  GS    MP    FG   FGA  ...  \
0  1  Shai Gilgeous-Alexander  26  OKC  PG  76  76  34.2  11.3  21.8  ...   
1  2    Giannis Antetokounmpo  30  MIL  PF  67  67  34.2  11.8  19.7  ...   
2  3             Nikola Jokić  29  DEN   C  70  70  36.7  11.2  19.5  ...   
3  4              Luka Dončić  25  2TM  PG  50  50  35.4   9.2  20.5  ...   
4  4              Luka Dončić  25  DAL  PG  22  22  35.7   9.8  21.2  ...   

   ORB  DRB   TRB   AST  STL  BLK  TOV   PF   PTS  \
0  0.9  4.1   5.0   6.4  1.7  1.0  2.4  2.2  32.7   
1  2.2  9.7  11.9   6.5  0.9  1.2  3.1  2.3  30.4   
2  2.9  9.9  12.7  10.2  1.8  0.6  3.3  2.3  29.6   
3  0.8  7.4   8.2   7.7  1.8  0.4  3.6  2.5  28.2   
4  0.7  7.6   8.3   7.8  2.0  0.4  3.4  2.6  28.1   

                         Awards  
0  MVP-1,DPOY-10,CPOY-8,AS,NBA1  
1          MVP-3,DPOY-8,AS,NBA1  
2          MVP-2,CPOY-2,AS,NBA1  
3                                
4                                

[5 rows x 31 

In [9]:
#8: Convert numeric columns
numeric_cols = df.columns[5:]  # Stats start after Pos column
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

print("\nData types after conversion:")
print(df.dtypes)


Data types after conversion:
Rk         object
Player     object
Age        object
Team       object
Pos        object
G         float64
GS        float64
MP        float64
FG        float64
FGA       float64
FG%       float64
3P        float64
3PA       float64
3P%       float64
2P        float64
2PA       float64
2P%       float64
eFG%      float64
FT        float64
FTA       float64
FT%       float64
ORB       float64
DRB       float64
TRB       float64
AST       float64
STL       float64
BLK       float64
TOV       float64
PF        float64
PTS       float64
Awards    float64
dtype: object


In [10]:
# Cell 9: Explore the data
print(f"\nTotal players: {len(df)}")
print(f"\nBasic stats:\n{df[['Player', 'Team', 'PTS', 'TRB', 'AST']].describe()}")


Total players: 736

Basic stats:
              PTS         TRB         AST
count  735.000000  735.000000  735.000000
mean     8.564354    3.513469    2.060136
std      6.443135    2.357733    1.801264
min      0.000000    0.000000    0.000000
25%      3.750000    1.800000    0.800000
50%      7.100000    3.100000    1.500000
75%     11.400000    4.550000    2.700000
max     32.700000   13.900000   11.600000


In [11]:
# Cell 10: Top 10 scorers
print("\nTop 10 Scorers (PPG):")
top_scorers = df.nlargest(10, 'PTS')[['Player', 'Team', 'PTS', 'TRB', 'AST']]
print(top_scorers)


Top 10 Scorers (PPG):
                    Player Team   PTS   TRB   AST
0  Shai Gilgeous-Alexander  OKC  32.7   5.0   6.4
1    Giannis Antetokounmpo  MIL  30.4  11.9   6.5
2             Nikola Jokić  DEN  29.6  12.7  10.2
3              Luka Dončić  2TM  28.2   8.2   7.7
5              Luka Dončić  LAL  28.2   8.1   7.5
4              Luka Dončić  DAL  28.1   8.3   7.8
6          Anthony Edwards  MIN  27.6   5.7   4.5
7             Jayson Tatum  BOS  26.8   8.7   6.0
8             Kevin Durant  PHO  26.6   6.0   4.2
9             Tyrese Maxey  PHI  26.3   3.3   6.1


In [12]:
# Cell 11: Calculate simplified PER
def calculate_simple_per(row):
    """
    Calculate a simplified Player Efficiency Rating
    
    Formula components:
    - Points, Rebounds, Assists, Steals, Blocks (positive)
    - Missed FG, Missed FT, Turnovers (negative)
    - Adjusted per minute played
    """
    try:
        # Get stats (using .get() to handle missing values)
        pts = row['PTS']
        trb = row['TRB']
        ast = row['AST']
        stl = row['STL']
        blk = row['BLK']
        
        fga = row['FGA']
        fgm = row['FG']
        fta = row['FTA']
        ftm = row['FT']
        tov = row['TOV']
        
        mp = row['MP']
        
        # Calculate missed shots
        missed_fg = fga - fgm
        missed_ft = fta - ftm
        
        # Simplified PER formula
        per = (pts + trb + ast + stl + blk - missed_fg - missed_ft - tov) / mp if mp > 0 else 0
        
        # Scale it to look more like traditional PER (multiply by a factor)
        per = per * 10
        
        return round(per, 2)
    
    except:
        return None

In [13]:
# Apply the function to each row
df['Simple_PER'] = df.apply(calculate_simple_per, axis=1)

print("Simple PER calculated!")
print(f"\nPER Stats:")
print(f"Average PER: {df['Simple_PER'].mean():.2f}")
print(f"Max PER: {df['Simple_PER'].max():.2f}")
print(f"Min PER: {df['Simple_PER'].min():.2f}")

Simple PER calculated!

PER Stats:
Average PER: 4.98
Max PER: 16.67
Min PER: -3.00


In [14]:
# Filter for players with meaningful minutes (at least 15 MPG)
df_qualified = df[df['MP'] >= 15].copy()

print("Simple PER calculated!")
print(f"\nAll players - PER Stats:")
print(f"Average PER: {df['Simple_PER'].mean():.2f}")
print(f"\nQualified players (15+ MPG) - PER Stats:")
print(f"Average PER: {df_qualified['Simple_PER'].mean():.2f}")
print(f"Total qualified players: {len(df_qualified)}")

Simple PER calculated!

All players - PER Stats:
Average PER: 4.98

Qualified players (15+ MPG) - PER Stats:
Average PER: 5.32
Total qualified players: 473


In [15]:
# Cell 12: Top 10 by PER (qualified players only)
print("\nTop 10 Players by Simple PER (min 15 MPG):")
top_per = df_qualified.nlargest(10, 'Simple_PER')[['Player', 'Team', 'MP', 'PTS', 'TRB', 'AST', 'Simple_PER']]
print(top_per)


Top 10 Players by Simple PER (min 15 MPG):
                      Player Team    MP   PTS   TRB   AST  Simple_PER
2               Nikola Jokić  DEN  36.7  29.6  12.7  10.2       11.47
1      Giannis Antetokounmpo  MIL  34.2  30.4  11.9   6.5       10.47
448              Moses Brown  DAL  18.3  11.8   7.8   0.5       10.05
0    Shai Gilgeous-Alexander  OKC  34.2  32.7   5.0   6.4        9.65
17             Anthony Davis  LAL  34.3  25.7  11.9   3.4        9.36
16             Anthony Davis  2TM  33.5  24.7  11.6   3.5        9.19
20           Zion Williamson  NOP  28.6  24.6   7.2   5.3        9.16
24         Victor Wembanyama  SAS  33.2  24.3  11.0   3.7        9.13
62          Domantas Sabonis  SAC  34.7  19.1  13.9   6.0        8.90
106            Mark Williams  CHO  26.6  15.3  10.2   2.5        8.83


In [16]:
# Cell 13: Save to CSV with PER
df.to_csv('nba_player_stats_2025.csv', index=False)
print("\nData with Simple PER saved to nba_player_stats_2025.csv")


Data with Simple PER saved to nba_player_stats_2025.csv


In [17]:
# scrape player salaries for upcoming season
# Cell 14: Scrape player salaries
salary_url = 'https://www.basketball-reference.com/contracts/players.html'

print(f"Fetching salary data from {salary_url}...")
salary_response = requests.get(salary_url, headers=headers)
print(f"Status code: {salary_response.status_code}")

Fetching salary data from https://www.basketball-reference.com/contracts/players.html...
Status code: 200


In [18]:
# Cell 15: Parse salary table
salary_soup = BeautifulSoup(salary_response.content, 'lxml')
salary_table = salary_soup.find('table', {'id': 'player-contracts'})
print("Salary table found!" if salary_table else "Salary table not found")

Salary table found!


In [19]:
# Cell 16: Extract salary data with correct column names
correct_headers = ['Rk', 'Player', 'Tm', '2025-26', '2026-27', '2027-28', '2028-29', '2029-30', '2030-31', 'Guaranteed']

salary_rows = []
for tr in salary_table.find('tbody').find_all('tr'):
    row = []
    
    # Get all cells in the row
    cells = tr.find_all(['th', 'td'])
    
    for cell in cells:
        row.append(cell.text.strip())
    
    # Only keep rows that have 10 columns (matching our headers)
    if len(row) == 10:
        salary_rows.append(row)

print(f"Scraped {len(salary_rows)} salary records with 10 columns")
if salary_rows:
    print(f"Sample first row: {salary_rows[0]}")



Scraped 371 salary records with 10 columns
Sample first row: ['1', 'Stephen Curry', 'GSW', '$59,606,817', '$62,587,158', '', '', '', '', '$122,193,975']


In [20]:
# Cell 17: Create salary DataFrame with correct columns
df_salary = pd.DataFrame(salary_rows, columns=correct_headers)
print(f"\nSalary columns: {df_salary.columns.tolist()}")
print(f"Shape: {df_salary.shape}")
print(df_salary[['Player', 'Tm', '2025-26']].head(10))


Salary columns: ['Rk', 'Player', 'Tm', '2025-26', '2026-27', '2027-28', '2028-29', '2029-30', '2030-31', 'Guaranteed']
Shape: (371, 10)
                  Player   Tm      2025-26
0          Stephen Curry  GSW  $59,606,817
1            Joel Embiid  PHI  $55,224,526
2           Nikola Jokić  DEN  $55,224,526
3           Kevin Durant  HOU  $54,708,609
4           Jayson Tatum  BOS  $54,126,450
5  Giannis Antetokounmpo  MIL  $54,126,450
6          Anthony Davis  DAL  $54,126,450
7           Jimmy Butler  GSW  $54,126,450
8         Damian Lillard  MIL  $54,126,450
9           Bradley Beal  PHO  $53,666,270


In [21]:
# Cell 18: Clean salary data - just grab Player and 2025-26
df_salary_clean = df_salary[['Player', '2025-26']].copy()

# Clean the salary column
df_salary_clean['Salary_2025_26'] = df_salary_clean['2025-26'].str.replace('$', '').str.replace(',', '')
df_salary_clean['Salary_2025_26'] = pd.to_numeric(df_salary_clean['Salary_2025_26'], errors='coerce')

# Drop the raw column
df_salary_clean = df_salary_clean[['Player', 'Salary_2025_26']].copy()

print(f"Salary data cleaned. Sample:")
print(df_salary_clean.head(15))
print(f"\nTotal players with salary: {len(df_salary_clean)}")

Salary data cleaned. Sample:
                   Player  Salary_2025_26
0           Stephen Curry      59606817.0
1             Joel Embiid      55224526.0
2            Nikola Jokić      55224526.0
3            Kevin Durant      54708609.0
4            Jayson Tatum      54126450.0
5   Giannis Antetokounmpo      54126450.0
6           Anthony Davis      54126450.0
7            Jimmy Butler      54126450.0
8          Damian Lillard      54126450.0
9            Bradley Beal      53666270.0
10           Jaylen Brown      53142264.0
11           Devin Booker      53142264.0
12     Karl-Anthony Towns      53142264.0
13           LeBron James      52627153.0
14            Paul George      51666090.0

Total players with salary: 371


In [22]:
# Cell 19: Merge salary with stats (handling multiple team entries)

# For players with multiple team entries, we want to merge salary to ALL their rows
# The salary data has each player once, stats may have them multiple times

df_merged = df.merge(df_salary_clean, on='Player', how='left')

print(f"\nMerge complete!")
print(f"Total stat rows (including multi-team players): {len(df_merged)}")
print(f"Unique players: {df_merged['Player'].nunique()}")
print(f"Total rows with salary data: {df_merged['Salary_2025_26'].notna().sum()}")
print(f"Unique players with salary: {df_merged[df_merged['Salary_2025_26'].notna()]['Player'].nunique()}")

# Create qualified dataset - use TOT (total) rows for multi-team players when available
# If no TOT row, use their individual team rows
df_merged_qualified = df_merged[df_merged['MP'] >= 15].copy()

print(f"\nQualified players (15+ MPG):")
print(f"Total qualified rows: {len(df_merged_qualified)}")
print(f"Qualified rows with salary: {df_merged_qualified['Salary_2025_26'].notna().sum()}")

# Show examples of players without salary
missing_salary = df_merged_qualified[df_merged_qualified['Salary_2025_26'].isna()].sort_values('PTS', ascending=False)
print(f"\nTop 10 qualified players missing salary data:")
print(missing_salary[['Player', 'Team', 'MP', 'PTS']].head(10))


Merge complete!
Total stat rows (including multi-team players): 736
Unique players: 570
Total rows with salary data: 368
Unique players with salary: 316

Qualified players (15+ MPG):
Total qualified rows: 473
Qualified rows with salary: 307

Top 10 qualified players missing salary data:
               Player Team    MP   PTS
19       Kyrie Irving  DAL  36.1  24.7
28         Cam Thomas  BRK  31.2  24.0
38       James Harden  LAC  35.3  22.8
117    Quentin Grimes  PHI  33.7  21.9
68      Julius Randle  MIN  32.3  18.7
140   Dennis Schröder  BRK  33.6  18.4
696      Jacob Toppin  ATL  27.0  17.0
96      Malik Beasley  DET  27.8  16.3
102      Myles Turner  IND  30.2  15.6
104  Jonathan Kuminga  GSW  24.3  15.3


In [23]:
# Cell 20: Top 10 players by PER with salary
# For players on multiple teams, prefer their TOT (total) row
df_for_ranking = df_merged_qualified.copy()

# Create a preference: TOT rows first, then single team rows
df_for_ranking['is_tot'] = df_for_ranking['Team'] == 'TOT'

# Remove duplicate players, keeping TOT if available
df_for_ranking = df_for_ranking.sort_values('is_tot', ascending=False).drop_duplicates(subset='Player', keep='first')

print(f"\nUnique qualified players for ranking: {len(df_for_ranking)}")
print("\nTop 10 Players by Simple PER with Salary (min 15 MPG):")
top_per_salary = df_for_ranking.nlargest(10, 'Simple_PER')[['Player', 'Team', 'MP', 'PTS', 'Simple_PER', 'Salary_2025_26']]
print(top_per_salary)


Unique qualified players for ranking: 387

Top 10 Players by Simple PER with Salary (min 15 MPG):
                      Player Team    MP   PTS  Simple_PER  Salary_2025_26
2               Nikola Jokić  DEN  36.7  29.6       11.47      55224526.0
1      Giannis Antetokounmpo  MIL  34.2  30.4       10.47      54126450.0
448              Moses Brown  DAL  18.3  11.8       10.05             NaN
0    Shai Gilgeous-Alexander  OKC  34.2  32.7        9.65      38333050.0
20           Zion Williamson  NOP  28.6  24.6        9.16      39446090.0
24         Victor Wembanyama  SAS  33.2  24.3        9.13      13376880.0
62          Domantas Sabonis  SAC  34.7  19.1        8.90      43636000.0
106            Mark Williams  CHO  26.6  15.3        8.83       6276531.0
23        Karl-Anthony Towns  NYK  35.0  24.4        8.66      53142264.0
203        Jonas Valančiūnas  SAC  16.9   8.7        8.64      10395000.0


In [24]:
# Cell 21: Calculate value with better filters
df_for_ranking['Value'] = df_for_ranking['Simple_PER'] / (df_for_ranking['Salary_2025_26'] / 1_000_000)
df_for_ranking['Value'] = df_for_ranking['Value'].round(2)

# Filter for players making at least $5M
high_earners = df_for_ranking[df_for_ranking['Salary_2025_26'] >= 5_000_000].copy()

print("\nTop 10 Best Value Players (PER per $1M, min $5M salary):")
top_value = high_earners.nlargest(10, 'Value')[['Player', 'Team', 'Simple_PER', 'Salary_2025_26', 'Value']]
print(top_value)

print("\n\nTop 10 Players by Simple PER (min 15 MPG):")
top_performers = df_for_ranking.nlargest(10, 'Simple_PER')[['Player', 'Team', 'MP', 'PTS', 'TRB', 'AST', 'Simple_PER', 'Salary_2025_26']]
print(top_performers)

print("\n\nMost Overpaid (min $20M salary, sorted by worst PER):")
overpaid = df_for_ranking[df_for_ranking['Salary_2025_26'] >= 20_000_000].nsmallest(10, 'Simple_PER')[['Player', 'Team', 'Simple_PER', 'Salary_2025_26', 'Value']]
print(overpaid)


Top 10 Best Value Players (PER per $1M, min $5M salary):
               Player Team  Simple_PER  Salary_2025_26  Value
146     Moritz Wagner  ORL        7.50       5000000.0   1.50
294  Dereck Lively II  DAL        7.53       5253360.0   1.43
106     Mark Williams  CHO        8.83       6276531.0   1.41
252     Nick Richards  PHO        6.83       5000000.0   1.37
344    Andre Drummond  PHI        6.49       5000000.0   1.30
175       Jalen Duren  DET        8.31       6483144.0   1.28
254         Zach Edey  MEM        7.30       6045000.0   1.21
168        Tari Eason  HOU        6.43       5675766.0   1.13
44     Jalen Williams  OKC        6.94       6580997.0   1.05
399   Donovan Clingan  POR        6.82       7178400.0   0.95


Top 10 Players by Simple PER (min 15 MPG):
                      Player Team    MP   PTS   TRB   AST  Simple_PER  \
2               Nikola Jokić  DEN  36.7  29.6  12.7  10.2       11.47   
1      Giannis Antetokounmpo  MIL  34.2  30.4  11.9   6.5       10.47

In [25]:
# Cell 22: Save merged data
df_merged.to_csv('nba_stats_with_salary_2025.csv', index=False)
print("\nFull data with salary saved to nba_stats_with_salary_2025.csv")


Full data with salary saved to nba_stats_with_salary_2025.csv
