NBA Stats Web Scraping 
Data sourced from basketball-reference.com based on the 2024-2025 season
Refer to nba_stats_sum.ipynb for the data analysis and exploration
Refer to nba_stats_sum.py for the code (no visualizations or outputs)

In [2]:
#1: Import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

In [3]:
#2: Set up the URL and headers for advanced stats
url = 'https://www.basketball-reference.com/leagues/NBA_2025_advanced.html'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

In [4]:
#3: Make the request for advanced stats
print(f"Fetching data from {url}...")
response = requests.get(url, headers=headers)
print(f"Status code: {response.status_code}")


Fetching data from https://www.basketball-reference.com/leagues/NBA_2025_advanced.html...
Status code: 200


In [5]:
#4: Parse the HTML
soup = BeautifulSoup(response.content, 'lxml')

# The advanced stats table likely has id='advanced'
table = soup.find('table', {'id': 'advanced'})

if not table:
    # Sometimes it's in the page differently, let's check all tables
    all_tables = soup.find_all('table')
    print(f"Found {len(all_tables)} tables")
    for t in all_tables:
        print(f"Table ID: {t.get('id')}")
    table = all_tables[0] if all_tables else None

print("Table found!" if table else "Table not found")

Table found!


In [6]:
#5: Extract column headers
headers_list = []
for th in table.find('thead').find_all('th'):
    headers_list.append(th.text.strip())
print(f"Columns: {headers_list}")

Columns: ['Rk', 'Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Awards']


In [7]:
#6: Extract all rows
rows = []
for tr in table.find('tbody').find_all('tr'):
    # Skip header rows that appear in the middle
    if tr.find('th', {'scope': 'row'}) is None:
        continue
    
    row = []
    for td in tr.find_all(['th', 'td']):
        row.append(td.text.strip())
    
    if row:
        rows.append(row)

print(f"Scraped {len(rows)} player records")


Scraped 736 player records


In [8]:
#7: Create DataFrame
df = pd.DataFrame(rows, columns=headers_list)
print(df.head())

  Rk           Player Age Team Pos   G  GS    MP   PER   TS%  ...  USG%  OWS  \
0  1    Mikal Bridges  28  NYK  SF  82  82  3036  14.0  .585  ...  19.6  3.7   
1  2        Josh Hart  29  NYK  SG  77  77  2897  16.5  .611  ...  15.3  5.4   
2  3  Anthony Edwards  23  MIN  SG  79  79  2871  20.1  .595  ...  31.4  4.6   
3  4     Devin Booker  28  PHO  SG  75  75  2795  19.3  .589  ...  29.3  6.1   
4  5     James Harden  35  LAC  PG  79  79  2789  20.0  .582  ...  29.6  4.0   

   DWS   WS WS/48 OBPM  DBPM   BPM VORP                Awards  
0  2.0  5.7  .090  0.4  -0.9  -0.5  1.2                        
1  3.8  9.2  .153  1.1   1.8   2.8  3.6                        
2  3.8  8.4  .140  4.4   0.0   4.3  4.6  MVP-7,CPOY-3,AS,NBA2  
3  0.3  6.4  .111  2.8  -2.4   0.4  1.7                        
4  4.3  8.3  .143  3.5   0.8   4.3  4.4        MVP-10,AS,NBA3  

[5 rows x 29 columns]


In [9]:
#8: Convert numeric columns
# Advanced stats are mostly numeric after the first few columns
numeric_cols = df.columns[5:]  # Stats start after Pos column
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

print("\nData types after conversion:")
print(df.dtypes)


Data types after conversion:
Rk         object
Player     object
Age        object
Team       object
Pos        object
G         float64
GS        float64
MP        float64
PER       float64
TS%       float64
3PAr      float64
FTr       float64
ORB%      float64
DRB%      float64
TRB%      float64
AST%      float64
STL%      float64
BLK%      float64
TOV%      float64
USG%      float64
OWS       float64
DWS       float64
WS        float64
WS/48     float64
OBPM      float64
DBPM      float64
BPM       float64
VORP      float64
Awards    float64
dtype: object


In [10]:
#9: Filter for qualified players (at least 15 MPG)
df_qualified = df[df['MP'] >= 15].copy()

print(f"\nQualified players (15+ MPG): {len(df_qualified)}")
print(f"Average PER for qualified: {df_qualified['PER'].mean():.2f}")

print("\nTop 10 Qualified Players by PER:")
top_per_qual = df_qualified.nlargest(10, 'PER')[['Player', 'Team', 'MP', 'PER', 'TS%', 'WS', 'BPM']]
print(top_per_qual)


Qualified players (15+ MPG): 710
Average PER for qualified: 13.31

Top 10 Qualified Players by PER:
                      Player Team      MP   PER    TS%    WS   BPM
23              Nikola Jokić  DEN  2571.0  32.0  0.663  16.4  13.3
19   Shai Gilgeous-Alexander  OKC  2598.0  30.7  0.637  16.7  11.5
45     Giannis Antetokounmpo  MIL  2289.0  30.5  0.625  11.5   9.5
619         MarJon Beauchamp  NYK    17.0  29.8  0.588   0.1   3.7
367          Zion Williamson  NOP   857.0  27.3  0.600   2.5   7.0
170            Anthony Davis  LAL  1440.0  27.1  0.601   6.2   6.0
169            Anthony Davis  2TM  1706.0  26.3  0.588   6.7   5.4
642              Moses Brown  DAL    73.0  26.3  0.743   0.3   3.5
488                Kai Jones  DAL   260.0  24.8  0.837   1.4   4.8
279           Daniel Gafford  DAL  1226.0  24.7  0.716   5.9   3.8


In [11]:
#10: Save advanced stats to CSV
df.to_csv('nba_advanced_stats_2025.csv', index=False)
print("\nAdvanced stats saved to nba_advanced_stats_2025.csv")


Advanced stats saved to nba_advanced_stats_2025.csv


In [12]:
#11: Scrape player salaries
salary_url = 'https://www.basketball-reference.com/contracts/players.html'

print(f"\nFetching salary data from {salary_url}...")
salary_response = requests.get(salary_url, headers=headers)
print(f"Status code: {salary_response.status_code}")


Fetching salary data from https://www.basketball-reference.com/contracts/players.html...
Status code: 200


In [13]:
#12: Parse salary table
salary_soup = BeautifulSoup(salary_response.content, 'lxml')
salary_table = salary_soup.find('table', {'id': 'player-contracts'})
print("Salary table found!" if salary_table else "Salary table not found")

Salary table found!


In [14]:
#13: Extract salary data with correct column names
correct_headers = ['Rk', 'Player', 'Tm', '2025-26', '2026-27', '2027-28', '2028-29', '2029-30', '2030-31', 'Guaranteed']

salary_rows = []
for tr in salary_table.find('tbody').find_all('tr'):
    row = []
    
    # Get all cells in the row
    cells = tr.find_all(['th', 'td'])
    
    for cell in cells:
        row.append(cell.text.strip())
    
    # Only keep rows that have 10 columns (matching our headers)
    if len(row) == 10:
        salary_rows.append(row)

print(f"Scraped {len(salary_rows)} salary records with 10 columns")

Scraped 494 salary records with 10 columns


In [15]:
#14: Create salary DataFrame
df_salary = pd.DataFrame(salary_rows, columns=correct_headers)
print(f"\nSalary columns: {df_salary.columns.tolist()}")
print(f"Shape: {df_salary.shape}")



Salary columns: ['Rk', 'Player', 'Tm', '2025-26', '2026-27', '2027-28', '2028-29', '2029-30', '2030-31', 'Guaranteed']
Shape: (494, 10)


In [16]:
#15: Clean salary data
df_salary_clean = df_salary[['Player', '2025-26']].copy()

# Clean the salary column - ensure it's a string first
df_salary_clean['Salary_2025_26'] = df_salary_clean['2025-26'].astype(str).str.replace('$', '').str.replace(',', '')
df_salary_clean['Salary_2025_26'] = pd.to_numeric(df_salary_clean['Salary_2025_26'], errors='coerce')

# Drop the raw column
df_salary_clean = df_salary_clean[['Player', 'Salary_2025_26']].copy()

print(f"\nSalary data cleaned.")
print(f"Total players with salary: {len(df_salary_clean)}")
print(df_salary_clean.head(10))

# Create CSV file 
df_salary_clean.to_csv('nba_player_salaries_2025.csv', index=False)
print("\nSalary data saved to nba_player_salaries_2025.csv")


Salary data cleaned.
Total players with salary: 494
                  Player  Salary_2025_26
0          Stephen Curry      59606817.0
1            Joel Embiid      55224526.0
2           Nikola Jokić      55224526.0
3           Kevin Durant      54708609.0
4           Jayson Tatum      54126450.0
5  Giannis Antetokounmpo      54126450.0
6          Anthony Davis      54126450.0
7           Jimmy Butler      54126450.0
8         Damian Lillard      68230450.0
9           Bradley Beal      59020270.0

Salary data saved to nba_player_salaries_2025.csv
