In [1]:
import requests
import numpy as np
import pandas as pd
import time

In [29]:
#Method for getting season stats
def get_season_data(season):
    #Player season stats
    season_url = f"https://www.basketball-reference.com/leagues/NBA_{season}_per_game.html"
    tables = pd.read_html(season_url)

    df_season = tables[0]
    
    df_season.drop_duplicates(subset=['Player'], keep='first', inplace=True) 
    df_season.drop(df_season.tail(1).index,inplace=True)
    df_season.reset_index(inplace=True, drop=True)

    #Sleep to avoid request limit
    time.sleep(5)
    
    #Team standings stats
    standings_url = f"https://www.basketball-reference.com/leagues/NBA_{season}_standings.html"
    tables_team = pd.read_html(standings_url)

    df_east = tables_team[0]
    df_west = tables_team[1]

    df_east['Eastern Conference'] = df_east['Eastern Conference'].str.replace('*', '')
    df_west['Western Conference'] = df_west['Western Conference'].str.replace('*', '')
    df_east = df_east.rename({'Eastern Conference': 'Team'}, axis=1)
    df_west = df_west.rename({'Western Conference': 'Team'}, axis=1)
    df_east = df_east[['Team', 'W/L%']]
    df_west = df_west[['Team', 'W/L%']]

    df_east = df_east.replace({'Milwaukee Bucks': 'MIL',
                 'Boston Celtics': 'BOS',
                 'Philadelphia 76ers': 'PHI',
                 'Cleveland Cavaliers': 'CLE',
                 'New York Knicks': 'NYK',
                 'Brooklyn Nets': 'BRK',
                 'Miami Heat': 'MIA',
                 'Atlanta Hawks': 'ATL',
                 'Toronto Raptors': 'TOR',
                 'Chicago Bulls': 'CHI',
                 'Indiana Pacers': 'IND',
                 'Washington Wizards': 'WAS',
                 'Orlando Magic': 'ORL',
                 'Charlotte Hornets': 'CHO',
                 'Detroit Pistons': 'DET',
                 'Charlotte Bobcats': 'CHA',
                 'New Jersey Nets': 'NJN'})
    df_west = df_west.replace({'Denver Nuggets': 'DEN',
                 'Memphis Grizzlies': 'MEM',
                 'Sacramento Kings': 'SAC',
                 'Phoenix Suns': 'PHO',
                 'Los Angeles Clippers': 'LAC',
                 'Golden State Warriors': 'GSW',
                 'Los Angeles Lakers': 'LAL',
                 'Minnesota Timberwolves': 'MIN',
                 'New Orleans Pelicans': 'NOP',
                 'Oklahoma City Thunder': 'OKC',
                 'Dallas Mavericks': 'DAL',
                 'Utah Jazz': 'UTA',
                 'Portland Trail Blazers': 'POR',
                 'Houston Rockets': 'HOU',
                 'San Antonio Spurs': 'SAS',
                 'New Orleans Hornets': 'NOP',
                 'Seattle Supersonics': 'SEA',
                 'New Orleans/Oklahoma City Hornets': 'NOK'})

    #Join team stats with player stats
    df_team = pd.concat([df_west, df_east])    
    df_season = df_season.merge(df_team, how='left', on='Team')

    #Fill with 50% for null values
    df_season['W/L%'] = df_season['W/L%'].fillna(0.5)
    df_season.drop(['Rk', 'Age', 'Team', 'Pos', 'G', 'GS', 'FG', 'FGA', '3P', '3PA', '3P%', '2P', '2PA', '2P%',
                    'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TOV', 'PF', 'Awards'],
                     axis=1, inplace=True)

    #Sleep to avoid request limit
    time.sleep(5)
    
    #Advanced stats
    advanced_url = f"https://www.basketball-reference.com/leagues/NBA_{season}_advanced.html"
    tables_adv = pd.read_html(advanced_url)

    df_advanced = tables_adv[0]
    
    df_advanced.drop_duplicates(subset=['Player'], keep='first', inplace=True)
    df_advanced.drop(['Rk', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
                      'OWS', 'DWS', 'WS/48', 'OBPM', 'DBPM', 'Awards'],
                       axis=1, inplace=True)

    df_season = df_season.merge(df_advanced, how='inner', on='Player')
    df_season.dropna()

    #Sleep to avoid request limit
    time.sleep(5)
    
    mvp_url = f"https://www.basketball-reference.com/awards/awards_{season}.html"
    tables_mvp = pd.read_html(mvp_url)

    df_mvp = tables_mvp[0]
    df_mvp = df_mvp.set_axis(['rank', 'Player', 'Age', 'Team', 'First', 'Pts Won', 'Pts Max', 'Share', 'G', 'MP', 'PTS',
                              'TRB', 'AST', 'STL', 'BLK', 'FG%', '3P%', 'FT%', 'WS', 'WS/48'], 
                               axis=1)
    df_mvp.drop(['rank', 'Age', 'Team', 'First', 'Pts Won', 'Pts Max', 'G', 'MP', 'PTS', 'TRB', 'AST', 'STL', 'BLK',
                 'FG%', '3P%', 'FT%', 'WS', 'WS/48'],
                  axis=1, inplace=True)

    df_season = df_season.merge(df_mvp, how='left', on='Player')
    df_season = df_season.fillna(0)
    df_season['Season'] = season
    df_season = df_season[['Season', 'Player', 'MP', 'PTS', 'AST', 'TRB', 'STL', 'BLK', 'TS%', 'PER', 'WS', 'BPM', 'VORP', 'USG%', 'W/L%', 'Share']]

    #Sleep to avoid request limit
    print('Sleeping', season)
    time.sleep(5)
    
    return df_season

In [30]:
df_list = []

for season in range(2003, 2026):  
    df_season = get_season_data(season)
    df_list.append(df_season)

df_overall = pd.concat(df_list, axis=0).reset_index(drop=True)
df_overall

Sleeping 2003
Sleeping 2004
Sleeping 2005
Sleeping 2006
Sleeping 2007
Sleeping 2008
Sleeping 2009
Sleeping 2010
Sleeping 2011
Sleeping 2012
Sleeping 2013
Sleeping 2014
Sleeping 2015
Sleeping 2016
Sleeping 2017
Sleeping 2018
Sleeping 2019
Sleeping 2020
Sleeping 2021
Sleeping 2022
Sleeping 2023
Sleeping 2024
Sleeping 2025


Unnamed: 0,Season,Player,MP,PTS,AST,TRB,STL,BLK,TS%,PER,WS,BPM,VORP,USG%,W/L%,Share
0,2003,Tracy McGrady,39.4,32.1,5.5,6.5,1.7,0.8,0.564,30.3,16.1,10.5,9.3,35.2,.512,0.359
1,2003,Kobe Bryant,41.5,30.0,5.9,6.9,2.2,0.8,0.550,26.2,14.9,7.1,7.7,32.9,.610,0.417
2,2003,Allen Iverson,42.5,27.6,5.5,4.2,2.7,0.2,0.500,21.2,9.2,3.5,4.8,32.9,.585,0.070
3,2003,Shaquille O'Neal,37.8,27.5,3.1,11.1,0.6,2.4,0.602,29.5,13.2,6.5,5.5,30.2,.610,0.106
4,2003,Paul Pierce,39.2,25.9,4.4,7.3,1.8,0.8,0.532,22.7,10.1,4.9,5.4,33.2,.537,0.001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11338,2025,Riley Minix,7.0,0.0,0.0,2.0,0.0,0.0,0.000,-1.7,0.0,-11.8,0.0,6.1,0.415,0.000
11339,2025,Jahlil Okafor,3.0,0.0,1.0,1.0,0.0,0.0,0.000,13.8,0.0,7.5,0.0,0.0,0.61,0.000
11340,2025,Zyon Pullin,1.0,0.0,0.0,0.0,0.0,0.0,0.000,-11.7,0.0,-22.7,0.0,13.4,0.585,0.000
11341,2025,Isaiah Stevens,2.0,0.0,0.0,0.7,0.3,0.0,0.000,0.1,0.0,-11.8,0.0,14.8,0.451,0.000


In [31]:
df_overall.to_csv('../data/historic_nba_player_data.csv', index=False)