Using Web Scraping to Get NBA MVP Data

In [1]:
pip install requests

Note: you may need to restart the kernel to use updated packages.


In [9]:
years = list(range(1991,2023))

In [2]:
url_start = "https://www.basketball-reference.com/awards/awards_{}.html"

Web Scraping Data from 1990-91 to 2021-2022 NBA Seasons

In [6]:
import requests
import time

for year in years:
    url = url_start.format(year)
    data = requests.get(url)
    time.sleep(5)
    with open("mvps/{}.html".format(year), "w+", encoding="utf-8") as f:
        f.write(data.text)

Using BeautifulSoup to Parse MVP Table from the HTML File

In [2]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [3]:
from bs4 import BeautifulSoup

In [2]:
with open("mvps/1991.html", encoding="utf-8") as f:
    page = f.read()

In [3]:
soup = BeautifulSoup(page, "html.parser")

In [4]:
soup.find('tr', class_="over_header").decompose()

In [5]:
mvp_table = soup.find(id="mvp")

In [4]:
import pandas as pd

In [13]:
mvp_1991 = pd.read_html(str(mvp_table))[0]

In [14]:
mvp_1991

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,1,Michael Jordan,27,CHI,77.0,891.0,960,0.928,82,37.0,31.5,6.0,5.5,2.7,1.0,0.539,0.312,0.851,20.3,0.321
1,2,Magic Johnson,31,LAL,10.0,497.0,960,0.518,79,37.1,19.4,7.0,12.5,1.3,0.2,0.477,0.32,0.906,15.4,0.251
2,3,David Robinson,25,SAS,6.0,476.0,960,0.496,82,37.7,25.6,13.0,2.5,1.5,3.9,0.552,0.143,0.762,17.0,0.264
3,4,Charles Barkley,27,PHI,2.0,222.0,960,0.231,67,37.3,27.6,10.1,4.2,1.6,0.5,0.57,0.284,0.722,13.4,0.258
4,5,Karl Malone,27,UTA,0.0,142.0,960,0.148,82,40.3,29.0,11.8,3.3,1.1,1.0,0.527,0.286,0.77,15.5,0.225
5,6,Clyde Drexler,28,POR,1.0,75.0,960,0.078,82,34.8,21.5,6.7,6.0,1.8,0.7,0.482,0.319,0.794,12.4,0.209
6,7,Kevin Johnson,24,PHO,0.0,32.0,960,0.033,77,36.0,22.2,3.5,10.1,2.1,0.1,0.516,0.205,0.843,12.7,0.22
7,8,Dominique Wilkins,31,ATL,0.0,29.0,960,0.03,81,38.0,25.9,9.0,3.3,1.5,0.8,0.47,0.341,0.829,11.4,0.177
8,9T,Larry Bird,34,BOS,0.0,25.0,960,0.026,60,38.0,19.4,8.5,7.2,1.8,1.0,0.454,0.389,0.891,6.6,0.14
9,9T,Terry Porter,27,POR,0.0,25.0,960,0.026,81,32.9,17.0,3.5,8.0,2.0,0.1,0.515,0.415,0.823,13.0,0.235


In [17]:
dfs = []
for year in years:
    with open("mvps/{}.html".format(year), encoding="utf-8") as f:
        page = f.read()
    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_="over_header").decompose()
    mvp_table = soup.find(id="mvp")
    mvp = pd.read_html(str(mvp_table))[0]
    mvp["Year"] = year
    
    dfs.append(mvp)

In [18]:
mvps = pd.concat(dfs)

In [19]:
mvps.tail()

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
7,8,Stephen Curry,33,GSW,0.0,4.0,1000,0.004,64,34.5,...,5.2,6.3,1.3,0.4,0.437,0.38,0.923,8.0,0.173,2022
8,9,Chris Paul,36,PHO,0.0,2.0,1000,0.002,65,32.9,...,4.4,10.8,1.9,0.3,0.493,0.317,0.837,9.4,0.21,2022
9,10T,DeMar DeRozan,32,CHI,0.0,1.0,1000,0.001,76,36.1,...,5.2,4.9,0.9,0.3,0.504,0.352,0.877,8.8,0.154,2022
10,10T,Kevin Durant,33,BRK,0.0,1.0,1000,0.001,55,37.2,...,7.4,6.4,0.9,0.9,0.518,0.383,0.91,8.4,0.198,2022
11,10T,LeBron James,37,LAL,0.0,1.0,1000,0.001,56,37.2,...,8.2,6.2,1.3,1.1,0.524,0.359,0.756,7.5,0.172,2022


In [20]:
mvps.head()

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
0,1,Michael Jordan,27,CHI,77.0,891.0,960,0.928,82,37.0,...,6.0,5.5,2.7,1.0,0.539,0.312,0.851,20.3,0.321,1991
1,2,Magic Johnson,31,LAL,10.0,497.0,960,0.518,79,37.1,...,7.0,12.5,1.3,0.2,0.477,0.32,0.906,15.4,0.251,1991
2,3,David Robinson,25,SAS,6.0,476.0,960,0.496,82,37.7,...,13.0,2.5,1.5,3.9,0.552,0.143,0.762,17.0,0.264,1991
3,4,Charles Barkley,27,PHI,2.0,222.0,960,0.231,67,37.3,...,10.1,4.2,1.6,0.5,0.57,0.284,0.722,13.4,0.258,1991
4,5,Karl Malone,27,UTA,0.0,142.0,960,0.148,82,40.3,...,11.8,3.3,1.1,1.0,0.527,0.286,0.77,15.5,0.225,1991


In [21]:
mvps.to_csv("mvps.csv")

Getting Stats for Every Player for the Seasons

In [8]:
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

url = player_stats_url.format(1991)
data = requests.get(url)
with open("player/1991.html", "w+", encoding= "utf-8") as f:
    f.write(data.text)

In [5]:
import requests

Using Selenium to Load Every Player Stats on the HTML Page and Scrape

In [9]:
pip install selenium

Collecting selenium
  Downloading selenium-4.9.0-py3-none-any.whl (6.5 MB)
Collecting trio-websocket~=0.9
  Downloading trio_websocket-0.10.2-py3-none-any.whl (17 kB)
Collecting trio~=0.17
  Downloading trio-0.22.0-py3-none-any.whl (384 kB)
Collecting exceptiongroup>=1.0.0rc9
  Downloading exceptiongroup-1.1.1-py3-none-any.whl (14 kB)
Collecting outcome
  Downloading outcome-1.2.0-py2.py3-none-any.whl (9.7 kB)
Collecting wsproto>=0.14
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
Installing collected packages: outcome, h11, exceptiongroup, wsproto, trio, trio-websocket, selenium
Successfully installed exceptiongroup-1.1.1 h11-0.14.0 outcome-1.2.0 selenium-4.9.0 trio-0.22.0 trio-websocket-0.10.2 wsproto-1.2.0
Note: you may need to restart the kernel to use updated packages.


In [7]:
from selenium import webdriver

In [8]:
driver = webdriver.Chrome(executable_path="/Users/dwdw0/Downloads/chromedriver_win32")

  driver = webdriver.Chrome(executable_path="/Users/dwdw0/Downloads/chromedriver_win32")


In [10]:
year = 2023
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"
url = player_stats_url.format(year)

driver.get(url)
driver.execute_script("window.scrollTo(1,10000)")
time.sleep(3)

html = driver.page_source

with open("2023/{}.html".format(year), "w+", encoding= "utf-8") as f:
    f.write(html)

In [9]:
import time

In [17]:
for year in years:
    
    url = player_stats_url.format(year)

    driver.get(url)
    driver.execute_script("window.scrollTo(1,10000)")
    time.sleep(5)

    html = driver.page_source

    with open("player/{}.html".format(year), "w+", encoding= "utf-8") as f:
        f.write(html)

In [7]:
year = 1991
with open("player/{}.html".format(year), encoding= "utf-8") as f:
    page = f.read()

    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_="thead").decompose()
    player_table = soup.find(id="per_game_stats")
    player = pd.read_html(str(player_table))[0]
    player["Year"] = year
    
    

In [15]:
from bs4 import BeautifulSoup

In [16]:
import pandas as pd

In [8]:
player.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,1,Alaa Abdelnaby,PF,22,POR,43,0,6.7,1.3,2.7,...,0.6,1.4,2.1,0.3,0.1,0.3,0.5,0.9,3.1,1991
1,2,Mahmoud Abdul-Rauf,PG,21,DEN,67,19,22.5,6.2,15.1,...,0.5,1.3,1.8,3.1,0.8,0.1,1.6,2.2,14.1,1991
2,3,Mark Acres,C,28,ORL,68,0,19.3,1.6,3.1,...,2.1,3.2,5.3,0.4,0.4,0.4,0.6,3.2,4.2,1991
3,4,Michael Adams,PG,28,DEN,66,66,35.5,8.5,21.5,...,0.9,3.0,3.9,10.5,2.2,0.1,3.6,2.5,26.5,1991
4,5,Mark Aguirre,SF,31,DET,78,13,25.7,5.4,11.7,...,1.7,3.1,4.8,1.8,0.6,0.3,1.6,2.7,14.2,1991


In [10]:
dfs = []
for year in years:
    with open("player/{}.html".format(year), encoding= "utf-8") as f:
        page = f.read()

    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_="thead").decompose()
    player_table = soup.find(id="per_game_stats")
    player = pd.read_html(str(player_table))[0]
    player["Year"] = year
    
    dfs.append(player)
    

In [11]:
player.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,1,Precious Achiuwa,C,22,TOR,73,28,23.6,3.6,8.3,...,2.0,4.5,6.5,1.1,0.5,0.6,1.2,2.1,9.1,2022
1,2,Steven Adams,C,28,MEM,76,75,26.3,2.8,5.1,...,4.6,5.4,10.0,3.4,0.9,0.8,1.5,2.0,6.9,2022
2,3,Bam Adebayo,C,24,MIA,56,56,32.6,7.3,13.0,...,2.4,7.6,10.1,3.4,1.4,0.8,2.6,3.1,19.1,2022
3,4,Santi Aldama,PF,21,MEM,32,0,11.3,1.7,4.1,...,1.0,1.7,2.7,0.7,0.2,0.3,0.5,1.1,4.1,2022
4,5,LaMarcus Aldridge,C,36,BRK,47,12,22.3,5.4,9.7,...,1.6,3.9,5.5,0.9,0.3,1.0,0.9,1.7,12.9,2022


In [12]:
players = pd.concat(dfs)

In [13]:
players.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,1,Alaa Abdelnaby,PF,22,POR,43,0,6.7,1.3,2.7,...,0.6,1.4,2.1,0.3,0.1,0.3,0.5,0.9,3.1,1991
1,2,Mahmoud Abdul-Rauf,PG,21,DEN,67,19,22.5,6.2,15.1,...,0.5,1.3,1.8,3.1,0.8,0.1,1.6,2.2,14.1,1991
2,3,Mark Acres,C,28,ORL,68,0,19.3,1.6,3.1,...,2.1,3.2,5.3,0.4,0.4,0.4,0.6,3.2,4.2,1991
3,4,Michael Adams,PG,28,DEN,66,66,35.5,8.5,21.5,...,0.9,3.0,3.9,10.5,2.2,0.1,3.6,2.5,26.5,1991
4,5,Mark Aguirre,SF,31,DET,78,13,25.7,5.4,11.7,...,1.7,3.1,4.8,1.8,0.6,0.3,1.6,2.7,14.2,1991


In [15]:
players

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,1,Alaa Abdelnaby,PF,22,POR,43,0,6.7,1.3,2.7,...,0.6,1.4,2.1,0.3,0.1,0.3,0.5,0.9,3.1,1991
1,2,Mahmoud Abdul-Rauf,PG,21,DEN,67,19,22.5,6.2,15.1,...,0.5,1.3,1.8,3.1,0.8,0.1,1.6,2.2,14.1,1991
2,3,Mark Acres,C,28,ORL,68,0,19.3,1.6,3.1,...,2.1,3.2,5.3,0.4,0.4,0.4,0.6,3.2,4.2,1991
3,4,Michael Adams,PG,28,DEN,66,66,35.5,8.5,21.5,...,0.9,3.0,3.9,10.5,2.2,0.1,3.6,2.5,26.5,1991
4,5,Mark Aguirre,SF,31,DET,78,13,25.7,5.4,11.7,...,1.7,3.1,4.8,1.8,0.6,0.3,1.6,2.7,14.2,1991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
836,601,Thaddeus Young,PF,33,TOR,26,0,18.3,2.6,5.5,...,1.5,2.9,4.4,1.7,1.2,0.4,0.8,1.7,6.3,2022
837,602,Trae Young,PG,23,ATL,76,76,34.9,9.4,20.3,...,0.7,3.1,3.7,9.7,0.9,0.1,4.0,1.7,28.4,2022
838,603,Omer Yurtseven,C,23,MIA,56,12,12.6,2.3,4.4,...,1.5,3.7,5.3,0.9,0.3,0.4,0.7,1.5,5.3,2022
839,604,Cody Zeller,C,29,POR,27,0,13.1,1.9,3.3,...,1.9,2.8,4.6,0.8,0.3,0.2,0.7,2.1,5.2,2022


In [16]:
players.to_csv("players.csv")

Team Record Scraping

In [18]:
team_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

In [19]:
year = 1991
url = team_stats_url.format(year)
data = requests.get(url)

with open("team/{}.html".format(year), "w+", encoding= "utf-8") as f:
    f.write(data.text)

In [20]:
import time
for year in years:
    url = team_stats_url.format(year)
    data = requests.get(url)
    time.sleep(4)

    with open("team/{}.html".format(year), "w+", encoding= "utf-8") as f:
        f.write(data.text)

In [23]:
dfs = []
for year in years:
    with open("team/{}.html".format(year), encoding= "utf-8") as f:
        page = f.read()

    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_="thead").decompose()
    team_table = soup.find(id="divs_standings_E")
    team = pd.read_html(str(team_table))[0]
    team["Year"] = year
    team["Team"] = team["Eastern Conference"]
    del team["Eastern Conference"]
    dfs.append(team)

    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_="thead").decompose()
    team_table = soup.find(id="divs_standings_W")
    team = pd.read_html(str(team_table))[0]
    team["Year"] = year
    team["Team"] = team["Western Conference"]
    del team["Western Conference"]
    dfs.append(team)

    

In [24]:
teams = pd.concat(dfs)

In [25]:
teams

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
0,56,26,.683,—,111.5,105.7,5.22,1991,Boston Celtics*
1,44,38,.537,12.0,105.4,105.6,-0.39,1991,Philadelphia 76ers*
2,39,43,.476,17.0,103.1,103.3,-0.43,1991,New York Knicks*
3,30,52,.366,26.0,101.4,106.4,-4.84,1991,Washington Bullets
4,26,56,.317,30.0,102.9,107.5,-4.53,1991,New Jersey Nets
...,...,...,...,...,...,...,...,...,...
13,56,26,.683,—,115.6,109.9,5.37,2022,Memphis Grizzlies*
14,52,30,.634,4.0,108.0,104.7,3.12,2022,Dallas Mavericks*
15,36,46,.439,20.0,109.3,110.3,-0.84,2022,New Orleans Pelicans*
16,34,48,.415,22.0,113.2,113.0,0.02,2022,San Antonio Spurs


In [26]:
teams.to_csv("teams.csv")

Adding Additional Statistics(Advanced) for Better Prediction

In [3]:
advanced_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_advanced.html"

In [14]:
year = 1991
url = advanced_stats_url.format(year)

driver.get(url)
driver.execute_script("window.scrollTo(1,10000)")
time.sleep(3)

html = driver.page_source

with open("advanced/{}.html".format(year), "w+", encoding= "utf-8") as f:
    f.write(html)

In [7]:
import time

In [10]:
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

In [12]:
years = list(range(1991,2023))

In [13]:
for year in years:
    
    url = advanced_stats_url.format(year)

    driver.get(url)
    driver.execute_script("window.scrollTo(1,10000)")
    time.sleep(5)

    html = driver.page_source

    with open("advanced/{}.html".format(year), "w+", encoding= "utf-8") as f:
        f.write(html)

In [17]:
year = 1991
with open("advanced/{}.html".format(year), encoding= "utf-8") as f:
    page = f.read()

    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_="thead").decompose()
    advanced_table = soup.find(id="advanced_stats")
    advanced = pd.read_html(str(advanced_table))[0]
    advanced["Year"] = year
    
    

In [18]:
advanced.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,...,OWS,DWS,WS,WS/48,Unnamed: 24,OBPM,DBPM,BPM,VORP,Year
0,1,Alaa Abdelnaby,PF,22,POR,43,290,13.1,0.499,0.0,...,0.0,0.5,0.5,0.079,,-3.4,-1.2,-4.6,-0.2,1991
1,2,Mahmoud Abdul-Rauf,PG,21,DEN,67,1505,12.2,0.448,0.099,...,-0.7,-0.3,-1.0,-0.031,,-2.0,-3.0,-5.0,-1.1,1991
2,3,Mark Acres,C,28,ORL,68,1313,9.2,0.551,0.014,...,1.4,1.1,2.5,0.09,,-2.8,-0.2,-3.0,-0.3,1991
3,4,Michael Adams,PG,28,DEN,66,2346,22.3,0.53,0.397,...,5.8,0.4,6.3,0.128,,6.0,-0.7,5.3,4.3,1991
4,5,Mark Aguirre,SF,31,DET,78,2006,16.7,0.526,0.086,...,2.8,2.7,5.5,0.132,,1.2,0.2,1.4,1.7,1991


In [19]:
advanced.dtypes

Rk              object
Player          object
Pos             object
Age             object
Tm              object
G               object
MP              object
PER             object
TS%             object
3PAr            object
FTr             object
ORB%            object
DRB%            object
TRB%            object
AST%            object
STL%            object
BLK%            object
TOV%            object
USG%            object
Unnamed: 19    float64
OWS             object
DWS             object
WS              object
WS/48           object
Unnamed: 24    float64
OBPM            object
DBPM            object
BPM             object
VORP            object
Year             int64
dtype: object

In [20]:
dfs = []
for year in years:
    with open("advanced/{}.html".format(year), encoding= "utf-8") as f:
        page = f.read()

    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_="thead").decompose()
    advanced_table = soup.find(id="advanced_stats")
    advanced = pd.read_html(str(advanced_table))[0]
    advanced["Year"] = year
    
    dfs.append(advanced)
    

In [21]:
advanced_stats = pd.concat(dfs)

In [22]:
advanced_stats

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,...,OWS,DWS,WS,WS/48,Unnamed: 24,OBPM,DBPM,BPM,VORP,Year
0,1,Alaa Abdelnaby,PF,22,POR,43,290,13.1,.499,.000,...,0.0,0.5,0.5,.079,,-3.4,-1.2,-4.6,-0.2,1991
1,2,Mahmoud Abdul-Rauf,PG,21,DEN,67,1505,12.2,.448,.099,...,-0.7,-0.3,-1.0,-0.031,,-2.0,-3.0,-5.0,-1.1,1991
2,3,Mark Acres,C,28,ORL,68,1313,9.2,.551,.014,...,1.4,1.1,2.5,.090,,-2.8,-0.2,-3.0,-0.3,1991
3,4,Michael Adams,PG,28,DEN,66,2346,22.3,.530,.397,...,5.8,0.4,6.3,.128,,6.0,-0.7,5.3,4.3,1991
4,5,Mark Aguirre,SF,31,DET,78,2006,16.7,.526,.086,...,2.8,2.7,5.5,.132,,1.2,0.2,1.4,1.7,1991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
836,601,Thaddeus Young,PF,33,TOR,26,475,15.8,.526,.299,...,0.5,0.8,1.3,.127,,-0.2,2.2,2.0,0.5,2022
837,602,Trae Young,PG,23,ATL,76,2652,25.4,.603,.395,...,9.0,1.0,10.0,.181,,7.1,-2.0,5.2,4.8,2022
838,603,Omer Yurtseven,C,23,MIA,56,706,17.4,.546,.045,...,0.8,1.4,2.1,.145,,-1.4,0.4,-1.0,0.2,2022
839,604,Cody Zeller,C,29,POR,27,355,17.2,.627,.044,...,0.9,0.2,1.1,.143,,-1.2,-1.0,-2.1,0.0,2022


In [23]:
advanced_stats.to_csv("advanced_stats.csv")