In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd



In [2]:
years = list(range(1991, 2022))

In [3]:
url_edit = "https://www.basketball-reference.com/awards/awards_{}.html"

we should make a limited number of get requests to a web page

In [4]:
for year in years:
    url = url_edit.format(year)
    data = requests.get(url)

    with open("mvp/{}.html".format(year), "w+", encoding="utf-8") as f:
        f.write(data.text)

In [5]:
with open("mvp/1991.html", encoding="utf-8") as f:
    page = f.read()

In [6]:
soup = BeautifulSoup(page, "html.parser")

In [7]:
soup.find("tr", class_="over_header").decompose()

In [8]:
mvp_table = soup.find(id="mvp")

Investigating one of the files to observe the table

In [9]:
mvp_1991 = pd.read_html(str(mvp_table))[0]
mvp_1991

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,1,Michael Jordan,27,CHI,77.0,891.0,960,0.928,82,37.0,31.5,6.0,5.5,2.7,1.0,0.539,0.312,0.851,20.3,0.321
1,2,Magic Johnson,31,LAL,10.0,497.0,960,0.518,79,37.1,19.4,7.0,12.5,1.3,0.2,0.477,0.32,0.906,15.4,0.251
2,3,David Robinson,25,SAS,6.0,476.0,960,0.496,82,37.7,25.6,13.0,2.5,1.5,3.9,0.552,0.143,0.762,17.0,0.264
3,4,Charles Barkley,27,PHI,2.0,222.0,960,0.231,67,37.3,27.6,10.1,4.2,1.6,0.5,0.57,0.284,0.722,13.4,0.258
4,5,Karl Malone,27,UTA,0.0,142.0,960,0.148,82,40.3,29.0,11.8,3.3,1.1,1.0,0.527,0.286,0.77,15.5,0.225
5,6,Clyde Drexler,28,POR,1.0,75.0,960,0.078,82,34.8,21.5,6.7,6.0,1.8,0.7,0.482,0.319,0.794,12.4,0.209
6,7,Kevin Johnson,24,PHO,0.0,32.0,960,0.033,77,36.0,22.2,3.5,10.1,2.1,0.1,0.516,0.205,0.843,12.7,0.22
7,8,Dominique Wilkins,31,ATL,0.0,29.0,960,0.03,81,38.0,25.9,9.0,3.3,1.5,0.8,0.47,0.341,0.829,11.4,0.177
8,9T,Larry Bird,34,BOS,0.0,25.0,960,0.026,60,38.0,19.4,8.5,7.2,1.8,1.0,0.454,0.389,0.891,6.6,0.14
9,9T,Terry Porter,27,POR,0.0,25.0,960,0.026,81,32.9,17.0,3.5,8.0,2.0,0.1,0.515,0.415,0.823,13.0,0.235


In [11]:
dfs = []
for year in years:
    with open("mvp/{}.html".format(year), encoding="utf-8") as f:
        page = f.read()
    soup = BeautifulSoup(page, "html.parser")
    soup.find("tr", class_="over_header").decompose()
    #The id of the table element we are parsing is "mvp" from the site
    mvp_table = soup.find(id="mvp")
    mvp = pd.read_html(str(mvp_table))[0]
    #Adding a year column so that the tables can be distinguished
    mvp["year"] = year

    dfs.append(mvp)

In [12]:
mvps = pd.concat(dfs)

In [13]:
mvps.head()

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,year
0,1,Michael Jordan,27,CHI,77.0,891.0,960,0.928,82,37.0,...,6.0,5.5,2.7,1.0,0.539,0.312,0.851,20.3,0.321,1991
1,2,Magic Johnson,31,LAL,10.0,497.0,960,0.518,79,37.1,...,7.0,12.5,1.3,0.2,0.477,0.32,0.906,15.4,0.251,1991
2,3,David Robinson,25,SAS,6.0,476.0,960,0.496,82,37.7,...,13.0,2.5,1.5,3.9,0.552,0.143,0.762,17.0,0.264,1991
3,4,Charles Barkley,27,PHI,2.0,222.0,960,0.231,67,37.3,...,10.1,4.2,1.6,0.5,0.57,0.284,0.722,13.4,0.258,1991
4,5,Karl Malone,27,UTA,0.0,142.0,960,0.148,82,40.3,...,11.8,3.3,1.1,1.0,0.527,0.286,0.77,15.5,0.225,1991


In [14]:
mvps.to_csv("mvps.csv")

Before we can predict future MVPs, we need to get the stat for each of the players

In [15]:
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

Checking out the 1991 stat page

In [16]:
url_1991 = player_stats_url.format("1991")
data = requests.get(url_1991)

with open("player_stats/1991.html", "w+", encoding="utf-8") as f:
    f.write(data.text)

The above method cannot load all the data needed in the table element because some javascript code is used to load the table on the page as you need it. Therefore, we use selenium

In [17]:
from selenium import webdriver

In [18]:
driver = webdriver.Chrome(executable_path=r"C:\Users\Daniel Tomiwa\Downloads\Setups\chromedriver")

  """Entry point for launching an IPython kernel.


In [39]:
import time
#We fetch the data from the url just like a request
driver.get(url_1991)
driver.execute_script("window.scrollTo(1,10000)") #Performing the scroll that an ordinary get request couldn't
time.sleep(2)

htmml = driver.page_source #the page_source is like the text attribute of the normal requests data

In [42]:
with open("player_stats/1991.html", "w+", encoding="utf-8") as f:
    f.write(htmml)

In [44]:
for year in years:
    url = player_stats_url.format(year)

    driver.get(url)
    driver.execute_script("window.scrollTo(1,10000)")
    time.sleep(2)

    html = driver.page_source
    with open("player_stats/{}.html".format(year), "w+", encoding="utf-8") as f:
        f.write(html)

In [22]:
dfs = []

for year in years:
    with open("player_stats/{}.html".format(year), encoding="utf-8") as f:
        page = f.read()
    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_="thead").decompose()
    stat_table = soup.find(id="per_game_stats")
    stats = pd.read_html(str(stat_table))[0]
    stats["year"] = year

    dfs.append(stats)    

In [23]:
player_stats = pd.concat(dfs)

In [24]:
player_stats

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,year
0,1,Alaa Abdelnaby,PF,22,POR,43,0,6.7,1.3,2.7,...,0.6,1.4,2.1,0.3,0.1,0.3,0.5,0.9,3.1,1991
1,2,Mahmoud Abdul-Rauf,PG,21,DEN,67,19,22.5,6.2,15.1,...,0.5,1.3,1.8,3.1,0.8,0.1,1.6,2.2,14.1,1991
2,3,Mark Acres,C,28,ORL,68,0,19.3,1.6,3.1,...,2.1,3.2,5.3,0.4,0.4,0.4,0.6,3.2,4.2,1991
3,4,Michael Adams,PG,28,DEN,66,66,35.5,8.5,21.5,...,0.9,3.0,3.9,10.5,2.2,0.1,3.6,2.5,26.5,1991
4,5,Mark Aguirre,SF,31,DET,78,13,25.7,5.4,11.7,...,1.7,3.1,4.8,1.8,0.6,0.3,1.6,2.7,14.2,1991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
725,536,Delon Wright,PG,28,SAC,27,8,25.8,3.9,8.3,...,1.0,2.9,3.9,3.6,1.6,0.4,1.3,1.1,10.0,2021
726,537,Thaddeus Young,PF,32,CHI,68,23,24.3,5.4,9.7,...,2.5,3.8,6.2,4.3,1.1,0.6,2.0,2.2,12.1,2021
727,538,Trae Young,PG,22,ATL,63,63,33.7,7.7,17.7,...,0.6,3.3,3.9,9.4,0.8,0.2,4.1,1.8,25.3,2021
728,539,Cody Zeller,C,28,CHO,48,21,20.9,3.8,6.8,...,2.5,4.4,6.8,1.8,0.6,0.4,1.1,2.5,9.4,2021


In [25]:
player_stats.to_csv("player_stats.csv")

Inorder to build a model that can identify the next mvp, we need a record of the team the player was in

In [28]:
team_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

Observing the 1991 year team stats

In [30]:
year = 1991

url = team_stats_url.format(year)
data = requests.get(url)

with open("team_stats/{}.html".format(year), "w+", encoding="utf-8") as f:
    f.write(data.text)

In [31]:
for year in years:
    url = team_stats_url.format(year)
    data = requests.get(url)

    with open("team_stats/{}.html".format(year), "w+", encoding="utf-8") as f:
        f.write(data.text)

In [33]:
dfs = []
for year in years:

    with open("team_stats/{}.html".format(year), encoding="utf-8") as f:
        page = f.read()

    soup = BeautifulSoup(page, "html.parser")
    soup.find("tr", class_="thead").decompose()
    team_stats = soup.find(id="divs_standings_E")
    team = pd.read_html(str(team_stats))[0]
    team["year"] = year
    team["Team"] = team["Eastern Conference"]
    del team["Eastern Conference"]
    dfs.append(team)

    with open("team_stats/{}.html".format(year), encoding="utf-8") as f:
        page = f.read()

    soup = BeautifulSoup(page, "html.parser")
    soup.find("tr", class_="thead").decompose()
    team_stats = soup.find(id="divs_standings_W")
    team = pd.read_html(str(team_stats))[0]
    team["year"] = year
    team["Team"] = team["Western Conference"]   
    del team["Western Conference"]
    dfs.append(team) 

In [34]:
teams = pd.concat(dfs)

In [35]:
teams

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,year,Team
0,56,26,.683,—,111.5,105.7,5.22,1991,Boston Celtics*
1,44,38,.537,12.0,105.4,105.6,-0.39,1991,Philadelphia 76ers*
2,39,43,.476,17.0,103.1,103.3,-0.43,1991,New York Knicks*
3,30,52,.366,26.0,101.4,106.4,-4.84,1991,Washington Bullets
4,26,56,.317,30.0,102.9,107.5,-4.53,1991,New Jersey Nets
...,...,...,...,...,...,...,...,...,...
13,42,30,.583,—,112.4,110.2,2.26,2021,Dallas Mavericks*
14,38,34,.528,4.0,113.3,112.3,1.07,2021,Memphis Grizzlies*
15,33,39,.458,9.0,111.1,112.8,-1.58,2021,San Antonio Spurs
16,31,41,.431,11.0,114.6,114.9,-0.20,2021,New Orleans Pelicans


In [36]:
teams.to_csv("team_stats.csv")