We will be using machine learning to predict NBA's defensive player of the year award

In [1]:
import os
import time
import requests as req
from bs4 import BeautifulSoup ,Comment
import html5lib
import pandas as pd
pd.options.display.max_columns = 50
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [2]:
#We will be scraping data from years 1991-2021 and saving the HTML files for later
url_DPOY = "https://www.basketball-reference.com/awards/awards_{}.html"

years = list(range(1991,2023))

for year in years:
    url = url_DPOY.format(year)
    data = req.get(url)

    with open("DPOYs/{}.html".format(year), "w+") as f:
        f.write(data.text)


In [3]:
#We will be using BeautifulSoup to extract comments from the HTML files

def get_comments(year):
    with open("DPOYs/{}.html".format(year), "r") as f:
        soup = BeautifulSoup(f, "html.parser")
        comments = soup.find_all(string=lambda text:isinstance(text, Comment))
        return comments

In [4]:
#Now we will overwrite the HTML files with the comments
for year in years:
    comments = get_comments(year)
    with open("DPOYs/{}.html".format(year), "w+") as f:
        for comment in comments:
            f.write(comment)

In [5]:
#We will be using pandas to read the comments and extract the data
dfs = []
for year in years:
    with open("DPOYs/{}.html".format(year)) as f:
        page = f.read()
    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_="over_header").decompose()
    dpoy_table = soup.find(id="dpoy")
    dpoy = pd.read_html(str(dpoy_table))[0]
    dpoy["Year"] = year

    dfs.append(dpoy)


In [6]:
#Now we will concatenate the dataframes into one dataframe for use in the analysis
dpoys = pd.concat(dfs)

In [8]:
#We will be looking at dpoys years 1991-2000
dpoys[dpoys["Year"] == 2022]


Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
0,1,Marcus Smart,27,BOS,37.0,257.0,500,0.514,71,32.3,12.1,3.8,5.9,1.7,0.3,0.418,0.331,0.793,5.6,0.116,2022
1,2,Mikal Bridges,25,PHO,22.0,202.0,500,0.404,82,34.8,14.2,4.2,2.3,1.2,0.4,0.534,0.369,0.834,8.9,0.15,2022
2,3,Rudy Gobert,29,UTA,12.0,136.0,500,0.272,66,32.1,15.6,14.7,1.1,0.7,2.1,0.713,0.0,0.69,11.7,0.264,2022
3,4,Bam Adebayo,24,MIA,13.0,128.0,500,0.256,56,32.6,19.1,10.1,3.4,1.4,0.8,0.557,0.0,0.753,7.2,0.188,2022
4,5,Jaren Jackson Jr.,22,MEM,10.0,99.0,500,0.198,78,27.3,16.3,5.8,1.1,0.9,2.3,0.415,0.319,0.823,5.4,0.121,2022
5,6,Giannis Antetokounmpo,27,MIL,5.0,58.0,500,0.116,67,32.9,29.9,11.6,5.8,1.1,1.4,0.553,0.293,0.722,12.9,0.281,2022
6,7,Robert Williams,24,BOS,1.0,8.0,500,0.016,61,29.6,10.0,9.6,2.0,0.9,2.2,0.736,0.0,0.722,9.9,0.262,2022
7,8,Jrue Holiday,31,MIL,0.0,6.0,500,0.012,67,32.9,18.3,4.5,6.8,1.6,0.4,0.501,0.411,0.761,6.9,0.15,2022
8,9,Al Horford,35,BOS,0.0,3.0,500,0.006,69,29.1,10.2,7.7,3.4,0.7,1.3,0.467,0.336,0.842,7.6,0.181,2022
9,10,Draymond Green,31,GSW,0.0,2.0,500,0.004,46,28.9,7.5,7.3,7.0,1.3,1.1,0.525,0.296,0.659,3.6,0.131,2022


In [9]:
#Let's save the dataframe to a csv file under the DPOYs folder
dpoys.to_csv("DPOYs/dpoys.csv")

In [23]:
#The next page we are going to extract data from requires javascript to be enabled, therefore we need to initialize selenium
driver = webdriver.Chrome(executable_path="/Users/david/Desktop/chromedriver")

  driver = webdriver.Chrome(executable_path="/Users/david/Desktop/chromedriver")


In [24]:
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

for year in years:
    url = player_stats_url.format(year)

    driver.get(url)
    driver.execute_script("window.scrollTo(1,10000)")
    time.sleep(2)

    html = driver.page_source
    with open("Players/{}.html".format(year), "w+") as f:
        f.write(html)

In [25]:
#Now it's time to extract the table and remove an unwanted headers like we did with the DPOYs
dfs = []
for year in years:
    with open("Players/{}.html".format(year)) as f:
        page = f.read()

    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr',class_="thead").decompose()
    player_table = soup.find_all(id="per_game_stats")[0]
    player_df = pd.read_html(str(player_table))[0]
    player_df["Year"] = year
    dfs.append(player_df)

In [27]:
# Let's concatenate the dataframes into one dataframe
players = pd.concat(dfs)

players.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,1,Alaa Abdelnaby,PF,22,POR,43,0,6.7,1.3,2.7,0.474,0.0,0.0,,1.3,2.7,0.474,0.474,0.6,1.0,0.568,0.6,1.4,2.1,0.3,0.1,0.3,0.5,0.9,3.1,1991
1,2,Mahmoud Abdul-Rauf,PG,21,DEN,67,19,22.5,6.2,15.1,0.413,0.4,1.5,0.24,5.9,13.6,0.432,0.425,1.3,1.5,0.857,0.5,1.3,1.8,3.1,0.8,0.1,1.6,2.2,14.1,1991
2,3,Mark Acres,C,28,ORL,68,0,19.3,1.6,3.1,0.509,0.0,0.0,0.333,1.6,3.1,0.512,0.512,1.0,1.5,0.653,2.1,3.2,5.3,0.4,0.4,0.4,0.6,3.2,4.2,1991
3,4,Michael Adams,PG,28,DEN,66,66,35.5,8.5,21.5,0.394,2.5,8.5,0.296,6.0,13.0,0.459,0.453,7.0,8.0,0.879,0.9,3.0,3.9,10.5,2.2,0.1,3.6,2.5,26.5,1991
4,5,Mark Aguirre,SF,31,DET,78,13,25.7,5.4,11.7,0.462,0.3,1.0,0.308,5.1,10.7,0.477,0.475,3.1,4.1,0.757,1.7,3.1,4.8,1.8,0.6,0.3,1.6,2.7,14.2,1991


In [28]:
#Let's save the dataframe to a csv file under the Players folder
players.to_csv("Players/players.csv")

In [31]:
#Now we will be extracting each team's record for each year
team_records_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

for year in years:
    url = team_records_url.format(year)

    data = req.get(url)

    with open("Team_Records/{}.html".format(year), "w+") as f:
        f.write(data.text)

In [32]:
#Now we will be extracting the table and removing the unwanted headers while adding columns
dfs = []

for year in years:
    with open("Team_Records/{}.html".format(year)) as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="thead").decompose()
    east_table = soup.find_all(id="divs_standings_E")[0]
    east_df = pd.read_html(str(east_table))[0]
    east_df["Year"] = year
    east_df["Team"] = east_df["Eastern Conference"]
    del east_df["Eastern Conference"]
    dfs.append(east_df)
    
    west_table = soup.find_all(id="divs_standings_W")[0]
    west_df = pd.read_html(str(west_table))[0]
    west_df["Year"] = year
    west_df["Team"] = west_df["Western Conference"]
    del west_df["Western Conference"]
    dfs.append(west_df)

In [33]:
teams = pd.concat(dfs)

In [35]:
teams.tail()

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
13,56,26,0.683,—,115.6,109.9,5.37,2022,Memphis Grizzlies*
14,52,30,0.634,4.0,108.0,104.7,3.12,2022,Dallas Mavericks*
15,36,46,0.439,20.0,109.3,110.3,-0.84,2022,New Orleans Pelicans*
16,34,48,0.415,22.0,113.2,113.0,0.02,2022,San Antonio Spurs
17,20,62,0.244,36.0,109.7,118.2,-8.26,2022,Houston Rockets


In [36]:
teams.to_csv('Team_Records/teams.csv')