In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

# Webscraping Basketball Reference

## Webscrape 1981

In [8]:
mvp_1981_url = 'https://www.basketball-reference.com/awards/awards_1981.html'
data_1981 = requests.get(mvp_1981_url)
with open('mvp/1981.html','w+') as f:
    f.write(data_1981.text)

## Convert html file to pandas dataframe

In [11]:
with open('mvp/1981.html') as f:
    page_1981 = f.read()
soup = BeautifulSoup(page_1981,'html.parser')
soup.find('tr', class_= 'over_header').decompose()

In [15]:
mvp_1981_table = soup.find_all(id = 'mvp')[0]

In [21]:
mvp_1981 = pd.read_html(str(mvp_1981_table))[0]

In [22]:
mvp_1981

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,1,Julius Erving,30,PHI,28.0,454.0,690,0.658,82,35.0,24.6,8.0,4.4,2.1,1.8,0.521,0.222,0.787,13.8,0.231
1,2,Larry Bird,24,BOS,20.0,423.0,690,0.613,82,39.5,21.2,10.9,5.5,2.0,0.8,0.478,0.27,0.863,10.8,0.16
2,3,Kareem Abdul-Jabbar,33,LAL,8.0,286.0,690,0.414,80,37.2,26.2,10.3,3.4,0.7,2.9,0.574,0.0,0.766,14.3,0.23
3,4,Moses Malone,25,HOU,8.0,180.0,690,0.261,80,40.6,27.8,14.8,1.8,1.0,1.9,0.522,0.333,0.757,13.7,0.202
4,5,George Gervin,28,SAS,1.0,83.0,690,0.12,82,33.7,27.1,5.1,3.2,1.1,0.7,0.492,0.257,0.826,10.5,0.182
5,6,Marques Johnson,24,MIL,1.0,73.0,690,0.106,76,33.4,20.3,6.8,4.6,1.5,0.5,0.552,0.0,0.706,11.2,0.211
6,7,Robert Parish,27,BOS,0.0,53.0,690,0.077,82,28.0,18.9,9.5,1.8,1.0,2.6,0.545,0.0,0.71,10.9,0.228
7,8,Dennis Johnson,26,PHO,0.0,50.0,690,0.072,79,33.1,18.8,4.6,3.7,1.7,0.8,0.436,0.216,0.82,8.4,0.154
8,9,Tiny Archibald,32,BOS,0.0,32.0,690,0.046,80,35.3,13.8,2.2,7.7,0.9,0.2,0.499,0.0,0.816,6.9,0.118
9,10,Jamaal Wilkes,27,LAL,0.0,19.0,690,0.028,81,37.4,22.6,5.4,2.9,1.5,0.4,0.526,0.077,0.758,8.5,0.135


In [32]:
mvp_1981["Year"] = 1981

In [33]:
mvp_1981.head(2)

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
0,1,Julius Erving,30,PHI,28.0,454.0,690,0.658,82,35.0,...,8.0,4.4,2.1,1.8,0.521,0.222,0.787,13.8,0.231,1981
1,2,Larry Bird,24,BOS,20.0,423.0,690,0.613,82,39.5,...,10.9,5.5,2.0,0.8,0.478,0.27,0.863,10.8,0.16,1981


## Webscrape 1982-2022

In [31]:
years= list(range(1982,2023))

In [30]:
url_year_start = 'https://www.basketball-reference.com/awards/awards_{}.html'

for y in years:
    url = url_year_start.format(y)
    
    data = requests.get(url)
    
    with open(f"mvp/{y}.html","w+") as f:
        f.write(data.text)

## Convert multiple files to dataframes

In [34]:
years= list(range(1981,2023))

In [36]:
data_frames = []
for y in years:
    with open (f'mvp/{y}.html') as f:
        page = f.read()
        
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_ = 'over_header').decompose()
    mvp_table = soup.find_all(id = 'mvp')[0]
    mvp_df = pd.read_html(str(mvp_table))[0]
    mvp_df["Year"] = y
    data_frames.append(mvp_df)

In [37]:
mvp_df = pd.concat(data_frames)

mvp_df.tail(3)

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
9,10T,DeMar DeRozan,32,CHI,0.0,1.0,1000,0.001,76,36.1,...,5.2,4.9,0.9,0.3,0.504,0.352,0.877,8.8,0.154,2022
10,10T,Kevin Durant,33,BRK,0.0,1.0,1000,0.001,55,37.2,...,7.4,6.4,0.9,0.9,0.518,0.383,0.91,8.4,0.198,2022
11,10T,LeBron James,37,LAL,0.0,1.0,1000,0.001,56,37.2,...,8.2,6.2,1.3,1.1,0.524,0.359,0.756,7.5,0.172,2022


In [39]:
mvp_df.to_csv('data/mvp_df.csv', index=False)

In [41]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

In [44]:
driver = webdriver.Chrome(
    executable_path='/Users/dcast822/chromedriver')

  driver = webdriver.Chrome(


## Get players

In [45]:
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"
for year in years:
    url = player_stats_url.format(year)
    driver.get(url)
    driver.execute_script("window.scrollTo(1,10000)")
    time.sleep(2)
    
    with open(f'player/{year}.html',"w+") as f:
        f.write(driver.page_source)

In [46]:
players = []
for year in years:
    with open(f'player/{year}.html') as f:
        page = f.read()
    soup = BeautifulSoup(page,'html.parser')
    soup.find('tr', class_ = 'thead').decompose()
    players_table = soup.find_all(id = "per_game_stats")[0]
    players_df = pd.read_html(str(players_table))[0]
    players_df["Year"] = year
    players.append(players_df)
                                 

In [47]:
player_df = pd.concat(players)

In [51]:
player_df.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,1,Kareem Abdul-Jabbar*,C,33,LAL,80,,37.2,10.5,18.2,...,2.5,7.8,10.3,3.4,0.7,2.9,3.1,3.1,26.2,1981
1,2,Tom Abernethy,SF,26,TOT,39,,7.6,0.6,1.5,...,0.5,0.7,1.2,0.5,0.2,0.1,0.2,0.9,1.6,1981
2,2,Tom Abernethy,SF,26,GSW,10,,3.9,0.1,0.3,...,0.1,0.7,0.8,0.1,0.1,0.0,0.2,0.5,0.4,1981
3,2,Tom Abernethy,SF,26,IND,29,,8.9,0.8,1.9,...,0.7,0.7,1.4,0.6,0.2,0.1,0.2,1.0,2.0,1981
4,3,Alvan Adams,C,26,PHO,75,,27.4,6.1,11.6,...,2.1,5.2,7.3,4.6,1.4,0.9,3.0,3.0,14.9,1981


In [62]:
player_df.to_csv("data/players.csv", index = False)

## Get Team Data

In [54]:
teams_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"
for year in years:
    url = teams_url.format(year)
    data = requests.get(url)
    with open(f"team/{year}.html", "w+") as f:
        f.write(data.text)

In [56]:
team_dfs = []

for year in years:
    with open(f'team/{year}.html') as f:
        page = f.read()
    soup = BeautifulSoup(page,'html.parser')
    soup.find('tr', class_ = 'thead').decompose()
    east_table = soup.find_all(id = 'divs_standings_E')[0]
    east_df = pd.read_html(str(east_table))[0]
    east_df["Year"] = year
    east_df["Team"] = east_df["Eastern Conference"]
    del east_df["Eastern Conference"]
    team_dfs.append(east_df)
    
    west_table = soup.find_all(id = 'divs_standings_W')[0]
    west_df = pd.read_html(str(west_table))[0]
    west_df["Year"] = year
    west_df["Team"] = west_df["Western Conference"]
    del west_df["Western Conference"]
    team_dfs.append(west_df)
                           

In [57]:
teams = pd.concat(team_dfs)

In [58]:
teams.head()

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
0,62,20,0.756,—,109.9,104.0,6.05,1981,Boston Celtics*
1,62,20,0.756,—,111.7,103.8,7.76,1981,Philadelphia 76ers*
2,50,32,0.61,12.0,107.9,106.3,2.0,1981,New York Knicks*
3,39,43,0.476,23.0,105.6,105.6,0.42,1981,Washington Bullets
4,24,58,0.293,38.0,106.9,113.0,-5.15,1981,New Jersey Nets


In [59]:
teams.tail()

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
13,56,26,0.683,—,115.6,109.9,5.37,2022,Memphis Grizzlies*
14,52,30,0.634,4.0,108.0,104.7,3.12,2022,Dallas Mavericks*
15,36,46,0.439,20.0,109.3,110.3,-0.84,2022,New Orleans Pelicans*
16,34,48,0.415,22.0,113.2,113.0,0.02,2022,San Antonio Spurs
17,20,62,0.244,36.0,109.7,118.2,-8.26,2022,Houston Rockets


In [61]:
teams.to_csv('data/teams.csv', index=False)