# Data scraping

In [4]:
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import pandas as pd
pd.set_option('display.max_columns', None) # display all columns in a wide DataFrame
import requests

In [2]:
years = range(1992, 2024)

In [3]:
# loop over years 1992 - 2023 to use selenium to grab player total season stats for each of those years and save it to a html file
# saving the pages to minimize requests on the site
start_time = time.time()
for year in years:
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_totals.html'
    with webdriver.Firefox() as driver:
        driver.get(url)
        driver.execute_script('window.scrollTo(1,10000)')
        time.sleep(3)

        html = driver.page_source
        with open(f'player_stats/{year}.html', 'w+', encoding='utf-8') as f:
            f.write(html)
            
print(f'Total run time : {(time.time()-start_time)/60} minutes')

In [5]:
# open each page and use BeautifulSoup to extract the stats table
# each table gets put into a list and then combined to a DataFrame
data = []
for year in years:
    with open(f'player_stats/{year}.html', encoding='utf-8') as f:
        page = f.read()
    soup = BeautifulSoup(page, 'html.parser')
    stats_table = soup.find('table', id='totals_stats')
    player_stats = pd.read_html(str(stats_table))[0]
    player_stats['Year'] = year

    data.append(player_stats)

In [6]:
player_stats = pd.concat(data)

In [7]:
player_stats.sample(5)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
341,248,Jim McIlvaine,C,24,SEA,82,79,1477,130,276,0.471,1,7,0.143,129,269,0.48,0.473,53,107,0.495,132,198,330,23,39,164,62,247,314,1997
169,137,PJ Dozier,PG,21,OKC,2,0,3,1,2,0.5,0,0,,1,2,0.5,0.5,0,0,,0,1,1,0,0,0,1,1,2,2018
68,54,Jud Buechler,SF,23,NJN,2,0,29,4,8,0.5,0,0,,4,8,0.5,0.5,0,0,,2,0,2,2,2,1,1,2,8,1992
227,155,Sundiata Gaines,PG,24,TOT,24,0,301,42,105,0.4,9,38,0.237,33,67,0.493,0.443,18,36,0.5,13,25,38,42,16,1,27,25,111,2011
289,231,James Johnson,PF,28,TOR,57,32,926,114,240,0.475,20,66,0.303,94,174,0.54,0.517,39,68,0.574,28,98,126,67,29,33,54,84,287,2016


In [8]:
player_stats.to_csv('player_stats.csv')

In [20]:
# using the same years as above, getting each teams ratings
# can utilize requests this time
start_time = time.time()
for year in years:
    res = requests.get(f'https://www.basketball-reference.com/leagues/NBA_{year}_ratings.html')
    time.sleep(15)
    
    with open(f'team_ratings/{year}.html', 'w+', encoding='utf-8') as f:
        f.write(res.text)

print(f'Total run time : {(time.time()-start_time)/60} minutes')

Total run time : 8.145839556058247 minutes


In [21]:
team_data = []
for year in years:
    with open(f'team_ratings/{year}.html', encoding='utf-8') as f:
        page = f.read()
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_='over_header').decompose()
    ratings_table = soup.find('table', id='ratings')
    team_ratings = pd.read_html(str(ratings_table))[0]
    team_ratings['Year'] = year

    team_data.append(team_ratings)

In [22]:
team_ratings = pd.concat(team_data)

In [23]:
team_ratings.sample(5)

Unnamed: 0,Rk,Team,Conf,Div,W,L,W/L%,MOV,ORtg,DRtg,NRtg,MOV/A,ORtg/A,DRtg/A,NRtg/A,Year
16,17,Milwaukee Bucks,E,C,42,40,0.512,-0.18,109.92,110.16,-0.23,-0.44,109.96,110.46,-0.5,2017
16,17,Miami Heat,E,A,32,50,0.39,-1.63,108.91,110.59,-1.68,-1.85,109.01,110.91,-1.9,1995
1,2,Utah Jazz,W,M,37,13,0.74,6.52,106.47,99.06,7.41,5.54,105.27,98.96,6.31,1999
19,20,Golden State Warriors,W,P,34,48,0.415,-2.16,105.24,107.71,-2.46,-1.74,105.22,107.24,-2.01,2005
9,10,Portland Trail Blazers,W,NW,50,32,0.61,3.3,111.91,108.02,3.88,3.18,111.78,108.04,3.74,2010


In [24]:
team_ratings.to_csv('team_ratings.csv')