# Get data

Extract each team's per game stats, miscellaneous stats, and shooting stats.

Also want to get number of shots attempted from each place.

Extract individual player stats too. 

Extract starting lineup.

In [7]:
import requests
import pandas as pd

from bs4 import BeautifulSoup

# Test: scrape data for 2018 - 2019 season

In [8]:
sample_url = "https://www.basketball-reference.com/leagues/NBA_2019.html#all_team-stats-base"

## Try using Pandas

In [9]:
# Try to read table with Pandas first 
tables = pd.read_html(sample_url)

eastern_conference = tables[0]
western_conference = tables[1]

This doesn't extract all the tables I want.

## Read with Requests and BeautifulSoup?

In [10]:
# response = requests.get(sample_url)
# page = response.text
# soup = BeautifulSoup(page, 'lxml')

# # try to get team stats
# tables = soup.findAll('div', {'id':'div_team-stats-per_game'})

Also doesn't work. Tables don't load, probably because of JavaScript. On to Selenium.

## Use Selenium to render tables

https://www.freecodecamp.org/news/better-web-scraping-in-python-with-selenium-beautiful-soup-and-pandas-d6390592e251/

In [11]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

driver = webdriver.Chrome()
driver.get(sample_url)

In [12]:
inputElement = driver.find_element_by_id('all_team-stats-per_game')

## Then pass html off to BeautifulSoup

In [13]:
soup = BeautifulSoup(driver.page_source, 'lxml')

#### Scrape game scores

https://www.basketball-reference.com/leagues/NBA_2019_games.html

#### Get per game stats

In [14]:
# find table tag for this
per_game_table = soup.find('div',{'id':'all_team-stats-per_game'})

In [15]:
# get columns
per_game_cols = []

header_items = per_game_table.find('thead').find_all('th')
per_game_cols = [i.text for i in header_items]

In [16]:
# get data
per_game_data = []

rows = per_game_table.find('tbody').find_all('tr')
for r in rows:
    if r.find('th').text == 'Rk':
        continue

    data_entry = [r.find('th').text]
    data_entry.extend([el.text for el in r.find_all('td')])
    per_game_data.append(data_entry)

per_game_df = pd.DataFrame(per_game_data, columns=per_game_cols)
per_game_df.head()

Unnamed: 0,Rk,Team,G,MP,FG,FGA,FG%,3P,3PA,3P%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Milwaukee Bucks*,82,241.2,43.4,91.1,0.476,13.5,38.2,0.353,...,0.773,9.3,40.4,49.7,26.0,7.5,5.9,13.9,19.6,118.1
1,2,Golden State Warriors*,82,241.5,44.0,89.8,0.491,13.3,34.4,0.385,...,0.801,9.7,36.5,46.2,29.4,7.6,6.4,14.3,21.4,117.7
2,3,New Orleans Pelicans,82,240.9,43.7,92.2,0.473,10.3,29.9,0.344,...,0.761,11.1,36.2,47.3,27.0,7.4,5.4,14.8,21.1,115.4
3,4,Philadelphia 76ers*,82,241.5,41.5,88.2,0.471,10.8,30.2,0.359,...,0.771,10.9,36.9,47.8,26.9,7.4,5.3,14.9,21.3,115.2
4,5,Los Angeles Clippers*,82,241.8,41.3,87.5,0.471,10.0,25.8,0.388,...,0.792,9.7,35.8,45.5,24.0,6.8,4.7,14.5,23.3,115.1


#### Get misc stats

In [17]:
misc_table = soup.find('table',{'id':'misc_stats'})

In [18]:
# get cols
misc_cols = []

header_rows = misc_table.find('thead').find_all('tr')
header_items = header_rows[1].find_all('th')
misc_cols = [i.text for i in header_items]

In [19]:
# get data
misc_data = []

rows = misc_table.find('tbody').find_all('tr')
for r in rows:
    data_entry = [r.find('th').text]
    data_entry.extend([el.text for el in r.find_all('td')])
    misc_data.append(data_entry)

misc_df = pd.DataFrame(misc_data, columns=misc_cols)
misc_df.head()

Unnamed: 0,Rk,Team,Age,W,L,PW,PL,MOV,SOS,SRS,...,TOV%,ORB%,FT/FGA,eFG%,TOV%.1,DRB%,FT/FGA.1,Arena,Attend.,Attend./G
0,1,Milwaukee Bucks*,26.9,60,22,61,21,8.87,-0.82,8.04,...,12.0,20.8,0.197,0.503,11.5,80.3,0.162,Fiserv Forum,721692,17602
1,2,Golden State Warriors*,28.4,57,25,56,26,6.46,-0.04,6.42,...,12.6,22.5,0.182,0.508,11.7,77.1,0.205,Oracle Arena,803436,19596
2,3,Toronto Raptors*,27.3,58,24,56,26,6.09,-0.6,5.49,...,12.4,21.9,0.198,0.509,13.1,77.1,0.19,Scotiabank Arena,812822,19825
3,4,Utah Jazz*,27.3,50,32,54,28,5.26,0.03,5.28,...,13.4,22.9,0.217,0.507,12.4,80.3,0.189,Vivint Smart Home Arena,750546,18306
4,5,Houston Rockets*,29.2,53,29,53,29,4.77,0.19,4.96,...,12.0,22.8,0.221,0.525,13.4,74.4,0.21,Toyota Center,740392,18058


#### Get shooting stats

In [20]:
shooting_table = soup.find('table',{'id':'team_shooting'})

In [21]:
shooting_cols = []

header_rows = shooting_table.find('thead').find_all('tr')
header_items = header_rows[2].find_all('th')
shooting_cols = [i.text for i in header_items]

In [22]:
shooting_data = []

rows = shooting_table.find('tbody').find_all('tr')
for r in rows:
    data_entry = [r.find('th').text]
    data_entry.extend([el.text for el in r.find_all('td')])
    shooting_data.append(data_entry)

In [23]:
shooting_df = pd.DataFrame(shooting_data, columns=shooting_cols)
shooting_df.head()

Unnamed: 0,Rk,Team,G,MP,FG%,Dist.,2P,0-3,3-10,10-16,...,%Ast'd,%FGA,Md.,%FGA.1,Md..1,%Ast'd.1,%3PA,3P%,Att.,Md..2
0,1,Atlanta Hawks,82,19855,0.451,13.6,0.597,0.332,0.134,0.083,...,0.533,0.081,528,0.306,1175,0.824,0.245,0.393,21,0
1,2,Boston Celtics*,82,19780,0.465,14.5,0.619,0.248,0.147,0.114,...,0.527,0.051,337,0.247,1028,0.854,0.196,0.355,7,0
2,3,Brooklyn Nets*,82,19980,0.449,13.7,0.597,0.299,0.166,0.084,...,0.508,0.051,326,0.309,1207,0.774,0.185,0.377,14,1
3,4,Chicago Bulls,82,19905,0.453,12.4,0.705,0.31,0.191,0.104,...,0.465,0.052,340,0.295,1112,0.838,0.218,0.4,14,0
4,5,Charlotte Hornets,82,19830,0.448,13.9,0.622,0.289,0.165,0.083,...,0.494,0.041,264,0.305,1181,0.777,0.194,0.415,37,2


#### Get salaries

In [25]:
salary = 'https://hoopshype.com/salaries/players/2018-2019/'

#### Salary caps

In [24]:
salary_url = 'https://www.basketball-reference.com/contracts/salary-cap-history.html'

# Turn code into functions

# Run this for all seasons

In [2]:
# Extract lots of information from each season
season_urls = ["https://www.basketball-reference.com/leagues/NBA_2014.html#all_team-stats-base",
               "https://www.basketball-reference.com/leagues/NBA_2015.html#all_team-stats-base"
               "https://www.basketball-reference.com/leagues/NBA_2016.html#all_team-stats-base",
               "https://www.basketball-reference.com/leagues/NBA_2017.html#all_team-stats-base",
               "https://www.basketball-reference.com/leagues/NBA_2018.html#all_team-stats-base"
               "https://www.basketball-reference.com/leagues/NBA_2019.html#all_team-stats-base"]