# Web Data Scraping

In [51]:
import pandas as pd
import numpy as np
import requests
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup

warnings.filterwarnings('ignore')

# Web Scrape NBA Standings from  CBS Sports

In [52]:
# URL
url = 'https://www.cbssports.com/nba/standings/'

# Send requests
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36'}

# Response
response = requests.get(url, headers = headers)

# Create a bs4 object to parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')


# Eastern Conference

In [53]:
# Find the tbody containing Eastern Conference Teams, you can use select ('tbody')[0], means first tbody found
east_C = soup.select('tbody')[0]

In [54]:
# Find all Team Names, you can use the .get_text().strip() to get the text and strip extra spaces

team = []

for row in east_C.select('tr'):
    columns = row.find_all('td')
    team.append(columns[1].get_text().strip())

In [55]:
# Data for Eastern Conference Teams
data = []

for row in east_C.select('tr'):
    columns = row.find_all('td')
    data.append([col.get_text().strip() for col in columns[2:18]])

In [56]:
# Construct Data Frame of Eastern Conference Teams
team_df = pd.DataFrame(team, columns = ['Team'])

team_df

Unnamed: 0,Team
0,Cleveland - y
1,Boston
2,New York
3,Milwaukee
4,Indiana
5,Detroit
6,Atlanta
7,Orlando
8,Miami
9,Chicago


In [57]:
# Construct DataFrame for Earch Conference Teams
# 7th Cell block:
team_data = pd.DataFrame(data, columns = ['W', 'L', 'PCT', 'GB', 'PPG','OPPG','DIFF','HOME', 'ROAD', 'DIV','CONF','STRK', 'L10', 'W','DIV','POST'])
team_data

Unnamed: 0,W,L,PCT,GB,PPG,OPPG,DIFF,HOME,ROAD,DIV,CONF,STRK,L10,W.1,DIV.1,POST
0,55,10,0.846,—,122.7,111.4,11.3,30-4,25-6,11-1,38-7,W15,10-0,66.5,100.0%,100.0%
1,47,19,0.712,8.5,116.8,108.3,8.5,23-12,24-7,11-2,32-11,L1,7-3,59.4,100.0%,100.0%
2,42,23,0.646,13.0,117.1,112.7,4.4,21-11,21-12,10-3,28-13,W2,5-5,51.9,0.0%,100.0%
3,37,28,0.569,18.0,114.8,112.4,2.4,23-11,14-17,6-7,26-19,W1,6-4,45.4,0.0%,91.7%
4,36,28,0.563,18.5,116.4,115.4,1.0,20-10,16-18,8-5,21-20,W1,5-5,46.5,0.0%,97.7%
5,37,30,0.552,19.0,115.0,113.2,1.8,18-15,19-15,4-9,25-21,L1,6-4,44.4,0.0%,89.1%
6,32,34,0.485,23.5,117.4,119.6,-2.2,17-16,15-18,8-4,24-18,W4,6-4,39.8,75.5%,91.1%
7,31,36,0.463,25.0,104.4,105.9,-1.5,18-16,13-20,8-3,24-20,W1,3-7,37.1,16.0%,62.3%
8,29,36,0.446,26.0,109.9,110.6,-0.7,15-16,14-20,7-5,18-22,L5,3-7,36.6,8.5%,40.4%
9,28,38,0.424,27.5,116.6,120.2,-3.6,13-22,15-16,4-11,23-23,W4,6-4,34.7,0.0%,25.6%


In [58]:
# Conbine both Data Frame (Eastern Conference)
df_merge = pd.concat([team_df, team_data], axis = 1)
df_merge

Unnamed: 0,Team,W,L,PCT,GB,PPG,OPPG,DIFF,HOME,ROAD,DIV,CONF,STRK,L10,W.1,DIV.1,POST
0,Cleveland - y,55,10,0.846,—,122.7,111.4,11.3,30-4,25-6,11-1,38-7,W15,10-0,66.5,100.0%,100.0%
1,Boston,47,19,0.712,8.5,116.8,108.3,8.5,23-12,24-7,11-2,32-11,L1,7-3,59.4,100.0%,100.0%
2,New York,42,23,0.646,13.0,117.1,112.7,4.4,21-11,21-12,10-3,28-13,W2,5-5,51.9,0.0%,100.0%
3,Milwaukee,37,28,0.569,18.0,114.8,112.4,2.4,23-11,14-17,6-7,26-19,W1,6-4,45.4,0.0%,91.7%
4,Indiana,36,28,0.563,18.5,116.4,115.4,1.0,20-10,16-18,8-5,21-20,W1,5-5,46.5,0.0%,97.7%
5,Detroit,37,30,0.552,19.0,115.0,113.2,1.8,18-15,19-15,4-9,25-21,L1,6-4,44.4,0.0%,89.1%
6,Atlanta,32,34,0.485,23.5,117.4,119.6,-2.2,17-16,15-18,8-4,24-18,W4,6-4,39.8,75.5%,91.1%
7,Orlando,31,36,0.463,25.0,104.4,105.9,-1.5,18-16,13-20,8-3,24-20,W1,3-7,37.1,16.0%,62.3%
8,Miami,29,36,0.446,26.0,109.9,110.6,-0.7,15-16,14-20,7-5,18-22,L5,3-7,36.6,8.5%,40.4%
9,Chicago,28,38,0.424,27.5,116.6,120.2,-3.6,13-22,15-16,4-11,23-23,W4,6-4,34.7,0.0%,25.6%


# Western Conference

In [59]:
# Find the table containing teams (Western Conference Teams), you can use .select('tbody')[1], means second tbody found
west_c = soup.select('tbody')[1]

In [60]:
# Team Names (Western Conference)

team = []

for row in west_c.select('tr'):
    columns = row.find_all('td')
    team.append(columns[1].get_text().strip())

In [61]:
# Data for Wastern Conference

data = []

for row in west_c.select('tr'):
    columns = row.find_all('td')
    data.append([col.get_text().strip() for col in columns[2:18]])

In [62]:
# Team DataFrame for Western Conference

team_df = pd.DataFrame(team, columns = ['Team'])

team_df

Unnamed: 0,Team
0,Oklahoma City - x
1,Memphis
2,Denver
3,L.A. Lakers
4,Houston
5,Golden St.
6,Minnesota
7,L.A. Clippers
8,Sacramento
9,Dallas


In [63]:
# DataFrame for Western Conference

team_data = pd.DataFrame(data, columns = ['W', 'L', 'PCT', 'GB', 'PPG','OPPG','DIFF','HOME', 'ROAD', 'DIV','CONF','STRK', 'L10', 'W','DIV','POST'])

team_data

Unnamed: 0,W,L,PCT,GB,PPG,OPPG,DIFF,HOME,ROAD,DIV,CONF,STRK,L10,W.1,DIV.1,POST
0,54,12,0.818,—,119.7,107.2,12.5,29-5,25-7,11-4,32-11,W1,8-2,67.0,100.0%,100.0%
1,42,24,0.636,12.0,122.6,116.7,5.9,24-10,18-14,10-5,25-16,W4,5-5,50.6,57.2%,98.7%
2,42,24,0.636,12.0,121.1,116.6,4.5,22-10,20-14,7-6,25-15,L1,5-5,52.3,0.0%,100.0%
3,40,24,0.625,13.0,112.8,111.4,1.4,25-7,15-17,11-3,27-12,L3,7-3,48.4,61.4%,94.4%
4,41,25,0.621,13.0,112.9,108.8,4.1,23-10,18-15,12-3,25-16,W4,6-4,49.7,42.8%,95.5%
5,38,28,0.576,16.0,113.8,111.2,2.6,20-13,18-15,3-10,22-19,W6,9-1,48.0,35.1%,91.8%
6,38,29,0.567,16.5,112.9,109.0,3.9,18-14,20-15,8-5,27-18,W6,7-3,48.2,0.0%,93.3%
7,36,30,0.545,18.0,111.1,108.9,2.2,22-10,14-20,7-7,21-22,W1,5-5,44.8,3.1%,67.4%
8,33,32,0.508,20.5,116.4,115.4,1.0,16-15,17-17,4-9,23-22,L3,5-5,42.2,0.0%,38.7%
9,33,34,0.493,21.5,114.8,114.6,0.2,19-15,14-19,8-6,23-24,L1,2-8,39.0,0.0%,13.1%


In [64]:
# Combine both Data Frame (Western Conference)
df_merge_west = pd.concat([team_df, team_data], axis = 1)

df_merge_west

Unnamed: 0,Team,W,L,PCT,GB,PPG,OPPG,DIFF,HOME,ROAD,DIV,CONF,STRK,L10,W.1,DIV.1,POST
0,Oklahoma City - x,54,12,0.818,—,119.7,107.2,12.5,29-5,25-7,11-4,32-11,W1,8-2,67.0,100.0%,100.0%
1,Memphis,42,24,0.636,12.0,122.6,116.7,5.9,24-10,18-14,10-5,25-16,W4,5-5,50.6,57.2%,98.7%
2,Denver,42,24,0.636,12.0,121.1,116.6,4.5,22-10,20-14,7-6,25-15,L1,5-5,52.3,0.0%,100.0%
3,L.A. Lakers,40,24,0.625,13.0,112.8,111.4,1.4,25-7,15-17,11-3,27-12,L3,7-3,48.4,61.4%,94.4%
4,Houston,41,25,0.621,13.0,112.9,108.8,4.1,23-10,18-15,12-3,25-16,W4,6-4,49.7,42.8%,95.5%
5,Golden St.,38,28,0.576,16.0,113.8,111.2,2.6,20-13,18-15,3-10,22-19,W6,9-1,48.0,35.1%,91.8%
6,Minnesota,38,29,0.567,16.5,112.9,109.0,3.9,18-14,20-15,8-5,27-18,W6,7-3,48.2,0.0%,93.3%
7,L.A. Clippers,36,30,0.545,18.0,111.1,108.9,2.2,22-10,14-20,7-7,21-22,W1,5-5,44.8,3.1%,67.4%
8,Sacramento,33,32,0.508,20.5,116.4,115.4,1.0,16-15,17-17,4-9,23-22,L3,5-5,42.2,0.0%,38.7%
9,Dallas,33,34,0.493,21.5,114.8,114.6,0.2,19-15,14-19,8-6,23-24,L1,2-8,39.0,0.0%,13.1%


# Combine Eastern Conference and Western Conference Data


In [65]:
# Combine Eastern and Western Conference Data via pd.concat

df_combined = pd.concat([df_merge, df_merge_west], axis = 0)

df_combined

Unnamed: 0,Team,W,L,PCT,GB,PPG,OPPG,DIFF,HOME,ROAD,DIV,CONF,STRK,L10,W.1,DIV.1,POST
0,Cleveland - y,55,10,0.846,—,122.7,111.4,11.3,30-4,25-6,11-1,38-7,W15,10-0,66.5,100.0%,100.0%
1,Boston,47,19,0.712,8.5,116.8,108.3,8.5,23-12,24-7,11-2,32-11,L1,7-3,59.4,100.0%,100.0%
2,New York,42,23,0.646,13.0,117.1,112.7,4.4,21-11,21-12,10-3,28-13,W2,5-5,51.9,0.0%,100.0%
3,Milwaukee,37,28,0.569,18.0,114.8,112.4,2.4,23-11,14-17,6-7,26-19,W1,6-4,45.4,0.0%,91.7%
4,Indiana,36,28,0.563,18.5,116.4,115.4,1.0,20-10,16-18,8-5,21-20,W1,5-5,46.5,0.0%,97.7%
5,Detroit,37,30,0.552,19.0,115.0,113.2,1.8,18-15,19-15,4-9,25-21,L1,6-4,44.4,0.0%,89.1%
6,Atlanta,32,34,0.485,23.5,117.4,119.6,-2.2,17-16,15-18,8-4,24-18,W4,6-4,39.8,75.5%,91.1%
7,Orlando,31,36,0.463,25.0,104.4,105.9,-1.5,18-16,13-20,8-3,24-20,W1,3-7,37.1,16.0%,62.3%
8,Miami,29,36,0.446,26.0,109.9,110.6,-0.7,15-16,14-20,7-5,18-22,L5,3-7,36.6,8.5%,40.4%
9,Chicago,28,38,0.424,27.5,116.6,120.2,-3.6,13-22,15-16,4-11,23-23,W4,6-4,34.7,0.0%,25.6%


In [66]:
# Sort by PCT in Descending Order

df_sort = df_combined.sort_values(by = 'PCT', ascending = False)

df_sort

Unnamed: 0,Team,W,L,PCT,GB,PPG,OPPG,DIFF,HOME,ROAD,DIV,CONF,STRK,L10,W.1,DIV.1,POST
0,Cleveland - y,55,10,0.846,—,122.7,111.4,11.3,30-4,25-6,11-1,38-7,W15,10-0,66.5,100.0%,100.0%
0,Oklahoma City - x,54,12,0.818,—,119.7,107.2,12.5,29-5,25-7,11-4,32-11,W1,8-2,67.0,100.0%,100.0%
1,Boston,47,19,0.712,8.5,116.8,108.3,8.5,23-12,24-7,11-2,32-11,L1,7-3,59.4,100.0%,100.0%
2,New York,42,23,0.646,13.0,117.1,112.7,4.4,21-11,21-12,10-3,28-13,W2,5-5,51.9,0.0%,100.0%
2,Denver,42,24,0.636,12.0,121.1,116.6,4.5,22-10,20-14,7-6,25-15,L1,5-5,52.3,0.0%,100.0%
1,Memphis,42,24,0.636,12.0,122.6,116.7,5.9,24-10,18-14,10-5,25-16,W4,5-5,50.6,57.2%,98.7%
3,L.A. Lakers,40,24,0.625,13.0,112.8,111.4,1.4,25-7,15-17,11-3,27-12,L3,7-3,48.4,61.4%,94.4%
4,Houston,41,25,0.621,13.0,112.9,108.8,4.1,23-10,18-15,12-3,25-16,W4,6-4,49.7,42.8%,95.5%
5,Golden St.,38,28,0.576,16.0,113.8,111.2,2.6,20-13,18-15,3-10,22-19,W6,9-1,48.0,35.1%,91.8%
3,Milwaukee,37,28,0.569,18.0,114.8,112.4,2.4,23-11,14-17,6-7,26-19,W1,6-4,45.4,0.0%,91.7%


In [67]:
# Reset the Index

df_sort.reset_index(drop = True, inplace = True)

df_sort

Unnamed: 0,Team,W,L,PCT,GB,PPG,OPPG,DIFF,HOME,ROAD,DIV,CONF,STRK,L10,W.1,DIV.1,POST
0,Cleveland - y,55,10,0.846,—,122.7,111.4,11.3,30-4,25-6,11-1,38-7,W15,10-0,66.5,100.0%,100.0%
1,Oklahoma City - x,54,12,0.818,—,119.7,107.2,12.5,29-5,25-7,11-4,32-11,W1,8-2,67.0,100.0%,100.0%
2,Boston,47,19,0.712,8.5,116.8,108.3,8.5,23-12,24-7,11-2,32-11,L1,7-3,59.4,100.0%,100.0%
3,New York,42,23,0.646,13.0,117.1,112.7,4.4,21-11,21-12,10-3,28-13,W2,5-5,51.9,0.0%,100.0%
4,Denver,42,24,0.636,12.0,121.1,116.6,4.5,22-10,20-14,7-6,25-15,L1,5-5,52.3,0.0%,100.0%
5,Memphis,42,24,0.636,12.0,122.6,116.7,5.9,24-10,18-14,10-5,25-16,W4,5-5,50.6,57.2%,98.7%
6,L.A. Lakers,40,24,0.625,13.0,112.8,111.4,1.4,25-7,15-17,11-3,27-12,L3,7-3,48.4,61.4%,94.4%
7,Houston,41,25,0.621,13.0,112.9,108.8,4.1,23-10,18-15,12-3,25-16,W4,6-4,49.7,42.8%,95.5%
8,Golden St.,38,28,0.576,16.0,113.8,111.2,2.6,20-13,18-15,3-10,22-19,W6,9-1,48.0,35.1%,91.8%
9,Milwaukee,37,28,0.569,18.0,114.8,112.4,2.4,23-11,14-17,6-7,26-19,W1,6-4,45.4,0.0%,91.7%
