In [10]:
import pandas as pd
import requests
from bs4 import BeautifulSoup, Comment
from datetime import datetime
import time
import os
import glob
import re

# Basketball Reference Shot Chart Data

In [11]:
url = "https://www.basketball-reference.com/boxscores/shot-chart/202503010CHO.html"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
}
response = requests.get(url, headers = headers)
response.raise_for_status()
response.encoding = 'utf-8'

In [12]:
soup = BeautifulSoup(response.text, "html.parser")


# Find all shot divs
shot_divs = soup.find_all("div", class_="tooltip")


# Extract shot data
shot_data = []

for shot in shot_divs:
    style = shot.get("style")  # Positioning
    tip = shot.get("tip")
    result = shot.get_text("style")  # Shot details
    shot_data.append({"style": style, "tip": tip, "result": result})

In [13]:
#establish coordinate system for the image used for the shot chart

relative_x_divisor = 500
relative_y_divisor = -472

In [88]:
df = pd.DataFrame(shot_data)

style = pd.DataFrame()

style[['pixel_Y', 'pixel_X', 'drop']] = df['style'].str.split(';', expand = True)

df = df.drop(labels = 'style', axis = 1)

style = style.drop(labels = 'drop', axis = 1)

style['pixel_X'] = style['pixel_X'].str.replace("px", "")

style['pixel_X'] = style['pixel_X'].str.replace("left:", "").str.strip()

style['pixel_X'] = pd.to_numeric(style['pixel_X'])

style['pixel_Y'] = style['pixel_Y'].str.replace("px", "")

style['pixel_Y'] = style['pixel_Y'].str.replace("top:", "").str.strip()

style['pixel_Y'] = pd.to_numeric(style['pixel_Y'])

df = df.merge(style, left_on = df.index, right_on = df.index)

df = df.drop(labels = "key_0", axis = 1)

df['result'] = df['result'].replace(['●', '×'], ['y', 'n'])

df['relative_X'] = (df['pixel_X'] / relative_x_divisor).round(3)

df['relative_Y'] = (df['pixel_Y'] / relative_y_divisor).round(3)

df[['time_of_shot', 'shot_details', 'score']] = df['tip'].str.strip().str.split('<br>', expand = True)

df['time_of_shot'] = df['time_of_shot'].str.strip()

df['shot_details'] = df['shot_details'].str.strip()

df['shot_details'] = df['shot_details'].str.replace(' Jr.', '')

#df['score'] = df['score'].str.strip()

df[['quarter', 'time']] = df['time_of_shot'].str.split(',', expand = True)

df = df.drop(labels = ['time_of_shot', 'score'], axis = 1)

df['quarter'] = "Q" + df['quarter'].str.strip().str.replace(r'[^\d]', '', regex=True)

df['time'] = df['time'].str.strip().str.split(' ', expand = True)[0]

df['shot_distance'] = df['shot_details'].str.split(' ').str[-2] + df['shot_details'].str.split(' ').str[-1].str.strip()






In [75]:
df

Unnamed: 0,tip,result,pixel_Y,pixel_X,relative_X,relative_Y,shot_details,score,quarter,time
0,"1st quarter, 11:30.0 remaining<br>Alex Sarr ma...",y,83,228,0.456,-0.176,Alex Sarr made 2-pointer from 5 ft,Washington now tied 2-2,Q1,11:30.0
1,"1st quarter, 10:40.0 remaining<br>Khris Middle...",y,159,366,0.732,-0.337,Khris Middleton made 2-pointer from 18 ft,Washington now trails 4-5,Q1,10:40.0
2,"1st quarter, 10:21.0 remaining<br>Kyshawn Geor...",n,228,51,0.102,-0.483,Kyshawn George missed 3-pointer from 27 ft,Washington trails 4-5,Q1,10:21.0
3,"1st quarter, 9:37.0 remaining<br>Khris Middlet...",n,259,102,0.204,-0.549,Khris Middleton missed 3-pointer from 27 ft,Washington leads 6-5,Q1,9:37.0
4,"1st quarter, 9:33.0 remaining<br>Alex Sarr mis...",n,59,263,0.526,-0.125,Alex Sarr missed 2-pointer from 4 ft,Washington leads 6-5,Q1,9:33.0
...,...,...,...,...,...,...,...,...,...,...
176,"4th quarter, 3:20.0 remaining<br>Mark Williams...",y,37,241,0.482,-0.078,Mark Williams made 2-pointer from 1 ft,Charlotte now trails 93-106,Q4,3:20.0
177,"4th quarter, 2:31.0 remaining<br>LaMelo Ball m...",y,279,186,0.372,-0.591,LaMelo Ball made 3-pointer from 26 ft,Charlotte now trails 96-108,Q4,2:31.0
178,"4th quarter, 1:46.0 remaining<br>Nick Smith Jr...",n,47,233,0.466,-0.100,Nick Smith Jr. missed 2-pointer from 2 ft,Charlotte trails 96-111,Q4,1:46.0
179,"4th quarter, 0:47.0 remaining<br>DaQuan Jeffri...",y,143,358,0.716,-0.303,DaQuan Jeffries made 2-pointer from 16 ft,Charlotte now trails 98-111,Q4,0:47.0


In [68]:
df['tip']

0      1st quarter, 11:30.0 remaining<br>Alex Sarr ma...
1      1st quarter, 10:40.0 remaining<br>Khris Middle...
2      1st quarter, 10:21.0 remaining<br>Kyshawn Geor...
3      1st quarter, 9:37.0 remaining<br>Khris Middlet...
4      1st quarter, 9:33.0 remaining<br>Alex Sarr mis...
                             ...                        
176    4th quarter, 3:20.0 remaining<br>Mark Williams...
177    4th quarter, 2:31.0 remaining<br>LaMelo Ball m...
178    4th quarter, 1:46.0 remaining<br>Nick Smith Jr...
179    4th quarter, 0:47.0 remaining<br>DaQuan Jeffri...
180    4th quarter, 0:18.0 remaining<br>Mark Williams...
Name: tip, Length: 181, dtype: object

# Basketball Reference Play By Play Game Data

In [15]:
pbp_url = "https://www.basketball-reference.com/boxscores/pbp/202503010CHO.html"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
}
response = requests.get(pbp_url, headers = headers)

response.raise_for_status()

response.encoding = 'utf-8'

soup = BeautifulSoup(response.text, 'html.parser')

table = soup.find("table", {"id":"pbp"})

rows = table.find_all("tr")

need to clean team rebounds to ensure they are actual rebounds

In [55]:
events = []

old_row_string = ""

key_word_list = ['makes', 'misses', 'rebound', 'turnover', 'foul', 'enters', 'timeout', 'violation']


score_event_type_dict = {"+2": "two-pointer",
                        "+3": "three-pointer",
                        "+1": "free throw"}

misses_event_type_dict = {"2-pt": "two-Pointer",
                          "3-pt": "three-Pointer",
                          "free throw": "free throw"}



for row in rows:

    cols = row.find_all(['th'])
    cells = row.find_all(['td'])

    if len(cols) == 1:
        quarter = "Q" + re.sub(r'[^\d]+', '', cols[0].text)
    
    if len(cells) == 6:
        time = cells[0].text.strip()

        away_event = cells[1]
        home_event = cells[5]

        if away_event.get_text(strip = True):
            pbp_event = away_event
            team = "visitor"
        
        elif home_event.get_text(strip = True):
            pbp_event = home_event
            team = "home"
        
        #print(event, player_tags)
        player_tags = pbp_event.find_all('a')

        pbp_event_text = pbp_event.get_text(strip = True).lower()

        #Checking to see if there was a scoring play and appending the scoring event and assisting event to the events list
        for word in key_word_list:

            if word in pbp_event_text:

                if word == 'makes':

                    away_score = cells[2]
                    home_score = cells[4]

                    if away_score.get_text(strip = True):
                        score = away_score

                    elif home_score.get_text(strip = True):
                        score = home_score
                    

                    player_id = player_tags[0]['href'].split('/')[3].split('.')[0]

                    away_score = cells[3].get_text(strip = True).split("-")[0]
                    home_score = cells[3].get_text(strip = True).split("-")[1]
                    event = "shot_make"
                    event_type = score_event_type_dict[score.get_text(strip = True)]
                    event_count = int(score.get_text(strip = True)[1])
                    events.append([quarter, time, team, away_score, home_score, player_id, event, event_type, event_count])


                    if len(player_tags) > 1:
                        player_id = player_tags[1]['href'].split('/')[3].split('.')[0]
                        event = "assist"
                        event_type = None
                        event_count = 1
                        events.append([quarter, time, team, away_score, home_score, player_id, event, event_type, event_count])

                elif word == 'misses':
                    player_id = player_tags[0]['href'].split('/')[3].split('.')[0]
                    event = "shot_miss"

                    #print(player_id)
                    
                    for keyword, action in misses_event_type_dict.items():
                        if keyword in pbp_event_text:
                            event_type = action
                    
                    #print(event_type)
                    
                    event_count = 1

                    events.append([quarter, time, team, away_score, home_score, player_id, event, event_type, event_count])

                    if len(player_tags) > 1:
                        player_id = player_tags[1]['href'].split('/')[3].split('.')[0]
                        event = "block"
                        event_type = event_type
                        event_count = 1
                        events.append([quarter, time, team, away_score, home_score, player_id, event, event_type, event_count])

                elif word == 'rebound':
                    if len(player_tags) > 0:
                        player_id = player_tags[0]['href'].split('/')[3].split('.')[0]
                    
                    else:
                        player_id = "team"
                    
                    event = "rebound"
                    
                    event_type = pbp_event_text.split(sep = ' ')[0].lower()

                    events.append([quarter, time, team, away_score, home_score, player_id, event, event_type, event_count])

                elif word == 'turnover':
                    player_id = player_tags[0]['href'].split('/')[3].split('.')[0]

                    event = "turnover"

                    event_type = pbp_event_text.split(sep = '(')[1].split(sep = ";")[0].lower().replace(')', '')

                    event_count = 1

                    events.append([quarter, time, team, away_score, home_score, player_id, event, event_type, event_count])

                    if len(player_tags) > 1:
                        player_id = player_tags[1]['href'].split('/')[3].split('.')[0]

                        event = "steal" 

                        event_type = None 

                        event_count = 1

                        events.append([quarter, time, team, away_score, home_score, player_id, event, event_type, event_count])
                
                elif word == 'foul':
                    player_id = player_tags[0]['href'].split('/')[3].split('.')[0]

                    event = "foul" 

                    event_type = pbp_event_text.split(sep = ' ')[0].lower()

                    event_count = 1

                    if event_type != 'turnover':
                        events.append([quarter, time, team, away_score, home_score, player_id, event, event_type, event_count])

                    if len(player_tags) > 1:
                        player_id = player_tags[1]['href'].split('/')[3].split('.')[0]

                        event = "foul_drawn" 

                        event_type = event_type

                        event_count = 1

                        events.append([quarter, time, team, away_score, home_score, player_id, event, event_type, event_count])
                
                elif word == "enters":
                    player_id = player_tags[0]['href'].split('/')[3].split('.')[0]

                    event = "substitution" 

                    event_type = "sub_in" 

                    event_count = 1

                    events.append([quarter, time, team, away_score, home_score, player_id, event, event_type, event_count])

                    if len(player_tags) > 1:

                        player_id = player_tags[1]['href'].split('/')[3].split('.')[0]

                        event = event 

                        event_type = "sub_out"

                        event_count = 1

                        events.append([quarter, time, team, away_score, home_score, player_id, event, event_type, event_count])

                elif word == "timeout":
                    player_id = "team" 

                    event = "timeout" 

                    event_type = pbp_event_text.split(sep = ' ')[1].lower()

                    event_count = 1

                    events.append([quarter, time, team, away_score, home_score, player_id, event, event_type, event_count])

                elif word == "violation":
                    player_id = "team"
                    
                    event = "violation" 

                    event_type = pbp_event_text.split(sep = '(')[0].replace(')', '').lower()

                    event_count = 1

                    events.append([quarter, time, team, away_score, home_score, player_id, event, event_type, event_count])

columns = ["quarter", "time", "team",  "away_score", "home_score", "player_id", "event", "event_type", "event_count"]
test = pd.DataFrame(events, columns= columns)


In [52]:
test.loc[test['event'] == "shot_make"]

Unnamed: 0,quarter,time,team,away_score,home_score,player_id,event,event_type,event_count
0,Q1,11:47.0,home,0,2,bridgmi02,shot_make,two-pointer,2
2,Q1,11:30.0,visitor,2,2,sarral01,shot_make,two-pointer,2
4,Q1,11:18.0,home,2,5,smithni01,shot_make,three-pointer,3
11,Q1,10:40.0,visitor,4,5,middlkh01,shot_make,two-pointer,2
20,Q1,10:01.0,visitor,5,5,middlkh01,shot_make,free throw,1
...,...,...,...,...,...,...,...,...,...
564,Q4,2:01.0,visitor,111,96,coulibi01,shot_make,three-pointer,3
582,Q4,0:47.0,home,111,98,jeffrda01,shot_make,two-pointer,2
585,Q4,0:34.0,visitor,112,98,coulibi01,shot_make,free throw,1
588,Q4,0:34.0,visitor,113,98,coulibi01,shot_make,free throw,1


In [65]:
print(f"""Total Away Team Points: {sum(test.loc[(test['event'] == "shot_make") & (test['team'] == 'visitor'), "event_count"])}""")

Total Away Team Points: 113


In [66]:
print(f"""Total Home Team Points: {sum(test.loc[(test['event'] == "shot_make") & (test['team'] == 'home'), "event_count"])}""")

Total Home Team Points: 100
