In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

# URL of the page to scrape
url = "https://fbref.com/en/matches/46d9048f/"

# Get the game ID from the URL
game_id = url.split("/")[-2]

# Send a GET request to the URL
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

# Find the Shots table on the page
shots_table = soup.find("div", {"id": "switcher_shots"})

# Create empty lists to store data and create the columns
data = []
headers = ['Game_ID', 'Minute', 'Player', 'Team', 'xG', 'PSxG', 'Result', 'Distance', 'Body Part', 'Notes', 'SCA 1 Player', 'Event 1', 'SCA 2 Player', 'Event 2']

# Extract table rows
rows = shots_table.find("tbody").find_all("tr")
for row in rows:
    row_data = [game_id] + [cell.get_text().strip() for cell in row.find_all(["th", "td"])]
    data.append(row_data)

# Create a DataFrame from the scraped data
shots_df = pd.DataFrame(data, columns=headers)

# Find the home and away teams and formations
team_headers = soup.find_all("th", {"colspan": "2"})
teams = [header.text.strip().split(" (")[0] for header in team_headers if "(" in header.text.strip()]
formations = [header.text.strip().split(" (")[1][:-1] for header in team_headers if "(" in header.text.strip()]

# Extracting the home and away teams and formations
home_team = teams[0]
away_team = teams[1]
home_formation = formations[0]
away_formation = formations[1]

# Find the competition -- only using .find because there are multiple in the HTML and we only need one, then take only the text
competition = soup.find("a", {"href": "/en/comps/10/2022-2023/2022-2023-Championship-Stats"}).text

# Extract the match date and add to df, this one is a little tricky as it's hidden within the span tag
span_venuetime = soup.find("span", class_="venuetime")
match_date = span_venuetime.get("data-venue-date") if span_venuetime else None

#Get location, 4th element in that list
location = soup.find("div", {"class": "scorebox_meta"}).find_all("small")[3].text

# Create a DataFrame from the scraped data
shots_df = pd.DataFrame(data, columns=headers)

# Add the team, away team, formation, and opponent formation information to the DataFrame
shots_df['Home_Team'] = home_team
shots_df['Away_Team'] = away_team
shots_df['Home Formation'] = home_formation
shots_df['Away Formation'] = away_formation
shots_df['is_home_shot'] = np.where(shots_df['Team'] == home_team, 1, 0)
shots_df['is_away_shot'] = np.where(shots_df['Team'] ==  away_team, 1, 0)
shots_df['location'] = location
shots_df['match_date'] = match_date

In [4]:
shots_df

Unnamed: 0,Game_ID,Minute,Player,Team,xG,PSxG,Result,Distance,Body Part,Notes,...,SCA 2 Player,Event 2,Home_Team,Away_Team,Home Formation,Away Formation,is_home_shot,is_away_shot,location,match_date
0,46d9048f,11,Josh Cullen,Burnley,0.07,0.02,Saved,21.0,Left Foot,,...,Josh Cullen,Pass (Live),Huddersfield,Burnley,4-1-4-1,4-3-3,0,1,"The John Smith's Stadium, Huddersfield",2022-07-29
1,46d9048f,14,Ashley Barnes,Burnley,0.06,0.34,Saved,14.0,Right Foot,,...,Josh Brownhill,Pass (Live),Huddersfield,Burnley,4-1-4-1,4-3-3,0,1,"The John Smith's Stadium, Huddersfield",2022-07-29
2,46d9048f,18,Ian Maatsen,Burnley,0.05,0.28,Goal,15.0,Right Foot,Deflected,...,Ian Maatsen,Pass (Live),Huddersfield,Burnley,4-1-4-1,4-3-3,0,1,"The John Smith's Stadium, Huddersfield",2022-07-29
3,46d9048f,20,Connor Roberts,Burnley,0.21,0.88,Saved,6.0,Right Foot,Volley,...,Connor Roberts,Pass (Live),Huddersfield,Burnley,4-1-4-1,4-3-3,0,1,"The John Smith's Stadium, Huddersfield",2022-07-29
4,46d9048f,24,Josh Brownhill,Burnley,0.03,0.2,Saved,18.0,Right Foot,Volley,...,,,Huddersfield,Burnley,4-1-4-1,4-3-3,0,1,"The John Smith's Stadium, Huddersfield",2022-07-29
5,46d9048f,38,Josh Koroma,Huddersfield,0.04,,Blocked,18.0,Right Foot,,...,Jonathan Hogg,Pass (Live),Huddersfield,Burnley,4-1-4-1,4-3-3,1,0,"The John Smith's Stadium, Huddersfield",2022-07-29
6,46d9048f,40,Dara Costelloe,Burnley,0.23,,Off Target,12.0,Right Foot,Volley,...,Josh Brownhill,Pass (Live),Huddersfield,Burnley,4-1-4-1,4-3-3,0,1,"The John Smith's Stadium, Huddersfield",2022-07-29
7,46d9048f,44,Samuel Bastien,Burnley,0.06,,Blocked,21.0,Left Foot,,...,Samuel Bastien,Pass (Live),Huddersfield,Burnley,4-1-4-1,4-3-3,0,1,"The John Smith's Stadium, Huddersfield",2022-07-29
8,46d9048f,45,Dara Costelloe,Burnley,0.06,,Off Target,18.0,Left Foot,,...,Samuel Bastien,Shot,Huddersfield,Burnley,4-1-4-1,4-3-3,0,1,"The John Smith's Stadium, Huddersfield",2022-07-29
9,46d9048f,,,,,,,,,,...,,,Huddersfield,Burnley,4-1-4-1,4-3-3,0,0,"The John Smith's Stadium, Huddersfield",2022-07-29
