# Get League Table and Stats

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the website
url = "https://fbref.com/en/comps/10/2022-2023/2022-2023-Championship-Stats"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
    # Parse the HTML content of the page
soup = BeautifulSoup(response.text, "html.parser")

    # Find the table using its HTML class
table = soup.find("table", {"class": "stats_table"})

    # Extract table data into a list of lists
table_data = []
for row in table.find_all("tr"):
    row_data = [cell.text.strip() for cell in row.find_all(["th", "td"])]
    table_data.append(row_data)

    # Create a DataFrame from the list of lists
columns = table_data[0]  # Assuming the first row contains column headers
data = table_data[1:]
league_table_df = pd.DataFrame(data, columns=columns)

# Get Fixtures 

In [3]:
#get main table

# Getting Fixtures and Game IDs

In [4]:
import requests
from bs4 import BeautifulSoup

url = "https://fbref.com/en/comps/10/2022-2023/schedule/2022-2023-Championship-Scores-and-Fixtures"

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content of the page
soup = BeautifulSoup(response.text, 'html.parser')

# Find all <td> elements with data-stat="score"
score_cells = soup.find_all('td', {'data-stat': 'score'})

# Extract game IDs from the <a> tags within the score_cells, the third element is the game ID
game_ids = [cell.find('a')['href'].split('/')[3] for cell in score_cells if cell.find('a')]
# Then strip the white space in the IDs
game_ids = [x.strip(' ') for x in game_ids]

In [5]:
game_ids = game_ids[0:3]

In [6]:
game_ids

['46d9048f', '03cea9be', '78f52c96']

In [7]:
# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, "html.parser")

    # Find the table containing scores and fixtures
    table = soup.find("table", {"class": "stats_table"})

    # Extract data into a list of dictionaries
    fixtures_data = []
    for row in table.find_all("tr")[1:]:  # Skip the header row
        columns = row.find_all(["th", "td"])
        fixture_date = columns[0].text.strip()
        home_team = columns[1].text.strip()
        away_team = columns[2].text.strip()
        score = columns[3].text.strip()

        fixture_info = {
            "Date": fixture_date,
            "Home Team": home_team,
            "Away Team": away_team,
            "Score": score
        }

        fixtures_data.append(fixture_info)

    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(fixtures_data)

    # Now, you have a DataFrame called 'df' with the scores and fixtures data
    print(df)

else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")


               Date Home Team Away Team       Score
0    Regular season         1       Fri  2022-07-29
1    Regular season         1       Sat  2022-07-30
2    Regular season         1       Sat  2022-07-30
3    Regular season         1       Sat  2022-07-30
4    Regular season         1       Sat  2022-07-30
..              ...       ...       ...         ...
612     Semi-finals                 Sat  2023-05-13
613     Semi-finals                 Sun  2023-05-14
614     Semi-finals                 Tue  2023-05-16
615     Semi-finals                 Wed  2023-05-17
616           Final                 Sat  2023-05-27

[617 rows x 4 columns]


# Scraping Shot Data

In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

all_data = []
for unique_id in game_ids:
    # URL of the page to scrape
    url = f'https://fbref.com/en/matches/{unique_id}/'

    # Get the game ID from the URL
    game_id = url.split("/")[-2]

    # Send a GET request to the URL
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the Shots table on the page
    shots_table = soup.find("div", {"id": "switcher_shots"})

    # Create empty lists to store data and create the columns
    data = []
    headers = ['Game_ID', 'Minute', 'Player', 'Team', 'xG', 'PSxG', 'Result', 'Distance', 'Body Part', 'Notes', 'SCA 1 Player', 'Event 1', 'SCA 2 Player', 'Event 2']

    # Extract table rows
    rows = shots_table.find("tbody").find_all("tr")
    for row in rows:
        row_data = [game_id] + [cell.get_text().strip() for cell in row.find_all(["th", "td"])]
        data.append(row_data)

    # Create a DataFrame from the scraped data
    shots_df = pd.DataFrame(data, columns=headers)

    # Find the home and away teams and formations
    team_headers = soup.find_all("th", {"colspan": "2"})
    teams = [header.text.strip().split(" (")[0] for header in team_headers if "(" in header.text.strip()]
    formations = [header.text.strip().split(" (")[1][:-1] for header in team_headers if "(" in header.text.strip()]

    # Extracting the home and away teams and formations
    home_team = teams[0]
    away_team = teams[1]
    home_formation = formations[0]
    away_formation = formations[1]

    # Find the competition -- only using .find because there are multiple in the HTML and we only need one, then take only the text
    competition = soup.find("a", {"href": "/en/comps/10/2022-2023/2022-2023-Championship-Stats"}).text

    # Extract the match date and add to df, this one is a little tricky as it's hidden within the span tag
    span_venuetime = soup.find("span", class_="venuetime")
    match_date = span_venuetime.get("data-venue-date") if span_venuetime else None

    #Get location, 4th element in that list
    location = soup.find("div", {"class": "scorebox_meta"}).find_all("small")[3].text
    
    all_data.append(data)

In [8]:
#Flatten list one level to load into df
all_data = sum(all_data, [])

In [9]:
# Create a DataFrame from the scraped data
shots_df = pd.DataFrame(all_data, columns=headers)

# Add the team, away team, formation, and opponent formation information to the DataFrame
shots_df['Home_Team'] = home_team
shots_df['Away_Team'] = away_team
shots_df['Home Formation'] = home_formation
shots_df['Away Formation'] = away_formation
shots_df['is_home_shot'] = np.where(shots_df['Team'] == home_team, 1, 0)
shots_df['is_away_shot'] = np.where(shots_df['Team'] ==  away_team, 1, 0)
shots_df['location'] = location
shots_df['match_date'] = match_date

# To Do

1. Get fixtures
2. Get overview game data
3. EDA
4. Start Modeling