# Get League Table and Stats

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import numpy as np
import time
import random
#imports

In [2]:
# URL of the website
url = "https://fbref.com/en/comps/10/Championship-Stats"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
    # Parse the HTML content of the page
soup = BeautifulSoup(response.text, "html.parser")

    # Find the table using its HTML class
table = soup.find("table", {"class": "stats_table"})

    # Extract table data into a list of lists
table_data = []
for row in table.find_all("tr"):
    row_data = [cell.text.strip() for cell in row.find_all(["th", "td"])]
    table_data.append(row_data)

    # Create a DataFrame from the list of lists
columns = table_data[0]  # Assuming the first row contains column headers
data = table_data[1:]
league_table_df = pd.DataFrame(data, columns=columns)

In [3]:
response

<Response [200]>

# Get Fixtures 

In [4]:
# URL of the website
url = "https://fbref.com/en/comps/10/schedule/Championship-Scores-and-Fixtures"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, "html.parser")

    # Find the table containing fixtures
    table = soup.find("table", {"class": "stats_table"})

    # Extract data into a list of dictionaries
    fixtures_data = []
    for row in table.find_all("tr")[1:]:  # Skip the header row
        columns = row.find_all(["th", "td"])
        
         # Extracting specific columns based on the structure of the table
        fixture_type = columns[0].text.strip()
        gameweek = columns[1].text.strip()
        day_of_week = columns[2].text.strip()
        date = columns[3].text.strip()
        match_time = columns[4].text.strip()
        home_team = columns[5].text.strip()
        home_xG = columns[6].text.strip()
        result = columns[7].text.strip()
        away_xG = columns[8].text.strip()
        away_team = columns[9].text.strip()
        attendance = columns[10].text.strip()
        stadium = columns[11].text.strip()
        referee = columns[12].text.strip()

        fixture_info = {
            "fixture_type": fixture_type,
            "gameweek": gameweek,
            "day_of_week": day_of_week,
            "date": date,
            "match_time": match_time,
            "home_team": home_team,
            "home_xG": home_xG,
            "result": result,
            "away_xG": away_xG,
            "away_team": away_team,
            "attendance": attendance,
            "stadium": stadium,
            "referee": referee
        }

        fixtures_data.append(fixture_info)

    # Create a DataFrame from the list of dictionaries
    fixtures_table = pd.DataFrame(fixtures_data)

In [5]:
fixtures_table['is_game_complete'] = np.where(len(fixtures_table['result']) > 0, 1, 0)
#Checking to see if game is complete... if the result column is populated at all, it's been played 

# Getting Game IDs

In [6]:
# URL of the website
url = "https://fbref.com/en/comps/10/schedule/Championship-Scores-and-Fixtures"

# Parse the HTML content of the page
soup = BeautifulSoup(response.text, 'html.parser')

# Find all <td> elements with data-stat="score"
score_cells = soup.find_all('td', {'data-stat': 'score'})

# Extract game IDs from the <a> tags within the score_cells, the third element is the game ID
game_ids = [cell.find('a')['href'].split('/')[3].strip() for cell in score_cells if cell.find('a')]

# Use a set to ensure uniqueness and convert back to a list
game_ids = list(set(game_ids))

In [7]:
# game_ids = game_ids[0:50]
#just use five to test

# Scraping Shot Data

In [8]:
all_data = []

for unique_id in game_ids:
    # URL of the page to scrape
    url = f'https://fbref.com/en/matches/{unique_id}/'

    # Get the game ID from the URL
    game_id = url.split("/")[-2]

    # Send a GET request to the URL
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the Shots table on the page
    shots_table = soup.find("div", {"id": "switcher_shots"})

    # Create empty lists to store data and create the columns
    data = []
    headers = ['game_id', 'minute', 'player', 'team', 'xG', 'psxG', 'result', 'distance', 'body_part', 'notes', 'sca_1_player', 'event_1', 'sca_2_player', 'event_2']

    # Extract table rows
    rows = shots_table.find("tbody").find_all("tr")
    for row in rows:
        row_data = [game_id] + [cell.get_text().strip() for cell in row.find_all(["th", "td"])]
        data.append(row_data)

    # Find the home and away teams and formations
    team_headers = soup.find_all("th", {"colspan": "2"})
    teams = [header.text.strip().split(" (")[0] for header in team_headers if "(" in header.text.strip()]
    formations = [header.text.strip().split(" (")[1][:-1] for header in team_headers if "(" in header.text.strip()]

    # Extracting the home and away teams and formations
    home_team = teams[0]
    away_team = teams[1]
    home_formation = formations[0]
    away_formation = formations[1]

    # Find the location and match date
    location = soup.find("div", {"class": "scorebox_meta"}).find_all("small")[3].text
    match_date = soup.find("span", class_="venuetime").get("data-venue-date", None)

    # Append additional columns to headers
    headers.extend(['home_team', 'away_team', 'home_formation', 'away_formation', 'location', 'match_date'])

    # Append data to all_data with additional columns
    for row in data:
        row.extend([home_team, away_team, home_formation, away_formation, location, match_date])
        all_data.append(row)
        
        
    time.sleep(15)
        
# Create a DataFrame from the scraped data
shots_df = pd.DataFrame(all_data, columns=headers)

In [9]:
shots_df['game_id'].nunique()

557

In [10]:
# Add Home and away shots, location, hard-code competition
shots_df['is_home_shot'] = np.where(shots_df['team'] == shots_df['home_team'], 1, 0)
shots_df['is_away_shot'] = np.where(shots_df['team'] ==  shots_df['away_team'], 1, 0)
shots_df['competition'] = 'English Championship'

In [11]:
# Define a function to calculate total score for each game
def calculate_total_score(group, shot):
    total_score = 0
    total_scores = []

    # Flag to indicate whether the current row is after a 'Goal'
    after_goal = False

    for idx, (result, is_shot) in enumerate(zip(group, shot)):
        # Check if the current row is after a 'Goal'
        if after_goal:
            # Increment the total score
            total_score += 1
            # Reset the flag
            after_goal = False

        # Check if the current result is 'Goal' and the corresponding shot is 1
        if result == 'Goal' and is_shot == 1:
            # Set the flag to True for the next iteration
            after_goal = True
        
        # Append the current total score
        total_scores.append(total_score)

    return total_scores

# Group by game_id and apply the function to calculate total score for home team
shots_df['home_score'] = shots_df.groupby('game_id')['result'].transform(lambda x: calculate_total_score(x, shots_df['is_home_shot']))

# Group by game_id and apply the function to calculate total score for away team
shots_df['away_score'] = shots_df.groupby('game_id')['result'].transform(lambda x: calculate_total_score(x, shots_df['is_away_shot']))


In [12]:
#Creating a nil nil column using numpy where and logical_and operators
shots_df['is_nil_nil'] = np.where(np.logical_and(shots_df['home_score'] == 0, shots_df['away_score'] == 0), 1, 0)

In [13]:
#Adding opposing team -- Whenever it's a home shot we want the away team, otherwise (away shot) we want the home team
shots_df['opposing_team'] = np.where(shots_df['is_home_shot'] == 1, shots_df['away_team'], shots_df['home_team'])

In [14]:
#Replace blank values with nulls
shots_df['xG'] = shots_df['xG'].replace('', np.nan)

#Then drop them
shots_df = shots_df.dropna(subset=['xG'])

In [15]:
shots_df['xG'] = shots_df['xG'].astype('float64') 

# Adding xG and xGA while nil-nil to league_table_df

In [16]:
# Filter shots_df where is_nil_nil is 0
filtered_shots_df = shots_df[shots_df['is_nil_nil'] == 1]

# Calculate the sum of 'xG' for each 'team'
team_xG_sum = filtered_shots_df.groupby('team')['xG'].sum().reset_index()

# Create a dictionary to map team names to their respective summed 'xG' values
team_xG_dict = dict(zip(team_xG_sum['team'], team_xG_sum['xG']))

# Add 'nil_nil_xG' column to league_table_df by mapping team names to their summed 'xG' values
league_table_df['nil_nil_xG'] = league_table_df['Squad'].map(team_xG_dict)

# Fill NaN values with 0 if any team didn't have any 'xG' when is_nil_nil is 0
league_table_df['nil_nil_xG'].fillna(0, inplace=True)

# xGA

# Calculate the sum of 'xGA' for each 'team'
team_xGA_sum = filtered_shots_df.groupby('opposing_team')['xG'].sum().reset_index()

# Create a dictionary to map team names to their respective summed 'xGA' values -- AKA xG of opposing_team
team_xGA_dict = dict(zip(team_xGA_sum['opposing_team'], team_xGA_sum['xG']))

# Add 'nil_nil_xGA' column to league_table_df by mapping team names to their summed 'xGA' values
league_table_df['nil_nil_xGA'] = league_table_df['Squad'].map(team_xGA_dict)

# Fill NaN values with 0 if any team didn't have any 'xG' when is_nil_nil is 0
league_table_df['nil_nil_xGA'].fillna(0, inplace=True)

In [17]:
league_table_df

Unnamed: 0,Rk,Squad,MP,W,D,L,GF,GA,GD,Pts,...,xG,xGA,xGD,xGD/90,Attendance,Top Team Scorer,Goalkeeper,Notes,nil_nil_xG,nil_nil_xGA
0,1,Leicester City,46,31,4,11,89,41,48,97,...,84.9,42.7,42.2,0.92,31238,Jamie Vardy - 18,Mads Hermansen,Promoted,29.79,15.02
1,2,Ipswich Town,46,28,12,6,92,57,35,96,...,74.0,46.7,27.4,0.59,28845,"Nathan Broadhead, Conor Chaplin - 13",Václav Hladký,Promoted,18.58,14.24
2,3,Leeds United,46,27,9,10,81,43,38,90,...,79.5,38.0,41.4,0.9,35989,Crysencio Summerville - 19,Illan Meslier,→ Semi-finals,26.33,17.63
3,4,Southampton,46,26,9,11,87,63,24,87,...,79.8,49.0,30.8,0.67,29373,Adam Armstrong - 21,Gavin Bazunu,"Promoted, → Semi-finals",24.07,13.53
4,5,West Brom,46,21,12,13,70,47,23,75,...,60.1,50.3,9.8,0.21,24019,Brandon Thomas-Asante - 11,Alex Palmer,→ Semi-finals,23.89,17.8
5,6,Norwich City,46,21,10,15,79,64,15,73,...,63.0,63.4,-0.4,-0.01,26077,Josh Sargent - 16,Angus Gunn,→ Semi-finals,23.26,18.38
6,7,Hull City,46,19,13,14,68,60,8,70,...,57.2,51.9,5.3,0.12,21980,Jaden Philogene Bidace - 12,Ryan Allsop,,18.73,16.54
7,8,Middlesbrough,46,20,9,17,71,62,9,69,...,68.3,53.8,14.6,0.32,26905,Emmanuel Latte Lath - 16,Seny Dieng,,24.8,19.73
8,9,Coventry City,46,17,13,16,70,59,11,64,...,63.8,55.9,7.9,0.17,25468,Haji Wright - 16,Bradley Collins,,25.12,20.81
9,10,Preston,46,18,9,19,56,67,-11,63,...,40.7,60.2,-19.5,-0.42,16720,Will Keane - 13,Freddie Woodman,,18.18,22.03


# To Do

1. Add average xG and xGA per match while nil nil