In [1]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import numpy as np
import os
import warnings
import time
from scipy.stats import poisson
from datetime import datetime

In [None]:
## We are interested in getting data of those leagues with xG data.

In [6]:
## Starting with one single league to use as template
## STANDINGS HOME AND AWAY

## Premier
url = 'https://fbref.com/en/comps/9/Premier-League-Stats'


def scrape_fbref_xG(url):
    html_page = requests.get(url).text
    data = BeautifulSoup(html_page, 'html.parser')
    
    tables = data.find_all('table')
    rows = tables[1].find_all('tr')

    table_data = []

    for row in rows:
        cols = row.find_all('td')
        table_data.append([col.get_text(strip=True) for col in cols])
        
    df = pd.DataFrame(table_data)

    df.columns = ['Squad', 'H_MP', 'H_W', 'H_D', 'H_L', 'H_GF', 'H_GA', 'H_GD', 'H_Pts', 'H_Pts/MP', 
          'H_xG', 'H_xGA', 'H_xGD', 'H_xGD/90', 'A_MP', 'A_W', 'A_D' , 'A_L', 'A_GF', 'A_GA', 'A_GD', 'A_Pts', 'A_Pts/MP',
          'A_xG', 'A_xGA', 'A_xGD', 'A_xGD/90']
    
    df = df.dropna()
    
    # change the dtypes to float
    column_list = ['H_MP', 'H_W', 'H_D', 'H_L', 'H_GF', 'H_GA', 'H_GD', 'H_Pts',
       'H_Pts/MP', 'H_xG', 'H_xGA', 'H_xGD', 'H_xGD/90', 'A_MP', 'A_W', 'A_D',
       'A_L', 'A_GF', 'A_GA', 'A_GD', 'A_Pts', 'A_Pts/MP', 'A_xG', 'A_xGA',
       'A_xGD', 'A_xGD/90']
    df[column_list] = df[column_list].astype(float)
    
    # split the df into two datasets: HOME and away
    df_home = df.iloc[:, :-13]
    df_away = df.iloc[:, 14:]
    df_away['Squad'] = df_home['Squad']
    
    #use pop method to re-arrange columns(we need the Squad to be first isntead of last)
    squad_col = df_away.pop('Squad')
    df_away.insert(0, 'Squad', squad_col)
    
    # ADDING THE RATIOS
    df_home['H_xG per Game'] = df_home['H_xG'] / df_home['H_MP']
    df_home['H_xGA per Game'] = df_home['H_xGA'] / df_home['H_MP']
    
    df_home['H_xG per Game Diff'] = df_home['H_GF'] / df_home['H_MP'] - df_home['H_xG per Game']
    df_home['H_xGA per Game Diff'] = df_home['H_xGA per Game'] -  df_home['H_GA'] / df_home['H_MP']
    
    
    
    df_away['A_xG per Game'] = df_away['A_xG'] / df_away['A_MP']
    df_away['A_xGA per Game'] = df_away['A_xGA'] / df_away['A_MP']
    
    df_away['A_xG per Game Diff'] = df_away['A_GF'] / df_away['A_MP'] - df_away['A_xG per Game']
    df_away['A_xGA per Game Diff'] = df_away['A_xGA per Game'] - df_away['A_GA'] / df_away['A_MP']
    
    
    ###### Mergin the Dataframes #########
    
    df_merged = pd.merge(df_home, df_away, on='Squad')
    
    df_merged.columns = ['Team', 'MP_x', 'W_x', 'D_x',
       'L_x', 'GF_x', 'GA_x', 'GD_x', 'Pts_x', 'Pts/MP_x', 'xG_x', 'xGA_x',
       'xGD_x', 'xGD/90_x', 'xG per Game_x', 'xGA per Game_x',
       'xG per Game Diff_x', 'xGA per Game Diff_x', 'MP_y', 'W_y', 'D_y',
       'L_y', 'GF_y', 'GA_y', 'GD_y', 'Pts_y', 'Pts/MP_y', 'xG_y', 'xGA_y',
       'xGD_y', 'xGD/90_y', 'xG per Game_y', 'xGA per Game_y',
       'xG per Game Diff_y', 'xGA per Game Diff_y']
    
    return df_merged


scrape_fbref_xG(url)

Unnamed: 0,Team,MP_x,W_x,D_x,L_x,GF_x,GA_x,GD_x,Pts_x,Pts/MP_x,...,Pts_y,Pts/MP_y,xG_y,xGA_y,xGD_y,xGD/90_y,xG per Game_y,xGA per Game_y,xG per Game Diff_y,xGA per Game Diff_y
0,Manchester City,1.0,1.0,0.0,0.0,4.0,1.0,3.0,3.0,3.0,...,6.0,3.0,3.8,1.7,2.1,1.04,1.9,0.85,0.6,0.35
1,Brighton,1.0,1.0,0.0,0.0,2.0,1.0,1.0,3.0,3.0,...,4.0,2.0,3.2,2.5,0.7,0.35,1.6,1.25,0.4,0.75
2,Arsenal,2.0,1.0,1.0,0.0,3.0,1.0,2.0,4.0,2.0,...,3.0,3.0,0.9,1.2,-0.4,-0.38,0.9,1.2,1.1,1.2
3,Liverpool,1.0,1.0,0.0,0.0,2.0,0.0,2.0,3.0,3.0,...,6.0,3.0,4.3,1.8,2.5,1.24,2.15,0.9,0.35,0.9
4,Brentford,2.0,2.0,0.0,0.0,5.0,2.0,3.0,6.0,3.0,...,0.0,0.0,0.5,2.5,-2.0,-2.0,0.5,2.5,-0.5,0.5
5,Aston Villa,1.0,0.0,0.0,1.0,0.0,2.0,-2.0,0.0,0.0,...,6.0,3.0,3.4,2.7,0.6,0.31,1.7,1.35,0.3,0.35
6,Bournemouth,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,...,4.0,2.0,3.4,3.1,0.3,0.16,1.7,1.55,0.3,0.05
7,Nott'ham Forest,2.0,0.0,2.0,0.0,2.0,2.0,0.0,2.0,1.0,...,3.0,3.0,2.2,0.1,2.1,2.11,2.2,0.1,-1.2,0.1
8,Tottenham,1.0,1.0,0.0,0.0,4.0,0.0,4.0,3.0,3.0,...,1.0,0.5,2.4,2.6,-0.2,-0.09,1.2,1.3,-0.2,-0.2
9,Newcastle Utd,2.0,2.0,0.0,0.0,3.0,1.0,2.0,6.0,3.0,...,1.0,1.0,1.6,2.2,-0.6,-0.6,1.6,2.2,-0.6,1.2


## Documentation

Row indexes:                                                  | Naming Convetion 
    - [2] -> Squad Standard Stats, Squad Stats                | '_Standard'
    - [4] -> Squad Goalkeeping, Squad Stats                   | '_GK'
    - [6] -> Squad Advanced Goalkeeping, Squad Stats          | '_AdvGK'
    - [8] -> Squad Shooting, Squad Stats                      | '_Shooting'
    - [10] -> Squad Passing, Squad Stats                      | '_Passing'
    - [12] -> Squad Pass Types, Squad Stats                   | '_PassTypes'
    - [14] -> Squad Goal and Shot Creation, Squad Stats       | '_G&SCreation'
    - [16] -> Squad Defensive Actions, Squad Stats            | '_DefActions'
    - [18] -> Squad Possession, Squad Stats                   | '_Possession'
    - [20] -> Squad Playing Time, Squad Stats                 | '_PlayTime'
    - [22] -> Squad Misc Stats, Squad Stats                   | '_Misc'

In [58]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

def scrape_fbref_xG(url):
    
    # Creatind a dictionary of index and labels
    
    dict_index = { 
        2: '_Standard',
        4: '_GK',
        6: '_AdvGK',
        8: '_Shooting',
        10: '_Passing',
        12: '_PassTypes',
        14: 'G&SCreation',
        16: '_DefActions',
        18: '_Possession',
        20: '_PlayTime',
        22: '_Misc'
    }
    
    html_page = requests.get(url).text
    data = BeautifulSoup(html_page, 'html.parser')
    
    # Initializing an empty dataframe
    merged_df = None
    
    for index, label in dict_index.items():
        
        # Extract the specific table (assuming it's the 3rd table on the page)
        tables = data.find_all('table')
        rows = tables[index].find_all('tr')

        # Extract the headers
        headers = rows[1].find_all('th')
        column_titles = [header.get_text(strip=True) for header in headers]

        # Creating title label for accompanying the default naming convention
        ## IF statements here
        title_label = label

        # Add '_Standard' to each column name
        column_titles = [column_titles[0]] + [title + title_label for title in column_titles[1:]]

        # Check the number of columns expected
        print(f"Expected number of columns: {len(column_titles)}")

        # Extract the data rows
        table_data = []
        for row in rows[1:]:  # Start from the second row to skip the header row
            cols_a = [col.get_text(strip=True) for col in row.find_all('a')]
            cols_b = [col.get_text(strip=True) for col in row.find_all('td')]
            combined_cols = cols_a + cols_b

            # Print the length of combined columns for debugging
            # print(f"Row has {len(combined_cols)} columns: {combined_cols}")

            # Append only if the number of columns matches the headers
            # The first row was empty, that's why the error in the previous block of code. Keep this one
            if len(combined_cols) == len(column_titles):
                table_data.append(combined_cols)
            else:
                print(f"Skipping row with {len(combined_cols)} columns, expected {len(column_titles)}.")
    
        # Create DataFrame and set column titles
        df = pd.DataFrame(table_data, columns=column_titles)
        
        # Merge dataframe with the existing dataframe
        if merged_df is None:
            merged_df = df
        else:
            merged_df = pd.merge(merged_df, df, how='outer', on='Squad')
            
    # Adding an extraction date column
    today = datetime.today()
    merged_df['Extraction Date'] = today
    
    return merged_df

# Example usage
url = 'https://fbref.com/en/comps/9/Premier-League-Stats'
df = scrape_fbref_xG(url)

df

Expected number of columns: 32
Skipping row with 0 columns, expected 32.
Expected number of columns: 21
Skipping row with 0 columns, expected 21.
Expected number of columns: 28
Skipping row with 0 columns, expected 28.
Expected number of columns: 20
Skipping row with 0 columns, expected 20.
Expected number of columns: 26
Skipping row with 0 columns, expected 26.
Expected number of columns: 18
Skipping row with 0 columns, expected 18.
Expected number of columns: 19
Skipping row with 0 columns, expected 19.
Expected number of columns: 19
Skipping row with 0 columns, expected 19.
Expected number of columns: 26
Skipping row with 0 columns, expected 26.
Expected number of columns: 23
Skipping row with 0 columns, expected 23.
Expected number of columns: 19
Skipping row with 0 columns, expected 19.


Unnamed: 0,Squad,# Pl_Standard,Age_Standard,Poss_Standard,MP_Standard,Starts_Standard,Min_Standard,90s_Standard,Gls_Standard,Ast_Standard,...,Int_Misc,TklW_Misc,PKwon_Misc,PKcon_Misc,OG_Misc,Recov_Misc,Won_Misc,Lost_Misc,Won%_Misc,Extraction Date
0,Arsenal,16,26.1,49.7,3,33,270,3.0,5,4,...,28,33,0,0,0,100,38,40,48.7,2024-09-02 14:42:25.626769
1,Aston Villa,18,26.6,43.7,3,33,270,3.0,4,4,...,18,35,0,1,0,109,31,30,50.8,2024-09-02 14:42:25.626769
2,Bournemouth,19,25.4,46.3,3,33,270,3.0,5,4,...,27,33,0,0,0,168,45,57,44.1,2024-09-02 14:42:25.626769
3,Brentford,17,27.0,40.3,3,33,270,3.0,5,2,...,27,29,0,0,1,129,36,23,61.0,2024-09-02 14:42:25.626769
4,Brighton,21,27.0,57.3,3,33,270,3.0,6,5,...,20,32,0,0,0,123,39,38,50.6,2024-09-02 14:42:25.626769
5,Chelsea,18,23.5,56.7,2,33,270,3.0,7,6,...,26,33,0,0,0,128,18,19,48.6,2024-09-02 14:42:25.626769
6,Crystal Palace,19,26.2,50.0,2,33,270,3.0,1,0,...,19,35,0,0,0,127,33,52,38.8,2024-09-02 14:42:25.626769
7,Everton,18,28.5,39.0,3,33,270,3.0,2,2,...,40,41,0,0,0,144,54,41,56.8,2024-09-02 14:42:25.626769
8,Fulham,17,27.7,50.3,3,33,270,3.0,3,3,...,23,34,0,0,0,127,44,43,50.6,2024-09-02 14:42:25.626769
9,Ipswich Town,22,26.2,37.0,3,33,270,3.0,2,2,...,23,27,0,1,0,99,29,35,45.3,2024-09-02 14:42:25.626769


## Final Script with Error Handling and Logging

In [69]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import logging

#logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def scrape_fbref_xG(url):
    dict_index = {
        2: '_Standard',
        4: '_GK',
        6: '_AdvGK',
        8: '_Shooting',
        10: '_Passing',
        12: '_PassTypes',
        14: 'G&SCreation',
        16: '_DefActions',
        18: '_Possession',
        20: '_PlayTime',
        22: '_Misc'
    }
    
    html_page = requests.get(url).text
    data = BeautifulSoup(html_page, 'html.parser')
    
    merged_df = None
    
    for index, label in dict_index.items():
        try:
            tables = data.find_all('table')
            rows = tables[index].find_all('tr')

            headers = rows[1].find_all('th')
            column_titles = [header.get_text(strip=True) for header in headers]

            title_label = label
            column_titles = [column_titles[0]] + [title + title_label for title in column_titles[1:]]

            table_data = []
            for row in rows[1:]:
                cols_a = [col.get_text(strip=True) for col in row.find_all('a')]
                cols_b = [col.get_text(strip=True) for col in row.find_all('td')]
                combined_cols = cols_a + cols_b

                # Append only if the number of columns matches the headers
                # The first row was empty, that's why the error in the previous block of code. Keep this one
                if len(combined_cols) == len(column_titles):
                    table_data.append(combined_cols)
                else:
                    logging.warning(f"Skipping row with {len(combined_cols)} columns, expected {len(column_titles)}.")
            
            df = pd.DataFrame(table_data, columns=column_titles)
            
            if 'Squad' not in df.columns:
                logging.warning(f"Table at index {index} does not contain 'Squad' column. Skipping this table.")
                continue
            
            if merged_df is None:
                merged_df = df
            else:
                merged_df = pd.merge(merged_df, df, how='outer', on='Squad')
        except Exception as e:
            logging.error(f"Failed to process table at index {index}: {e}")
    
    # Adding Matchday
    
    
    # Adding extraction date
    today = datetime.today()
    merged_df['Extraction Date'] = today
    
    return merged_df

In [70]:
# Creating a dictionary of urls with leagues with xG data available
fbref_urls = {
    'EPL': 'https://fbref.com/en/comps/9/Premier-League-Stats',
    'La Liga': 'https://fbref.com/en/comps/12/La-Liga-Stats',
    'Bundesliga': 'https://fbref.com/en/comps/20/Bundesliga-Stats',
    'Serie A': 'https://fbref.com/en/comps/11/Serie-A-Stats',
    'Ligue 1': 'https://fbref.com/en/comps/13/Ligue-1-Stats',
    'Eredivisie': 'https://fbref.com/en/comps/23/2023-2024/2023-2024-Eredivisie-Stats',
    'Bundesliga_2': 'https://fbref.com/en/comps/33/2-Bundesliga-Stats',
    'Jupiler': 'https://fbref.com/en/comps/37/Belgian-Pro-League-Stats',
    'Liga MX': 'https://fbref.com/en/comps/31/Liga-MX-Stats',
    'Primeira Liga': 'https://fbref.com/en/comps/32/Primeira-Liga-Stats',
    'Liga Argentina': 'https://fbref.com/en/comps/21/Primera-Division-Stats',
    'Brasileirao': 'https://fbref.com/en/comps/24/Serie-A-Stats',
    'MLS': 'https://fbref.com/en/comps/22/Major-League-Soccer-Stats' }

# Creating a dictionary to store the scraped data for each league
standings = {}

for league, url in fbref_urls.items():
    
    try:
        # global variable here
        standings[league] = scrape_fbref_xG(url)
        
    except Exception as e:
        print(f"Failed to scrape standings data for {league}")
        
    
# 1: Merge the standings data from home and away into the overall table

# 2: save the output as JSON files with dates to differentiate. Add some variables such as league, round, etc.




In [68]:
standings

{'EPL':               Squad # Pl_Standard Age_Standard Poss_Standard MP_Standard  \
 0           Arsenal            16         26.1          49.7           3   
 1       Aston Villa            18         26.6          43.7           3   
 2       Bournemouth            19         25.4          46.3           3   
 3         Brentford            17         27.0          40.3           3   
 4          Brighton            21         27.0          57.3           3   
 5           Chelsea            18         23.5          56.7           3   
 6    Crystal Palace            19         26.2          50.0           3   
 7           Everton            18         28.5          39.0           3   
 8            Fulham            17         27.7          50.3           3   
 9      Ipswich Town            22         26.2          37.0           3   
 10   Leicester City            18         26.7          44.3           3   
 11        Liverpool            18         27.6          57.0        

In [None]:
# Next: Scrape in a dictionary fashion of selected leagues (like in Poisson model)

In [None]:
# Next: Feature engineering (can wait)