Imports and Constants

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

domain = 'https://fbref.com'
matches_file = 'matches.csv'

Download HTML of country groups

In [2]:
groups_url = 'https://fbref.com/en/comps/678/UEFA-Euro-Qualifying-Stats'
qualifying_data = requests.get(groups_url)

In [3]:
soup = BeautifulSoup(qualifying_data.text)
group_tables = soup.select('table.stats_table')

Get URL of each participating country

In [4]:
team_urls = {}
for i in group_tables:
    links = i.find_all('a')
    links = [j.get('href') for j in links]
    links = [domain + j for j in links if '/squads/' in j]
    for j in links:
        team_name = j.split('/')[-1].replace('-Men-Stats','')
        team_name = team_name.lower() # lower case
        team_name = team_name.replace('-',' ') # replace space
        team_urls[team_name] = j
# germany automatically qulifies so they are not included in the tables
team_urls['germany'] = 'https://fbref.com/en/squads/c1e40422/Germany-Men-Stats'

Scrape data for all teams and their games played and saves it to matches.csv file.  
(This cell takes a while to run because if it scrapes too fast, the website will ban it for too many requests)

In [5]:
team_matches = {}
for i in team_urls:
    # Scores & Fixtures table
    team_data = requests.get(team_urls[i])
    matches_data = pd.read_html(team_data.text, match='Scores & Fixtures')[0]
    time.sleep(4)
    # find links for other stats
    soup = BeautifulSoup(team_data.text)
    links = soup.find_all('a')
    links = [i.get('href') for i in links]
    # add shooting stats
    try:
        shooting_link = [domain + i for i in links if i and '/en/' in i and 'all_comps/shooting/' in i][0]
        shooting_data = requests.get(shooting_link)
        shooting = pd.read_html(shooting_data.text, match='Shooting')[0]
        shooting.columns = shooting.columns.droplevel()
        matches_data = matches_data.merge(shooting[['Date', 'Sh', 'SoT']], on='Date', how='left')
    except:
        pass
    time.sleep(4)
    # add misc stats
    try:
        misc_link = [domain + i for i in links if i and '/en/' in i and 'all_comps/misc/' in i][0]
        misc_data = requests.get(misc_link)
        misc = pd.read_html(misc_data.text, match='Miscellaneous Stats')[0]
        misc.columns = misc.columns.droplevel()
        matches_data = matches_data.merge(misc[['Date', 'CrdY', 'Fls', 'Fld', 'Crs', 'Int', 'TklW']], on='Date', how='left')
    except:
        pass
    time.sleep(4)
    # all stats together
    matches_data['Team'] = i
    team_matches[i] = matches_data
    # team_matches[i] = matches_data[matches_data['Comp'] == 'UEFA Euro Qualifying' or matches_data['Comp'] == 'UEFA Euro']

# join all team matches in one data frame
team_stats = pd.concat(list(team_matches.values()))
team_stats.columns = [i.lower() for i in team_stats.columns]
# write to csv file
team_stats.to_csv(matches_file)

In [6]:
# copy csv file to drive
!cp matches.csv drive/MyDrive/euro2024predictor/matches.csv