# Premier League Scores & Fixtures Scraper

This notebook scrapes match results and fixture information from FBRef for Premier League teams.

## Imports and Setup

In [1]:
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re
from urllib.parse import urljoin, urlparse

# Headers to appear more like a regular browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def get_page(url):
    """Fetch page with error handling and rate limiting"""
    time.sleep(random.uniform(2, 4))  # Be respectful - random delay
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

  from pandas.core import (


In [2]:
def load_teams_from_json(json_filename):
    """
    Load teams data from JSON file
    
    Args:
        json_filename (str): Path to JSON file
    
    Returns:
        dict: Teams data dictionary
    """
    with open(json_filename, 'r', encoding='utf-8') as f:
        teams_data = json.load(f)

    return teams_data

# Load the teams data
all_teams = load_teams_from_json('../../data/raw/all_teams.json')

# Now you can use all_teams for wage extraction
print(f"Loaded {len(all_teams)} teams from JSON file")

Loaded 27 teams from JSON file


In [3]:
def extract_scores_fixtures(team_id, season, team_name):
    """
    Extract scores and fixtures information for a team in a specific season
    
    Args:
        team_id (str): FBRef team ID (e.g., 'b8fd03ef')
        season (str): Season in format '2023-2024'
        team_name (str): Team name for URL construction
    
    Returns:
        dict: Dictionary containing match data
    """
    # Construct the team fixtures URL
    team_name_url = team_name.replace(' ', '-').replace("'", "")
    url = f"https://fbref.com/en/squads/{team_id}/{season}/all_comps/{team_name_url}-Stats-All-Competitions"

    print(f"Fetching fixtures for {team_name} ({season})...")
    print(f"URL: {url}")
    
    soup = get_page(url)
    if not soup:
        return {}
    
    # Look for fixtures table - uses 'matchlogs_for' table ID
    fixtures_table = soup.find('table', {'id': 'matchlogs_for'})
    
    if not fixtures_table:
        print(f"No fixtures table found for {team_name} ({season})")
        return {}
    
    print(f"Found fixtures table")
    
    # Initialize fixtures data structure
    fixtures_data = {
        'team_id': team_id,
        'team_name': team_name,
        'season': season,
        'matches': []
    }
    
    # Process fixtures table
    tbody = fixtures_table.find('tbody')
    if tbody:
        rows = tbody.find_all('tr')
    else:
        rows = fixtures_table.find_all('tr')
        # Filter out header rows
        rows = [row for row in rows if row.find('td')]
    
    print(f"Found {len(rows)} fixture rows")
    
    for row in rows:
        match_data = {}
        
        # Extract all available data columns
        cells = row.find_all(['td', 'th'])
        for cell in cells:
            data_stat = cell.get('data-stat')
            if data_stat:
                cell_text = cell.text.strip()
                if cell_text and cell_text != '':
                    match_data[data_stat] = cell_text
                    
                # Special handling for links (opponent, competition, etc.)
                cell_link = cell.find('a')
                if cell_link and data_stat:
                    href = cell_link.get('href')
                    if href:
                        match_data[f"{data_stat}_href"] = href
        
        # Only add match if we have meaningful data
        if match_data.get('date') or match_data.get('opponent'):
            fixtures_data['matches'].append(match_data)
    
    print(f"Total extracted: {len(fixtures_data['matches'])} matches")
    return fixtures_data

In [4]:
def extract_all_team_fixtures(all_teams_dict):
    """
    Extract fixtures data for all teams across all seasons they played
    
    Args:
        all_teams_dict (dict): Dictionary containing team information with seasons
    
    Returns:
        dict: Complete fixtures dataset organized by team_id and season
    """
    all_fixtures_data = {}
    total_extractions = sum(len(team_info['seasons']) for team_info in all_teams_dict.values())
    current_extraction = 0
    
    print(f"Starting fixtures extraction for {len(all_teams_dict)} teams across {total_extractions} team-season combinations...")
    print("=" * 80)
    
    for team_id, team_info in all_teams_dict.items():
        team_name = team_info['team_name']
        seasons = team_info['seasons']
        
        print(f"\n🏟️  Processing {team_name} (ID: {team_id})")
        print(f"   Seasons to extract: {seasons}")
        
        # Initialize team entry in results
        if team_id not in all_fixtures_data:
            all_fixtures_data[team_id] = {
                'team_name': team_name,
                'team_id': team_id,
                'seasons_data': {}
            }
        
        # Extract fixtures for each season this team played
        for season in seasons:
            current_extraction += 1
            print(f"\n   📊 [{current_extraction}/{total_extractions}] Extracting {season}...")
            
            try:
                season_fixtures = extract_scores_fixtures(team_id, season, team_name)
                
                if season_fixtures and season_fixtures.get('matches'):
                    all_fixtures_data[team_id]['seasons_data'][season] = season_fixtures
                    match_count = len(season_fixtures['matches'])
                    print(f"   ✅ Success: {match_count} matches")
                else:
                    print(f"   ⚠️  No fixture data found for {team_name} in {season}")
                    all_fixtures_data[team_id]['seasons_data'][season] = None
                    
            except Exception as e:
                print(f"   ❌ Error extracting {team_name} {season}: {str(e)}")
                all_fixtures_data[team_id]['seasons_data'][season] = None
            
            # Small delay to be respectful to the server
            time.sleep(1)
    
    # Summary statistics
    successful_extractions = 0
    total_matches = 0
    
    for team_data in all_fixtures_data.values():
        for season_data in team_data['seasons_data'].values():
            if season_data and season_data.get('matches'):
                successful_extractions += 1
                total_matches += len(season_data['matches'])
    
    print("\n" + "=" * 80)
    print("📈 EXTRACTION SUMMARY:")
    print(f"   Total teams processed: {len(all_fixtures_data)}")
    print(f"   Successful extractions: {successful_extractions}/{total_extractions}")
    print(f"   Total match records: {total_matches}")
    print("=" * 80)
    
    return all_fixtures_data

In [5]:
def save_fixtures_data(fixtures_data, filename):
    """
    Save fixtures data to JSON file with proper serialization
    """
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(fixtures_data, f, indent=2, ensure_ascii=False)
    
    print(f"Fixtures data saved to {filename}")

In [6]:
def fixtures_data_to_dataframe(fixtures_data):
    """
    Convert fixtures data dictionary to a pandas DataFrame
    
    Args:
        fixtures_data (dict): Fixtures data from extract_all_team_fixtures()
    
    Returns:
        pd.DataFrame: Flattened DataFrame with one row per match
    """
    all_records = []
    
    for team_id, team_data in fixtures_data.items():
        team_name = team_data['team_name']
        
        for season, season_data in team_data['seasons_data'].items():
            if season_data and season_data.get('matches'):
                
                for match in season_data['matches']:
                    # Create a record for each match
                    record = {
                        'team_id': team_id,
                        'team_name': team_name,
                        'season': season
                    }
                    
                    # Add all match data
                    record.update(match)
                    all_records.append(record)
    
    # Create DataFrame
    df = pd.DataFrame(all_records)
    
    # Reorder columns to match FBRef table structure
    if len(df) > 0:
        # First the team identification columns
        team_columns = ['team_name', 'season', 'team_id']
        
        # Then the actual FBRef fixtures table columns in order
        fixtures_columns = [
            'date', 'time', 'comp','round', 'day', 'venue', 'result', 
            'gf', 'ga', 'opponent', 'xg', 'xga', 'poss', 
            'attendance', 'captain', 'formation', 'formation_opp', 
            'referee', 'match_report', 'notes'
        ]
        
        # Include any additional columns (like href links)
        available_team_cols = [col for col in team_columns if col in df.columns]
        available_fixtures_cols = [col for col in fixtures_columns if col in df.columns]
        other_columns = [col for col in df.columns if col not in team_columns + fixtures_columns]
        
        # Final column order
        final_columns = available_team_cols + available_fixtures_cols + other_columns
        df = df[final_columns]
    
    return df

In [7]:
def load_fixtures_json_to_dataframe(json_filename):
    """
    Load fixtures data from JSON file and convert to DataFrame
    
    Args:
        json_filename (str): Path to JSON file
    
    Returns:
        pd.DataFrame: Fixtures data as DataFrame
    """
    with open(json_filename, 'r', encoding='utf-8') as f:
        fixtures_data = json.load(f)
    
    return fixtures_data_to_dataframe(fixtures_data)

## Execute Fixtures Extraction

**WARNING**: This will take a significant amount of time as it needs to scrape fixture data for all teams across all seasons.

In [8]:
# Uncomment to run full extraction for all teams:
complete_fixtures_dataset = extract_all_team_fixtures(all_teams)

# Save to JSON
save_fixtures_data(complete_fixtures_dataset, '../../data/raw/all_competitions_fixtures_2019_2025.json')

# Convert to DataFrame
fixtures_df = fixtures_data_to_dataframe(complete_fixtures_dataset)
print(f"DataFrame shape: {fixtures_df.shape}")
print(f"Columns: {list(fixtures_df.columns)}")
fixtures_df.head()

Starting fixtures extraction for 27 teams across 120 team-season combinations...

🏟️  Processing Arsenal (ID: 18bb7c10)
   Seasons to extract: ['2019-2020', '2020-2021', '2021-2022', '2022-2023', '2023-2024', '2024-2025']

   📊 [1/120] Extracting 2019-2020...
Fetching fixtures for Arsenal (2019-2020)...
URL: https://fbref.com/en/squads/18bb7c10/2019-2020/all_comps/Arsenal-Stats-All-Competitions
Found fixtures table
Found 54 fixture rows
Total extracted: 54 matches
   ✅ Success: 54 matches

   📊 [2/120] Extracting 2020-2021...
Fetching fixtures for Arsenal (2020-2021)...
URL: https://fbref.com/en/squads/18bb7c10/2020-2021/all_comps/Arsenal-Stats-All-Competitions
Found fixtures table
Found 58 fixture rows
Total extracted: 58 matches
   ✅ Success: 58 matches

   📊 [3/120] Extracting 2021-2022...
Fetching fixtures for Arsenal (2021-2022)...
URL: https://fbref.com/en/squads/18bb7c10/2021-2022/all_comps/Arsenal-Stats-All-Competitions
Found fixtures table
Found 45 fixture rows
Total extracted

Unnamed: 0,team_name,season,team_id,date,comp,round,venue,result,opponent,attendance,...,dayofweek,goals_for,goals_against,opponent_href,xg_for,xg_against,possession,captain_href,opp_formation,match_report_href
0,Arsenal,2019-2020,18bb7c10,2019-08-11,Premier League,Matchweek 1,Away,W,Newcastle Utd,47635,...,Sun,1,0,/en/squads/b2b47a98/2019-2020/Newcastle-United...,1.1,0.4,62,/en/players/e61b8aee/Granit-Xhaka,3-5-2,/en/matches/1405a610/Newcastle-United-Arsenal-...
1,Arsenal,2019-2020,18bb7c10,2019-08-17,Premier League,Matchweek 2,Home,W,Burnley,60214,...,Sat,2,1,/en/squads/943e8050/2019-2020/Burnley-Stats,0.8,1.5,67,/en/players/d4cb83cc/Nacho-Monreal,4-4-2,/en/matches/ff7eda21/Arsenal-Burnley-August-17...
2,Arsenal,2019-2020,18bb7c10,2019-08-24,Premier League,Matchweek 3,Away,L,Liverpool,53298,...,Sat,1,3,/en/squads/822bd0ba/2019-2020/Liverpool-Stats,1.0,2.5,48,/en/players/e61b8aee/Granit-Xhaka,4-3-3,/en/matches/102b241e/Liverpool-Arsenal-August-...
3,Arsenal,2019-2020,18bb7c10,2019-09-01,Premier League,Matchweek 4,Home,D,Tottenham,60333,...,Sun,2,2,/en/squads/361ca564/2019-2020/Tottenham-Hotspu...,2.4,2.0,55,/en/players/e61b8aee/Granit-Xhaka,4-2-2-2,/en/matches/0b6b8aaf/North-London-Derby-Arsena...
4,Arsenal,2019-2020,18bb7c10,2019-09-15,Premier League,Matchweek 5,Away,D,Watford,21360,...,Sun,2,2,/en/squads/2abfe087/2019-2020/Watford-Stats,0.8,2.7,48,/en/players/e61b8aee/Granit-Xhaka,4-2-3-1,/en/matches/8257eda8/Watford-Arsenal-September...


In [9]:
all_fixtures = load_teams_from_json('../../data/raw/all_competitions_fixtures_2019_2024.json')

In [11]:
fixtures_df = fixtures_data_to_dataframe(all_fixtures)
print(f"DataFrame shape: {fixtures_df.shape}")
print(f"Columns: {list(fixtures_df.columns)}")
fixtures_df.head()

DataFrame shape: (4776, 29)
Columns: ['team_name', 'season', 'team_id', 'date', 'comp', 'round', 'venue', 'result', 'opponent', 'attendance', 'captain', 'formation', 'referee', 'match_report', 'notes', 'date_href', 'start_time', 'comp_href', 'round_href', 'dayofweek', 'goals_for', 'goals_against', 'opponent_href', 'xg_for', 'xg_against', 'possession', 'captain_href', 'opp_formation', 'match_report_href']


Unnamed: 0,team_name,season,team_id,date,comp,round,venue,result,opponent,attendance,...,dayofweek,goals_for,goals_against,opponent_href,xg_for,xg_against,possession,captain_href,opp_formation,match_report_href
0,Arsenal,2019-2020,18bb7c10,2019-08-11,Premier League,Matchweek 1,Away,W,Newcastle Utd,47635,...,Sun,1,0,/en/squads/b2b47a98/2019-2020/Newcastle-United...,1.1,0.4,62,/en/players/e61b8aee/Granit-Xhaka,3-5-2,/en/matches/1405a610/Newcastle-United-Arsenal-...
1,Arsenal,2019-2020,18bb7c10,2019-08-17,Premier League,Matchweek 2,Home,W,Burnley,60214,...,Sat,2,1,/en/squads/943e8050/2019-2020/Burnley-Stats,0.8,1.5,67,/en/players/d4cb83cc/Nacho-Monreal,4-4-2,/en/matches/ff7eda21/Arsenal-Burnley-August-17...
2,Arsenal,2019-2020,18bb7c10,2019-08-24,Premier League,Matchweek 3,Away,L,Liverpool,53298,...,Sat,1,3,/en/squads/822bd0ba/2019-2020/Liverpool-Stats,1.0,2.5,48,/en/players/e61b8aee/Granit-Xhaka,4-3-3,/en/matches/102b241e/Liverpool-Arsenal-August-...
3,Arsenal,2019-2020,18bb7c10,2019-09-01,Premier League,Matchweek 4,Home,D,Tottenham,60333,...,Sun,2,2,/en/squads/361ca564/2019-2020/Tottenham-Hotspu...,2.4,2.0,55,/en/players/e61b8aee/Granit-Xhaka,4-2-2-2,/en/matches/0b6b8aaf/North-London-Derby-Arsena...
4,Arsenal,2019-2020,18bb7c10,2019-09-15,Premier League,Matchweek 5,Away,D,Watford,21360,...,Sun,2,2,/en/squads/2abfe087/2019-2020/Watford-Stats,0.8,2.7,48,/en/players/e61b8aee/Granit-Xhaka,4-2-3-1,/en/matches/8257eda8/Watford-Arsenal-September...
