# Scores & Fixtures Scraper for Inference - Season 2025-2026

This notebook extracts match fixtures and results specifically for the 2025-2026 season for inference purposes.
It focuses on current season fixtures needed for making predictions.

In [2]:
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re
from urllib.parse import urljoin, urlparse
from datetime import datetime

In [3]:
# Headers to appear more like a regular browser
request_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def get_page(url):
    """Fetch page with error handling and rate limiting"""
    time.sleep(random.uniform(2, 4))  # Be respectful - random delay
    
    try:
        response = requests.get(url, headers=request_headers)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

In [4]:
def load_teams_for_inference():
    """
    Load teams data for 2025-2026 season inference
    Structure: team_name -> {team_name, team_id, reference}
    """
    try:
        # Try to load from inference teams file
        with open('../../../data/prod/inference/raw/all_teams.json', 'r', encoding='utf-8') as f:
            teams_data = json.load(f)
        print(f"Loaded {len(teams_data)} teams from inference file")
        return teams_data
    except FileNotFoundError:
        print("Inference teams file not found. Please run team_id_mapping_inference first.")
        return {}

# Load teams for inference
inference_teams = load_teams_for_inference()

# Display teams available for fixtures scraping
if inference_teams:
    print(f"\nTeams available for fixtures scraping:")
    for team_name, team_info in list(inference_teams.items())[:5]:
        print(f"  {team_name}: {team_info['team_id']}")
    if len(inference_teams) > 5:
        print(f"  ... and {len(inference_teams) - 5} more teams")

Loaded 20 teams from inference file

Teams available for fixtures scraping:
  Arsenal: 18bb7c10
  Aston Villa: 8602292d
  Bournemouth: 4ba7cbea
  Brentford: cd051869
  Brighton: d07537b9
  ... and 15 more teams


In [5]:
def extract_team_fixtures_2025_2026(team_id, team_name):
    """
    Extract fixtures and results for a team in the current season (2025-2026)
    Uses the current season URL structure: https://fbref.com/en/squads/{team_id}/{team_name}-Stats
    
    Args:
        team_id (str): FBRef team ID (e.g., 'b8fd03ef')
        team_name (str): Team name for URL construction
    
    Returns:
        dict: Dictionary containing match data
    """
    season = "2025-2026"
    
    # Construct the team fixtures URL for current season
    # Current season URL structure: https://fbref.com/en/squads/{team_id}/{team_name}-Stats
    team_name_url = team_name.replace(' ', '-').replace("'", "")
    url = f"https://fbref.com/en/squads/{team_id}/{team_name_url}-Stats"

    print(f"Fetching current season fixtures for {team_name}...")
    print(f"URL: {url}")
    
    soup = get_page(url)
    if not soup:
        return {}
    
    # Look for fixtures table - uses 'matchlogs_for' table ID
    fixtures_table = soup.find('table', {'id': 'matchlogs_for'})
    
    if not fixtures_table:
        print(f"No fixtures table found for {team_name}")
        # Let's see what tables are available
        all_tables = soup.find_all('table')
        table_ids = [table.get('id') for table in all_tables if table.get('id')]
        print(f"Available tables: {table_ids}")
        return {}
    
    print(f"✅ Found fixtures table for {team_name}")
    
    # Initialize fixtures data structure
    fixtures_data = {
        'team_id': team_id,
        'team_name': team_name,
        'season': season,
        'matches': []
    }
    
    # Process fixtures table
    tbody = fixtures_table.find('tbody')
    if tbody:
        rows = tbody.find_all('tr')
    else:
        rows = fixtures_table.find_all('tr')
        # Filter out header rows
        rows = [row for row in rows if row.find('td')]
    
    print(f"Found {len(rows)} fixture rows")
    
    for row in rows:
        match_data = {}
        
        # Extract all available data columns
        cells = row.find_all(['td', 'th'])
        for cell in cells:
            data_stat = cell.get('data-stat')
            if data_stat:
                cell_text = cell.text.strip()
                if cell_text and cell_text != '':
                    match_data[data_stat] = cell_text
                    
                # Special handling for links (opponent, competition, etc.)
                cell_link = cell.find('a')
                if cell_link and data_stat:
                    href = cell_link.get('href')
                    if href:
                        match_data[f"{data_stat}_href"] = href
        
        # Only add match if we have meaningful data
        if match_data.get('date') or match_data.get('opponent'):
            fixtures_data['matches'].append(match_data)
    
    print(f"✅ Total extracted: {len(fixtures_data['matches'])} matches")
    return fixtures_data

In [6]:
# Test with one team first - Arsenal
if inference_teams:
    # Get Arsenal's data
    arsenal_info = inference_teams.get('Arsenal', {})
    if arsenal_info:
        team_id = arsenal_info['team_id']
        team_name = arsenal_info['team_name']
        
        print(f"Testing fixtures extraction with {team_name} (ID: {team_id})")
        arsenal_fixtures = extract_team_fixtures_2025_2026(team_id, team_name)
        
        if arsenal_fixtures and arsenal_fixtures.get('matches'):
            print(f"\n✅ Success! Found {len(arsenal_fixtures['matches'])} matches for {team_name}")
            
            # Show first few matches
            print("\n📋 Sample matches:")
            for i, match in enumerate(arsenal_fixtures['matches'][:3]):
                date = match.get('date', 'TBD')
                opponent = match.get('opponent', 'Unknown')
                venue = match.get('venue', 'Unknown')
                result = match.get('result', 'Fixture')
                print(f"  {i+1}. {date} vs {opponent} ({venue}) - {result}")
        else:
            print(f"\n⚠️ No fixtures found for {team_name}")
    else:
        print("Arsenal not found in teams data")
else:
    print("No teams data available")

Testing fixtures extraction with Arsenal (ID: 18bb7c10)
Fetching current season fixtures for Arsenal...
URL: https://fbref.com/en/squads/18bb7c10/Arsenal-Stats
✅ Found fixtures table for Arsenal
Found 38 fixture rows
✅ Total extracted: 38 matches

✅ Success! Found 38 matches for Arsenal

📋 Sample matches:
  1. 2025-08-17 vs Manchester Utd (Away) - Fixture
  2. 2025-08-23 vs Leeds United (Home) - Fixture
  3. 2025-08-31 vs Liverpool (Away) - Fixture


In [None]:
def get_all_teams_fixtures(teams_dict, season="2025-2026"):
    """
    Extract fixtures data for all teams in the 2025-2026 season
    
    Args:
        teams_dict (dict): Dictionary with structure {team_name: {team_name, team_id, reference}}
        season (str): Season (default: '2025-2026')
    
    Returns:
        dict: Complete fixtures dataset for all teams
    """
    all_fixtures_data = {}
    total_teams = len(teams_dict)
    successful_extractions = 0
    failed_teams = []
    
    print(f"Starting fixtures extraction for {total_teams} teams in {season}...")
    print("=" * 80)
    
    for i, (team_name_key, team_info) in enumerate(teams_dict.items(), 1):
        team_name = team_info['team_name']
        team_id = team_info['team_id']
        
        print(f"\n🏟️  [{i}/{total_teams}] Processing {team_name} (ID: {team_id})")
        
        try:
            team_fixtures = extract_team_fixtures_2025_2026(team_id, team_name)
            
            if team_fixtures and team_fixtures.get('matches'):
                all_fixtures_data[team_id] = {
                    'team_name': team_name,
                    'team_id': team_id,
                    'season_data': team_fixtures
                }
                match_count = len(team_fixtures['matches'])
                successful_extractions += 1
                print(f"   ✅ Success: {match_count} matches")
            else:
                failed_teams.append(team_name)
                print(f"   ⚠️  No fixture data found for {team_name}")
                all_fixtures_data[team_id] = {
                    'team_name': team_name,
                    'team_id': team_id,
                    'season_data': None
                }
                
        except Exception as e:
            failed_teams.append(team_name)
            print(f"   ❌ Error extracting {team_name}: {str(e)}")
            all_fixtures_data[team_id] = {
                'team_name': team_name,
                'team_id': team_id,
                'season_data': None
            }
        
        # Small delay between requests
        if i < total_teams:
            print(f"   ⏳ Waiting before next request...")
            time.sleep(1)
    
    # Calculate total matches
    total_matches = sum(len(data['season_data']['matches']) 
                       for data in all_fixtures_data.values() 
                       if data['season_data'] and data['season_data'].get('matches'))
    
    print("\n" + "=" * 80)
    print("📈 EXTRACTION SUMMARY:")
    print(f"   Teams processed: {total_teams}")
    print(f"   Successful extractions: {successful_extractions}")
    print(f"   Failed extractions: {len(failed_teams)}")
    print(f"   Total match records: {total_matches}")
    
    if failed_teams:
        print(f"\n❌ Teams that failed: {', '.join(failed_teams)}")
    
    print("=" * 80)
    
    return all_fixtures_data

In [8]:
def fixtures_to_dataframe(fixtures_data):
    """
    Convert fixtures data to pandas DataFrame
    
    Args:
        fixtures_data (dict): Fixtures data from get_all_teams_fixtures()
    
    Returns:
        pd.DataFrame: DataFrame with match fixtures for 2025-2026
    """
    all_records = []
    
    for team_id, team_data in fixtures_data.items():
        team_name = team_data['team_name']
        season_data = team_data['season_data']
        
        if season_data and season_data.get('matches'):
            for match in season_data['matches']:
                # Create a record for each match
                record = {
                    'team_id': team_id,
                    'team_name': team_name,
                    'season': season_data['season']
                }
                
                # Add all match data
                record.update(match)
                all_records.append(record)
    
    # Create DataFrame
    df = pd.DataFrame(all_records)
    
    # Reorder columns for better readability
    if len(df) > 0:
        priority_columns = ['team_name', 'season', 'date', 'time', 'opponent', 'venue', 'result']
        other_columns = [col for col in df.columns if col not in priority_columns]
        available_priority = [col for col in priority_columns if col in df.columns]
        df = df[available_priority + other_columns]
    
    return df

def save_inference_fixtures_data(fixtures_data, filename):
    """
    Save fixtures data to JSON file for inference
    """
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(fixtures_data, f, indent=2, ensure_ascii=False)
    
    print(f"Inference fixtures data saved to {filename}")

In [9]:
# Execute fixtures extraction for all teams in 2025-2026 season
if inference_teams:
    print("Starting fixtures extraction for all teams...")
    all_fixtures_data = get_all_teams_fixtures(inference_teams)
    
    # Save raw data
    save_inference_fixtures_data(all_fixtures_data, '../../../data/prod/inference/raw/fixtures_2025_2026.json')
    
    # Convert to DataFrame
    fixtures_df = fixtures_to_dataframe(all_fixtures_data)
    
    if not fixtures_df.empty:
        print(f"\n✅ Success! Combined fixtures dataset created")
        print(f"Shape: {fixtures_df.shape}")
        print(f"Columns: {list(fixtures_df.columns)}")
        
        # Save DataFrame to multiple formats
        fixtures_df.to_csv('../../../data/prod/inference/raw/fixtures_2025_2026.csv', index=False)
        fixtures_df.to_parquet('../../../data/prod/inference/raw/fixtures_2025_2026.parquet', index=False)
        
        print(f"\n💾 Data saved to inference/raw/ directory in multiple formats")
        
        # Show sample data
        print(f"\n📋 Sample fixtures:")
        display_cols = ['team_name', 'date', 'opponent', 'venue', 'result']
        available_cols = [col for col in display_cols if col in fixtures_df.columns]
        print(fixtures_df[available_cols].head(10))
        
        # Show upcoming fixtures (no result yet)
        if 'result' in fixtures_df.columns:
            upcoming = fixtures_df[fixtures_df['result'].isna() | (fixtures_df['result'] == '')]
            if not upcoming.empty:
                print(f"\n🔮 Upcoming fixtures: {len(upcoming)} matches")
                print(upcoming[available_cols].head(5))
        
    else:
        print(f"\n❌ No fixtures data was extracted")
else:
    print("❌ No teams data available. Please load teams first.")

Starting fixtures extraction for all teams...
Starting fixtures extraction for 20 teams in 2025-2026...

🏟️  [1/20] Processing Arsenal (ID: 18bb7c10)
Fetching current season fixtures for Arsenal...
URL: https://fbref.com/en/squads/18bb7c10/Arsenal-Stats
✅ Found fixtures table for Arsenal
Found 38 fixture rows
✅ Total extracted: 38 matches
   ✅ Success: 38 matches
   ⏳ Waiting before next request...

🏟️  [2/20] Processing Aston Villa (ID: 8602292d)
Fetching current season fixtures for Aston Villa...
URL: https://fbref.com/en/squads/8602292d/Aston-Villa-Stats
✅ Found fixtures table for Aston Villa
Found 38 fixture rows
✅ Total extracted: 38 matches
   ✅ Success: 38 matches
   ⏳ Waiting before next request...

🏟️  [3/20] Processing Bournemouth (ID: 4ba7cbea)
Fetching current season fixtures for Bournemouth...
URL: https://fbref.com/en/squads/4ba7cbea/Bournemouth-Stats
✅ Found fixtures table for Bournemouth
Found 39 fixture rows
✅ Total extracted: 39 matches
   ✅ Success: 39 matches
   ⏳ W

In [11]:
# Summary analysis of fixtures data
if 'fixtures_df' in locals() and not fixtures_df.empty:
    print("\n📊 FIXTURES DATA ANALYSIS:")
    print(f"Total matches: {len(fixtures_df)}")
    print(f"Teams covered: {fixtures_df['team_name'].nunique()}")
    print(f"Season: {fixtures_df['season'].iloc[0] if 'season' in fixtures_df.columns else 'N/A'}")
    
    # Matches per team
    if 'team_name' in fixtures_df.columns:
        print("\n🏟️ Matches per team:")
        team_counts = fixtures_df['team_name'].value_counts().sort_values(ascending=False)
        for team, count in team_counts.head(10).items():
            print(f"  {team}: {count} matches")
    
    # Results breakdown (if available)
    if 'result' in fixtures_df.columns:
        results = fixtures_df['result'].value_counts()
        print(f"\n📈 Results breakdown:")
        for result, count in results.items():
            if pd.notna(result) and result != '':
                print(f"  {result}: {count} matches")
        
        # Upcoming matches
        upcoming_count = fixtures_df['result'].isna().sum() + (fixtures_df['result'] == '').sum()
        print(f"  Upcoming: {upcoming_count} matches")
    
    print(f"\n📅 Date range:")
    if 'date' in fixtures_df.columns:
        dates = fixtures_df['date'].dropna()
        if not dates.empty:
            print(f"  From: {dates.min()}")
            print(f"  To: {dates.max()}")
else:
    print("No fixtures data available for analysis.")


📊 FIXTURES DATA ANALYSIS:
Total matches: 774
Teams covered: 20
Season: 2025-2026

🏟️ Matches per team:
  Leeds United: 39 matches
  Wolves: 39 matches
  Bournemouth: 39 matches
  Brentford: 39 matches
  Brighton: 39 matches
  Everton: 39 matches
  Crystal Palace: 39 matches
  Burnley: 39 matches
  Fulham: 39 matches
  Liverpool: 39 matches

📈 Results breakdown:
  D: 3 matches
  Upcoming: 771 matches

📅 Date range:
  From: 2025-08-10
  To: 2026-05-24
