# Match Stats Scraper - Inference (2024-2025 Season)

This notebook scrapes match statistics from FBRef specifically for inference purposes:
- **Season**: 2024-2025 only
- **Teams**: All teams from inference/processed/raw/all_teams.json
- **Purpose**: Collect recent match data for model inference and predictions

Based on the production-ready scraper from `match_stats_scraper_scaled.ipynb`

In [2]:
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re
from urllib.parse import urljoin, urlparse
from typing import Optional, List, Dict, Tuple
import os
from datetime import datetime

# Headers to appear more like a regular browser
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

print("✅ Libraries imported successfully")

✅ Libraries imported successfully


## Load Inference Data

Load the teams we need for inference and the comprehensive fixtures data.

In [11]:
# Load teams for inference
with open('../../../data/prod/inference/raw/all_teams.json', 'r', encoding='utf-8') as f:
    inference_teams = json.load(f)

# Load comprehensive fixtures data
with open('../../../data/prod/inference/raw/fixtures_2024_2025.json', 'r', encoding='utf-8') as f:
    all_fixtures_data = json.load(f)

print(f"📊 Inference teams loaded: {len(inference_teams)} teams")
print(f"📊 Comprehensive fixtures loaded for teams: {len(all_fixtures_data)}")

# Show teams for inference
inference_team_names = list(inference_teams.keys())
print(f"\n🎯 Teams for inference: {inference_team_names}")

📊 Inference teams loaded: 20 teams
📊 Comprehensive fixtures loaded for teams: 20

🎯 Teams for inference: ['Arsenal', 'Aston Villa', 'Bournemouth', 'Brentford', 'Brighton', 'Burnley', 'Chelsea', 'Crystal Palace', 'Everton', 'Fulham', 'Leeds United', 'Liverpool', 'Manchester City', 'Manchester Utd', 'Newcastle Utd', "Nott'ham Forest", 'Sunderland', 'Tottenham', 'West Ham', 'Wolves']


## Core Scraping Functions

Import and adapt the core scraping functions from the scaled scraper.

In [5]:
def get_page(url: str, delay_range: Tuple[float, float] = (3, 8)) -> Optional[BeautifulSoup]:
    """
    Fetch page with error handling and rate limiting
    
    Args:
        url: URL to fetch
        delay_range: Tuple of (min_delay, max_delay) in seconds
    
    Returns:
        BeautifulSoup object or None if failed
    """
    time.sleep(random.uniform(*delay_range))  # Be respectful - random delay
    
    try:
        response = requests.get(url, headers=HEADERS, timeout=30)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def extract_percentage_or_value(text: str) -> str:
    """
    Extract percentage first, if not found then extract first number
    Prioritizes percentage values over other numbers
    """
    # First try to find percentage
    percentage_match = re.search(r'(\d+(?:\.\d+)?%)', text)
    if percentage_match:
        return percentage_match.group(1)
    
    # If no percentage, try to find any number
    number_match = re.search(r'(\d+(?:\.\d+)?)', text)
    if number_match:
        return number_match.group(1)
    
    # If nothing found, return original text
    return text

print("✅ Core scraping functions loaded")

✅ Core scraping functions loaded


In [6]:
def extract_team_names_fallback(soup, match_id=None):
    """
    Fallback method to extract team names from other parts of the page
    if header extraction fails
    """
    import re
    
    # Try to extract from page title
    title = soup.find('title')
    if title:
        title_text = title.get_text()
        # Title format is usually "Team1 vs Team2, Date, Competition"
        match = re.search(r'(.+?)\s+vs\s+(.+?),', title_text)
        if match:
            return match.group(1).strip(), match.group(2).strip()
    
    # Try to extract from H1 heading
    h1 = soup.find('h1')
    if h1:
        h1_text = h1.get_text()
        match = re.search(r'(.+?)\s+vs\.?\s+(.+?)(?:\s+\(|$)', h1_text)
        if match:
            return match.group(1).strip(), match.group(2).strip()
    
    return None, None

def scrape_team_stats(soup: BeautifulSoup, match_id: str = None) -> Optional[Tuple[pd.DataFrame, str, str]]:
    """
    Scrape main team statistics from match page in long format
    
    Args:
        soup: BeautifulSoup object of the match page
        match_id: Match identifier (URL)
    
    Returns:
        Tuple of (DataFrame with stats in long format, team1_name, team2_name) or None
    """
    team_stats_div = soup.find('div', {'id': 'team_stats'})
    
    if not team_stats_div:
        print("No team_stats div found")
        return None
    
    table = team_stats_div.find('table')
    if not table:
        print("No table found in team_stats div")
        return None
    
    # Extract team names from header
    header_row = table.find('tr')
    team_cells = header_row.find_all('th')
    
    # Improved team name extraction - get full names, remove formation info
    import re
    team1_text = team_cells[0].get_text(strip=True)
    team2_text = team_cells[1].get_text(strip=True)
    
    # Remove formation info (text in parentheses) if present
    team1_name = re.sub(r'\s*\([^)]*\)', '', team1_text).strip()
    team2_name = re.sub(r'\s*\([^)]*\)', '', team2_text).strip()
    
    # Fallback: if names are empty or too short, try alternative extraction
    if not team1_name or not team2_name or len(team1_name) < 2 or len(team2_name) < 2:
        print("Using fallback team name extraction...")
        fallback_names = extract_team_names_fallback(soup, match_id)
        if fallback_names[0] and fallback_names[1]:
            team1_name, team2_name = fallback_names
    
    print(f"Teams found: '{team1_name}' vs '{team2_name}'")
    
    # Parse stats in long format
    stats_data = []
    rows = table.find_all('tr')[1:]  # Skip header
    
    i = 0
    while i < len(rows):
        # Each stat has a header row followed by a data row
        if i + 1 < len(rows):
            header_row = rows[i]
            data_row = rows[i + 1]
            
            # Get stat name
            stat_name = header_row.get_text(strip=True)
            
            if stat_name and stat_name != "Cards":
                # Get values for both teams
                data_cells = data_row.find_all('td')
                if len(data_cells) == 2:
                    team1_value = data_cells[0].get_text(strip=True)
                    team2_value = data_cells[1].get_text(strip=True)
                    
                    # Use improved extraction function
                    team1_clean = extract_percentage_or_value(team1_value)
                    team2_clean = extract_percentage_or_value(team2_value)
                    
                    # Add two rows: one for each team (long format)
                    stats_data.append({
                        'match_id': match_id,
                        'team_name': team1_name,
                        'stat_name': stat_name,
                        'stat_value': team1_clean
                    })
                    stats_data.append({
                        'match_id': match_id,
                        'team_name': team2_name,
                        'stat_name': stat_name,
                        'stat_value': team2_clean
                    })
        
        i += 2  # Skip to next stat (header + data)
    
    return pd.DataFrame(stats_data), team1_name, team2_name

print("✅ Team stats scraping functions loaded")

✅ Team stats scraping functions loaded


In [7]:
def scrape_team_stats_extra(soup: BeautifulSoup, team1_name: str, team2_name: str, match_id: str = None) -> Optional[pd.DataFrame]:
    """
    Scrape extra team statistics from match page in long format
    
    Args:
        soup: BeautifulSoup object of the match page
        team1_name: Name of first team
        team2_name: Name of second team
        match_id: Match identifier (URL)
    
    Returns:
        DataFrame with extra stats in long format or None
    """
    team_stats_extra_div = soup.find('div', {'id': 'team_stats_extra'})
    
    if not team_stats_extra_div:
        print("No team_stats_extra div found")
        return None
    
    stats_data = []
    
    # Find all stat containers
    stat_containers = team_stats_extra_div.find_all('div', recursive=False)
    
    for container in stat_containers:
        divs = container.find_all('div')
        if len(divs) >= 3:
            # Each row has: team1_value, stat_name, team2_value pattern
            for i in range(0, len(divs), 3):
                if i + 2 < len(divs):
                    team1_value = divs[i].get_text(strip=True)
                    stat_name = divs[i + 1].get_text(strip=True)
                    team2_value = divs[i + 2].get_text(strip=True)
                    
                    # Skip headers and invalid data
                    if team1_value.isdigit() and team2_value.isdigit():
                        # Add two rows: one for each team (long format)
                        stats_data.append({
                            'match_id': match_id,
                            'team_name': team1_name,
                            'stat_name': stat_name,
                            'stat_value': team1_value
                        })
                        stats_data.append({
                            'match_id': match_id,
                            'team_name': team2_name,
                            'stat_name': stat_name,
                            'stat_value': team2_value
                        })
    
    return pd.DataFrame(stats_data) if stats_data else None

def scrape_match_stats(match_url: str) -> Optional[pd.DataFrame]:
    """
    Scrape all team stats (main + extra) from a single match URL in long format
    
    Args:
        match_url: URL of the match page
    
    Returns:
        Combined DataFrame with all team stats in long format or None
        Format: match_id | team_name | stat_name | stat_value
    """
    print(f"Scraping: {match_url}")
    
    # Fetch the page
    soup = get_page(match_url)
    if not soup:
        print(f"Failed to fetch page: {match_url}")
        return None
    
    # Scrape team stats
    team_stats_result = scrape_team_stats(soup, match_url)
    if team_stats_result is None:
        print("Failed to scrape team stats")
        return None
    
    team_stats_df, team1_name, team2_name = team_stats_result
    
    # Scrape team stats extra
    team_stats_extra_df = scrape_team_stats_extra(soup, team1_name, team2_name, match_url)
    
    # Concatenate the dataframes
    dfs_to_concat = []
    if team_stats_df is not None and len(team_stats_df) > 0:
        dfs_to_concat.append(team_stats_df)
        print(f"Found {len(team_stats_df)} main stats rows")
    
    if team_stats_extra_df is not None and len(team_stats_extra_df) > 0:
        dfs_to_concat.append(team_stats_extra_df)
        print(f"Found {len(team_stats_extra_df)} extra stats rows")
    
    if dfs_to_concat:
        combined_df = pd.concat(dfs_to_concat, ignore_index=True)
        print(f"Total stats collected: {len(combined_df)} rows")
        return combined_df
    else:
        print("No stats available")
        return None

print("✅ Complete scraping functions loaded")

✅ Complete scraping functions loaded


## Data Processing Functions

Functions to filter and process fixture data for inference.

In [8]:
def fixtures_data_to_dataframe_filtered(fixtures_data: Dict, target_season: str = "2024-2025", target_teams: List[str] = None) -> pd.DataFrame:
    """
    Convert fixtures data dictionary to a pandas DataFrame with filtering
    
    Args:
        fixtures_data: Fixtures data from extract_all_team_fixtures()
        target_season: Season to filter for (e.g., "2024-2025")
        target_teams: List of team names to include (None for all)
    
    Returns:
        Filtered DataFrame with one row per match
    """
    all_records = []
    
    for team_id, team_data in fixtures_data.items():
        team_name = team_data['team_name']
        
        # Filter by team if specified
        if target_teams and team_name not in target_teams:
            continue
        
        # Check if target season exists
        if target_season not in team_data.get('seasons_data', {}):
            print(f"⚠️  Season {target_season} not found for team {team_name}")
            continue
            
        season_data = team_data['seasons_data'][target_season]
        
        if season_data and season_data.get('matches'):
            for match in season_data['matches']:
                # Create a record for each match
                record = {
                    'team_id': team_id,
                    'team_name': team_name,
                    'season': target_season
                }
                
                # Add all match data
                record.update(match)
                all_records.append(record)
    
    # Create DataFrame
    df = pd.DataFrame(all_records)
    
    if len(df) > 0:
        # Add full match report URL
        df['full_match_report_url'] = 'https://fbref.com' + df['match_report_href']
        print(f"✅ Filtered to {len(df)} matches for {target_season}")
    else:
        print(f"❌ No matches found for {target_season}")
    
    return df

def get_inference_fixtures(fixtures_data: Dict, inference_teams: Dict, target_season: str = "2024-2025") -> pd.DataFrame:
    """
    Get fixtures specifically for inference teams and season
    
    Args:
        fixtures_data: Complete fixtures data
        inference_teams: Teams dictionary from inference data
        target_season: Season to filter for
    
    Returns:
        DataFrame with inference fixtures
    """
    inference_team_names = list(inference_teams.keys())
    
    print(f"🎯 Filtering for {len(inference_team_names)} inference teams")
    print(f"📅 Target season: {target_season}")
    
    filtered_df = fixtures_data_to_dataframe_filtered(
        fixtures_data, 
        target_season=target_season, 
        target_teams=inference_team_names
    )
    
    return filtered_df

def get_match_urls_from_fixtures(fixtures_df: pd.DataFrame) -> List[str]:
    """
    Extract unique match URLs from fixtures DataFrame
    
    Args:
        fixtures_df: DataFrame with fixtures data
    
    Returns:
        List of unique match URLs
    """
    return fixtures_df['full_match_report_url'].unique().tolist()

print("✅ Data processing functions loaded")

✅ Data processing functions loaded


## Filter Data for Inference

Get the 2024-2025 season matches for our inference teams.

In [12]:
# Filter fixtures for inference
print("🔍 Filtering fixtures for inference...")
inference_fixtures_df = get_inference_fixtures(
    all_fixtures_data, 
    inference_teams, 
    target_season="2024-2025"
)

if not inference_fixtures_df.empty:
    print(f"\n📊 Inference dataset summary:")
    print(f"   • Total matches: {len(inference_fixtures_df)}")
    print(f"   • Unique teams: {inference_fixtures_df['team_name'].nunique()}")
    print(f"   • Competitions: {list(inference_fixtures_df['comp'].unique())}")
    print(f"   • Date range: {inference_fixtures_df['date'].min()} to {inference_fixtures_df['date'].max()}")
    
    # Get unique match URLs
    inference_match_urls = get_match_urls_from_fixtures(inference_fixtures_df)
    print(f"   • Unique matches to scrape: {len(inference_match_urls)}")
    
    # Preview data
    print(f"\n📋 Sample data:")
    print(inference_fixtures_df[['team_name', 'date', 'comp', 'opponent', 'venue']].head())
else:
    print("❌ No fixtures found for inference teams in 2024-2025 season")

🔍 Filtering fixtures for inference...
🎯 Filtering for 20 inference teams
📅 Target season: 2024-2025
✅ Filtered to 996 matches for 2024-2025

📊 Inference dataset summary:
   • Total matches: 996
   • Unique teams: 20
   • Competitions: ['Premier League', 'Champions Lg', 'EFL Cup', 'FA Cup', 'Championship', 'Conf Lg', 'FA Community Shield', 'Europa Lg']
   • Date range: 2024-08-10 to 2025-05-28
   • Unique matches to scrape: 685

📋 Sample data:
  team_name        date            comp     opponent venue
0   Arsenal  2024-08-17  Premier League       Wolves  Home
1   Arsenal  2024-08-24  Premier League  Aston Villa  Away
2   Arsenal  2024-08-31  Premier League     Brighton  Home
3   Arsenal  2024-09-15  Premier League    Tottenham  Away
4   Arsenal  2024-09-19    Champions Lg  it Atalanta  Away


## Enhanced Scraper for Inference

Set up the enhanced scraper optimized for inference workload.

In [19]:
# Multiple User-Agents for rotation
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
]

class InferenceScraper:
    """Enhanced scraper optimized for inference workload"""
    
    def __init__(self, 
                 min_delay: float = 5.0,
                 max_delay: float = 12.0,
                 chunk_size: int = 25,
                 chunk_break: float = 60.0):  # 1 minute break between chunks
        """
        Initialize inference scraper with conservative settings
        
        Args:
            min_delay: Minimum delay between requests (seconds)
            max_delay: Maximum delay between requests (seconds) 
            chunk_size: Number of requests per chunk before break
            chunk_break: Break time between chunks (seconds)
        """
        self.min_delay = min_delay
        self.max_delay = max_delay
        self.chunk_size = chunk_size
        self.chunk_break = chunk_break
        
        # Request counter for chunking
        self.request_count = 0
        
    def _get_random_headers(self) -> Dict[str, str]:
        """Get random headers for request"""
        return {
            'User-Agent': random.choice(USER_AGENTS),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        }
    
    def _wait_between_requests(self):
        """Handle delays and chunking"""
        # Random delay between requests
        delay = random.uniform(self.min_delay, self.max_delay)
        print(f"⏱️  Waiting {delay:.1f} seconds...")
        time.sleep(delay)
        
        # Increment request counter
        self.request_count += 1
        
        # Check if we need a chunk break
        if self.request_count % self.chunk_size == 0:
            print(f"\n🛑 Chunk break after {self.request_count} requests")
            print(f"⏰ Waiting {self.chunk_break/60:.1f} minutes before continuing...")
            time.sleep(self.chunk_break)
            print("🚀 Resuming scraping...\n")
    
    def get_page_enhanced(self, url: str) -> Optional[BeautifulSoup]:
        """Enhanced page fetching with anti-blocking measures"""
        self._wait_between_requests()
        headers = self._get_random_headers()
        
        try:
            response = requests.get(url, headers=headers, timeout=30)
            
            # Check for rate limiting
            if response.status_code == 429:
                print(f"⚠️  Rate limited. Waiting longer...")
                time.sleep(120)  # Wait 2 minutes for rate limit
                return None
            
            response.raise_for_status()
            return BeautifulSoup(response.content, 'html.parser')
            
        except requests.RequestException as e:
            print(f"❌ Error fetching {url}: {e}")
            return None
    
    def scrape_match_stats_enhanced(self, match_url: str) -> Optional[pd.DataFrame]:
        """Enhanced version of scrape_match_stats for inference"""
        print(f"🔍 Scraping: {match_url}")
        
        # Fetch the page with enhanced measures
        soup = self.get_page_enhanced(match_url)
        if not soup:
            print(f"❌ Failed to fetch page: {match_url}")
            return None
        
        # Use existing scraping logic
        team_stats_result = scrape_team_stats(soup, match_url)
        if team_stats_result is None:
            print("❌ Failed to scrape team stats")
            return None
        
        team_stats_df, team1_name, team2_name = team_stats_result
        
        # Scrape team stats extra
        team_stats_extra_df = scrape_team_stats_extra(soup, team1_name, team2_name, match_url)
        
        # Combine results
        dfs_to_concat = []
        if team_stats_df is not None and len(team_stats_df) > 0:
            dfs_to_concat.append(team_stats_df)
            print(f"✅ Found {len(team_stats_df)} main stats rows")
        
        if team_stats_extra_df is not None and len(team_stats_extra_df) > 0:
            dfs_to_concat.append(team_stats_extra_df)
            print(f"✅ Found {len(team_stats_extra_df)} extra stats rows")
        
        if dfs_to_concat:
            combined_df = pd.concat(dfs_to_concat, ignore_index=True)
            print(f"📊 Total stats collected: {len(combined_df)} rows")
            return combined_df
        else:
            print("⚠️  No stats available")
            return None

# Initialize inference scraper
inference_scraper = InferenceScraper(
    min_delay=5.0,       # 5-12 second delays
    max_delay=12.0,
    chunk_size=250,       # Break every 250 requests
    chunk_break=500.0     # 5 minute breaks
)

print("✅ Inference scraper initialized")

✅ Inference scraper initialized


## Batch Processing for Inference

Function to scrape multiple matches with progress tracking.

In [14]:
def scrape_inference_matches(match_urls: List[str], 
                           max_matches: Optional[int] = None,
                           save_progress: bool = True,
                           output_dir: str = '../../../data/prod/inference/raw/match_stats/') -> pd.DataFrame:
    """
    Scrape match stats for inference with progress tracking
    
    Args:
        match_urls: List of match URLs to scrape
        max_matches: Maximum number of matches to process
        save_progress: Whether to save progress periodically
        output_dir: Directory to save progress files
    
    Returns:
        Combined DataFrame with all match stats for inference
    """
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    all_stats = []
    failed_urls = []
    
    # Limit matches if specified
    urls_to_process = match_urls[:max_matches] if max_matches else match_urls
    
    print(f"🚀 Starting inference scraping for {len(urls_to_process)} matches")
    print(f"⚙️  Settings: {inference_scraper.min_delay}-{inference_scraper.max_delay}s delays, {inference_scraper.chunk_size} requests per chunk")
    
    start_time = time.time()
    
    for i, match_url in enumerate(urls_to_process, 1):
        print(f"\n[{i}/{len(urls_to_process)}] Processing match...")
        
        try:
            stats_df = inference_scraper.scrape_match_stats_enhanced(match_url)
            
            if stats_df is not None:
                all_stats.append(stats_df)
                print(f"✅ Successfully scraped {len(stats_df)} stats")
            else:
                failed_urls.append(match_url)
                print(f"❌ Failed to scrape stats")
        
        except Exception as e:
            print(f"❌ Error processing {match_url}: {e}")
            failed_urls.append(match_url)
        
        # Save progress every 10 matches
        if save_progress and i % 10 == 0 and all_stats:
            progress_df = pd.concat(all_stats, ignore_index=True)
            progress_file = f"{output_dir}progress_inference_{i}.json"
            progress_df.to_json(progress_file, orient='records', indent=2)
            
            elapsed = time.time() - start_time
            rate = i / elapsed * 3600  # matches per hour
            print(f"💾 Progress saved: {progress_file}")
            print(f"📈 Rate: {rate:.1f} matches/hour")
    
    # Final results
    if all_stats:
        final_df = pd.concat(all_stats, ignore_index=True)
        
        elapsed = time.time() - start_time
        total_time = elapsed / 3600  # hours
        
        print(f"\n🎉 INFERENCE SCRAPING COMPLETE!")
        print(f"✅ Successfully processed: {len(all_stats)} matches")
        print(f"❌ Failed to process: {len(failed_urls)} matches")
        print(f"📊 Total stats collected: {len(final_df)} rows")
        print(f"⏰ Total time: {total_time:.2f} hours")
        print(f"📈 Average rate: {len(all_stats)/total_time:.1f} matches/hour")
        
        # Save failed URLs
        if failed_urls:
            failed_file = f"{output_dir}failed_urls_inference_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
            with open(failed_file, 'w') as f:
                json.dump(failed_urls, f, indent=2)
            print(f"📝 Failed URLs saved: {failed_file}")
        
        return final_df
    else:
        print("❌ No matches were successfully processed")
        return pd.DataFrame()

def save_inference_results(df: pd.DataFrame, output_dir: str = '../../../data/prod/inference/raw/match_stats/') -> None:
    """
    Save inference results in multiple formats
    
    Args:
        df: DataFrame with inference match stats
        output_dir: Directory to save files
    """
    os.makedirs(output_dir, exist_ok=True)
    
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    base_filename = f"inference_match_stats_2024_2025_{timestamp}"
    
    # Save as JSON
    json_file = f"{output_dir}{base_filename}.json"
    df.to_json(json_file, orient='records', indent=2)
    print(f"📁 JSON saved: {json_file}")
    
    # Save as CSV
    csv_file = f"{output_dir}{base_filename}.csv"
    df.to_csv(csv_file, index=False)
    print(f"📁 CSV saved: {csv_file}")
    
    # Save as Parquet (efficient for large datasets)
    parquet_file = f"{output_dir}{base_filename}.parquet"
    df.to_parquet(parquet_file, index=False)
    print(f"📁 Parquet saved: {parquet_file}")

print("✅ Batch processing functions loaded")

✅ Batch processing functions loaded


## Run Inference Scraping

Execute the scraping process for 2024-2025 season matches.

In [15]:
# Test with a small sample first (RECOMMENDED)
print("🧪 TESTING INFERENCE SCRAPER WITH SMALL SAMPLE")
print("=" * 50)

if 'inference_match_urls' in locals() and len(inference_match_urls) > 0:
    # Test with just 3 matches first
    test_urls = inference_match_urls[:3]
    print(f"Testing with {len(test_urls)} matches...")
    
    # Run test
    test_stats_df = scrape_inference_matches(
        test_urls, 
        max_matches=3,
        output_dir='../../../data/prod/inference/raw/match_stats/test/'
    )
    
    if not test_stats_df.empty:
        print(f"\n✅ TEST SUCCESSFUL!")
        print(f"Collected {len(test_stats_df)} stats rows")
        print(f"\nSample data:")
        print(test_stats_df[['match_id', 'team_name', 'stat_name', 'stat_value']].head(10))
        
        # Save test results
        save_inference_results(test_stats_df, '../../../data/prod/inference/raw/match_stats/test/')
    else:
        print("\n❌ TEST FAILED - Check the output above for errors")
else:
    print("❌ No match URLs available for testing")

🧪 TESTING INFERENCE SCRAPER WITH SMALL SAMPLE
Testing with 3 matches...
🚀 Starting inference scraping for 3 matches
⚙️  Settings: 5.0-12.0s delays, 25 requests per chunk

[1/3] Processing match...
🔍 Scraping: https://fbref.com/en/matches/c0e3342a/Arsenal-Wolverhampton-Wanderers-August-17-2024-Premier-League
⏱️  Waiting 10.9 seconds...
Teams found: 'Arsenal' vs 'Wolves'
✅ Found 8 main stats rows
✅ Found 24 extra stats rows
📊 Total stats collected: 32 rows
✅ Successfully scraped 32 stats

[2/3] Processing match...
🔍 Scraping: https://fbref.com/en/matches/4692171a/Aston-Villa-Arsenal-August-24-2024-Premier-League
⏱️  Waiting 5.8 seconds...
Teams found: 'Aston Villa' vs 'Arsenal'
✅ Found 8 main stats rows
✅ Found 24 extra stats rows
📊 Total stats collected: 32 rows
✅ Successfully scraped 32 stats

[3/3] Processing match...
🔍 Scraping: https://fbref.com/en/matches/a843d023/Arsenal-Brighton-and-Hove-Albion-August-31-2024-Premier-League
⏱️  Waiting 9.1 seconds...
Teams found: 'Arsenal' vs 'Br

In [20]:
# Production inference scraping (uncomment to run)
print("🚀 PRODUCTION INFERENCE SCRAPING")
print("=" * 40)

if 'inference_match_urls' in locals() and len(inference_match_urls) > 0:
    print(f"📊 Total matches to scrape: {len(inference_match_urls)}")
    print(f"⏰ Estimated time: ~{len(inference_match_urls) * 8.5 / 3600:.1f} hours")
    print(f"⚙️  Settings: {inference_scraper.min_delay}-{inference_scraper.max_delay}s delays")
    print(f"🛑 Breaks: {inference_scraper.chunk_break/60:.0f} min every {inference_scraper.chunk_size} requests")
    
    # UNCOMMENT THE LINES BELOW TO START PRODUCTION SCRAPING
    
    inference_stats_df = scrape_inference_matches(
        inference_match_urls,
        max_matches=None,  # Process all matches
        output_dir='../../../data/prod/inference/raw/match_stats/'
    )
    
    # Save final results
    if not inference_stats_df.empty:
        save_inference_results(inference_stats_df)
        print(f"\n🎉 INFERENCE COMPLETE! Final dataset: {len(inference_stats_df)} rows")
    
#    print("\n⚠️  PRODUCTION SCRAPING IS COMMENTED OUT")
#    print("Uncomment the lines above to start the full inference scraping process")
else:
    print("❌ No match URLs available for inference")

🚀 PRODUCTION INFERENCE SCRAPING
📊 Total matches to scrape: 685
⏰ Estimated time: ~1.6 hours
⚙️  Settings: 5.0-12.0s delays
🛑 Breaks: 8 min every 250 requests
🚀 Starting inference scraping for 685 matches
⚙️  Settings: 5.0-12.0s delays, 250 requests per chunk

[1/685] Processing match...
🔍 Scraping: https://fbref.com/en/matches/c0e3342a/Arsenal-Wolverhampton-Wanderers-August-17-2024-Premier-League
⏱️  Waiting 11.4 seconds...
Teams found: 'Arsenal' vs 'Wolves'
✅ Found 8 main stats rows
✅ Found 24 extra stats rows
📊 Total stats collected: 32 rows
✅ Successfully scraped 32 stats

[2/685] Processing match...
🔍 Scraping: https://fbref.com/en/matches/4692171a/Aston-Villa-Arsenal-August-24-2024-Premier-League
⏱️  Waiting 9.2 seconds...
Teams found: 'Aston Villa' vs 'Arsenal'
✅ Found 8 main stats rows
✅ Found 24 extra stats rows
📊 Total stats collected: 32 rows
✅ Successfully scraped 32 stats

[3/685] Processing match...
🔍 Scraping: https://fbref.com/en/matches/a843d023/Arsenal-Brighton-and-Hov

## Results Summary

Summary of the inference scraping results and next steps.

In [21]:
# Check existing inference results
results_dir = '../../../data/prod/inference/raw/match_stats/'

if os.path.exists(results_dir):
    result_files = [f for f in os.listdir(results_dir) if f.endswith(('.json', '.csv', '.parquet'))]
    
    if result_files:
        print("📁 Existing inference results:")
        for file in sorted(result_files):
            file_path = os.path.join(results_dir, file)
            size = os.path.getsize(file_path) / 1024  # KB
            print(f"   • {file} ({size:.1f} KB)")
        
        # Load latest results if available
        latest_json = [f for f in result_files if f.endswith('.json') and 'inference_match_stats' in f]
        if latest_json:
            latest_file = sorted(latest_json)[-1]
            latest_path = os.path.join(results_dir, latest_file)
            
            print(f"\n📊 Loading latest results: {latest_file}")
            latest_df = pd.read_json(latest_path)
            
            print(f"\n📈 Results summary:")
            print(f"   • Total rows: {len(latest_df):,}")
            print(f"   • Unique matches: {latest_df['match_id'].nunique():,}")
            print(f"   • Teams covered: {latest_df['team_name'].nunique()}")
            print(f"   • Stat types: {latest_df['stat_name'].nunique()}")
            
            # Show team distribution
            team_counts = latest_df['team_name'].value_counts()
            print(f"\n🏆 Team coverage:")
            print(team_counts.head(10).to_string())
    else:
        print("📂 No inference results found yet")
else:
    print("📂 Results directory does not exist yet")

print("\n✅ Inference scraper ready for use!")
print("\n📋 Next steps:")
print("   1. Test with small sample (already set up above)")
print("   2. Uncomment production scraping code to run full inference")
print("   3. Results will be saved in multiple formats for downstream use")
print("   4. Use the scraped data for model inference and predictions")

📁 Existing inference results:
   • failed_urls_inference_20250814_232352.json (3.6 KB)
   • inference_match_stats_2024_2025_20250814_232352.csv (2313.3 KB)
   • inference_match_stats_2024_2025_20250814_232352.json (4020.0 KB)
   • inference_match_stats_2024_2025_20250814_232352.parquet (55.9 KB)
   • progress_inference_10.json (62.1 KB)
   • progress_inference_100.json (605.0 KB)
   • progress_inference_110.json (664.1 KB)
   • progress_inference_120.json (725.9 KB)
   • progress_inference_130.json (792.8 KB)
   • progress_inference_140.json (848.0 KB)
   • progress_inference_150.json (906.7 KB)
   • progress_inference_160.json (965.2 KB)
   • progress_inference_170.json (1023.4 KB)
   • progress_inference_180.json (1082.5 KB)
   • progress_inference_190.json (1146.8 KB)
   • progress_inference_20.json (123.5 KB)
   • progress_inference_200.json (1206.1 KB)
   • progress_inference_210.json (1268.5 KB)
   • progress_inference_220.json (1326.4 KB)
   • progress_inference_230.json (1389.5

In [26]:
# 1. Load progress file (example)
progress_file = '../../../data/prod/inference/raw/match_stats/inference_match_stats_2024_2025.json'
if os.path.exists(progress_file):
    completed_df = pd.read_json(progress_file)
    completed_urls = completed_df['match_id'].unique()
    remaining_urls = [url for url in inference_match_urls if url not in completed_urls]
    print(f"Found {len(completed_urls)} completed matches")
    print(f"Remaining: {len(remaining_urls)} matches")
else:
    remaining_urls = inference_match_urls
    print("No progress file found, starting from beginning")

# 2. Continue with remaining URLs
enhanced_stats_df = scrape_inference_matches(
    remaining_urls,
    output_dir='../../../data/prod/inference/raw/match_stats/rescraped_matches/'
)


print("Resume functionality available - uncomment code above if needed")

Found 647 completed matches
Remaining: 38 matches
🚀 Starting inference scraping for 38 matches
⚙️  Settings: 5.0-12.0s delays, 250 requests per chunk

[1/38] Processing match...
🔍 Scraping: https://fbref.com/en/matches/8323ca40/Arsenal-Manchester-United-January-12-2025-FA-Cup
⏱️  Waiting 5.2 seconds...
Teams found: 'Arsenal' vs 'Manchester Utd'
No team_stats_extra div found
⚠️  No stats available
❌ Failed to scrape stats

[2/38] Processing match...
🔍 Scraping: https://fbref.com/en/matches/027fb10f/Aston-Villa-West-Ham-United-January-10-2025-FA-Cup
⏱️  Waiting 8.0 seconds...
Teams found: 'Aston Villa' vs 'West Ham'
No team_stats_extra div found
⚠️  No stats available
❌ Failed to scrape stats

[3/38] Processing match...
🔍 Scraping: https://fbref.com/en/matches/955c44a4/Bournemouth-West-Bromwich-Albion-January-11-2025-FA-Cup
⏱️  Waiting 5.2 seconds...
Teams found: 'Bournemouth' vs 'West Brom'
No team_stats_extra div found
⚠️  No stats available
❌ Failed to scrape stats

[4/38] Processing 