# Match Stats Scraper - Production Scale

This notebook contains clean, scalable functions to scrape match statistics from FBRef for multiple matches efficiently.

In [20]:
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re
from urllib.parse import urljoin, urlparse
from typing import Optional, List, Dict, Tuple
import os
from datetime import datetime

# Headers to appear more like a regular browser
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

## Core Web Scraping Functions

In [21]:
def get_page(url: str, delay_range: Tuple[float, float] = (2, 4)) -> Optional[BeautifulSoup]:
    """
    Fetch page with error handling and rate limiting
    
    Args:
        url: URL to fetch
        delay_range: Tuple of (min_delay, max_delay) in seconds
    
    Returns:
        BeautifulSoup object or None if failed
    """
    time.sleep(random.uniform(*delay_range))  # Be respectful - random delay
    
    try:
        response = requests.get(url, headers=HEADERS, timeout=30)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def extract_percentage_or_value(text: str) -> str:
    """
    Extract percentage first, if not found then extract first number
    Prioritizes percentage values over other numbers
    """
    # First try to find percentage
    percentage_match = re.search(r'(\d+(?:\.\d+)?%)', text)
    if percentage_match:
        return percentage_match.group(1)
    
    # If no percentage, try to find any number
    number_match = re.search(r'(\d+(?:\.\d+)?)', text)
    if number_match:
        return number_match.group(1)
    
    # If nothing found, return original text
    return text

## Data Loading Functions

In [22]:
def load_fixtures_from_json(json_filename: str) -> pd.DataFrame:
    """
    Load fixtures data from JSON file and convert to DataFrame
    
    Args:
        json_filename: Path to JSON file
    
    Returns:
        DataFrame with fixtures data
    """
    with open(json_filename, 'r', encoding='utf-8') as f:
        fixtures_data = json.load(f)
    
    return fixtures_data_to_dataframe(fixtures_data)

def fixtures_data_to_dataframe(fixtures_data: Dict) -> pd.DataFrame:
    """
    Convert fixtures data dictionary to a pandas DataFrame
    
    Args:
        fixtures_data: Fixtures data from extract_all_team_fixtures()
    
    Returns:
        Flattened DataFrame with one row per match
    """
    all_records = []
    
    for team_id, team_data in fixtures_data.items():
        team_name = team_data['team_name']
        
        for season, season_data in team_data['seasons_data'].items():
            if season_data and season_data.get('matches'):
                
                for match in season_data['matches']:
                    # Create a record for each match
                    record = {
                        'team_id': team_id,
                        'team_name': team_name,
                        'season': season
                    }
                    
                    # Add all match data
                    record.update(match)
                    all_records.append(record)
    
    # Create DataFrame
    df = pd.DataFrame(all_records)
    
    if len(df) > 0:
        # Add full match report URL
        df['full_match_report_url'] = 'https://fbref.com' + df['match_report_href']
    
    return df

## Match Statistics Scraping Functions

In [23]:
def scrape_team_stats(soup: BeautifulSoup, match_id: str = None) -> Optional[Tuple[pd.DataFrame, str, str]]:
    """
    Scrape main team statistics from match page in long format
    
    Args:
        soup: BeautifulSoup object of the match page
        match_id: Match identifier (URL)
    
    Returns:
        Tuple of (DataFrame with stats in long format, team1_name, team2_name) or None
    """
    team_stats_div = soup.find('div', {'id': 'team_stats'})
    
    if not team_stats_div:
        print("No team_stats div found")
        return None
    
    table = team_stats_div.find('table')
    if not table:
        print("No table found in team_stats div")
        return None
    
    # Extract team names from header
    header_row = table.find('tr')
    team_cells = header_row.find_all('th')
    team1_name = team_cells[0].get_text(strip=True).split()[0]
    team2_name = team_cells[1].get_text(strip=True).split()[-1]
    
    # Parse stats in long format
    stats_data = []
    rows = table.find_all('tr')[1:]  # Skip header
    
    i = 0
    while i < len(rows):
        # Each stat has a header row followed by a data row
        if i + 1 < len(rows):
            header_row = rows[i]
            data_row = rows[i + 1]
            
            # Get stat name
            stat_name = header_row.get_text(strip=True)
            
            if stat_name and stat_name != "Cards":
                # Get values for both teams
                data_cells = data_row.find_all('td')
                if len(data_cells) == 2:
                    team1_value = data_cells[0].get_text(strip=True)
                    team2_value = data_cells[1].get_text(strip=True)
                    
                    # Use improved extraction function
                    team1_clean = extract_percentage_or_value(team1_value)
                    team2_clean = extract_percentage_or_value(team2_value)
                    
                    # Add two rows: one for each team (long format)
                    stats_data.append({
                        'match_id': match_id,
                        'team_name': team1_name,
                        'stat_name': stat_name,
                        'stat_value': team1_clean
                    })
                    stats_data.append({
                        'match_id': match_id,
                        'team_name': team2_name,
                        'stat_name': stat_name,
                        'stat_value': team2_clean
                    })
        
        i += 2  # Skip to next stat (header + data)
    
    return pd.DataFrame(stats_data), team1_name, team2_name

def scrape_team_stats_extra(soup: BeautifulSoup, team1_name: str, team2_name: str, match_id: str = None) -> Optional[pd.DataFrame]:
    """
    Scrape extra team statistics from match page in long format
    
    Args:
        soup: BeautifulSoup object of the match page
        team1_name: Name of first team
        team2_name: Name of second team
        match_id: Match identifier (URL)
    
    Returns:
        DataFrame with extra stats in long format or None
    """
    team_stats_extra_div = soup.find('div', {'id': 'team_stats_extra'})
    
    if not team_stats_extra_div:
        print("No team_stats_extra div found")
        return None
    
    stats_data = []
    
    # Find all stat containers
    stat_containers = team_stats_extra_div.find_all('div', recursive=False)
    
    for container in stat_containers:
        divs = container.find_all('div')
        if len(divs) >= 3:
            # Each row has: team1_value, stat_name, team2_value pattern
            for i in range(0, len(divs), 3):
                if i + 2 < len(divs):
                    team1_value = divs[i].get_text(strip=True)
                    stat_name = divs[i + 1].get_text(strip=True)
                    team2_value = divs[i + 2].get_text(strip=True)
                    
                    # Skip headers and invalid data
                    if team1_value.isdigit() and team2_value.isdigit():
                        # Add two rows: one for each team (long format)
                        stats_data.append({
                            'match_id': match_id,
                            'team_name': team1_name,
                            'stat_name': stat_name,
                            'stat_value': team1_value
                        })
                        stats_data.append({
                            'match_id': match_id,
                            'team_name': team2_name,
                            'stat_name': stat_name,
                            'stat_value': team2_value
                        })
    
    return pd.DataFrame(stats_data) if stats_data else None

## Combined Scraping Functions

In [24]:
def scrape_match_stats(match_url: str) -> Optional[pd.DataFrame]:
    """
    Scrape all team stats (main + extra) from a single match URL in long format
    
    Args:
        match_url: URL of the match page
    
    Returns:
        Combined DataFrame with all team stats in long format or None
        Format: match_id | team_name | stat_name | stat_value
    """
    print(f"Scraping: {match_url}")
    
    # Fetch the page
    soup = get_page(match_url)
    if not soup:
        print(f"Failed to fetch page: {match_url}")
        return None
    
    # Scrape team stats
    team_stats_result = scrape_team_stats(soup, match_url)
    if team_stats_result is None:
        print("Failed to scrape team stats")
        return None
    
    team_stats_df, team1_name, team2_name = team_stats_result
    
    # Scrape team stats extra
    team_stats_extra_df = scrape_team_stats_extra(soup, team1_name, team2_name, match_url)
    
    # Concatenate the dataframes
    dfs_to_concat = []
    if team_stats_df is not None and len(team_stats_df) > 0:
        dfs_to_concat.append(team_stats_df)
        print(f"Found {len(team_stats_df)} main stats rows")
    
    if team_stats_extra_df is not None and len(team_stats_extra_df) > 0:
        dfs_to_concat.append(team_stats_extra_df)
        print(f"Found {len(team_stats_extra_df)} extra stats rows")
    
    if dfs_to_concat:
        combined_df = pd.concat(dfs_to_concat, ignore_index=True)
        print(f"Total stats collected: {len(combined_df)} rows")
        return combined_df
    else:
        print("No stats available")
        return None

## Batch Processing Functions

In [25]:
def scrape_multiple_matches(match_urls: List[str], 
                          max_matches: Optional[int] = None,
                          save_progress: bool = True,
                          output_dir: str = '../../data/raw/match_stats/') -> pd.DataFrame:
    """
    Scrape stats from multiple matches with progress saving
    
    Args:
        match_urls: List of match URLs to scrape
        max_matches: Maximum number of matches to process (None for all)
        save_progress: Whether to save progress periodically
        output_dir: Directory to save progress files
    
    Returns:
        Combined DataFrame with all match stats
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    all_stats = []
    failed_urls = []
    
    # Limit matches if specified
    urls_to_process = match_urls[:max_matches] if max_matches else match_urls
    
    print(f"Processing {len(urls_to_process)} matches...")
    
    for i, match_url in enumerate(urls_to_process, 1):
        print(f"\n[{i}/{len(urls_to_process)}] Processing match...")
        
        try:
            stats_df = scrape_match_stats(match_url)
            
            if stats_df is not None:
                all_stats.append(stats_df)
                print(f"✅ Successfully scraped {len(stats_df)} stats")
            else:
                failed_urls.append(match_url)
                print(f"❌ Failed to scrape stats")
        
        except Exception as e:
            print(f"❌ Error processing {match_url}: {e}")
            failed_urls.append(match_url)
        
        # Save progress every 10 matches
        if save_progress and i % 10 == 0 and all_stats:
            progress_df = pd.concat(all_stats, ignore_index=True)
            progress_file = f"{output_dir}progress_match_stats_{i}.json"
            progress_df.to_json(progress_file, orient='records', indent=2)
            print(f"💾 Progress saved: {progress_file}")
    
    # Combine all results
    if all_stats:
        final_df = pd.concat(all_stats, ignore_index=True)
        print(f"\n✅ Successfully processed {len(all_stats)} matches")
        print(f"❌ Failed to process {len(failed_urls)} matches")
        print(f"📊 Total stats collected: {len(final_df)}")
        
        # Save failed URLs for retry
        if failed_urls:
            failed_file = f"{output_dir}failed_urls_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
            with open(failed_file, 'w') as f:
                json.dump(failed_urls, f, indent=2)
            print(f"📝 Failed URLs saved: {failed_file}")
        
        return final_df
    else:
        print("❌ No matches were successfully processed")
        return pd.DataFrame()

## Utility Functions

In [26]:
def save_match_stats(df: pd.DataFrame, filename: str, output_dir: str = '../../data/raw/match_stats/') -> None:
    """
    Save match stats DataFrame to JSON file
    
    Args:
        df: DataFrame to save
        filename: Name of output file (without extension)
        output_dir: Directory to save file
    """
    os.makedirs(output_dir, exist_ok=True)
    
    filepath = f"{output_dir}{filename}.json"
    df.to_json(filepath, orient='records', indent=2)
    print(f"📁 Match stats saved: {filepath}")

def filter_fixtures_by_criteria(fixtures_df: pd.DataFrame, 
                               teams: Optional[List[str]] = None,
                               seasons: Optional[List[str]] = None,
                               competitions: Optional[List[str]] = None) -> pd.DataFrame:
    """
    Filter fixtures DataFrame by various criteria
    
    Args:
        fixtures_df: DataFrame with fixtures data
        teams: List of team names to include
        seasons: List of seasons to include
        competitions: List of competitions to include
    
    Returns:
        Filtered DataFrame
    """
    filtered_df = fixtures_df.copy()
    
    if teams:
        filtered_df = filtered_df[filtered_df['team_name'].isin(teams)]
    
    if seasons:
        filtered_df = filtered_df[filtered_df['season'].isin(seasons)]
    
    if competitions:
        filtered_df = filtered_df[filtered_df['comp'].isin(competitions)]
    
    print(f"Filtered to {len(filtered_df)} matches")
    return filtered_df

def get_match_urls_from_fixtures(fixtures_df: pd.DataFrame) -> List[str]:
    """
    Extract match URLs from fixtures DataFrame
    
    Args:
        fixtures_df: DataFrame with fixtures data
    
    Returns:
        List of unique match URLs
    """
    return fixtures_df['full_match_report_url'].unique().tolist()

## Improved Anti-Blocking Measures

Enhanced scraping functions with better rate limiting, user-agent rotation, session management, and retry mechanisms to handle large-scale scraping (3000+ requests).

In [27]:
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import random
import time
from typing import List, Optional, Dict
import json

# Multiple User-Agents for rotation
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
]

class EnhancedScraper:
    """Enhanced scraper with anti-blocking measures"""
    
    def __init__(self, 
                 min_delay: float = 15.0,
                 max_delay: float = 30.0,
                 max_retries: int = 3,
                 backoff_factor: float = 2.0,
                 chunk_size: int = 50,
                 chunk_break: float = 300.0):  # 5 minute break between chunks
        """
        Initialize enhanced scraper
        
        Args:
            min_delay: Minimum delay between requests (seconds)
            max_delay: Maximum delay between requests (seconds) 
            max_retries: Maximum retry attempts for failed requests
            backoff_factor: Exponential backoff multiplier
            chunk_size: Number of requests per chunk before long break
            chunk_break: Break time between chunks (seconds)
        """
        self.min_delay = min_delay
        self.max_delay = max_delay
        self.max_retries = max_retries
        self.backoff_factor = backoff_factor
        self.chunk_size = chunk_size
        self.chunk_break = chunk_break
        
        # Request counter for chunking
        self.request_count = 0
        
        # Create session with retry strategy
        self.session = self._create_session()
        
    def _create_session(self) -> requests.Session:
        """Create requests session with retry strategy"""
        session = requests.Session()
        
        # Retry strategy
        retry_strategy = Retry(
            total=self.max_retries,
            status_forcelist=[429, 500, 502, 503, 504],
            backoff_factor=self.backoff_factor,
            respect_retry_after_header=True
        )
        
        adapter = HTTPAdapter(max_retries=retry_strategy)
        session.mount("http://", adapter)
        session.mount("https://", adapter)
        
        return session
    
    def _get_random_headers(self) -> Dict[str, str]:
        """Get random headers for request"""
        return {
            'User-Agent': random.choice(USER_AGENTS),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        }
    
    def _wait_between_requests(self):
        """Handle delays and chunking"""
        # Random delay between requests
        delay = random.uniform(self.min_delay, self.max_delay)
        print(f"⏱️  Waiting {delay:.1f} seconds...")
        time.sleep(delay)
        
        # Increment request counter
        self.request_count += 1
        
        # Check if we need a chunk break
        if self.request_count % self.chunk_size == 0:
            print(f"\n🛑 Chunk break after {self.request_count} requests")
            print(f"⏰ Waiting {self.chunk_break/60:.1f} minutes before continuing...")
            time.sleep(self.chunk_break)
            print("🚀 Resuming scraping...\n")
    
    def get_page_enhanced(self, url: str) -> Optional[BeautifulSoup]:
        """
        Enhanced page fetching with anti-blocking measures
        
        Args:
            url: URL to fetch
            
        Returns:
            BeautifulSoup object or None if failed
        """
        self._wait_between_requests()
        
        headers = self._get_random_headers()
        
        try:
            response = self.session.get(url, headers=headers, timeout=30)
            
            # Check for rate limiting
            if response.status_code == 429:
                print(f"⚠️  Rate limited. Waiting longer...")
                time.sleep(60)  # Wait 1 minute for rate limit
                return None
            
            response.raise_for_status()
            return BeautifulSoup(response.content, 'html.parser')
            
        except requests.RequestException as e:
            print(f"❌ Error fetching {url}: {e}")
            
            # Exponential backoff for retries
            if "429" in str(e) or "rate" in str(e).lower():
                wait_time = 60 * (2 ** (self.request_count % 3))  # Exponential backoff
                print(f"⏰ Rate limited. Waiting {wait_time/60:.1f} minutes...")
                time.sleep(wait_time)
            
            return None
    
    def scrape_match_stats_enhanced(self, match_url: str) -> Optional[pd.DataFrame]:
        """
        Enhanced version of scrape_match_stats with anti-blocking measures
        """
        print(f"🔍 Scraping: {match_url}")
        
        # Fetch the page with enhanced measures
        soup = self.get_page_enhanced(match_url)
        if not soup:
            print(f"❌ Failed to fetch page: {match_url}")
            return None
        
        # Use existing scraping logic
        team_stats_result = scrape_team_stats(soup, match_url)
        if team_stats_result is None:
            print("❌ Failed to scrape team stats")
            return None
        
        team_stats_df, team1_name, team2_name = team_stats_result
        
        # Scrape team stats extra
        team_stats_extra_df = scrape_team_stats_extra(soup, team1_name, team2_name, match_url)
        
        # Combine results
        dfs_to_concat = []
        if team_stats_df is not None and len(team_stats_df) > 0:
            dfs_to_concat.append(team_stats_df)
            print(f"✅ Found {len(team_stats_df)} main stats rows")
        
        if team_stats_extra_df is not None and len(team_stats_extra_df) > 0:
            dfs_to_concat.append(team_stats_extra_df)
            print(f"✅ Found {len(team_stats_extra_df)} extra stats rows")
        
        if dfs_to_concat:
            combined_df = pd.concat(dfs_to_concat, ignore_index=True)
            print(f"📊 Total stats collected: {len(combined_df)} rows")
            return combined_df
        else:
            print("⚠️  No stats available")
            return None

# Initialize enhanced scraper
enhanced_scraper = EnhancedScraper(
    min_delay=15.0,      # 15-30 second delays
    max_delay=30.0,
    chunk_size=50,       # Break every 50 requests
    chunk_break=300.0    # 5 minute breaks
)

print("✅ Enhanced scraper initialized with anti-blocking measures")

✅ Enhanced scraper initialized with anti-blocking measures


In [28]:
def scrape_multiple_matches_enhanced(match_urls: List[str], 
                                   max_matches: Optional[int] = None,
                                   save_progress: bool = True,
                                   output_dir: str = '../../data/raw/match_stats/',
                                   scraper: EnhancedScraper = None) -> pd.DataFrame:
    """
    Enhanced batch scraping with anti-blocking measures
    
    Args:
        match_urls: List of match URLs to scrape
        max_matches: Maximum number of matches to process
        save_progress: Whether to save progress periodically
        output_dir: Directory to save progress files
        scraper: EnhancedScraper instance to use
    
    Returns:
        Combined DataFrame with all match stats
    """
    if scraper is None:
        scraper = enhanced_scraper
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    all_stats = []
    failed_urls = []
    
    # Limit matches if specified
    urls_to_process = match_urls[:max_matches] if max_matches else match_urls
    
    print(f"🚀 Starting enhanced scraping for {len(urls_to_process)} matches")
    print(f"⚙️  Settings: {scraper.min_delay}-{scraper.max_delay}s delays, {scraper.chunk_size} requests per chunk")
    
    start_time = time.time()
    
    for i, match_url in enumerate(urls_to_process, 1):
        print(f"\n[{i}/{len(urls_to_process)}] Processing match...")
        
        try:
            stats_df = scraper.scrape_match_stats_enhanced(match_url)
            
            if stats_df is not None:
                all_stats.append(stats_df)
                print(f"✅ Successfully scraped {len(stats_df)} stats")
            else:
                failed_urls.append(match_url)
                print(f"❌ Failed to scrape stats")
        
        except Exception as e:
            print(f"❌ Error processing {match_url}: {e}")
            failed_urls.append(match_url)
        
        # Save progress every 20 matches (more frequent saves)
        if save_progress and i % 20 == 0 and all_stats:
            progress_df = pd.concat(all_stats, ignore_index=True)
            progress_file = f"{output_dir}progress_enhanced_{i}.json"
            progress_df.to_json(progress_file, orient='records', indent=2)
            
            elapsed = time.time() - start_time
            rate = i / elapsed * 3600  # matches per hour
            print(f"💾 Progress saved: {progress_file}")
            print(f"📈 Rate: {rate:.1f} matches/hour")
    
    # Final results
    if all_stats:
        final_df = pd.concat(all_stats, ignore_index=True)
        
        elapsed = time.time() - start_time
        total_time = elapsed / 3600  # hours
        
        print(f"\n🎉 SCRAPING COMPLETE!")
        print(f"✅ Successfully processed: {len(all_stats)} matches")
        print(f"❌ Failed to process: {len(failed_urls)} matches")
        print(f"📊 Total stats collected: {len(final_df)} rows")
        print(f"⏰ Total time: {total_time:.2f} hours")
        print(f"📈 Average rate: {len(all_stats)/total_time:.1f} matches/hour")
        
        # Save failed URLs
        if failed_urls:
            failed_file = f"{output_dir}failed_urls_enhanced_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
            with open(failed_file, 'w') as f:
                json.dump(failed_urls, f, indent=2)
            print(f"📝 Failed URLs saved: {failed_file}")
        
        return final_df
    else:
        print("❌ No matches were successfully processed")
        return pd.DataFrame()

# Test the enhanced scraper with a small sample
print("Ready to test enhanced scraper!")

Ready to test enhanced scraper!


## Usage Example - Enhanced Scraper

Example of how to use the enhanced scraper with anti-blocking measures for large-scale scraping.

In [29]:
# Load fixtures data (if not already loaded)
if 'fixtures_df' not in locals():
    print("Loading fixtures data...")
    fixtures_df = load_fixtures_from_json('../../data/raw/all_competitions_fixtures_2019_2025.json')
    print(f"Loaded {len(fixtures_df)} total fixtures")

# Get all match URLs
match_urls = get_match_urls_from_fixtures(fixtures_df)
print(f"Found {len(match_urls)} unique match URLs")

# Preview the data
print("\nFixtures preview:")
print(fixtures_df[['team_name', 'season', 'comp', 'match_report_href']].head())

Found 3272 unique match URLs

Fixtures preview:
  team_name     season            comp  \
0   Arsenal  2019-2020  Premier League   
1   Arsenal  2019-2020  Premier League   
2   Arsenal  2019-2020  Premier League   
3   Arsenal  2019-2020  Premier League   
4   Arsenal  2019-2020  Premier League   

                                   match_report_href  
0  /en/matches/1405a610/Newcastle-United-Arsenal-...  
1  /en/matches/ff7eda21/Arsenal-Burnley-August-17...  
2  /en/matches/102b241e/Liverpool-Arsenal-August-...  
3  /en/matches/0b6b8aaf/North-London-Derby-Arsena...  
4  /en/matches/8257eda8/Watford-Arsenal-September...  


In [31]:
# STEP 1: Test with a small sample first (RECOMMENDED)
print("🧪 TESTING ENHANCED SCRAPER WITH SMALL SAMPLE")
print("=" * 50)

# Test with just 3 matches first
test_urls = match_urls[:3]
print(f"Testing with {len(test_urls)} matches...")

# Create a test scraper with faster settings for testing
test_scraper = EnhancedScraper(
    min_delay=5.0,       # Faster for testing
    max_delay=10.0,
    chunk_size=10,       # Smaller chunks for testing
    chunk_break=30.0     # Shorter breaks for testing
)

# Run test
test_stats_df = scrape_multiple_matches_enhanced(
    test_urls, 
    max_matches=3,
    scraper=test_scraper,
    output_dir='../../data/raw/match_stats/test/'
)

if not test_stats_df.empty:
    print(f"\n✅ TEST SUCCESSFUL!")
    print(f"Collected {len(test_stats_df)} stats rows")
    print(f"Sample data:")
    print(test_stats_df.head())
else:
    print("\n❌ TEST FAILED - Check the output above for errors")

🧪 TESTING ENHANCED SCRAPER WITH SMALL SAMPLE
Testing with 3 matches...
🚀 Starting enhanced scraping for 3 matches
⚙️  Settings: 5.0-10.0s delays, 10 requests per chunk

[1/3] Processing match...
🔍 Scraping: https://fbref.com/en/matches/1405a610/Newcastle-United-Arsenal-August-11-2019-Premier-League
⏱️  Waiting 7.1 seconds...
✅ Found 8 main stats rows
✅ Found 24 extra stats rows
📊 Total stats collected: 32 rows
✅ Successfully scraped 32 stats

[2/3] Processing match...
🔍 Scraping: https://fbref.com/en/matches/ff7eda21/Arsenal-Burnley-August-17-2019-Premier-League
⏱️  Waiting 9.2 seconds...
✅ Found 8 main stats rows
✅ Found 24 extra stats rows
📊 Total stats collected: 32 rows
✅ Successfully scraped 32 stats

[3/3] Processing match...
🔍 Scraping: https://fbref.com/en/matches/102b241e/Liverpool-Arsenal-August-24-2019-Premier-League
⏱️  Waiting 5.8 seconds...
✅ Found 8 main stats rows
✅ Found 24 extra stats rows
📊 Total stats collected: 32 rows
✅ Successfully scraped 32 stats

🎉 SCRAPING CO

In [36]:
# STEP 2: Production scraping with enhanced measures
print("🚀 PRODUCTION SCRAPING WITH ENHANCED MEASURES")
print("=" * 50)

# Configure production scraper (uncomment to run)
production_scraper = EnhancedScraper(
    min_delay=3.0,      # 15-30 second delays for safety
    max_delay=12.0,
    chunk_size=250,       # Break every 50 requests  
    chunk_break=500.0    # 5 minute breaks between chunks
)

print(f"📊 Total matches to scrape: {len(match_urls)}")
print(f"⏰ Estimated time: ~{len(match_urls) * 22.5 / 3600:.1f} hours")
print(f"⚙️  Settings: {production_scraper.min_delay}-{production_scraper.max_delay}s delays")
print(f"🛑 Breaks: {production_scraper.chunk_break/60:.0f} min every {production_scraper.chunk_size} requests")

# UNCOMMENT THE LINES BELOW TO START PRODUCTION SCRAPING
# WARNING: This will take ~27-30 hours to complete all 3272 matches

enhanced_stats_df = scrape_multiple_matches_enhanced(
    match_urls,
    max_matches=None,  # Process all matches
    scraper=production_scraper,
    output_dir='../../data/raw/match_stats/enhanced/'
)

# Save final results
if not enhanced_stats_df.empty:
    save_match_stats(
        enhanced_stats_df, 
        'all_competitions_enhanced_match_stats_2019_2025',
        output_dir='../../data/raw/match_stats/enhanced/'
    )
    print(f"\n🎉 COMPLETE! Final dataset: {len(enhanced_stats_df)} rows")

print("\n⚠️  PRODUCTION SCRAPING IS COMMENTED OUT")
print("Uncomment the lines above to start the full scraping process")

🚀 PRODUCTION SCRAPING WITH ENHANCED MEASURES
📊 Total matches to scrape: 3272
⏰ Estimated time: ~20.4 hours
⚙️  Settings: 3.0-12.0s delays
🛑 Breaks: 8 min every 250 requests
🚀 Starting enhanced scraping for 3272 matches
⚙️  Settings: 3.0-12.0s delays, 250 requests per chunk

[1/3272] Processing match...
🔍 Scraping: https://fbref.com/en/matches/1405a610/Newcastle-United-Arsenal-August-11-2019-Premier-League
⏱️  Waiting 4.7 seconds...
✅ Found 8 main stats rows
✅ Found 24 extra stats rows
📊 Total stats collected: 32 rows
✅ Successfully scraped 32 stats

[2/3272] Processing match...
🔍 Scraping: https://fbref.com/en/matches/ff7eda21/Arsenal-Burnley-August-17-2019-Premier-League
⏱️  Waiting 11.5 seconds...
✅ Found 8 main stats rows
✅ Found 24 extra stats rows
📊 Total stats collected: 32 rows
✅ Successfully scraped 32 stats

[3/3272] Processing match...
🔍 Scraping: https://fbref.com/en/matches/102b241e/Liverpool-Arsenal-August-24-2019-Premier-League
⏱️  Waiting 9.4 seconds...
✅ Found 8 main st

In [41]:
# STEP 3: Optional - Resume from failed/partial run
print("🔄 RESUME FROM PARTIAL RUN (if needed)")
print("=" * 40)

# If you need to resume scraping from where you left off:

# 1. Load progress file (example)
progress_file = '../../data/raw/match_stats/enhanced/all_competitions_enhanced_match_stats_2019_2025.json'
if os.path.exists(progress_file):
    completed_df = pd.read_json(progress_file)
    completed_urls = completed_df['match_id'].unique()
    remaining_urls = [url for url in match_urls if url not in completed_urls]
    print(f"Found {len(completed_urls)} completed matches")
    print(f"Remaining: {len(remaining_urls)} matches")
else:
    remaining_urls = match_urls
    print("No progress file found, starting from beginning")

# 2. Continue with remaining URLs
enhanced_stats_df = scrape_multiple_matches_enhanced(
    remaining_urls,
    scraper=production_scraper,
    output_dir='../../data/raw/match_stats/enhanced/remaining_urls'
)

print("Resume functionality available - uncomment code above if needed")

🔄 RESUME FROM PARTIAL RUN (if needed)
Found 3236 completed matches
Remaining: 36 matches
🚀 Starting enhanced scraping for 36 matches
⚙️  Settings: 3.0-12.0s delays, 250 requests per chunk

[1/36] Processing match...
🔍 Scraping: https://fbref.com/en/matches/8323ca40/Arsenal-Manchester-United-January-12-2025-FA-Cup
⏱️  Waiting 6.2 seconds...
No team_stats_extra div found
⚠️  No stats available
❌ Failed to scrape stats

[2/36] Processing match...
🔍 Scraping: https://fbref.com/en/matches/f36a1b2f/Hibernian-Aston-Villa-August-23-2023-Europa-Conference-League
⏱️  Waiting 7.3 seconds...
No team_stats_extra div found
⚠️  No stats available
❌ Failed to scrape stats

[3/36] Processing match...
🔍 Scraping: https://fbref.com/en/matches/5cc4d456/Aston-Villa-Hibernian-August-31-2023-Europa-Conference-League
⏱️  Waiting 3.7 seconds...
No team_stats_extra div found
⚠️  No stats available
❌ Failed to scrape stats

[4/36] Processing match...
🔍 Scraping: https://fbref.com/en/matches/027fb10f/Aston-Villa-

In [48]:
columns_select = [
    'match_id', 
    'team_name_x', 
    'stat_name', 
    'stat_value',
    'season', 
    'date', 
    'start_time',
    'comp',
    'round',
    'dayofweek'
]


completed_df.merge(
    fixtures_df,
    left_on = 'match_id',
    right_on = 'full_match_report_url',
    how = 'left'
)[columns_select].drop_duplicates()

Unnamed: 0,match_id,team_name_x,stat_name,stat_value,season,date,start_time,comp,round,dayofweek
0,https://fbref.com/en/matches/1405a610/Newcastl...,Newcastle,Possession,38%,2019-2020,2019-08-11,14:00,Premier League,Matchweek 1,Sun
2,https://fbref.com/en/matches/1405a610/Newcastl...,Arsenal,Possession,62%,2019-2020,2019-08-11,14:00,Premier League,Matchweek 1,Sun
4,https://fbref.com/en/matches/1405a610/Newcastl...,Newcastle,Passing Accuracy,75%,2019-2020,2019-08-11,14:00,Premier League,Matchweek 1,Sun
6,https://fbref.com/en/matches/1405a610/Newcastl...,Arsenal,Passing Accuracy,84%,2019-2020,2019-08-11,14:00,Premier League,Matchweek 1,Sun
8,https://fbref.com/en/matches/1405a610/Newcastl...,Newcastle,Shots on Target,22%,2019-2020,2019-08-11,14:00,Premier League,Matchweek 1,Sun
...,...,...,...,...,...,...,...,...,...,...
175137,https://fbref.com/en/matches/be42686a/Coventry...,Town,Crosses,9,2024-2025,2025-02-08,15:00,FA Cup,Fourth round proper,Sat
175138,https://fbref.com/en/matches/be42686a/Coventry...,Coventry,Interceptions,13,2024-2025,2025-02-08,15:00,FA Cup,Fourth round proper,Sat
175139,https://fbref.com/en/matches/be42686a/Coventry...,Town,Interceptions,14,2024-2025,2025-02-08,15:00,FA Cup,Fourth round proper,Sat
175140,https://fbref.com/en/matches/be42686a/Coventry...,Coventry,Offsides,3,2024-2025,2025-02-08,15:00,FA Cup,Fourth round proper,Sat
