# Team Wages Scraper for Inference - Season 2025-2026

This notebook extracts wage data specifically for the 2025-2026 season for inference purposes.
It focuses on current team wages needed for making predictions.

In [1]:
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re
from urllib.parse import urljoin, urlparse

  from pandas.core import (


In [2]:
# Headers to appear more like a regular browser
request_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def get_page(url):
    """Fetch page with error handling and rate limiting"""
    time.sleep(random.uniform(2, 4))  # Be respectful - random delay
    
    try:
        response = requests.get(url, headers=request_headers)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

In [3]:
def load_teams_for_inference():
    """
    Load teams data for 2025-2026 season inference
    Structure: team_name -> {team_name, team_id, reference}
    """
    try:
        # Try to load from inference teams file
        with open('../../../data/prod/inference/raw/all_teams.json', 'r', encoding='utf-8') as f:
            teams_data = json.load(f)
        print(f"Loaded {len(teams_data)} teams from inference file")
        
        # Show sample structure
        if teams_data:
            first_team = next(iter(teams_data.values()))
            print(f"Team structure: {first_team}")
        
        return teams_data
    except FileNotFoundError:
        print("Inference teams file not found. Please run team_id_mapping_inference first.")
        return {}

# Load teams for inference
inference_teams = load_teams_for_inference()

# Display teams available for wage scraping
if inference_teams:
    print(f"\nTeams available for wage scraping:")
    for team_name, team_info in list(inference_teams.items())[:5]:
        print(f"  {team_name}: {team_info['team_id']}")
    if len(inference_teams) > 5:
        print(f"  ... and {len(inference_teams) - 5} more teams")

Loaded 20 teams from inference file
Team structure: {'team_name': 'Arsenal', 'team_id': '18bb7c10', 'reference': '/en/squads/18bb7c10/Arsenal-Stats'}

Teams available for wage scraping:
  Arsenal: 18bb7c10
  Aston Villa: 8602292d
  Bournemouth: 4ba7cbea
  Brentford: cd051869
  Brighton: d07537b9
  ... and 15 more teams


In [4]:
def get_team_wages_df(team_id, team_name, season="2025-2026"):
    """
    Fetch wages data for a specific team and return as DataFrame
    
    Args:
        team_id (str): FBRef team ID (e.g., '18bb7c10')
        team_name (str): Team name (e.g., 'Arsenal')
        season (str): Season (default: '2025-2026')
    
    Returns:
        pd.DataFrame: Wages data for the team, or empty DataFrame if failed
    """
    # Construct URL
    url = f"https://fbref.com/en/squads/{team_id}/{season}/wages/{team_name}-Wage-Details"
    print(f"Fetching wages for {team_name}...")
    print(f"URL: {url}")
    
    # Make request
    soup = get_page(url)
    if not soup:
        print(f"❌ Failed to fetch page for {team_name}")
        return pd.DataFrame()
    
    # Look for wages table
    wages_table = soup.find('table', {'id': 'wages'})
    if not wages_table:
        print(f"❌ No wages table found for {team_name}")
        return pd.DataFrame()
    
    print(f"✅ Found wages table for {team_name}")
    
    # Extract table headers
    header_row = wages_table.find('thead').find('tr') if wages_table.find('thead') else wages_table.find('tr')
    column_headers = []
    
    if header_row:
        for th in header_row.find_all(['th', 'td']):
            header_text = th.text.strip()
            data_stat = th.get('data-stat', '')
            # Use data-stat if available, otherwise use text
            column_name = data_stat if data_stat else header_text
            column_headers.append(column_name)
    
    # Extract table data
    tbody = wages_table.find('tbody') if wages_table.find('tbody') else wages_table
    rows = tbody.find_all('tr')
    
    # Skip header rows if they're in tbody
    data_rows = [row for row in rows if row.find('td') or row.find('th', {'data-stat': True})]
    
    # Extract data from each row
    table_data = []
    for row in data_rows:
        row_data = []
        cells = row.find_all(['td', 'th'])
        
        for cell in cells:
            cell_text = cell.text.strip()
            row_data.append(cell_text)
        
        # Only add row if it has data
        if row_data and any(cell for cell in row_data):
            table_data.append(row_data)
    
    # Create DataFrame
    if table_data and column_headers:
        # Ensure we have the right number of columns
        max_cols = max(len(row) for row in table_data) if table_data else len(column_headers)
        
        # Pad headers if needed
        while len(column_headers) < max_cols:
            column_headers.append(f'col_{len(column_headers)}')
        
        # Pad rows if needed
        for row in table_data:
            while len(row) < len(column_headers):
                row.append('')
        
        # Create DataFrame
        df = pd.DataFrame(table_data, columns=column_headers[:max_cols])
        
        # Add team info columns
        df['team_name'] = team_name
        df['team_id'] = team_id
        df['season'] = season
        
        print(f"✅ DataFrame created with {len(df)} players")
        return df
        
    else:
        print(f"❌ No data to create DataFrame for {team_name}")
        return pd.DataFrame()

Fetching wages for Arsenal...
URL: https://fbref.com/en/squads/18bb7c10/2025-2026/wages/Arsenal-Wage-Details
✅ Found wages table for Arsenal
✅ DataFrame created with 27 players


Unnamed: 0,player,nationality,position,age,weekly_wages,annual_wages,notes,team_name,team_id,season
0,Kai Havertz,de GER,CF,26,"£ 280,000 (€ 321,368, $371,011)","£ 14,560,000 (€ 16,711,124, $19,292,582)",,Arsenal,18bb7c10,2025-2026
1,Gabriel Jesus,br BRA,CF,28,"£ 265,000 (€ 304,152, $351,136)","£ 13,780,000 (€ 15,815,885, $18,259,051)",,Arsenal,18bb7c10,2025-2026
2,Declan Rice,eng ENG,CM,26,"£ 240,000 (€ 275,458, $318,010)","£ 12,480,000 (€ 14,323,820, $16,536,499)",,Arsenal,18bb7c10,2025-2026
3,Martin Ødegaard,no NOR,AM,26,"£ 240,000 (€ 275,458, $318,010)","£ 12,480,000 (€ 14,323,820, $16,536,499)",,Arsenal,18bb7c10,2025-2026
4,Viktor Gyökeres,se SWE,CF,27,"£ 200,000 (€ 229,548, $265,008)","£ 10,400,000 (€ 11,936,517, $13,780,416)",Unverified estimation,Arsenal,18bb7c10,2025-2026
5,Bukayo Saka,eng ENG,RW,23,"£ 195,000 (€ 223,810, $258,383)","£ 10,140,000 (€ 11,638,104, $13,435,906)",,Arsenal,18bb7c10,2025-2026
6,William Saliba,fr FRA,CB,24,"£ 190,000 (€ 218,071, $251,758)","£ 9,880,000 (€ 11,339,691, $13,091,395)",,Arsenal,18bb7c10,2025-2026
7,Gabriel Martinelli,br BRA,LW,24,"£ 180,000 (€ 206,594, $238,507)","£ 9,360,000 (€ 10,742,865, $12,402,374)",,Arsenal,18bb7c10,2025-2026
8,Oleksandr Zinchenko,ua UKR,LB,28,"£ 150,000 (€ 172,161, $198,756)","£ 7,800,000 (€ 8,952,388, $10,335,312)",,Arsenal,18bb7c10,2025-2026
9,Ben White,eng ENG,RB,27,"£ 150,000 (€ 172,161, $198,756)","£ 7,800,000 (€ 8,952,388, $10,335,312)",Unverified estimation,Arsenal,18bb7c10,2025-2026


In [6]:
def get_all_teams_wages(teams_dict, season="2025-2026"):
    """
    Fetch wages data for all teams and combine into a single DataFrame
    
    Args:
        teams_dict (dict): Dictionary with structure {team_name: {team_name, team_id, reference}}
        season (str): Season (default: '2025-2026')
    
    Returns:
        pd.DataFrame: Combined wages data for all teams
    """
    all_wages_data = []
    total_teams = len(teams_dict)
    successful_extractions = 0
    failed_teams = []
    
    print(f"Starting wages extraction for {total_teams} teams...")
    print("=" * 80)
    
    for i, (team_name_key, team_info) in enumerate(teams_dict.items(), 1):
        team_name = team_info['team_name']
        team_id = team_info['team_id']
        
        print(f"\n🏟️  [{i}/{total_teams}] Processing {team_name} (ID: {team_id})")
        
        try:
            # Get wages DataFrame for this team
            team_wages_df = get_team_wages_df(team_id, team_name, season)
            
            if not team_wages_df.empty:
                all_wages_data.append(team_wages_df)
                successful_extractions += 1
                print(f"   ✅ Success: {len(team_wages_df)} players added")
            else:
                failed_teams.append(team_name)
                print(f"   ⚠️  No data found for {team_name}")
                
        except Exception as e:
            failed_teams.append(team_name)
            print(f"   ❌ Error processing {team_name}: {str(e)}")
        
        # Small delay between requests to be respectful
        if i < total_teams:  # Don't delay after the last team
            print(f"   ⏳ Waiting before next request...")
            time.sleep(1)
    
    # Combine all DataFrames
    if all_wages_data:
        print(f"\\n📊 Combining data from {len(all_wages_data)} successful extractions...")
        combined_df = pd.concat(all_wages_data, ignore_index=True)
        
        print("\\n" + "=" * 80)
        print("📈 EXTRACTION SUMMARY:")
        print(f"   Total teams processed: {total_teams}")
        print(f"   Successful extractions: {successful_extractions}")
        print(f"   Failed extractions: {len(failed_teams)}")
        print(f"   Total players in dataset: {len(combined_df)}")
        print(f"   Columns in dataset: {len(combined_df.columns)}")
        
        if failed_teams:
            print(f"\\n❌ Teams that failed: {', '.join(failed_teams)}")
        
        print("=" * 80)
        
        return combined_df
    else:
        print("\\n❌ No data was successfully extracted from any team")
        return pd.DataFrame()

In [7]:
# Execute wages extraction for all teams
if inference_teams:
    print("Starting extraction for all teams...")
    all_wages_df = get_all_teams_wages(inference_teams)
    
    if not all_wages_df.empty:
        print(f"\\n✅ Success! Combined wages dataset created")
        print(f"Shape: {all_wages_df.shape}")
        print(f"Columns: {list(all_wages_df.columns)}")
        
        # Save to multiple formats in inference directory
        all_wages_df.to_csv('../../../data/prod/inference/raw/all_teams_wages_2025_2026.csv', index=False)
        all_wages_df.to_parquet('../../../data/prod/inference/raw/all_teams_wages_2025_2026.parquet', index=False)
        all_wages_df.to_json('../../../data/prod/inference/raw/all_teams_wages_2025_2026.json', orient='records', indent=2)
        
        print(f"\\n💾 Data saved to inference/raw/ directory in multiple formats")
        
        # Show sample data
        print(f"\\n📋 Sample data:")
        display_cols = ['team_name', 'player', 'age', 'weekly_wages', 'annual_wages']
        available_cols = [col for col in display_cols if col in all_wages_df.columns]
        print(all_wages_df[available_cols].head(10))
        
    else:
        print(f"\\n❌ No data was extracted")
else:
    print("❌ No teams data available. Please load teams first.")

Starting extraction for all teams...
Starting wages extraction for 20 teams...

🏟️  [1/20] Processing Arsenal (ID: 18bb7c10)
Fetching wages for Arsenal...
URL: https://fbref.com/en/squads/18bb7c10/2025-2026/wages/Arsenal-Wage-Details
✅ Found wages table for Arsenal
✅ DataFrame created with 27 players
   ✅ Success: 27 players added
   ⏳ Waiting before next request...

🏟️  [2/20] Processing Aston Villa (ID: 8602292d)
Fetching wages for Aston Villa...
URL: https://fbref.com/en/squads/8602292d/2025-2026/wages/Aston Villa-Wage-Details
✅ Found wages table for Aston Villa
✅ DataFrame created with 30 players
   ✅ Success: 30 players added
   ⏳ Waiting before next request...

🏟️  [3/20] Processing Bournemouth (ID: 4ba7cbea)
Fetching wages for Bournemouth...
URL: https://fbref.com/en/squads/4ba7cbea/2025-2026/wages/Bournemouth-Wage-Details
✅ Found wages table for Bournemouth
✅ DataFrame created with 31 players
   ✅ Success: 31 players added
   ⏳ Waiting before next request...

🏟️  [4/20] Process

Unnamed: 0,player,nationality,position,age,weekly_wages,annual_wages,notes,team_name,team_id,season
0,Kai Havertz,de GER,CF,26,"£ 280,000 (€ 321,368, $371,011)","£ 14,560,000 (€ 16,711,124, $19,292,582)",,Arsenal,18bb7c10,2025-2026
1,Gabriel Jesus,br BRA,CF,28,"£ 265,000 (€ 304,152, $351,136)","£ 13,780,000 (€ 15,815,885, $18,259,051)",,Arsenal,18bb7c10,2025-2026
2,Declan Rice,eng ENG,CM,26,"£ 240,000 (€ 275,458, $318,010)","£ 12,480,000 (€ 14,323,820, $16,536,499)",,Arsenal,18bb7c10,2025-2026
3,Martin Ødegaard,no NOR,AM,26,"£ 240,000 (€ 275,458, $318,010)","£ 12,480,000 (€ 14,323,820, $16,536,499)",,Arsenal,18bb7c10,2025-2026
4,Viktor Gyökeres,se SWE,CF,27,"£ 200,000 (€ 229,548, $265,008)","£ 10,400,000 (€ 11,936,517, $13,780,416)",Unverified estimation,Arsenal,18bb7c10,2025-2026
...,...,...,...,...,...,...,...,...,...,...
617,Bastien Meupiyou,,CB,19,"£ 8,000 (€ 9,182, $10,600)","£ 416,000 (€ 477,461, $551,217)",Unverified estimation,Wolves,8cec06e1,2025-2026
618,Nathan Fraser,,CF,20,"£ 2,500 (€ 2,869, $3,313)","£ 130,000 (€ 149,206, $172,255)",Unverified estimation,Wolves,8cec06e1,2025-2026
619,Tawanda Chirewa,,AM,21,"£ 2,000 (€ 2,295, $2,650)","£ 104,000 (€ 119,365, $137,804)",Unverified estimation,Wolves,8cec06e1,2025-2026
620,Jhon Arias,co COL,RW,27,,,,Wolves,8cec06e1,2025-2026
