In [1]:
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re
from urllib.parse import urljoin, urlparse

  from pandas.core import (


In [2]:
def load_teams_from_json(json_filename):
    """
    Load teams data from JSON file
    
    Args:
        json_filename (str): Path to JSON file
    
    Returns:
        dict: Teams data dictionary
    """
    with open(json_filename, 'r', encoding='utf-8') as f:
        teams_data = json.load(f)

    return teams_data

# Load the teams data
all_teams = load_teams_from_json('../../data/raw/all_teams.json')

# Now you can use all_teams for wage extraction
print(f"Loaded {len(all_teams)} teams from JSON file")

Loaded 27 teams from JSON file


# Premier League Team Wages Scraper

This notebook contains the wage scraping functionality extracted from the team_id_mapping notebook.
It focuses specifically on extracting wage data from FBRef for Premier League teams.

In [4]:
# Headers to appear more like a regular browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def get_page(url):
    """Fetch page with error handling and rate limiting"""
    time.sleep(random.uniform(2, 4))  # Be respectful - random delay
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

In [5]:
# Updated function to extract from both wages tables
def extract_team_wages_complete(team_id, season, team_name):
    """
    Extract wages information from both 'wages' and 'div_wages' tables
    
    Args:
        team_id (str): FBRef team ID (e.g., 'b8fd03ef')
        season (str): Season in format '2023-2024'
        team_name (str): Team name for URL construction
    
    Returns:
        dict: Dictionary containing wages data from both tables
    """
    # Construct the team stats URL
    team_name_url = team_name.replace(' ', '-').replace("'", "")
    url = f"https://fbref.com/en/squads/{team_id}/{season}/wages/{team_name_url}-Wage-Details"
    
    print(f"Fetching wages for {team_name} ({season})...")
    print(f"URL: {url}")
    
    soup = get_page(url)
    if not soup:
        return {}
    
    # Look for both wages tables
    tables_to_find = ['wages', 'div_wages']
    found_tables = {}
    
    for table_id in tables_to_find:
        table = soup.find('table', {'id': table_id})
        if table:
            found_tables[table_id] = table
            print(f"Found table: {table_id}")
    
    if not found_tables:
        print(f"No wages tables found for {team_name} ({season})")
        return {}
    
    # Initialize wages data structure
    wages_data = {
        'team_id': team_id,
        'team_name': team_name,
        'season': season,
        'players': [],
        'tables_found': list(found_tables.keys())
    }
    
    # Process each table
    for table_name, table in found_tables.items():
        print(f"\nProcessing table: {table_name}")
        
        tbody = table.find('tbody')
        if tbody:
            rows = tbody.find_all('tr')
        else:
            rows = table.find_all('tr')
            # Filter out header rows
            rows = [row for row in rows if row.find('td')]
        
        print(f"Found {len(rows)} rows in {table_name}")
        
        for row in rows:
            player_data = {'table_source': table_name}
            
            # Extract player name
            player_cell = row.find('th', {'data-stat': 'player'}) or row.find('td', {'data-stat': 'player'})
            if player_cell:
                player_link = player_cell.find('a')
                player_data['player_name'] = player_link.text.strip() if player_link else player_cell.text.strip()
            
            # Extract all available data columns
            cells = row.find_all(['td', 'th'])
            for cell in cells:
                data_stat = cell.get('data-stat')
                if data_stat and data_stat != 'player':  # Skip player as we already handled it
                    cell_text = cell.text.strip()
                    if cell_text and cell_text != '':
                        player_data[data_stat] = cell_text
            
            # Only add player if we have at least the name
            if player_data.get('player_name'):
                wages_data['players'].append(player_data)
    
    print(f"\nTotal extracted: {len(wages_data['players'])} player records")
    return wages_data

In [6]:
# Extract wages for all teams across all seasons
def extract_all_team_wages(all_teams_dict):
    """
    Extract wages data for all teams across all seasons they played
    
    Args:
        all_teams_dict (dict): Dictionary containing team information with seasons
    
    Returns:
        dict: Complete wages dataset organized by team_id and season
    """
    all_wages_data = {}
    total_extractions = sum(len(team_info['seasons']) for team_info in all_teams_dict.values())
    current_extraction = 0
    
    print(f"Starting wages extraction for {len(all_teams_dict)} teams across {total_extractions} team-season combinations...")
    print("=" * 80)
    
    for team_id, team_info in all_teams_dict.items():
        team_name = team_info['team_name']
        seasons = team_info['seasons']
        
        print(f"\n🏟️  Processing {team_name} (ID: {team_id})")
        print(f"   Seasons to extract: {seasons}")
        
        # Initialize team entry in results
        if team_id not in all_wages_data:
            all_wages_data[team_id] = {
                'team_name': team_name,
                'team_id': team_id,
                'seasons_data': {}
            }
        
        # Extract wages for each season this team played
        for season in seasons:
            current_extraction += 1
            print(f"\n   📊 [{current_extraction}/{total_extractions}] Extracting {season}...")
            
            try:
                season_wages = extract_team_wages_complete(team_id, season, team_name)
                
                if season_wages and season_wages.get('players'):
                    all_wages_data[team_id]['seasons_data'][season] = season_wages
                    player_count = len(season_wages['players'])
                    tables_found = season_wages.get('tables_found', [])
                    print(f"   ✅ Success: {player_count} players, tables: {tables_found}")
                else:
                    print(f"   ⚠️  No wage data found for {team_name} in {season}")
                    all_wages_data[team_id]['seasons_data'][season] = None
                    
            except Exception as e:
                print(f"   ❌ Error extracting {team_name} {season}: {str(e)}")
                all_wages_data[team_id]['seasons_data'][season] = None
            
            # Small delay to be respectful to the server
            time.sleep(1)
    
    # Summary statistics
    successful_extractions = 0
    total_players = 0
    
    for team_data in all_wages_data.values():
        for season_data in team_data['seasons_data'].values():
            if season_data and season_data.get('players'):
                successful_extractions += 1
                total_players += len(season_data['players'])
    
    print("\n" + "=" * 80)
    print("📈 EXTRACTION SUMMARY:")
    print(f"   Total teams processed: {len(all_wages_data)}")
    print(f"   Successful extractions: {successful_extractions}/{total_extractions}")
    print(f"   Total player records: {total_players}")
    print("=" * 80)
    
    return all_wages_data

In [7]:
# Function to save wages data to JSON file
def save_wages_data(wages_data, filename):
    """
    Save wages data to JSON file with proper serialization
    """
    # Convert sets to lists for JSON serialization
    serializable_data = {}
    for team_id, team_data in wages_data.items():
        serializable_data[team_id] = {
            'team_name': team_data['team_name'],
            'team_id': team_data['team_id'],
            'seasons_data': team_data['seasons_data']
        }
    
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(serializable_data, f, indent=2, ensure_ascii=False)
    
    print(f"Wages data saved to {filename}")

In [8]:
def wages_data_to_dataframe(wages_data):
    """
    Convert wages data dictionary to a pandas DataFrame
    
    Args:
        wages_data (dict): Wages data from extract_all_team_wages()
    
    Returns:
        pd.DataFrame: Flattened DataFrame with one row per player per season
    """
    all_records = []
    
    for team_id, team_data in wages_data.items():
        team_name = team_data['team_name']
        
        for season, season_data in team_data['seasons_data'].items():
            if season_data and season_data.get('players'):
                
                for player in season_data['players']:
                    # Create a record for each player
                    record = {
                        'team_id': team_id,
                        'team_name': team_name,
                        'season': season,
                        'tables_found': ', '.join(season_data.get('tables_found', [])),
                    }
                    
                    # Add all player data
                    record.update(player)
                    all_records.append(record)
    
    # Create DataFrame
    df = pd.DataFrame(all_records)
    
    # Reorder columns for better readability
    if len(df) > 0:
        priority_columns = ['team_name', 'season', 'player_name', 'age', 'annual_wages', 'weekly_wages']
        other_columns = [col for col in df.columns if col not in priority_columns]
        df = df[priority_columns + other_columns]
    
    return df

In [9]:
# Alternative: Load from JSON file and convert to DataFrame
def load_wages_json_to_dataframe(json_filename):
    """
    Load wages data from JSON file and convert to DataFrame
    
    Args:
        json_filename (str): Path to JSON file
    
    Returns:
        pd.DataFrame: Wages data as DataFrame
    """
    with open(json_filename, 'r', encoding='utf-8') as f:
        wages_data = json.load(f)
    
    return wages_data_to_dataframe(wages_data)

## Usage Example

To use this scraper, you need a dictionary of teams with their IDs and seasons. Here's an example of how to use it:

```python
# Example team data structure
all_teams = {
    '18bb7c10': {
        'team_name': 'Arsenal',
        'team_id': '18bb7c10',
        'seasons': ['2019-2020', '2020-2021', '2021-2022', '2022-2023', '2023-2024']
    },
    # ... more teams
}

# Extract wages for all teams
complete_wages_dataset = extract_all_team_wages(all_teams)

# Save to JSON
save_wages_data(complete_wages_dataset, '../../data/raw/premier_league_wages_2019_2024.json')

# Convert to DataFrame
wages_df = wages_data_to_dataframe(complete_wages_dataset)
```

In [10]:
# Execute the complete wages extraction
# WARNING: This will take a significant amount of time (estimated 20-30 minutes)
# as it needs to scrape wages data for ~26 teams across ~5 seasons each

# Uncomment the line below to start the extraction:
complete_wages_dataset = extract_all_team_wages(all_teams)
save_wages_data(complete_wages_dataset,'../../data/raw/premier_league_wages_2019_2025_v2.json')

Starting wages extraction for 27 teams across 120 team-season combinations...

🏟️  Processing Arsenal (ID: 18bb7c10)
   Seasons to extract: ['2019-2020', '2020-2021', '2021-2022', '2022-2023', '2023-2024', '2024-2025']

   📊 [1/120] Extracting 2019-2020...
Fetching wages for Arsenal (2019-2020)...
URL: https://fbref.com/en/squads/18bb7c10/2019-2020/wages/Arsenal-Wage-Details
Found table: wages

Processing table: wages
Found 27 rows in wages

Total extracted: 27 player records
   ✅ Success: 27 players, tables: ['wages']

   📊 [2/120] Extracting 2020-2021...
Fetching wages for Arsenal (2020-2021)...
URL: https://fbref.com/en/squads/18bb7c10/2020-2021/wages/Arsenal-Wage-Details
Found table: wages

Processing table: wages
Found 30 rows in wages

Total extracted: 30 player records
   ✅ Success: 30 players, tables: ['wages']

   📊 [3/120] Extracting 2021-2022...
Fetching wages for Arsenal (2021-2022)...
URL: https://fbref.com/en/squads/18bb7c10/2021-2022/wages/Arsenal-Wage-Details
Found tabl

In [12]:
save_wages_data(complete_wages_dataset,'../../data/raw/premier_league_wages_2019_2025_v2.json')

Wages data saved to ../../data/raw/premier_league_wages_2019_2025_v2.json


In [13]:
wages_df = wages_data_to_dataframe(complete_wages_dataset)

In [14]:
wages_df

Unnamed: 0,team_name,season,player_name,age,annual_wages,weekly_wages,team_id,tables_found,table_source,nationality,position,notes
0,Arsenal,2019-2020,Mesut Özil,30,"£ 18,200,000 (€ 21,704,678, $22,117,026)","£ 350,000 (€ 417,398, $425,327)",18bb7c10,wages,wages,de GER,MF,
1,Arsenal,2019-2020,Pierre-Emerick Aubameyang,30,"£ 13,000,000 (€ 15,503,341, $15,797,876)","£ 250,000 (€ 298,141, $303,805)",18bb7c10,wages,wages,ga GAB,FW,
2,Arsenal,2019-2020,Alexandre Lacazette,28,"£ 9,470,000 (€ 11,293,588, $11,508,145)","£ 182,115 (€ 217,184, $221,310)",18bb7c10,wages,wages,fr FRA,FW,
3,Arsenal,2019-2020,Héctor Bellerín,24,"£ 5,720,000 (€ 6,821,470, $6,951,065)","£ 110,000 (€ 131,182, $133,674)",18bb7c10,wages,wages,es ESP,"DF,MF",
4,Arsenal,2019-2020,David Luiz,32,"£ 5,250,000 (€ 6,260,965, $6,379,911)","£ 100,962 (€ 120,403, $122,691)",18bb7c10,wages,wages,br BRA,DF,
...,...,...,...,...,...,...,...,...,...,...,...,...
3336,Ipswich Town,2024-2025,Nathan Broadhead,26,"£ 312,000 (€ 369,641, $381,031)","£ 6,000 (€ 7,108, $7,328)",b74092de,wages,wages,wls WAL,"MF,FW",Unverified estimation
3337,Ipswich Town,2024-2025,Omari Hutchinson,20,"£ 312,000 (€ 369,641, $381,031)","£ 6,000 (€ 7,108, $7,328)",b74092de,wages,wages,,RW,Unverified estimation
3338,Ipswich Town,2024-2025,Cieran Slicker,22,"£ 156,000 (€ 184,821, $190,515)","£ 3,000 (€ 3,554, $3,664)",b74092de,wages,wages,,GK,Unverified estimation
3339,Ipswich Town,2024-2025,Elkan Baggott,22,"£ 78,000 (€ 91,585, $102,108)","£ 1,500 (€ 1,761, $1,964)",b74092de,wages,wages,,CB,Unverified estimation
