# Premier League Team ID Mapping (2019-2024)

This notebook extracts team IDs from FBRef for all teams that have played in the Premier League from 2019 to 2024.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re
from urllib.parse import urljoin, urlparse
import json

# Headers to appear more like a regular browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def get_page(url):
    """Fetch page with error handling and rate limiting"""
    time.sleep(random.uniform(2, 4))  # Be respectful - random delay
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

  from pandas.core import (


In [2]:
# Create the working function based on your successful fix
def extract_team_ids_from_season_working(season):
    """
    Extract team IDs and names from a specific Premier League season
    """
    url = f"https://fbref.com/en/comps/9/{season}/stats/{season}-Premier-League-Stats"
    print(f"Fetching teams for {season}...")
    
    soup = get_page(url)
    if not soup:
        return {}
    
    # Find the squads table
    table = soup.find('table', {'id': 'stats_squads_standard_for'})
    
    if not table:
        print(f"Squads table not found for {season}")
        return {}
    
    print(f"Found table: {table.get('id', 'Unknown ID')}")
    
    team_mapping = {}
    tbody = table.find('tbody')
    
    if tbody:
        rows = tbody.find_all('tr')
        print(f"Found tbody with {len(rows)} rows")
    else:
        rows = table.find_all('tr')
        print(f"No tbody found, found {len(rows)} rows directly in table")
        # Skip header row if no tbody
        rows = [row for row in rows if row.find('td')]
        print(f"After filtering header rows: {len(rows)} data rows")
    
    print(f"Processing {len(rows)} rows...")
    
    for row in rows:
        # Look for team cell (th element with data-stat='team')
        team_cell = row.find('th', {'data-stat': 'team'})
        if not team_cell:
            continue
            
        # Extract team link
        team_link = team_cell.find('a')
        if not team_link:
            continue
            
        team_name = team_link.text.strip()
        team_href = team_link.get('href')
        
        # Extract team ID from href
        if team_href and '/squads/' in team_href:
            team_id_match = re.search(r'/squads/([a-f0-9]+)/', team_href)
            if team_id_match:
                team_id = team_id_match.group(1)
                team_mapping[team_name] = {
                    'team_id': team_id,
                    'season': season,
                    'href': team_href
                }
                print(f"  ✓ {team_name}: {team_id}")
    
    return team_mapping

In [3]:
# Extract team IDs for all seasons (2019-2024)
seasons = [
    '2019-2020',
    '2020-2021', 
    '2021-2022',
    '2022-2023',
    '2023-2024',
    '2024-2025'
]

# Collect all team mappings
all_teams = {}

for season in seasons:
    season_teams = extract_team_ids_from_season_working(season)
    
    for team_name, team_data in season_teams.items():
        team_id = team_data['team_id']
        
        if team_id not in all_teams:
            all_teams[team_id] = {
                'team_name': team_name,
                'team_id': team_id,
                'seasons': [],
                'aliases': set([team_name])  # Track different name variations
            }
        
        # Add season and track name variations
        all_teams[team_id]['seasons'].append(season)
        all_teams[team_id]['aliases'].add(team_name)
    
    print(f"Completed {season}: {len(season_teams)} teams\n")

Fetching teams for 2019-2020...
Found table: stats_squads_standard_for
Found tbody with 20 rows
Processing 20 rows...
  ✓ Arsenal: 18bb7c10
  ✓ Aston Villa: 8602292d
  ✓ Bournemouth: 4ba7cbea
  ✓ Brighton: d07537b9
  ✓ Burnley: 943e8050
  ✓ Chelsea: cff3d9bb
  ✓ Crystal Palace: 47c64c55
  ✓ Everton: d3fd31cc
  ✓ Leicester City: a2d435b3
  ✓ Liverpool: 822bd0ba
  ✓ Manchester City: b8fd03ef
  ✓ Manchester Utd: 19538871
  ✓ Newcastle Utd: b2b47a98
  ✓ Norwich City: 1c781004
  ✓ Sheffield Utd: 1df6b87e
  ✓ Southampton: 33c895d4
  ✓ Tottenham: 361ca564
  ✓ Watford: 2abfe087
  ✓ West Ham: 7c21e445
  ✓ Wolves: 8cec06e1
Completed 2019-2020: 20 teams

Fetching teams for 2020-2021...
Found table: stats_squads_standard_for
Found tbody with 20 rows
Processing 20 rows...
  ✓ Arsenal: 18bb7c10
  ✓ Aston Villa: 8602292d
  ✓ Brighton: d07537b9
  ✓ Burnley: 943e8050
  ✓ Chelsea: cff3d9bb
  ✓ Crystal Palace: 47c64c55
  ✓ Everton: d3fd31cc
  ✓ Fulham: fd962109
  ✓ Leeds United: 5bfb9659
  ✓ Leicester Ci

In [5]:
import json

# Convert sets to lists for JSON serialization (if needed)
serializable_teams = {}
for team_id, team_data in all_teams.items():
    serializable_teams[team_id] = {
        'team_name': team_data['team_name'],
        'team_id': team_data['team_id'],
        'seasons': team_data['seasons'],
        'aliases': list(team_data['aliases'])  # Convert set to list
    }

#Save to JSON
with open('../../data/raw/all_teams.json', 'w',       
encoding='utf-8') as f:
    json.dump(serializable_teams, f, indent=2,        
ensure_ascii=False)