# Team ID Mapping for Inference

This notebook extracts current team IDs from FBRef for inference purposes. It focuses on the current season and active teams for making predictions.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re
from urllib.parse import urljoin, urlparse
import json

# Headers to appear more like a regular browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def get_page(url):
    """Fetch page with error handling and rate limiting"""
    time.sleep(random.uniform(2, 4))  # Be respectful - random delay
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

  from pandas.core import (


In [16]:
# Extract teams from 2025-2026 season - CURRENT SEASON TABLE
def extract_teams_2025_2026():
    """
    Extract team IDs and names from the 2025-2026 Premier League season
    Uses the current season table from https://fbref.com/en/comps/9/Premier-League-Stats
    """
    url = "https://fbref.com/en/comps/9/Premier-League-Stats"
    season = "2025-2026"
    print(f"Fetching teams for {season}...")
    
    soup = get_page(url)
    if not soup:
        return {}
    
    # Find the current season table - CORRECT table ID for 2025-2026
    table = soup.find('table', {'id': 'results2025-202691_overall'})
    
    if not table:
        print(f"Table 'results2025-202691_overall' not found for {season}")
        return {}
    
    print(f"Found table: {table.get('id', 'Unknown ID')}")
    
    team_mapping = {}
    tbody = table.find('tbody')
    
    if tbody:
        rows = tbody.find_all('tr')
        print(f"Found tbody with {len(rows)} rows")
    else:
        rows = table.find_all('tr')
        print(f"No tbody found, found {len(rows)} rows directly in table")
        # Skip header row if no tbody
        rows = [row for row in rows if row.find('td')]
        print(f"After filtering header rows: {len(rows)} data rows")
    
    print(f"Processing {len(rows)} rows...")
    
    for row in rows:
        # Look for team cell (td element with data-stat='team')
        team_cell = row.find('td', {'data-stat': 'team'})
        if not team_cell:
            continue
            
        # Extract team link
        team_link = team_cell.find('a')
        if not team_link:
            continue
            
        team_name = team_link.text.strip()
        team_href = team_link.get('href')
        
        # Extract team ID from href
        if team_href and '/squads/' in team_href:
            team_id_match = re.search(r'/squads/([a-f0-9]+)/', team_href)
            if team_id_match:
                team_id = team_id_match.group(1)
                team_mapping[team_name] = {
                    'team_id': team_id,
                    'season': season,
                    'href': team_href
                }
                print(f"  {team_name}: {team_id}")
    
    return team_mapping

In [None]:
# Execute the extraction
teams_2025_2026 = extract_teams_2025_2026()
print(f"\nTotal teams extracted for 2025-2026: {len(teams_2025_2026)}")

In [None]:
# import json

# # Convert sets to lists for JSON serialization (if needed)
# serializable_teams = {}
# for team_id, team_data in teams_2025_2026.items():
#     serializable_teams[team_id] = {
#         'team_name': team_id,
#         'team_id': team_data['team_id'],
#         'reference': team_data['href']  # Convert set to list
#     }

# #Save to JSON
# with open('../../../data/prod/inference/raw/all_teams.json', 'w',       
# encoding='utf-8') as f:
#     json.dump(serializable_teams, f, indent=2,        
# ensure_ascii=False)