# Phase 3: Live Season Ingestion & Model Staging

**Goal:** Ingest the 2025/2026 season, merge with historical data, and partition into 3 ML-ready CSVs.

| Step | Cell | Description |
|------|------|-------------|
| Setup | Cell 1 | Load historical data, Elo ratings, and team mappings |
| Canary | Cell 2 | Verify 25/26 data is accessible |
| MH Fetch | Cell 3 | Download 25/26 MatchHistory results for all 5 leagues |
| Fixtures | Cell 4 | Download real future fixtures with scheduled dates |
| Understat | Cell 5 | Fetch 25/26 xG data and merge with MatchHistory |
| Elo + Split | Cell 6 | Lookup Elo ratings and partition into 3 CSVs |
| Sanity | Cell 7 | Validate row counts and data quality |

### Output Files
| File | Content |
|------|---------|
| `model_training.csv` | 10 seasons of historical data for model training |
| `current_season_banked.csv` | 25/26 completed matches (live table) |
| `future_schedule_features.csv` | 25/26 upcoming matches to simulate |

### Data Sources
| Source | Used For |
|--------|---------|
| football-data.co.uk | Match results + betting odds (played games) |
| fixturedownload.com | Future fixture schedule with real dates |
| Understat | Expected goals (xG) |
| ClubElo | Team strength ratings |

In [1]:
# Ensure working directory is the project root perfectly across IDEs/Terminals
import os
import sys
try:
    if 'notebooks' in os.getcwd():
        project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
    else:
        project_root = os.getcwd()
    os.chdir(project_root)
    if project_root not in sys.path:
        sys.path.append(project_root)
except Exception:
    pass

# =============================================================================
# Cell 1: Setup & Inputs
# =============================================================================

import pandas as pd
import numpy as np
import os
import time
import requests
import warnings
from io import StringIO

try:
    import soccerdata as sd
except ImportError:
    %pip install soccerdata
    import soccerdata as sd

warnings.filterwarnings('ignore')

processed_dir = os.path.join('data', 'processed')
raw_dir = os.path.join('data', 'raw')
os.makedirs(processed_dir, exist_ok=True)

# --- Load historical Master Training Set (read-only) ---
print('Loading historical data...')
master_path = os.path.join(processed_dir, 'Master_Training_Set.csv')
master_df = pd.read_csv(master_path)
master_size = os.path.getsize(master_path)
print(f'  Master_Training_Set.csv: {len(master_df):,} rows, {master_df["season"].nunique()} seasons')

# --- Load ClubElo ---
print('Loading ClubElo data...')
elo_raw = pd.read_csv(os.path.join(raw_dir, 'ClubElo_Master.csv'))
print(f'  ClubElo_Master.csv: {len(elo_raw):,} rows')

# =========================================================================
# TEAM NAME MAPPINGS (copied from Phase 2)
# All names standardized TO MatchHistory format.
# =========================================================================

understat_to_mh = {
    # ENG
    'Manchester City':           'Man City',
    'Manchester United':         'Man United',
    'Newcastle United':          'Newcastle',
    'Nottingham Forest':         "Nott'm Forest",
    'West Bromwich Albion':      'West Brom',
    'Wolverhampton Wanderers':   'Wolves',
    'Queens Park Rangers':       'QPR',
    # ESP
    'Athletic Club':             'Ath Bilbao',
    'Atletico Madrid':           'Ath Madrid',
    'Real Betis':                'Betis',
    'Celta Vigo':                'Celta',
    'Espanyol':                  'Espanol',
    'SD Huesca':                 'Huesca',
    'Deportivo La Coruna':       'La Coruna',
    'Real Sociedad':             'Sociedad',
    'Sporting Gijon':            'Sp Gijon',
    'Real Valladolid':           'Valladolid',
    'Rayo Vallecano':            'Vallecano',
    # GER
    'Arminia Bielefeld':         'Bielefeld',
    'Borussia Dortmund':         'Dortmund',
    'Eintracht Frankfurt':       'Ein Frankfurt',
    'FC Cologne':                'FC Koln',
    'Fortuna Duesseldorf':       'Fortuna Dusseldorf',
    'Hamburger SV':              'Hamburg',
    'Hannover 96':               'Hannover',
    'FC Heidenheim':             'Heidenheim',
    'Hertha Berlin':             'Hertha',
    'Bayer Leverkusen':          'Leverkusen',
    'Borussia M.Gladbach':       "M'gladbach",
    'Mainz 05':                  'Mainz',
    'Nuernberg':                 'Nurnberg',
    'RasenBallsport Leipzig':    'RB Leipzig',
    'St. Pauli':                 'St Pauli',
    'VfB Stuttgart':             'Stuttgart',
    # ITA
    'AC Milan':                  'Milan',
    'SPAL 2013':                 'Spal',
    'Parma Calcio 1913':         'Parma',
    # FRA
    'GFC Ajaccio':               'Ajaccio GFCO',
    'SC Bastia':                 'Bastia',
    'Clermont Foot':             'Clermont',
    'Paris Saint Germain':       'Paris SG',
    'Saint-Etienne':             'St Etienne',
}

clubelo_to_mh = {
    'Atletico':          'Ath Madrid',
    'Bilbao':            'Ath Bilbao',
    'Bayern':            'Bayern Munich',
    'Frankfurt':         'Ein Frankfurt',
    'Espanyol':          'Espanol',
    'Gladbach':          "M'gladbach",
    'Forest':            "Nott'm Forest",
    'Koeln':             'FC Koln',
    'Duesseldorf':       'Fortuna Dusseldorf',
    'Holstein':          'Holstein Kiel',
    'Depor':             'La Coruna',
    'Nuernberg':         'Nurnberg',
    'Gijon':             'Sp Gijon',
    'Rayo Vallecano':    'Vallecano',
    'Werder':            'Werder Bremen',
    'Evian TG':          'Evian Thonon Gaillard',
    'Schalke':           'Schalke 04',
}

# --- Fixturedownload.com -> MatchHistory name mapping ---
fixture_to_mh = {
    # ENG
    'Man Utd':                   'Man United',
    'Spurs':                     'Tottenham',
    # ESP
    'Athletic Club':             'Ath Bilbao',
    'Atletico de Madrid':        'Ath Madrid',
    u'Atl\u00e9tico de Madrid':  'Ath Madrid',
    'CA Osasuna':                'Osasuna',
    'Deportivo Alaves':          'Alaves',
    u'Deportivo Alav\u00e9s':    'Alaves',
    'Elche CF':                  'Elche',
    'FC Barcelona':              'Barcelona',
    'Getafe CF':                 'Getafe',
    'Girona FC':                 'Girona',
    'Levante UD':               'Levante',
    'RCD Espanyol de Barcelona': 'Espanol',
    'RCD Mallorca':              'Mallorca',
    'Rayo Vallecano':            'Vallecano',
    'Real Betis':                'Betis',
    'Real Oviedo':               'Oviedo',
    'Real Sociedad':             'Sociedad',
    'Sevilla FC':                'Sevilla',
    'Valencia CF':               'Valencia',
    'Villarreal CF':             'Villarreal',
    # GER
    '1. FC Heidenheim 1846':     'Heidenheim',
    u'1. FC K\u00f6ln':          'FC Koln',
    '1. FC Koln':                'FC Koln',
    '1. FC Union Berlin':        'Union Berlin',
    '1. FSV Mainz 05':           'Mainz',
    'Bayer 04 Leverkusen':       'Leverkusen',
    'Borussia Dortmund':         'Dortmund',
    u'Borussia M\u00f6nchengladbach': "M'gladbach",
    'Borussia Monchengladbach':  "M'gladbach",
    'Eintracht Frankfurt':       'Ein Frankfurt',
    'FC Augsburg':               'Augsburg',
    u'FC Bayern M\u00fcnchen':   'Bayern Munich',
    'FC Bayern Munchen':         'Bayern Munich',
    'FC St. Pauli':              'St Pauli',
    'Hamburger SV':              'Hamburg',
    'SV Werder Bremen':          'Werder Bremen',
    'Sport-Club Freiburg':       'Freiburg',
    'TSG Hoffenheim':            'Hoffenheim',
    'VfB Stuttgart':             'Stuttgart',
    'VfL Wolfsburg':             'Wolfsburg',
    # ITA
    'Hellas Verona':             'Verona',
    # FRA
    'AJ Auxerre':                'Auxerre',
    'AS Monaco':                 'Monaco',
    'Angers SCO':                'Angers',
    'FC Lorient':                'Lorient',
    'FC Metz':                   'Metz',
    'FC Nantes':                 'Nantes',
    'Havre Athletic Club':       'Le Havre',
    'LOSC Lille':                'Lille',
    'OGC Nice':                  'Nice',
    'Olympique Lyonnais':        'Lyon',
    'Olympique de Marseille':    'Marseille',
    'Paris Saint-Germain':       'Paris SG',
    'RC Lens':                   'Lens',
    'RC Strasbourg Alsace':      'Strasbourg',
    'Stade Brestois 29':         'Brest',
    'Stade Rennais FC':          'Rennes',
    'Toulouse FC':               'Toulouse',
}

# Apply ClubElo mapping immediately
elo_raw['team'] = elo_raw['team'].replace(clubelo_to_mh)

# --- Browser headers for football-data.co.uk ---
browser_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Referer': 'https://www.football-data.co.uk/',
}

# --- League configuration ---
league_codes = {
    'ENG-Premier League': 'E0',
    'ESP-La Liga':        'SP1',
    'GER-Bundesliga':     'D1',
    'ITA-Serie A':        'I1',
    'FRA-Ligue 1':        'F1',
}

fixture_urls = {
    'ENG-Premier League': 'https://fixturedownload.com/download/epl-2025-GMTStandardTime.csv',
    'ESP-La Liga':        'https://fixturedownload.com/download/la-liga-2025-GMTStandardTime.csv',
    'GER-Bundesliga':     'https://fixturedownload.com/download/bundesliga-2025-GMTStandardTime.csv',
    'ITA-Serie A':        'https://fixturedownload.com/download/serie-a-2025-GMTStandardTime.csv',
    'FRA-Ligue 1':        'https://fixturedownload.com/download/ligue-1-2025-GMTStandardTime.csv',
}

season_code = '2526'
base_url = 'https://www.football-data.co.uk/mmz4281'

print(f'\nMappings loaded:')
print(f'  {len(understat_to_mh)} Understat -> MH')
print(f'  {len(clubelo_to_mh)} ClubElo -> MH')
print(f'  {len(fixture_to_mh)} Fixture -> MH')
print('Setup complete.')

Loading historical data...
  Master_Training_Set.csv: 19,837 rows, 10 seasons
Loading ClubElo data...
  ClubElo_Master.csv: 706,284 rows

Mappings loaded:
  42 Understat -> MH
  17 ClubElo -> MH
  60 Fixture -> MH
Setup complete.


In [2]:
# =============================================================================
# Cell 2: Canary Probe -- Verify 25/26 Data Exists
# =============================================================================

print('Running canary probe for 2025/2026 season...')

canary_url = f'{base_url}/{season_code}/E0.csv'
try:
    resp = requests.get(canary_url, headers=browser_headers, timeout=30)
    resp.raise_for_status()
    canary_df = pd.read_csv(StringIO(resp.text))
    assert len(canary_df) > 0, 'Canary CSV is empty'
    print(f'  [OK] EPL 25/26 data available: {len(canary_df)} matches found')
    print(f'  Sample: {canary_df["HomeTeam"].iloc[0]} vs {canary_df["AwayTeam"].iloc[0]} on {canary_df["Date"].iloc[0]}')
except Exception as e:
    print(f'  [FAIL] API Data Unavailable: {e}')
    print('  Cannot proceed without current season data.')
    raise SystemExit('Canary probe failed -- 25/26 data not available.')

print('\nCanary probe passed. Proceeding with full download.')

Running canary probe for 2025/2026 season...
  [OK] EPL 25/26 data available: 260 matches found
  Sample: Liverpool vs Bournemouth on 15/08/2025

Canary probe passed. Proceeding with full download.


In [3]:
# =============================================================================
# Cell 3: Live MatchHistory Fetch (25/26 played results)
# =============================================================================
# football-data.co.uk only includes completed matches with results and odds.
# Future fixtures come from fixturedownload.com in Cell 4.

print('Downloading 25/26 MatchHistory results...')
print('=' * 50)

mh_frames = []

for league_name, code in league_codes.items():
    url = f'{base_url}/{season_code}/{code}.csv'
    print(f'\n  Fetching {league_name} ({code})...')
    try:
        resp = requests.get(url, headers=browser_headers, timeout=30)
        resp.raise_for_status()
        df = pd.read_csv(StringIO(resp.text))
        
        # Clean trailing blank rows
        df = df.dropna(how='all')
        df = df.dropna(subset=['HomeTeam', 'AwayTeam'])
        
        # Rename to standard format
        df = df.rename(columns={'HomeTeam': 'home_team', 'AwayTeam': 'away_team'})
        df['league'] = league_name
        df['season'] = 2526
        
        # Parse date
        df['date'] = pd.to_datetime(df['Date'], format='mixed', dayfirst=True)
        df['date_norm'] = df['date'].dt.date
        
        print(f'    [OK] {len(df)} completed matches')
        mh_frames.append(df)
        time.sleep(1)
    except Exception as e:
        print(f'    [FAIL] {e}')

mh_played = pd.concat(mh_frames, ignore_index=True)
print(f'\nTotal played matches: {len(mh_played):,}')

Downloading 25/26 MatchHistory results...

  Fetching ENG-Premier League (E0)...
    [OK] 260 completed matches

  Fetching ESP-La Liga (SP1)...
    [OK] 228 completed matches

  Fetching GER-Bundesliga (D1)...
    [OK] 188 completed matches

  Fetching ITA-Serie A (I1)...
    [OK] 239 completed matches

  Fetching FRA-Ligue 1 (F1)...
    [OK] 189 completed matches

Total played matches: 1,104


In [4]:
# =============================================================================
# Cell 4: Future Fixtures (real scheduled dates)
# =============================================================================
# fixturedownload.com provides full season schedules with real dates.
# We download the full schedule, apply name mapping, filter to only future
# fixtures, then deduplicate against played matches.

print('Downloading future fixtures from fixturedownload.com...')
print('=' * 50)

# Expected total matches per league (teams * (teams-1))
expected_total = {
    'ENG-Premier League': 380,  # 20 teams
    'ESP-La Liga':        380,  # 20 teams
    'GER-Bundesliga':     306,  # 18 teams
    'ITA-Serie A':        380,  # 20 teams
    'FRA-Ligue 1':        306,  # 18 teams
}

fixture_frames = []

for league_name, url in fixture_urls.items():
    print(f'\n  Fetching {league_name}...')
    try:
        resp = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=15)
        resp.raise_for_status()
        df = pd.read_csv(StringIO(resp.text))
        
        # Apply name mapping
        df['Home Team'] = df['Home Team'].replace(fixture_to_mh)
        df['Away Team'] = df['Away Team'].replace(fixture_to_mh)
        
        # Parse date
        df['date'] = pd.to_datetime(df['Date'], format='mixed')
        df['date_norm'] = df['date'].dt.date
        
        # Separate played vs future based on Result column
        future = df[df['Result'].isna()].copy()
        played_fixture = df[df['Result'].notna()].copy()
        
        expected = expected_total.get(league_name, '?')
        print(f'    Source total: {len(df)} (expected: {expected})')
        print(f'    Played: {len(played_fixture)} | Future: {len(future)}')
        
        if len(future) > 0:
            future_clean = pd.DataFrame({
                'home_team': future['Home Team'].values,
                'away_team': future['Away Team'].values,
                'date': future['date'].values,
                'date_norm': future['date_norm'].values,
                'league': league_name,
                'season': 2526,
                'FTHG': np.nan,
                'FTAG': np.nan,
                'FTR': np.nan,
            })
            fixture_frames.append(future_clean)
        
        time.sleep(0.5)
    except Exception as e:
        print(f'    [FAIL] {e}')

future_fixtures = pd.concat(fixture_frames, ignore_index=True)
print(f'\nRaw future fixtures: {len(future_fixtures):,}')

# --- Cross-check: verify fixture team names match MH team names ---
# Promoted teams (Oviedo, Paris FC, Pisa) may only appear in fixtures,
# not in MH played data if they haven't played yet in the MH source.
played_teams = set(mh_played['home_team'].unique()) | set(mh_played['away_team'].unique())
future_teams = set(future_fixtures['home_team'].unique()) | set(future_fixtures['away_team'].unique())
orphans = sorted(future_teams - played_teams)

if orphans:
    print(f'\n  [INFO] {len(orphans)} teams only in fixtures (promoted/new):')
    for t in orphans:
        print(f'    - {t}')
else:
    print(f'\n  [OK] All fixture team names match MH format ({len(future_teams)} teams verified)')

# --- Combine played + future into one 25/26 dataset ---
mh_2025 = pd.concat([mh_played, future_fixtures], ignore_index=True)
before_dedup = len(mh_2025)

# --- Deduplicate: if a fixture appears in both MH (played) and fixture list, ---
# --- keep the MH version which has results, odds, and stats.                ---
mh_2025 = mh_2025.drop_duplicates(
    subset=['league', 'home_team', 'away_team'],
    keep='first'  # played rows from mh_played come first in the concat
)
dupes_removed = before_dedup - len(mh_2025)

print(f'\nDeduplication: removed {dupes_removed} overlapping fixtures')

# --- Per-league validation ---
print(f'\nPer-league totals after dedup:')
for league in sorted(league_codes.keys()):
    mask = mh_2025['league'] == league
    total = mask.sum()
    played_n = (mask & mh_2025['FTHG'].notna()).sum()
    future_n = (mask & mh_2025['FTHG'].isna()).sum()
    expected = expected_total.get(league, '?')
    status = '[OK]' if total == expected else f'[WARN: expected {expected}]'
    print(f'  {league:25s} {total:>4} total ({played_n} played + {future_n} future) {status}')

print(f'\nCombined 25/26 dataset: {len(mh_2025):,} rows')
print(f'  Played:  {mh_2025["FTHG"].notna().sum():,}')
print(f'  Future:  {mh_2025["FTHG"].isna().sum():,}')

Downloading future fixtures from fixturedownload.com...

  Fetching ENG-Premier League...
    Source total: 380 (expected: 380)
    Played: 260 | Future: 120

  Fetching ESP-La Liga...
    Source total: 380 (expected: 380)
    Played: 219 | Future: 161

  Fetching GER-Bundesliga...
    Source total: 306 (expected: 306)
    Played: 188 | Future: 118

  Fetching ITA-Serie A...
    Source total: 380 (expected: 380)
    Played: 230 | Future: 150

  Fetching FRA-Ligue 1...
    Source total: 306 (expected: 306)
    Played: 180 | Future: 126

Raw future fixtures: 675

  [OK] All fixture team names match MH format (96 teams verified)

Deduplication: removed 27 overlapping fixtures

Per-league totals after dedup:
  ENG-Premier League         380 total (260 played + 120 future) [OK]
  ESP-La Liga                380 total (228 played + 152 future) [OK]
  FRA-Ligue 1                306 total (189 played + 117 future) [OK]
  GER-Bundesliga             306 total (188 played + 118 future) [OK]
  ITA-

In [5]:
# =============================================================================
# Cell 5: Understat Fetch + Merge with MatchHistory
# =============================================================================

print('Fetching 25/26 Understat data...')
print('=' * 50)

us_frames = []
for league_name in league_codes.keys():
    print(f'\n  Fetching {league_name}...')
    try:
        us = sd.Understat(leagues=league_name, seasons='2025')
        df = us.read_schedule()
        if df is not None and len(df) > 0:
            us_frames.append(df)
            print(f'    [OK] {len(df)} matches')
        time.sleep(1)
    except Exception as e:
        print(f'    [WARN] {e}')

if us_frames:
    us_2025 = pd.concat(us_frames, ignore_index=True)
    print(f'\nTotal Understat: {len(us_2025):,} matches')
    
    # Apply name mapping
    us_2025['home_team'] = us_2025['home_team'].replace(understat_to_mh)
    us_2025['away_team'] = us_2025['away_team'].replace(understat_to_mh)
    
    # Normalize dates
    us_2025['date_norm'] = pd.to_datetime(us_2025['date'], format='mixed').dt.date
    
    # Select merge columns
    us_merge = us_2025[['date_norm', 'home_team', 'away_team', 'home_xg', 'away_xg']].copy()
    us_merge = us_merge.drop_duplicates(subset=['date_norm', 'home_team', 'away_team'])
    
    # Merge xG into played matches only
    print(f'\nMerging xG data on [date, home_team, away_team]...')
    mh_2025 = mh_2025.merge(
        us_merge,
        on=['date_norm', 'home_team', 'away_team'],
        how='left',
        suffixes=('', '_us')
    )
    
    played_mask = mh_2025['FTHG'].notna()
    xg_matched = mh_2025.loc[played_mask, 'home_xg'].notna().sum()
    played_count = played_mask.sum()
    print(f'  xG matched: {xg_matched} / {played_count} played matches ({xg_matched/max(played_count,1)*100:.1f}%)')
else:
    print('\n  [WARN] No Understat data retrieved. Continuing without xG.')
    mh_2025['home_xg'] = np.nan
    mh_2025['away_xg'] = np.nan

print('\nUnderstat merge complete.')

Fetching 25/26 Understat data...

  Fetching ENG-Premier League...


[2026-02-13 15:25:10] INFO     TLSLibrary:_load_library:397 - Successfully loaded TLS library: C:\Users\ljega\AppData\Local\Programs\Python\Python312\Lib\site-packages\tls_requests\bin\tls-client-xgo-1.13.1-windows-amd64.dll


    [OK] 380 matches

  Fetching ESP-La Liga...


    [OK] 380 matches

  Fetching GER-Bundesliga...


    [OK] 306 matches

  Fetching ITA-Serie A...


    [OK] 380 matches

  Fetching FRA-Ligue 1...


    [OK] 306 matches

Total Understat: 1,752 matches

Merging xG data on [date, home_team, away_team]...
  xG matched: 1078 / 1104 played matches (97.6%)

Understat merge complete.


In [6]:
# =============================================================================
# Cell 6: ClubElo Lookup + Grand Split
# =============================================================================

print('Looking up Elo ratings for 25/26 matches...')
print('=' * 50)

# --- Prepare Elo data ---
elo_raw['elo_date'] = pd.to_datetime(elo_raw['to'], format='mixed', errors='coerce')
elo_clean = elo_raw.dropna(subset=['elo_date', 'team', 'elo']).copy()
elo_clean = elo_clean[['team', 'elo_date', 'elo']].copy()
elo_clean = elo_clean.sort_values('elo_date').reset_index(drop=True)

# --- Ensure match_date is datetime ---
mh_2025['match_date'] = pd.to_datetime(mh_2025['date_norm'])

# --- Home Elo ---
print('  Looking up Home Elo...')
home_df = mh_2025[['match_date', 'home_team']].copy()
home_df = home_df.rename(columns={'home_team': 'team'})
home_df = home_df.sort_values('match_date').reset_index()

home_elo = pd.merge_asof(
    home_df,
    elo_clean.rename(columns={'elo': 'home_elo'}),
    left_on='match_date',
    right_on='elo_date',
    by='team',
    direction='backward'
)
home_elo = home_elo.set_index('index')['home_elo']
mh_2025['home_elo'] = home_elo

# --- Away Elo ---
print('  Looking up Away Elo...')
away_df = mh_2025[['match_date', 'away_team']].copy()
away_df = away_df.rename(columns={'away_team': 'team'})
away_df = away_df.sort_values('match_date').reset_index()

away_elo = pd.merge_asof(
    away_df,
    elo_clean.rename(columns={'elo': 'away_elo'}),
    left_on='match_date',
    right_on='elo_date',
    by='team',
    direction='backward'
)
away_elo = away_elo.set_index('index')['away_elo']
mh_2025['away_elo'] = away_elo

# --- Fallback: use latest known Elo for any remaining NaN ---
latest_elo = elo_clean.sort_values('elo_date').groupby('team').last()['elo']

missing_home = mh_2025['home_elo'].isna()
missing_away = mh_2025['away_elo'].isna()

if missing_home.any():
    mh_2025.loc[missing_home, 'home_elo'] = mh_2025.loc[missing_home, 'home_team'].map(latest_elo)
    print(f'  Filled {missing_home.sum()} missing Home Elo with latest known ratings')

if missing_away.any():
    mh_2025.loc[missing_away, 'away_elo'] = mh_2025.loc[missing_away, 'away_team'].map(latest_elo)
    print(f'  Filled {missing_away.sum()} missing Away Elo with latest known ratings')

mh_2025['elo_diff'] = mh_2025['home_elo'] - mh_2025['away_elo']

elo_matched = mh_2025['home_elo'].notna().sum()
print(f'\n  Elo coverage: {elo_matched}/{len(mh_2025)} ({elo_matched/len(mh_2025)*100:.1f}%)')

# Drop temp columns
mh_2025 = mh_2025.drop(columns=['match_date'], errors='ignore')

# =========================================================================
# GRAND SPLIT
# =========================================================================
print('\nPerforming Grand Split...')
print('=' * 50)

# Combine historical + current
combined = pd.concat([master_df, mh_2025], ignore_index=True)

# --- File A: model_training.csv (historical only) ---
training = combined[combined['season'] != 2526].copy()
training_path = os.path.join(processed_dir, 'model_training.csv')
training.to_csv(training_path, index=False)
print(f'  [A] model_training.csv:           {len(training):>6,} rows -> {training_path}')

# --- File B: current_season_banked.csv (played 25/26 games) ---
banked = combined[(combined['season'] == 2526) & (combined['FTHG'].notna())].copy()
banked_path = os.path.join(processed_dir, 'current_season_banked.csv')
banked.to_csv(banked_path, index=False)
print(f'  [B] current_season_banked.csv:    {len(banked):>6,} rows -> {banked_path}')

# --- File C: future_schedule_features.csv (upcoming 25/26 games) ---
future = combined[(combined['season'] == 2526) & (combined['FTHG'].isna())].copy()
future_path = os.path.join(processed_dir, 'future_schedule_features.csv')
future.to_csv(future_path, index=False)
print(f'  [C] future_schedule_features.csv: {len(future):>6,} rows -> {future_path}')

# --- Critical check: Elo completeness for future games ---
if len(future) > 0:
    elo_null = future[['home_elo', 'away_elo']].isna().any(axis=1).sum()
    if elo_null > 0:
        print(f'\n  [WARN] {elo_null} future games missing Elo ratings!')
        missing_teams = set()
        missing_teams.update(future.loc[future['home_elo'].isna(), 'home_team'].unique())
        missing_teams.update(future.loc[future['away_elo'].isna(), 'away_team'].unique())
        for t in sorted(missing_teams):
            print(f'      - {t}')
    else:
        print(f'\n  [OK] All {len(future)} future games have Elo ratings.')

print('\nGrand Split complete.')

Looking up Elo ratings for 25/26 matches...
  Looking up Home Elo...
  Looking up Away Elo...

  Elo coverage: 1752/1752 (100.0%)

Performing Grand Split...
  [A] model_training.csv:           19,837 rows -> data\processed\model_training.csv
  [B] current_season_banked.csv:     1,104 rows -> data\processed\current_season_banked.csv
  [C] future_schedule_features.csv:    648 rows -> data\processed\future_schedule_features.csv

  [OK] All 648 future games have Elo ratings.

Grand Split complete.


In [7]:
# =============================================================================
# Cell 7: Sanity Check
# =============================================================================

print('PHASE 3 SANITY CHECK')
print('=' * 60)

# --- Row counts ---
print(f'\n  Training:  {len(training):>6,} rows  (expected: 18,000+)')
print(f'  Banked:    {len(banked):>6,} rows  (expected: 600-1200)')
print(f'  Future:    {len(future):>6,} rows  (expected: 100-500)')
print(f'  --------------------------------')
print(f'  Combined:  {len(training) + len(banked) + len(future):>6,} rows')

# --- Validate training data ---
training_ok = len(training) >= 18000
print(f'\n  Training 18k+ rows:  {"[PASS]" if training_ok else "[FAIL]"}')

# --- Validate banked data ---
banked_ok = len(banked) >= 100
print(f'  Banked has results:  {"[PASS]" if banked_ok else "[WARN]"} ({len(banked)} rows)')

# --- Validate future data ---
future_ok = len(future) > 0
print(f'  Future has matches:  {"[PASS]" if future_ok else "[FAIL]"} ({len(future)} rows)')

# --- Elo quality for future ---
if len(future) > 0:
    elo_complete = future['home_elo'].notna().all() and future['away_elo'].notna().all()
    print(f'  Future Elo complete: {"[PASS]" if elo_complete else "[FAIL]"}')

# --- Safety: Master not overwritten ---
master_still_exists = os.path.exists(master_path)
master_same_size = os.path.getsize(master_path) == master_size
print(f'  Master untouched:    {"[PASS]" if (master_still_exists and master_same_size) else "[FAIL]"}')

# --- League breakdown ---
print(f'\n  Banked league breakdown:')
for league in sorted(banked['league'].unique()):
    count = (banked['league'] == league).sum()
    print(f'    {league:25s} {count:>5} matches')

if len(future) > 0:
    print(f'\n  Future league breakdown:')
    for league in sorted(future['league'].unique()):
        count = (future['league'] == league).sum()
        print(f'    {league:25s} {count:>5} matches')

# --- Sample future fixture ---
if len(future) > 0:
    print(f'\n  Sample future fixtures:')
    sample = future[['league', 'date_norm', 'home_team', 'away_team', 'home_elo', 'away_elo']].head(5)
    print(sample.to_string(index=False))

# --- Output file sizes ---
print(f'\n  Output files:')
for name, path in [('model_training', training_path), ('current_season_banked', banked_path), ('future_schedule_features', future_path)]:
    size_mb = os.path.getsize(path) / (1024 * 1024)
    print(f'    {name:35s} {size_mb:>6.1f} MB')

print(f'\n{"=" * 60}')
print('Phase 3 complete. Data is staged for model training.')

PHASE 3 SANITY CHECK

  Training:  19,837 rows  (expected: 18,000+)
  Banked:     1,104 rows  (expected: 600-1200)
  Future:       648 rows  (expected: 100-500)
  --------------------------------
  Combined:  21,589 rows

  Training 18k+ rows:  [PASS]
  Banked has results:  [PASS] (1104 rows)
  Future has matches:  [PASS] (648 rows)
  Future Elo complete: [PASS]
  Master untouched:    [PASS]

  Banked league breakdown:
    ENG-Premier League          260 matches
    ESP-La Liga                 228 matches
    FRA-Ligue 1                 189 matches
    GER-Bundesliga              188 matches
    ITA-Serie A                 239 matches

  Future league breakdown:
    ENG-Premier League          120 matches
    ESP-La Liga                 152 matches
    FRA-Ligue 1                 117 matches
    GER-Bundesliga              118 matches
    ITA-Serie A                 141 matches

  Sample future fixtures:
            league  date_norm   home_team   away_team    home_elo    away_elo
ENG-