# Data Extraction Pipeline - Big 5 European Leagues

**Phase 1: Data Foundation**

This notebook extracts match history and advanced statistics for the Big 5 European football leagues using the `soccerdata` library. Each league is processed in its own isolated cell, allowing independent execution, debugging, and data saving.

### Data Sources
| Source | Method | Description |
|--------|--------|-------------|
| **MatchHistory** | `read_games()` | Historic match results & betting odds from football-data.co.uk |
| **Understat** | `read_schedule()` | Match schedule with xG / advanced stats from understat.com |
| **ClubElo** | `read_team_history()` | Full Elo rating history per team from clubelo.com |

### Output
All CSVs are saved to `data/raw/` with a strict naming convention:
- `{League_Code}_MatchHistory.csv`
- `{League_Code}_Understat.csv`
- `ClubElo_Master.csv`

In [1]:
# =============================================================================
# Cell 1: Setup & Initialization
# =============================================================================

try:
    import soccerdata as sd
except ImportError:
    %pip install soccerdata
    import soccerdata as sd

import pandas as pd
import os
import time
import warnings
import glob

warnings.filterwarnings('ignore')

# --- Create output directory ---
OUTPUT_DIR = os.path.join('data', 'raw')
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f'Output directory ready: {os.path.abspath(OUTPUT_DIR)}')

# --- Define seasons (14/15 through 24/25 = 11 complete seasons) ---
SEASONS = [str(i) for i in range(2014, 2025)]
print(f'Seasons configured: {SEASONS[0]}/{int(SEASONS[0])+1} through {SEASONS[-1]}/{int(SEASONS[-1])+1}')

# --- League configuration ---
LEAGUES = {
    'ENG-Premier League': 'ENG_Premier_League',
    'ESP-La Liga':        'ESP_La_Liga',
    'GER-Bundesliga':     'GER_Bundesliga',
    'ITA-Serie A':        'ITA_Serie_A',
    'FRA-Ligue 1':        'FRA_Ligue_1',
}

def extract_league_data(league_id, league_code, seasons):
    """
    Extract MatchHistory and Understat data for a single league.
    Handles errors per-season so partial data is always saved.
    """
    mh_path = os.path.join(OUTPUT_DIR, f'{league_code}_MatchHistory.csv')
    us_path = os.path.join(OUTPUT_DIR, f'{league_code}_Understat.csv')
    
    # --- MatchHistory ---
    print(f'  Fetching MatchHistory for {league_id}...')
    mh_frames = []
    for season in seasons:
        try:
            mh = sd.MatchHistory(leagues=league_id, seasons=season)
            df = mh.read_games()
            if df is not None and len(df) > 0:
                mh_frames.append(df)
        except Exception as e:
            print(f'    [WARN] MatchHistory failed for {league_id} season {season}: {e}')
    
    if mh_frames:
        mh_all = pd.concat(mh_frames, ignore_index=False)
        mh_all.to_csv(mh_path)
        print(f'    [OK] Saved {len(mh_all)} rows -> {mh_path}')
    else:
        print(f'    [FAIL] No MatchHistory data retrieved for {league_id}')
    
    # --- Understat ---
    print(f'  Fetching Understat schedule for {league_id}...')
    us_frames = []
    for season in seasons:
        try:
            us = sd.Understat(leagues=league_id, seasons=season)
            df = us.read_schedule()
            if df is not None and len(df) > 0:
                us_frames.append(df)
        except Exception as e:
            print(f'    [WARN] Understat failed for {league_id} season {season}: {e}')
    
    if us_frames:
        us_all = pd.concat(us_frames, ignore_index=False)
        us_all.to_csv(us_path)
        print(f'    [OK] Saved {len(us_all)} rows -> {us_path}')
    else:
        print(f'    [FAIL] No Understat data retrieved for {league_id}')

print('\nSetup complete. Run the cells below in order.')

Output directory ready: c:\Users\ljega\Downloads\MSBA\Sports Analytics Project\data\raw
Seasons configured: 2014/2015 through 2024/2025

Setup complete. Run the cells below in order.


In [None]:
# =============================================================================
# Cell 1.5: Pre-download MatchHistory CSVs (bypass bot detection)
# =============================================================================
# football-data.co.uk blocks automated requests with 503 errors.
# This cell downloads CSVs using browser-like headers and saves them
# to soccerdata's cache so the library uses cached files instead.

import requests
from pathlib import Path

CACHE_DIR = Path.home() / 'soccerdata' / 'data' / 'MatchHistory'
CACHE_DIR.mkdir(parents=True, exist_ok=True)

BROWSER_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Referer': 'https://www.football-data.co.uk/',
}

MH_LEAGUE_CODES = {
    'ENG-Premier League': 'E0',
    'ESP-La Liga':        'SP1',
    'GER-Bundesliga':     'D1',
    'ITA-Serie A':        'I1',
    'FRA-Ligue 1':        'F1',
}

def season_code(year):
    return f'{int(year) % 100:02d}{(int(year)+1) % 100:02d}'

BASE_URL = 'https://www.football-data.co.uk/mmz4281'

downloaded = 0
skipped = 0
failed = 0

for league_name, code in MH_LEAGUE_CODES.items():
    print(f'\nPre-downloading MatchHistory for {league_name} ({code})...')
    for season in SEASONS:
        sc = season_code(season)
        filename = f'{code}_{sc}.csv'
        filepath = CACHE_DIR / filename
        
        if filepath.exists() and filepath.stat().st_size > 100:
            skipped += 1
            continue
        
        url = f'{BASE_URL}/{sc}/{code}.csv'
        try:
            resp = requests.get(url, headers=BROWSER_HEADERS, timeout=30)
            resp.raise_for_status()
            filepath.write_bytes(resp.content)
            downloaded += 1
            print(f'    [OK] {filename} ({len(resp.content):,} bytes)')
            time.sleep(1)
        except Exception as e:
            failed += 1
            print(f'    [WARN] {filename}: {e}')

print(f'\nSummary: {downloaded} downloaded, {skipped} already cached, {failed} failed')
print(f'Cache location: {CACHE_DIR}')
print('\nMatchHistory cache is ready. Run the league cells below.')

In [None]:
# =============================================================================
# Cell 2: Premier League (ENG-Premier League)
# =============================================================================
time.sleep(2)  # Rate-limit guard

print('Extracting Premier League data...')
print('=' * 50)

extract_league_data(
    league_id='ENG-Premier League',
    league_code='ENG_Premier_League',
    seasons=SEASONS
)

print('\nPremier League extraction complete.')

In [None]:
# =============================================================================
# Cell 3: La Liga (ESP-La Liga)
# =============================================================================
time.sleep(2)  # Rate-limit guard

print('Extracting La Liga data...')
print('=' * 50)

extract_league_data(
    league_id='ESP-La Liga',
    league_code='ESP_La_Liga',
    seasons=SEASONS
)

print('\nLa Liga extraction complete.')

In [None]:
# =============================================================================
# Cell 4: Bundesliga (GER-Bundesliga)
# =============================================================================
time.sleep(2)  # Rate-limit guard

print('Extracting Bundesliga data...')
print('=' * 50)

extract_league_data(
    league_id='GER-Bundesliga',
    league_code='GER_Bundesliga',
    seasons=SEASONS
)

print('\nBundesliga extraction complete.')

In [None]:
# =============================================================================
# Cell 5: Serie A (ITA-Serie A)
# =============================================================================
time.sleep(2)  # Rate-limit guard

print('Extracting Serie A data...')
print('=' * 50)

extract_league_data(
    league_id='ITA-Serie A',
    league_code='ITA_Serie_A',
    seasons=SEASONS
)

print('\nSerie A extraction complete.')

In [None]:
# =============================================================================
# Cell 6: Ligue 1 (FRA-Ligue 1)
# =============================================================================
time.sleep(2)  # Rate-limit guard

print('Extracting Ligue 1 data...')
print('=' * 50)

extract_league_data(
    league_id='FRA-Ligue 1',
    league_code='FRA_Ligue_1',
    seasons=SEASONS
)

print('\nLigue 1 extraction complete.')

In [14]:
# =============================================================================
# Cell 7: ClubElo Aggregation
# =============================================================================
print('Fetching ClubElo ratings for all Big 5 leagues...')
print('=' * 50)

elo_path = os.path.join(OUTPUT_DIR, 'ClubElo_Master.csv')

# --- Name mapping: football-data.co.uk -> ClubElo ---
# These teams use abbreviated names in MatchHistory that don't match ClubElo.
CLUBELO_NAME_MAP = {
    'Ajaccio GFCO':          'Ajaccio',
    'Ath Bilbao':            'Bilbao',
    'Ath Madrid':            'Atletico',
    'Bayern Munich':         'Bayern',
    'Ein Frankfurt':         'Frankfurt',
    'Espanol':               'Espanyol',
    'Evian Thonon Gaillard': 'Evian TG',
    'FC Koln':               'Koeln',
    'Fortuna Dusseldorf':    'Duesseldorf',
    'Holstein Kiel':         'Holstein',
    'La Coruna':             'Depor',
    "M'gladbach":            'Gladbach',
    "Nott'm Forest":         'Forest',
    'Nurnberg':              'Nuernberg',
    'Sp Gijon':              'Gijon',
    'St Etienne':            'Saint-Etienne',
    'Vallecano':             'Rayo Vallecano',
    'Werder Bremen':         'Werder',
}

# Collect all unique team names from saved MatchHistory files
mh_files = glob.glob(os.path.join(OUTPUT_DIR, '*_MatchHistory.csv'))
raw_teams = set()

for f in mh_files:
    try:
        df = pd.read_csv(f)
        home_col = [c for c in df.columns if 'home' in c.lower() and 'team' in c.lower()]
        away_col = [c for c in df.columns if 'away' in c.lower() and 'team' in c.lower()]
        if home_col:
            raw_teams.update(df[home_col[0]].dropna().unique())
        if away_col:
            raw_teams.update(df[away_col[0]].dropna().unique())
    except Exception as e:
        print(f'  [WARN] Could not read {f}: {e}')

# Apply name mapping
all_teams = {CLUBELO_NAME_MAP.get(t, t) for t in raw_teams}
mapped_count = sum(1 for t in raw_teams if t in CLUBELO_NAME_MAP)
print(f'  Found {len(raw_teams)} unique teams ({mapped_count} names remapped for ClubElo)')

# Fetch ClubElo data per team
elo_frames = []
failed_teams = []

for i, team in enumerate(sorted(all_teams)):
    try:
        elo = sd.ClubElo()
        df = elo.read_team_history(team)
        if df is not None and len(df) > 0:
            elo_frames.append(df)
            if (i + 1) % 20 == 0:
                print(f'  Processed {i+1}/{len(all_teams)} teams...')
        time.sleep(0.5)
    except Exception as e:
        failed_teams.append(team)

if elo_frames:
    elo_all = pd.concat(elo_frames, ignore_index=True)
    elo_all.to_csv(elo_path, index=False)
    print(f'\n  [OK] Saved {len(elo_all)} Elo records -> {elo_path}')
else:
    print('  [FAIL] No ClubElo data retrieved')

if failed_teams:
    print(f'  [WARN] {len(failed_teams)} teams could not be matched in ClubElo:')
    for t in sorted(failed_teams):
        print(f'      - {t}')
else:
    print('  All teams matched successfully.')

print('\nClubElo aggregation complete.')

Fetching ClubElo ratings for all Big 5 leagues...
  Found 163 unique teams (18 names remapped for ClubElo)


  Processed 20/162 teams...


  Processed 40/162 teams...


  Processed 60/162 teams...


  Processed 80/162 teams...


  Processed 100/162 teams...


  Processed 120/162 teams...


  Processed 140/162 teams...


  Processed 160/162 teams...



  [OK] Saved 699162 Elo records -> data\raw\ClubElo_Master.csv
  [WARN] 1 teams could not be matched in ClubElo:
      - Saint-Etienne

ClubElo aggregation complete.


In [None]:
# =============================================================================
# Cell 8: Verification Summary
# =============================================================================
print('DATA EXTRACTION VERIFICATION')
print('=' * 60)

all_files = glob.glob(os.path.join(OUTPUT_DIR, '*.csv'))
total_rows = 0

for f in sorted(all_files):
    try:
        df = pd.read_csv(f)
        size_kb = os.path.getsize(f) / 1024
        total_rows += len(df)
        status = '[OK]  ' if len(df) > 0 else '[FAIL]'
        print(f'  {status} {os.path.basename(f):45s} {len(df):>7,} rows  ({size_kb:>8,.1f} KB)')
    except Exception as e:
        print(f'  [FAIL] {os.path.basename(f):45s} ERROR: {e}')

print(f'\n{"=" * 60}')
print(f'  Total files: {len(all_files)}')
print(f'  Total rows:  {total_rows:,}')
print(f'  Location:    {os.path.abspath(OUTPUT_DIR)}')
print(f'\nData extraction pipeline complete.')