In [4]:
%pip install requests_html fake-useragent pandas openpyxl

Note: you may need to restart the kernel to use updated packages.


In [5]:
from requests_html import HTMLSession
from fake_useragent import UserAgent
import re
import json
import pandas as pd
import os

print("Libraries imported correctly")
print("Pandas version: {pd.__version__}")

Libraries imported correctly
Pandas version: {pd.__version__}


In [6]:
%pip install lxml_html_clean

Note: you may need to restart the kernel to use updated packages.


In [None]:
# requests_html: provides HTMLSession for HTTP requests with optional JavaScript rendering via Pyppeteer
# fake_useragent: generates realistic browser User-Agent strings from a live database to reduce bot-detection risk
# re: Python's built-in regex module used here for pattern matching within raw HTML text
# json: built-in module for deserializing JSON strings into Python dicts/lists
# pandas: data manipulation library; used to structure extracted stats into tabular form
# os: standard file system interface (available for path operations if needed)
from requests_html import HTMLSession
from fake_useragent import UserAgent
import re
import json
import pandas as pd
import os

print("Libraries imported correctly")
print(f"Pandas version: {pd.__version__}")

In [None]:
# UserAgent() initializes the fake-useragent pool. ua.random returns a randomly selected
# browser User-Agent string, rotating the client identity on each request to avoid
# server-side bot fingerprinting based on a static or repeated User-Agent header.
ua = UserAgent()

# HTMLSession wraps requests.Session and optionally drives a headless Chromium browser
# via Pyppeteer for JavaScript-rendered pages. For Next.js sites that pre-render their
# data server-side into the HTML, calling .render() is not required — the __NEXT_DATA__
# script tag is present in the initial HTML response.
session = HTMLSession()

url = "https://www.fotmob.com/matches/galatasaray-vs-juventus/2u4xwc#5161868"

# Accept headers mimic a real browser request to pass basic server-side bot filters.
# Accept-Language is set to Spanish to match the locale, though the __NEXT_DATA__ payload
# is not locale-dependent for statistical data.
headers = {
    "User-Agent" : ua.random,
    "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "es-ES,es;q=0.9"
}

print(f"URL objetivo: {url}")
print(f"User agent: {headers}")

In [None]:
# session.get() sends an HTTP GET request with the spoofed headers.
# timeout=15 sets a 15-second hard limit to prevent the script from hanging on slow responses.
# response.text returns the full HTTP body decoded as a UTF-8 string (the complete HTML page).
# The HTML is persisted to disk so subsequent development iterations can work from the
# cached file rather than issuing repeated network requests to the same URL.
try:
    response = session.get(url, headers=headers, timeout=15)
    if response.status_code == 200:
        data = response.text
        print(f"HTML: {data}")
        filename = "fotmob_html.txt"
        # Open in write mode with explicit UTF-8 encoding to handle non-ASCII characters
        # (player names, accented characters in stadium/team names)
        with open(filename, "w", encoding="utf-8") as f:
            f.write(data)
        print(f"Saved on {filename}")
except Exception as e:
    print(f"Error: {e}")

In [None]:
# FotMob is a Next.js application. Next.js injects its full server-side rendered state
# into a <script id="__NEXT_DATA__"> tag as a JSON blob. This tag is always present
# in the initial HTML and contains all data used to hydrate the React component tree,
# including match stats, lineups, and event data — without needing to reverse-engineer
# any XHR/fetch API endpoints.
#
# re.DOTALL makes '.' match newline characters, allowing the pattern to capture
# JSON content that spans multiple lines.
# re.IGNORECASE handles any casing variation in the script tag attributes.
pattern = r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>'

match = re.search(pattern, data, re.DOTALL | re.IGNORECASE)

if match:
    # match.group(1) extracts the first capture group — the raw JSON string
    # between the opening and closing script tags
    json_str = match.group(1).strip()
    print("__NEXT_DATA__ found")
    try:
        # json.loads() deserializes the JSON string into a nested Python dict/list structure
        json_data = json.loads(json_str)
        print(json_data)
        print(f'{list(json_data.keys())}')
    except json.JSONDecodeError as e:
        print(f"Error: {e}")
else:
    print("Error: Block not found")

In [None]:
# Navigate the nested JSON structure to the match statistics payload.
# Path: json_data['props']['pageProps']['content'] is the root content object for the match page.
# ['stats']['Periods']['All']['stats'] contains stats aggregated across the full 90+ minutes.
# The 'Periods' key also contains 'FirstHalf' and 'SecondHalf' sub-objects for
# half-level breakdowns, which can be used to identify second-half performance drops.
base = json_data['props']["pageProps"]["content"]
stats_all = base["stats"]["Periods"]["All"]["stats"]
print(stats_all)

In [None]:
# Flatten the nested stats list into a row-per-stat tabular structure.
# The raw JSON organizes stats as: sections > items > [home_value, away_value].
# Section header rows carry stats: [None, None] and contain no data — they are skipped.
# Each valid item is appended as a dict with:
#   'section'  — the category name (e.g. "Shots", "Expected goals (xG)", "Passes")
#   'stat'     — the specific metric name within that category
#   'home'     — the home team's value (may be int, float, or formatted string like "396 (79%)")
#   'away'     — the away team's value in the same format
rows = []
for section in stats_all:
    section_name = section.get("title")
    for item in section.get("stats", []):
        values = item.get("stats")
        # Skip rows that are section-level title headers with no data values
        if values == [None, None]:
            continue
        # values[0] = home team stat, values[1] = away team stat
        rows.append({
            "section" : section_name,
            "stat" : item.get("title"),
            "home": values[0] if isinstance(values, list) and len(values) > 0 else None,
            "away": values[1] if isinstance(values, list) and len(values) > 0 else None
        })
    print(rows)

In [None]:
# pd.DataFrame(rows) converts the list of dicts into a structured DataFrame.
# Each row maps to a single stat: section | metric name | home value | away value.
# Note: 'home' and 'away' columns are mixed-type — some stats are integers (e.g. shots),
# others are floats (xG), and others are pre-formatted strings (e.g. "396 (79%)").
# For numerical analysis, string-format columns need parsing (e.g. split on space, cast to int/float).
import pandas as pd

stats_df = pd.DataFrame(rows)
stats_df.head(30)

In [None]:
# The 'shotmap' key within the content payload contains individual shot events
# for both teams, each as a nested dict with fields for xG, coordinates, outcome,
# player, and shot type.
# pd.json_normalize() flattens nested dictionaries within each shot record into
# top-level columns, producing one row per shot. Nested keys become dot-separated
# column names (e.g. player.name becomes player_name after normalization).
shots = base['shotmap']['shots']

shots_df = pd.json_normalize(shots)
shots_df.head(50)

## Summary: FotMob Match Statistics Scraper

### What This Notebook Does

This notebook extracts structured match statistics from FotMob using direct HTML scraping. FotMob is built on Next.js, which embeds its full server-side rendered state as a JSON blob inside a `<script id="__NEXT_DATA__">` tag. This tag is present in the initial HTML response — no JavaScript execution or API reverse-engineering is required to access the data.

The pipeline is: HTTP GET with spoofed headers → regex extraction of `__NEXT_DATA__` → JSON deserialization → tree traversal → tabular flattening.

### Key Techniques

- **User-Agent rotation** (`fake_useragent`): Generates realistic browser strings to reduce automated-request detection. This is a surface-level measure; robust scraping at scale requires IP rotation, request throttling, and session management.
- **`__NEXT_DATA__` extraction** (`re.search` with `DOTALL`): A reliable scraping pattern applicable to any Next.js-based site. The captured JSON contains the complete data model used to render the page.
- **Nested JSON traversal**: The match stats are located at `props > pageProps > content > stats > Periods > All > stats`. The `Periods` object also exposes `FirstHalf` and `SecondHalf` keys for within-match breakdowns.
- **`pd.json_normalize()`**: Flattens deeply nested shot event dicts into a flat DataFrame — critical when individual events contain nested player, team, and coordinate sub-objects.

### Data Available

| DataFrame | Content |
|-----------|---------|
| `stats_df` | Match-level stats across 8 categories (xG, shots, passes, physical metrics, duels, discipline) |
| `shots_df` | Individual shot events with coordinates, xG, outcome, and player metadata |

### Ideas to Extract More Value

- **Multi-match aggregation**: Loop over multiple FotMob match URLs to build a season-level dataset. Persist to a database (SQLite, PostgreSQL) rather than in-memory DataFrames to handle scale.
- **Half-level breakdown**: `Periods.FirstHalf` and `Periods.SecondHalf` expose per-half stats. Comparing xG and physical metrics between halves can reveal fatigue-driven performance drops.
- **xG vs actual goals**: Accumulate xG across a season and compare to actual goals scored to quantify finishing efficiency (overperformance) or wasteful attacking play (underperformance).
- **Physical metric tracking over a fixture schedule**: Sprint counts and total distances from `physical_metrics` can indicate squad fatigue during congested periods, useful for load management analysis.
- **Shot location clustering**: Combine `shots_df` coordinates with xG to cluster shot origins and identify the zones each team most frequently attacks from and their quality distribution.
- **Resilience at scale**: User-Agent spoofing is fragile under high request volume. For production pipelines, prefer an official data provider API or implement rate limiting, proxy rotation, and exponential backoff.