In [14]:
from curl_cffi import requests
from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path
import os
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter

url = "https://fulltime.thefa.com/displayFixture.html?id=29432341"

session = requests.Session()

response = session.get(
    url,
    impersonate = "chrome120"
)


html = response.text

print("Status:", response.status_code)
print("HTML length:", len(html))


soup = BeautifulSoup(html, "html.parser")
#print(soup)

Status: 200
HTML length: 157467


## Let's try and see which are the div's that actually have the info we want
Obtain teams' names, scores, matchgame's title/competition

In [None]:
container = soup.select_one("div.fixture-teams-and-score.played")
container

In [19]:
home_team = container.select_one(".home-team .team-name h2")
home_team.get_text(strip=True)

'Billericay Town'

In [20]:
away_team = container.select_one(".road-team .team-name h2")
away_team.get_text(strip=True)

"Bishop's Stortford"

In [21]:
score_box = container.select_one(".score.played")
score_box.get_text(" ", strip=True)

'FT 1 2'

In [22]:
import re

text = score_box.get_text(" ", strip=True)
numbers = re.findall(r"\d+", text)
numbers

['1', '2']

In [23]:
match = {}

#Game title
title = soup.find("h1").get_text(strip=True)
print (title)

National League U19 Alliance


In [24]:
def first_text(soup, selectors):
    for sel in selectors:
        el = soup.select_one(sel)
        if el:
            return el.get_text(strip=True)
    return None

In [5]:
print(len(soup.select(".fixture-lineup-statistics")))
print(len(soup.select(".player")))

1
31


In [None]:
for p in soup.select(".lineup-statistics-grid p"):
    print(p.get_text(strip=True))

In [2]:

grid = soup.select_one(".lineup-statistics-grid")

for team_side, selector in {
    "home": ".home-team",
    "away": ".road-team"
}.items():

    team_block = grid.select_one(selector)
    if not team_block:
        continue

    print(team_side)
    #(team_block.prettify())

home
away


Now that we have confirmed the tags we need, the development of helper functions is quite logical. Hence, their usage.

In [None]:
from scrape_match import scrape_match
data = scrape_match(url)

#data["match"]
#data["lineups"]
#data["events"]
#data["additional_stats"]

Creating dataframe objects for each of the lists obtained from the match scraping function.


In [5]:
df_match = pd.DataFrame([data["match"]])
df_lineups = pd.DataFrame(data["lineups"])
df_events = pd.DataFrame(data["events"])
df_additional_stats = pd.DataFrame(data["additional_stats"])

In [6]:
#Write outputs to CSV
def write_csv(df, path):
    path = Path(path)
    df.to_csv(
        path,
        mode="a" if path.exists() else "w",
        header=not path.exists(),
        index=False
    )

In [7]:
RAW_DIR = "../output"
os.makedirs(RAW_DIR, exist_ok=True)

write_csv(df_match, f"{RAW_DIR}/matches.csv")
write_csv(df_lineups, f"{RAW_DIR}/lineups.csv")
write_csv(df_events, f"{RAW_DIR}/events.csv")
write_csv(df_additional_stats, f"{RAW_DIR}/player_stats.csv")

Mapping to excel all the CSVs in order to rewrite the final output's file.

In [None]:
def map_lineups_for_excel(lineups, match):
    rows = []
    competition = match.get("competition")
    for l in lineups:

        rows.append({
            "player id": None,       
            "player & team id": f"{l['player_name']}_{l['team']}",       
            "first name": l["name"],
            "last name": l["last_name"],
            "full name": l["player_name"],
            "team id": None,
            "tournament id": competition,
            "Team": l["team"]
        })

    return rows

def map_match_for_excel(match):
    winner = None
    if match["home_goals"] is not None:
        if match["home_goals"] > match["away_goals"]:
            winner = match["home_team"]
        elif match["home_goals"] < match["away_goals"]:
            winner = match["away_team"]
        else:
            winner = "Draw"

    return [{
        "game id": match["match_id"],
        "date": match["date"],
        "home team id": match["home_team"],
        "away team id": match["away_team"],
        "winner": winner,
        "ft home score": match["home_goals"],
        "ft away score": match["away_goals"],
    }]

def map_events_for_excel(events):
    rows = []
    for e in events:
        rows.append({
            "game id": e["match_id"],
            "minute": e["minute"],
            "player": e["player_name"],
            "event": e["event_type"]
        })
    return rows

def map_stats_for_excel(stats):
    return [{
        "game id": s["match_id"],
        "player": s["player_name"],
        "stat": s["stat"],
        "value": s["value"]
    } for s in stats]

def append_excel(sheet_name, new_rows, path):

    if not new_rows:
        print(f"[SKIP] No rows to write for sheet: {sheet_name}")
        return

    wb = load_workbook(path)

    if sheet_name not in wb.sheetnames:
        ws = wb.create_sheet(sheet_name)
        headers = list(new_rows[0].keys())
        ws.append(headers)
    else:
        ws = wb[sheet_name]
        headers = [cell.value for cell in ws[1]]

    for row in new_rows:
        ws.append([row.get(h) for h in headers])


    wb.save(path)
    print(f"[OK] Written {len(new_rows)} rows â†’ {sheet_name}")

In [None]:
append_excel(
    "Players",
    map_lineups_for_excel(data["lineups"], data["match"]),
    "TrialScrapingDavid.xlsx"
)

append_excel(
    "Game",
    map_match_for_excel(data["match"]),
    "TrialScrapingDavid.xlsx"
)

append_excel(
    "Game Events",
    map_events_for_excel(data["events"]),
    "TrialScrapingDavid.xlsx"
)

append_excel(
    "Player Stats",
    map_stats_for_excel(data["additional_stats"]),
    "TrialScrapingDavid.xlsx"
)