1/Import & configuration Selenium

In [16]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import os, time


**2/Lancer Chrome**

In [37]:
options = Options()
options.add_argument("--headless=new")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


**3/Page principale**

In [42]:
league_url = "https://fbref.com/en/comps/9/2024-2025/2024-2025-Premier-League-Stats"
driver.get(league_url)
time.sleep(3)

**4/Extraire les liens d’équipes**

In [43]:
# Extraire les liens d’équipes UNIQUEMENT depuis le tableau principal
soup = BeautifulSoup(driver.page_source, "lxml")
teams = []

# Trouver le tableau principal contenant la liste des équipes
main_table = soup.find("table", {"id": lambda x: x and "results" in x})
if main_table:
    for a in main_table.select("a[href*='/squads/']"):
        team_name = a.get_text(strip=True)
        team_link = a["href"]
        if team_name and team_link.startswith("/en/squads/"):
            full_url = "https://fbref.com" + team_link
            if (team_name, full_url) not in teams:
                teams.append((team_name, full_url))
else:
    print("⚠️ Tableau principal non trouvé sur la page.")

print(f"{len(teams)} équipes trouvées")
teams


20 équipes trouvées


[('Liverpool',
  'https://fbref.com/en/squads/822bd0ba/2024-2025/Liverpool-Stats'),
 ('Arsenal', 'https://fbref.com/en/squads/18bb7c10/2024-2025/Arsenal-Stats'),
 ('Manchester City',
  'https://fbref.com/en/squads/b8fd03ef/2024-2025/Manchester-City-Stats'),
 ('Chelsea', 'https://fbref.com/en/squads/cff3d9bb/2024-2025/Chelsea-Stats'),
 ('Newcastle Utd',
  'https://fbref.com/en/squads/b2b47a98/2024-2025/Newcastle-United-Stats'),
 ('Aston Villa',
  'https://fbref.com/en/squads/8602292d/2024-2025/Aston-Villa-Stats'),
 ("Nott'ham Forest",
  'https://fbref.com/en/squads/e4a775cb/2024-2025/Nottingham-Forest-Stats'),
 ('Brighton',
  'https://fbref.com/en/squads/d07537b9/2024-2025/Brighton-and-Hove-Albion-Stats'),
 ('Bournemouth',
  'https://fbref.com/en/squads/4ba7cbea/2024-2025/Bournemouth-Stats'),
 ('Brentford',
  'https://fbref.com/en/squads/cd051869/2024-2025/Brentford-Stats'),
 ('Fulham', 'https://fbref.com/en/squads/fd962109/2024-2025/Fulham-Stats'),
 ('Crystal Palace',
  'https://fbref.

In [28]:
# --- Dossier de sauvegarde ---
base_dir = "../Data/Bronze"
os.makedirs(base_dir, exist_ok=True)

**5/Scraping des joueurs (jusqu’à Performance) et des matchs (jusqu’à Referee)**

In [44]:
# --- Boucle principale sur chaque équipe ---
for team_name, team_url in teams:
    print(f"\n=== Scraping {team_name} ===")
    team_dir = os.path.join(base_dir, team_name)
    os.makedirs(team_dir, exist_ok=True)

    try:
        driver.get(team_url)
        time.sleep(3)
        page = BeautifulSoup(driver.page_source, "lxml")

        # ======================
        #  TABLE DES JOUEURS
        # ======================
        player_table = page.find("table", id=lambda x: x and "stats_standard" in x)
        if player_table:
            # Récupérer la 2e ligne du thead (les vrais noms de colonnes)
            thead_trs = player_table.find("thead").find_all("tr")
            if len(thead_trs) >= 2:
                headers = [th.get_text(strip=True) for th in thead_trs[1].find_all("th")]
            else:
                headers = [th.get_text(strip=True) for th in thead_trs[0].find_all("th")]

            # Garder seulement jusqu’à "CrdR" inclus
            if "CrdR" in headers:
                crdr_idx = headers.index("CrdR") + 1
                headers = headers[:crdr_idx]
            else:
                crdr_idx = len(headers)

            rows = []
            for tr in player_table.find("tbody").find_all("tr"):
                # Ignorer les lignes décoratives (thead au milieu du tbody)
                if "class" in tr.attrs and "thead" in tr.attrs["class"]:
                    continue

                cols = [td.get_text(strip=True) for td in tr.find_all(["th", "td"])]
                # Couper les colonnes au même index que CrdR
                cols = cols[:crdr_idx]
                rows.append(cols)

            df_players = pd.DataFrame(rows, columns=headers)
            df_players.to_csv(os.path.join(team_dir, "players.csv"), index=False, encoding="utf-8-sig")
            print(f"✅ players.csv sauvegardé ({len(df_players)} lignes, {len(df_players.columns)} colonnes)")
        else:
            print("⚠️ Table joueurs non trouvée")

        # ======================
        #  TABLE DES MATCHS
        # ======================
        match_table = page.find("table", id=lambda x: x and "matchlogs_for" in x)
        if match_table:
            # Prendre uniquement la 2e ligne du thead
            thead_trs = match_table.find("thead").find_all("tr")
            headers = [th.get_text(strip=True) for th in thead_trs[-1].find_all("th")]

            # Garder jusqu’à “Referee” inclus
            if "Referee" in headers:
                ref_idx = headers.index("Referee") + 1
                headers = headers[:ref_idx]
            else:
                ref_idx = len(headers)

            rows = []
            for tr in match_table.find("tbody").find_all("tr"):
                if "class" in tr.attrs and "thead" in tr.attrs["class"]:
                    continue

                cols = [td.get_text(strip=True) for td in tr.find_all(["th", "td"])]
                cols = cols[:ref_idx]
                rows.append(cols)

            df_matches = pd.DataFrame(rows, columns=headers)
            df_matches.to_csv(os.path.join(team_dir, "matches.csv"), index=False, encoding="utf-8-sig")
            print(f"✅ matches.csv sauvegardé ({len(df_matches)} lignes, {len(df_matches.columns)} colonnes)")
        else:
            print("⚠️ Table matchs non trouvée")

    except Exception as e:
        print(f"❌ Erreur sur {team_name}: {e}")

    time.sleep(2)



=== Scraping Liverpool ===
✅ players.csv sauvegardé (29 lignes, 16 colonnes)
✅ matches.csv sauvegardé (56 lignes, 18 colonnes)

=== Scraping Arsenal ===
✅ players.csv sauvegardé (38 lignes, 16 colonnes)
✅ matches.csv sauvegardé (58 lignes, 18 colonnes)

=== Scraping Manchester City ===
✅ players.csv sauvegardé (36 lignes, 16 colonnes)
✅ matches.csv sauvegardé (57 lignes, 18 colonnes)

=== Scraping Chelsea ===
✅ players.csv sauvegardé (35 lignes, 16 colonnes)
✅ matches.csv sauvegardé (57 lignes, 18 colonnes)

=== Scraping Newcastle Utd ===
✅ players.csv sauvegardé (30 lignes, 16 colonnes)
✅ matches.csv sauvegardé (48 lignes, 18 colonnes)

=== Scraping Aston Villa ===
✅ players.csv sauvegardé (34 lignes, 16 colonnes)
✅ matches.csv sauvegardé (57 lignes, 18 colonnes)

=== Scraping Nott'ham Forest ===
✅ players.csv sauvegardé (27 lignes, 16 colonnes)
✅ matches.csv sauvegardé (44 lignes, 18 colonnes)

=== Scraping Brighton ===
✅ players.csv sauvegardé (42 lignes, 16 colonnes)
✅ matches.csv

In [15]:
driver.quit()
print("\n Scraping terminé !")


 Scraping terminé !
