1/Import & configuration Selenium

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import os, time


**2/Lancer Chrome**

In [2]:
options = Options()
options.add_argument("--headless=new")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


**3/Page principale**

In [3]:
league_url = "https://fbref.com/en/comps/9/2024-2025/2024-2025-Premier-League-Stats"
driver.get(league_url)
time.sleep(3)

**4/Extraire les liens d‚Äô√©quipes**

In [5]:
# Extraire les liens d‚Äô√©quipes UNIQUEMENT depuis le tableau principal
soup = BeautifulSoup(driver.page_source, "lxml")
teams = []

# Trouver le tableau principal contenant la liste des √©quipes
main_table = soup.find("table", {"id": lambda x: x and "results" in x})
if main_table:
    for a in main_table.select("a[href*='/squads/']"):
        team_name = a.get_text(strip=True)
        team_link = a["href"]
        if team_name and team_link.startswith("/en/squads/"):
            full_url = "https://fbref.com" + team_link
            if (team_name, full_url) not in teams:
                teams.append((team_name, full_url))
else:
    print("‚ö†Ô∏è Tableau principal non trouv√© sur la page.")

print(f"{len(teams)} √©quipes trouv√©es")
teams


20 √©quipes trouv√©es


[('Liverpool',
  'https://fbref.com/en/squads/822bd0ba/2024-2025/Liverpool-Stats'),
 ('Arsenal', 'https://fbref.com/en/squads/18bb7c10/2024-2025/Arsenal-Stats'),
 ('Manchester City',
  'https://fbref.com/en/squads/b8fd03ef/2024-2025/Manchester-City-Stats'),
 ('Chelsea', 'https://fbref.com/en/squads/cff3d9bb/2024-2025/Chelsea-Stats'),
 ('Newcastle Utd',
  'https://fbref.com/en/squads/b2b47a98/2024-2025/Newcastle-United-Stats'),
 ('Aston Villa',
  'https://fbref.com/en/squads/8602292d/2024-2025/Aston-Villa-Stats'),
 ("Nott'ham Forest",
  'https://fbref.com/en/squads/e4a775cb/2024-2025/Nottingham-Forest-Stats'),
 ('Brighton',
  'https://fbref.com/en/squads/d07537b9/2024-2025/Brighton-and-Hove-Albion-Stats'),
 ('Bournemouth',
  'https://fbref.com/en/squads/4ba7cbea/2024-2025/Bournemouth-Stats'),
 ('Brentford',
  'https://fbref.com/en/squads/cd051869/2024-2025/Brentford-Stats'),
 ('Fulham', 'https://fbref.com/en/squads/fd962109/2024-2025/Fulham-Stats'),
 ('Crystal Palace',
  'https://fbref.

In [7]:
# --- Dossier de sauvegarde ---
base_dir = "../Data/Bronze"
os.makedirs(base_dir, exist_ok=True)

**5/Scraping des joueurs (jusqu‚Äô√† Performance) et des matchs (jusqu‚Äô√† Referee)**

In [None]:
# --- Boucle principale sur chaque √©quipe ---
for team_name, team_url in teams:
    print(f"\n=== Scraping {team_name} ===")
    team_dir = os.path.join(base_dir, team_name)
    os.makedirs(team_dir, exist_ok=True)

    try:
        driver.get(team_url)
        time.sleep(3)
        page = BeautifulSoup(driver.page_source, "lxml")

        #  TABLE DES JOUEURS
 
        player_table = page.find("table", id=lambda x: x and "stats_standard" in x)
        if player_table:
            # R√©cup√©rer le deuxi√®me tr du thead pour les vrais headers
            thead_trs = player_table.find("thead").find_all("tr")
            if len(thead_trs) >= 2:
                headers = [th.get_text(strip=True) for th in thead_trs[1].find_all("th")]
            else:
                headers = [th.get_text(strip=True) for th in thead_trs[0].find_all("th")]

            rows = []
            for tr in player_table.find("tbody").find_all("tr"):
                # Ignorer les lignes vides ou les sous-titres (ex: class="thead")
                if "class" in tr.attrs and "thead" in tr.attrs["class"]:
                    continue

                cols = [td.get_text(strip=True) for td in tr.find_all(["th", "td"])]
                # Normaliser la longueur des lignes
                if len(cols) < len(headers):
                    cols += [""] * (len(headers) - len(cols))
                elif len(cols) > len(headers):
                    cols = cols[:len(headers)]
                rows.append(cols)

            df_players = pd.DataFrame(rows, columns=headers)
            # Supprimer les colonnes compl√®tement vides
            df_players = df_players.dropna(axis=1, how="all")

            df_players.to_csv(os.path.join(team_dir, "players.csv"), index=False, encoding="utf-8-sig")
            print(f" players.csv sauvegard√© ({len(df_players)} lignes, {len(df_players.columns)} colonnes)")
        else:
            print(" Table joueurs non trouv√©e")

        #    TABLE DES MATCHS
        match_table = page.find("table", id=lambda x: x and "matchlogs_for" in x)
        if match_table:
            headers = [th.get_text(strip=True) for th in match_table.find("thead").find_all("th")]

            # On garde toutes les colonnes jusqu‚Äô√† ‚ÄúReferee‚Äù inclus
            if "Referee" in headers:
                ref_idx = headers.index("Referee") + 1
                headers = headers[:ref_idx + 2]  # inclure Match Report & Notes
            else:
                ref_idx = len(headers)

            rows = []
            for tr in match_table.find("tbody").find_all("tr"):
                # Ignorer les lignes vides ou les sous-titres
                if "class" in tr.attrs and "thead" in tr.attrs["class"]:
                    continue

                cols = [td.get_text(strip=True) for td in tr.find_all(["th", "td"])]
                if len(cols) < len(headers):
                    cols += [""] * (len(headers) - len(cols))
                elif len(cols) > len(headers):
                    cols = cols[:len(headers)]
                rows.append(cols)

            df_matches = pd.DataFrame(rows, columns=headers)
            df_matches = df_matches.dropna(axis=1, how="all")

            df_matches.to_csv(os.path.join(team_dir, "matches.csv"), index=False, encoding="utf-8-sig")
            print(f" matches.csv sauvegard√© ({len(df_matches)} lignes, {len(df_matches.columns)} colonnes)")
        else:
            print(" Table matchs non trouv√©e")

    except Exception as e:
        print(f" Erreur sur {team_name}: {e}")

    time.sleep(2)


=== Scraping Liverpool ===
‚úÖ players.csv sauvegard√© (29 lignes, 34 colonnes)
‚úÖ matches.csv sauvegard√© (56 lignes, 20 colonnes)

=== Scraping Arsenal ===
‚úÖ players.csv sauvegard√© (38 lignes, 34 colonnes)
‚úÖ matches.csv sauvegard√© (58 lignes, 20 colonnes)

=== Scraping Manchester City ===
‚úÖ players.csv sauvegard√© (36 lignes, 34 colonnes)
‚úÖ matches.csv sauvegard√© (57 lignes, 20 colonnes)

=== Scraping Chelsea ===
‚úÖ players.csv sauvegard√© (35 lignes, 34 colonnes)
‚úÖ matches.csv sauvegard√© (57 lignes, 20 colonnes)

=== Scraping Newcastle Utd ===
‚úÖ players.csv sauvegard√© (30 lignes, 34 colonnes)
‚úÖ matches.csv sauvegard√© (48 lignes, 20 colonnes)

=== Scraping Aston Villa ===
‚úÖ players.csv sauvegard√© (34 lignes, 34 colonnes)
‚úÖ matches.csv sauvegard√© (57 lignes, 20 colonnes)

=== Scraping Nott'ham Forest ===
‚úÖ players.csv sauvegard√© (27 lignes, 34 colonnes)
‚úÖ matches.csv sauvegard√© (44 lignes, 20 colonnes)

=== Scraping Brighton ===
‚úÖ players.csv sauve

In [None]:
driver.quit()
print("\nüéâ Scraping termin√© !")