##  Installation des dépendances

In [None]:
%pip install --quiet selenium webdriver-manager pandas


Note: you may need to restart the kernel to use updated packages.


## Imports, constantes (XPaths) et configuration

In [6]:
import time
import re
import sqlite3
from datetime import datetime
from typing import List, Dict, Optional

import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import (
    NoSuchElementException,
    TimeoutException,
    ElementClickInterceptedException,
    WebDriverException,
)
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# ====== XPATHS (d'après votre projet) ======
XPATH_COOKIES_ACCEPT = "/html/body/div/div/div/div/div/div/div[3]/button[2]"
XPATH_MENU_BUTTON = "/html/body/app-root/ng-sidebar-container/div/div/app-navbar/div[1]/nav/app-navbar-menu-button/div"
XPATH_BONS_PLANS_BUTTON = "/html/body/app-root/ng-sidebar-container/ng-sidebar/aside/app-sidenav/div/div[1]/div[2]/div/div/div/div/div[2]/app-sidenav-sections/ul[1]/li[2]/a"
XPATH_ALL_PRODUCT_LIST = "/html/body/app-root/ng-sidebar-container/div/div/div[2]/app-template-details/div[2]/div[4]/div/div[2]/app-template-result-list/ul"
XPATH_ALL_PRODUCT_CARDS = XPATH_ALL_PRODUCT_LIST + "/li"
XPATH_PRODUCT_NAME_IN_CARD = ".//app-product-card-label/div/a"
XPATH_SOLD_BY_IN_CARD = ".//app-product-card-seller/p/span"
XPATH_SOLD_BY_BLOCK_IN_CARD = ".//app-product-card-seller"
XPATH_PROMO_BLOCK_IN_CARD = ".//app-product-promo/div/div"
XPATH_PRICE_INTEGER_PART = ".//app-product-price//div[@id='price']//div[contains(@class,'price-unit')]"
XPATH_PRICE_CENTS_PART   = ".//app-product-price//div[@id='price']//span[contains(@class,'price-cents')]"
XPATH_IMAGE_IN_CARD = ".//app-lazy-image/img"
XPATH_PAGE_LINK_IN_CARD = ".//a[@href][1]"
XPATH_NEXT_LI = "//li[contains(@class,'pagination-next')]"
XPATH_NEXT_BUTTON = XPATH_NEXT_LI + "/a"
XPATH_PRODUCT_DESCRIPTION = "/html/body/main/div/div/div[3]/section[1]/div"

# ====== Options Chrome ======
def build_driver(headless=False):
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--start-maximized")
    opts.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/128.0.0.0 Safari/537.36")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=opts)
    wait = WebDriverWait(driver, 10)
    return driver, wait


## Ouverture du navigateur et acceptation des cookies

In [7]:
driver, wait = build_driver(headless=False)
print("[STEP] ouverture https://www.e.leclerc/")
driver.get("https://www.e.leclerc/")

# Cookies
try:
    btn = wait.until(EC.element_to_be_clickable((By.XPATH, XPATH_COOKIES_ACCEPT)))
    btn.click()
    time.sleep(0.5)
    print("[STEP] cookies acceptés")
except Exception:
    print("[STEP] pas de bannière cookies à accepter")


[STEP] ouverture https://www.e.leclerc/
[STEP] cookies acceptés


## Aller sur la page « Bons plans »

In [None]:
wait.until(EC.presence_of_element_located((By.XPATH, XPATH_MENU_BUTTON)))
driver.find_element(By.XPATH, XPATH_MENU_BUTTON).click()
wait.until(EC.element_to_be_clickable((By.XPATH, XPATH_BONS_PLANS_BUTTON))).click()

# Attendre le listing
wait.until(EC.presence_of_element_located((By.XPATH, XPATH_ALL_PRODUCT_LIST)))
wait.until(EC.presence_of_all_elements_located((By.XPATH, XPATH_ALL_PRODUCT_CARDS)))
print("[STEP] page Bons plans OK")


[STEP] page Bons plans OK


## Récupérer tous les articles de la page 1 et afficher le nombre

In [None]:
cards = driver.find_elements(By.XPATH, XPATH_ALL_PRODUCT_CARDS)
print(f"[INFO] Nombre d'articles sur la page 1: {len(cards)}")

[INFO] Nombre d'articles sur la page 1: 34


## Fonctions utilitaires d’extraction (carte produit)

In [None]:
def clean_sold_by(txt: str) -> Optional[str]:
    if not txt: return None
    line = txt.strip().splitlines()[0].strip()
    line = re.sub(r"^\s*vendu(\s+et\s+expédié)?\s+par\s*:?\s*", "", line, flags=re.IGNORECASE)
    line = line.strip(" :-\u00A0").strip()
    return line or None

def extract_sold_by(card) -> Optional[str]:
    try:
        txt = card.find_element(By.XPATH, XPATH_SOLD_BY_IN_CARD).text
        cleaned = clean_sold_by(txt)
        if cleaned: return cleaned
    except NoSuchElementException:
        pass
    try:
        txt2 = card.find_element(By.XPATH, XPATH_SOLD_BY_BLOCK_IN_CARD).text
        return clean_sold_by(txt2)
    except NoSuchElementException:
        return None

def extract_promo(card) -> Optional[str]:
    try:
        raw = card.find_element(By.XPATH, XPATH_PROMO_BLOCK_IN_CARD).text.strip()
        cleaned = " ".join(raw.split())
        return cleaned or None
    except NoSuchElementException:
        return None

def extract_price(card) -> Optional[float]:
    try:
        euros_txt = card.find_element(By.XPATH, XPATH_PRICE_INTEGER_PART).text.strip()
    except NoSuchElementException:
        euros_txt = ""
    try:
        cents_txt = card.find_element(By.XPATH, XPATH_PRICE_CENTS_PART).text.strip()
    except NoSuchElementException:
        cents_txt = ""
    euros_txt = euros_txt.replace("€","").replace(" ","")
    cents_txt = cents_txt.replace("€","").replace(" ","")
    if cents_txt.startswith(","): cents_txt = cents_txt[1:]
    if cents_txt == "": cents_txt = "00"
    if euros_txt == "": return None
    try:
        return float(f"{euros_txt}.{cents_txt}")
    except ValueError:
        return None

def extract_card_data(card) -> Dict:
    try:
        product_name = card.find_element(By.XPATH, XPATH_PRODUCT_NAME_IN_CARD).text.strip()
    except NoSuchElementException:
        product_name = None

    sold_by = extract_sold_by(card)
    discount_text = extract_promo(card)
    price_eur = extract_price(card)

    # image
    try:
        img = card.find_element(By.XPATH, XPATH_IMAGE_IN_CARD)
        image_url = img.get_attribute("data-src") or img.get_attribute("src")
    except NoSuchElementException:
        image_url = None

    # page_url (premier lien)
    page_url = None
    try:
        a = card.find_element(By.XPATH, XPATH_PAGE_LINK_IN_CARD)
        href = a.get_attribute("href")
        if href and not href.strip().lower().startswith("javascript"):
            page_url = href
    except NoSuchElementException:
        page_url = None

    return {
        "sold_by": sold_by,
        "product_name": product_name,
        "discount_text": discount_text,
        "price_eur": price_eur,
        "page_url": page_url,
        "image_url": image_url,
    }


## Extraire les données de la page 1  et afficher les articles sur cette page 

In [None]:
page1_rows = [extract_card_data(c) for c in cards]
df_page1 = pd.DataFrame(page1_rows)
display(df_page1.head())
print(f"[INFO] {len(df_page1)} lignes extraites (page 1, sans détails)")


Unnamed: 0,sold_by,product_name,discount_text,price_eur,page_url,image_url
0,Emma Matelas,EMMA | Matelas Original II Plus | Ressorts + M...,30 %,399.0,https://www.e.leclerc/fp/emma-matelas-original...,https://media.e.leclerc/4255762726835_1?op_sha...
1,E.Leclerc,LEGO® CITY 60475,,26.9,https://www.e.leclerc/fp/lego-city-60475-57020...,https://media.e.leclerc/5702017812687_1?op_sha...
2,E.Leclerc,BARBIE JOYEUX NOEL BLONDE,5 €,37.9,https://www.e.leclerc/fp/barbie-joyeux-noel-bl...,https://media.e.leclerc/0194735260966_1?op_sha...
3,E.Leclerc,LICORNE SONS ET LUMIERES,3 €,22.9,https://www.e.leclerc/fp/licorne-sons-et-lumie...,https://media.e.leclerc/0194735274727_1?op_sha...
4,E.Leclerc,Table d'appoint - enceinte et chargeur à induc...,,38.0,https://www.e.leclerc/fp/table-d-appoint-encei...,https://media.e.leclerc/3603313451902_1?op_sha...


[INFO] 34 lignes extraites (page 1, sans détails)


## Fonctions pour récupérer la description & caractéristiques depuis la fiche produit

In [None]:
def split_description_features(raw: str) -> (Optional[str], Optional[str]):
    if not raw: return None, None
    txt = re.sub(r"\r\n|\r", "\n", raw).strip()
    m = re.search(r"Caractéristiques\s*:?", txt, flags=re.IGNORECASE)
    if not m:
        return txt, None
    before = txt[:m.start()].strip()
    after = txt[m.end():].strip()
    lines = [ln.strip(" -•\u2022\t").strip() for ln in after.split("\n") if ln.strip()]
    feats = " | ".join(lines) if lines else None
    desc = before if before else None
    return desc, feats

def fetch_details(page_url: Optional[str], driver=None, wait=None) -> Dict[str, Optional[str]]:
    if not page_url: return {"description": None, "features": None}
    main = driver.current_window_handle
    try:
        driver.switch_to.new_window('tab')
        driver.get(page_url)
        try:
            wait.until(EC.presence_of_element_located((By.XPATH, XPATH_PRODUCT_DESCRIPTION)))
            block = driver.find_element(By.XPATH, XPATH_PRODUCT_DESCRIPTION)
            raw = block.text.strip()
        except TimeoutException:
            raw = ""
        desc, feats = split_description_features(raw)
        return {"description": desc, "features": feats}
    except WebDriverException:
        return {"description": None, "features": None}
    finally:
        try:
            driver.close()
            driver.switch_to.window(main)
        except Exception:
            pass


## Enrichir la page 1 avec description & caractéristiques

In [None]:
page1_full = []
for row in page1_rows:
    details = fetch_details(row.get("page_url"), driver=driver, wait=wait)
    r = dict(row)
    r.update(details)
    r["scraped_at"] = datetime.utcnow().isoformat()
    page1_full.append(r)

df_page1_full = pd.DataFrame(page1_full)
display(df_page1_full.head(3))
print(f"[INFO] Page 1 enrichie: {len(df_page1_full)} lignes")


  r["scraped_at"] = datetime.utcnow().isoformat()


Unnamed: 0,sold_by,product_name,discount_text,price_eur,page_url,image_url,description,features,scraped_at
0,Emma Matelas,EMMA | Matelas Original II Plus | Ressorts + M...,30 %,399.0,https://www.e.leclerc/fp/emma-matelas-original...,https://media.e.leclerc/4255762726835_1?op_sha...,Profitez de la version Plus dotée de la même s...,,2025-10-30T10:11:29.738432
1,E.Leclerc,LEGO® CITY 60475,,26.9,https://www.e.leclerc/fp/lego-city-60475-57020...,https://media.e.leclerc/5702017812687_1?op_sha...,Démarrez le compte à rebours des fêtes avec Le...,,2025-10-30T10:11:30.488226
2,E.Leclerc,BARBIE JOYEUX NOEL BLONDE,5 €,37.9,https://www.e.leclerc/fp/barbie-joyeux-noel-bl...,https://media.e.leclerc/0194735260966_1?op_sha...,Célébrez la saison avec la Barbie Joyeux Noel ...,,2025-10-30T10:11:31.274749


[INFO] Page 1 enrichie: 34 lignes


## Pagination robuste + collecte multi-pages 

In [None]:
def go_next_page(driver, wait) -> bool:
    # mémoriser le 1er item de la page actuelle
    try:
        first_before = driver.find_element(By.XPATH, XPATH_ALL_PRODUCT_CARDS + "[1]").text[:60]
    except NoSuchElementException:
        first_before = ""

    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(0.5)

    try:
        next_li = driver.find_element(By.XPATH, XPATH_NEXT_LI)
    except NoSuchElementException:
        return False

    li_class = (next_li.get_attribute("class") or "").lower()
    if "disabled" in li_class:
        return False

    try:
        next_a = next_li.find_element(By.XPATH, "./a")
    except NoSuchElementException:
        return False

    driver.execute_script("arguments[0].scrollIntoView()", next_a)
    time.sleep(0.2)
    driver.execute_script("arguments[0].click()", next_a)

    try:
        WebDriverWait(driver, 12).until(
            lambda d: d.find_element(By.XPATH, XPATH_ALL_PRODUCT_CARDS + "[1]").text[:60] != first_before
        )
    except TimeoutException:
        return False

    try:
        wait.until(EC.presence_of_all_elements_located((By.XPATH, XPATH_ALL_PRODUCT_CARDS)))
    except TimeoutException:
        return False

    return True

# --------- collecte multi-pages (2 pages max) ---------
ALL_ROWS = []
MAX_PAGES = 2

# On a déjà la page 1 en mémoire (page1_full). On l'ajoute d'abord.
ALL_ROWS.extend(page1_full)

current_page = 1
while current_page < MAX_PAGES:
    moved = go_next_page(driver, wait)
    if not moved:
        print(f"[STOP] Pas de page suivante à la page {current_page}")
        break
    current_page += 1
    print(f"[STEP] Page {current_page} chargée")

    # extraire cartes
    cards = driver.find_elements(By.XPATH, XPATH_ALL_PRODUCT_CARDS)
    tmp_rows = [extract_card_data(c) for c in cards]
    # enrichir avec détails
    for row in tmp_rows:
        details = fetch_details(row.get("page_url"), driver=driver, wait=wait)
        r = dict(row); r.update(details); r["scraped_at"] = datetime.utcnow().isoformat()
        ALL_ROWS.append(r)

print(f"[INFO] Total collecté: {len(ALL_ROWS)} produits (sur {current_page} page(s))")

df_all = pd.DataFrame(ALL_ROWS)
display(df_all.head(2))


[STEP] Page 2 chargée


  r = dict(row); r.update(details); r["scraped_at"] = datetime.utcnow().isoformat()


[STEP] Page 3 chargée
[STEP] Page 4 chargée
[STEP] Page 5 chargée
[INFO] Total collecté: 162 produits (sur 5 page(s))


Unnamed: 0,sold_by,product_name,discount_text,price_eur,page_url,image_url,description,features,scraped_at
0,Emma Matelas,EMMA | Matelas Original II Plus | Ressorts + M...,30 %,399.0,https://www.e.leclerc/fp/emma-matelas-original...,https://media.e.leclerc/4255762726835_1?op_sha...,Profitez de la version Plus dotée de la même s...,,2025-10-30T10:11:29.738432
1,E.Leclerc,LEGO® CITY 60475,,26.9,https://www.e.leclerc/fp/lego-city-60475-57020...,https://media.e.leclerc/5702017812687_1?op_sha...,Démarrez le compte à rebours des fêtes avec Le...,,2025-10-30T10:11:30.488226
2,E.Leclerc,BARBIE JOYEUX NOEL BLONDE,5 €,37.9,https://www.e.leclerc/fp/barbie-joyeux-noel-bl...,https://media.e.leclerc/0194735260966_1?op_sha...,Célébrez la saison avec la Barbie Joyeux Noel ...,,2025-10-30T10:11:31.274749
3,E.Leclerc,LICORNE SONS ET LUMIERES,3 €,22.9,https://www.e.leclerc/fp/licorne-sons-et-lumie...,https://media.e.leclerc/0194735274727_1?op_sha...,La Licorne Sons et Lumières Barbie brille de m...,,2025-10-30T10:11:32.046133
4,E.Leclerc,Table d'appoint - enceinte et chargeur à induc...,,38.0,https://www.e.leclerc/fp/table-d-appoint-encei...,https://media.e.leclerc/3603313451902_1?op_sha...,La table d'appoint HOMESIDE possède deux fonct...,"Commandes : lecture des pistes audio, réglage ...",2025-10-30T10:11:32.748467


## Enregistrer en SQLite et aperçu

In [15]:

DB_PATH = "leclerc_deals.db"
con = sqlite3.connect(DB_PATH)
cur = con.cursor()
cur.execute("""
CREATE TABLE IF NOT EXISTS leclerc_deals (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    sold_by TEXT,
    product_name TEXT,
    discount_text TEXT,
    price_eur REAL,
    page_url TEXT,
    image_url TEXT,
    description TEXT,
    features TEXT,
    scraped_at TEXT
);
""")
con.commit()

rows_to_insert = [
    (r.get("sold_by"), r.get("product_name"), r.get("discount_text"), r.get("price_eur"),
     r.get("page_url"), r.get("image_url"), r.get("description"), r.get("features"), r.get("scraped_at"))
    for r in ALL_ROWS
]
cur.executemany("""
INSERT INTO leclerc_deals
(sold_by, product_name, discount_text, price_eur, page_url, image_url, description, features, scraped_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
""", rows_to_insert)
con.commit()

df_preview = pd.read_sql_query("SELECT sold_by, product_name, discount_text, price_eur, substr(scraped_at,1,19) as scraped_at FROM leclerc_deals ORDER BY id DESC LIMIT 10;", con)
con.close()

df_preview


Unnamed: 0,sold_by,product_name,discount_text,price_eur,scraped_at
0,E.Leclerc,Rasoir Philips Series 1000 Philips X3003/02,33 %,49.99,2025-10-30T10:15:19
1,E.Leclerc,RUBSON Recharge SENSATION 3en1 Neutre Pure Lot...,25 %,7.9,2025-10-30T10:15:18
2,Mon mobilier design,VEGA Siège auto réglable i-Size 100-150 cm jus...,19 %,49.9,2025-10-30T10:15:17
3,Fuxtec,FUXTEC Kit souffleur/aspirateur de feuilles à ...,17 %,124.0,2025-10-30T10:15:16
4,IDMarket,IDMARKET Enclos poulailler dôme parc grillagé ...,17 %,149.99,2025-10-30T10:15:16
5,E.Leclerc,Kidisecrets Selfie Mauve,2 €,44.9,2025-10-30T10:15:15
6,E.Leclerc,ABRI DE JARDIN METAL 108WGY 7.18M²,,598.9,2025-10-30T10:15:14
7,E.Leclerc,"OPPO 13 FS 5G 16,9 cm (6.67"") Double SIM Andro...",100 €,299.9,2025-10-30T10:15:13
8,E.Leclerc,"Débroussailleuse thermique 42,7cc - Sélection ...",,115.0,2025-10-30T10:15:12
9,E.Leclerc,VELO D'EQUILIBRE TOVE (BEIGE),,44.9,2025-10-30T10:15:11


## Fermer le navigateur

In [None]:
try:
    driver.quit()
    print("[DONE] Chrome fermé")
except Exception:
    pass


[DONE] Chrome fermé
