In [1]:
# 📚 Corpus Extraction, Nettoyage et Équilibrage

import requests, re, time, os, fitz
import pandas as pd
import spacy
from bs4 import BeautifulSoup


In [None]:
# 📌 Config
nlp = spacy.load("fr_core_news_sm")
HEADERS = {"User-Agent": "Mozilla/5.0"}
DATE = pd.Timestamp.now().strftime("%Y-%m-%d")
TAILLE_CIBLE = 450

In [None]:

# 🧽 Nettoyage & segmentation
def nettoyer_et_segmenter(texte, seuil_mots=5):
    texte = re.sub(r"\[.*?\]|\([^)]+\)|http\S+|[\"*\[\]\{\}<>\|\\/~^=]", "", str(texte)).strip()
    texte = re.sub(r"\s+", " ", texte)
    doc = nlp(texte)
    return [sent.text.strip() for sent in doc.sents if len(sent.text.strip().split()) >= seuil_mots]


In [None]:
# 🔴 Reddit
def scraper_reddit(url):
    print("🔴 Reddit...")
    try:
        r = requests.get(url, headers=HEADERS)
        soup = BeautifulSoup(r.text, 'html.parser')
        texte = " ".join(p.get_text() for p in soup.select("div.md > p"))
        return [{"texte": t, "registre": "familier", "date": DATE} for t in nettoyer_et_segmenter(texte)]
    except Exception as e:
        print("❌ Reddit:", e)
        return []

In [None]:

# 🟡 Wikipédia
def scraper_wikipedia(urls):
    print("🟡 Wikipédia...")
    corpus = []
    for url in urls:
        try:
            r = requests.get(url, headers=HEADERS)
            soup = BeautifulSoup(r.text, 'html.parser')
            texte = " ".join(p.get_text() for p in soup.find_all("p") if len(p.get_text()) > 40)
            phrases = nettoyer_et_segmenter(texte)
            corpus += [{"texte": t, "registre": "courant", "date": DATE} for t in phrases]
            print(f"  ✅ {len(phrases)} phrases de {url}")
            time.sleep(1)
        except Exception as e:
            print(f"❌ Wikipédia : {url} → {e}")
    return corpus


In [None]:
# 🔵 PDF
def extraire_phrases_pdfs(chemins):
    print("🔵 Extraction PDF...")
    corpus = []
    for path in chemins:
        try:
            texte = "".join(page.get_text() for page in fitz.open(path))
            phrases = nettoyer_et_segmenter(texte, seuil_mots=6)
            corpus += [{"texte": t, "registre": "academique", "date": DATE} for t in phrases]
            print(f"  ✅ {len(phrases)} phrases depuis : {path}")
        except Exception as e:
            print(f"❌ PDF : {path} → {e}")
    return corpus

In [None]:
# 📥 Extraction
reddit_url = "https://old.reddit.com/r/france/comments/1kws0vy/..."
wiki_urls = [
    "https://fr.wikipedia.org/wiki/Technologie",
    "https://fr.wikipedia.org/wiki/Intelligence_artificielle"
]
pdf_paths = ["../data/academique.pdf", "../data/climat.pdf"]

data_all = {
    "familier": scraper_reddit(reddit_url),
    "courant": scraper_wikipedia(wiki_urls),
    "academique": extraire_phrases_pdfs(pdf_paths)
}

In [None]:
corpus_total = []
for registre, data in data_all.items():
    df = pd.DataFrame(data)
    if df.empty or "texte" not in df.columns:
        continue
    df["texte_nettoye"] = df["texte"].str.lower().str.strip().replace(r"\s+", " ", regex=True)
    df = df[df["texte_nettoye"].str.len() >= 20]
    df = df.drop_duplicates(subset="texte_nettoye").dropna()
    df["registre"] = registre
    corpus_total.append(df)
    df.to_csv(f"corpus_{registre}.csv", index=False, encoding="utf-8")
    print(f"💾 corpus_{registre}.csv → {len(df)} lignes")

# ⚖️ Fusion & équilibrage
df_complet = pd.concat(corpus_total, ignore_index=True)
df_equilibre = df_complet.groupby("registre", group_keys=False).apply(
    lambda g: g.sample(n=min(len(g), TAILLE_CIBLE), random_state=42)
).reset_index(drop=True)

df_equilibre.to_csv("corpus_equilibre.csv", index=False, encoding="utf-8")
print("\n✅ corpus_equilibre.csv généré avec succès !")
print(df_equilibre["registre"].value_counts())

In [None]:

# 👀 Aperçu
for r in df_equilibre["registre"].unique():
    print(f"\n🔹 {r.upper()} :")
    print(df_equilibre[df_equilibre["registre"] == r]["texte_nettoye"].head(3).to_string(index=False))
