In [19]:

import requests, re, time, os, fitz
import pandas as pd
import spacy
from bs4 import BeautifulSoup


In [20]:
#  Config
nlp = spacy.load("fr_core_news_sm")
HEADERS = {"User-Agent": "Mozilla/5.0"}
DATE = pd.Timestamp.now().strftime("%Y-%m-%d")
TAILLE_CIBLE = 450

In [1]:

#  Nettoyage
def nettoyer_text(texte, seuil_mots=5):
    texte = re.sub(r"\[.*?\]|\([^)]+\)|http\S+|[\"*\[\]\{\}<>\|\\/~^=]", "", str(texte)).strip()
    texte = re.sub(r"\s+", " ", texte)
    doc = nlp(texte)
    return [sent.text.strip() for sent in doc.sents if len(sent.text.strip().split()) >= seuil_mots]


In [22]:
# Reddit
def scraper_reddit(url):
    try:
        r = requests.get(url, headers=HEADERS)
        soup = BeautifulSoup(r.text, 'html.parser')
        texte = " ".join(p.get_text() for p in soup.select("div.md > p"))
        return [{"texte": t, "registre": "familier", "date": DATE} for t in nettoyer_text(texte)]
    except Exception as e:
        print(" Reddit:", e)
        return []

In [23]:

#  Wikipédia
def scraper_wikipedia(urls):
    corpus = []
    for url in urls:
        try:
            r = requests.get(url, headers=HEADERS)
            soup = BeautifulSoup(r.text, 'html.parser')
            texte = " ".join(p.get_text() for p in soup.find_all("p") if len(p.get_text()) > 40)
            phrases = nettoyer_text(texte)
            corpus += [{"texte": t, "registre": "courant", "date": DATE} for t in phrases]
            time.sleep(1)
        except Exception as e:
            print(f"Wikipédia : {url} → {e}")
    return corpus


In [24]:
#  PDF
def extraire_phrases_pdfs(chemins):
    corpus = []
    for pathpdf in chemins:
        try:
            texte = "".join(page.get_text() for page in fitz.open(pathpdf))
            phrases = nettoyer_text(texte, seuil_mots=5)
            corpus += [{"texte": t, "registre": "academique", "date": DATE} for t in phrases]
        except Exception as e:
            print(f"ERR  : {pathpdf} → {e}")
    return corpus

In [25]:
#  Les chemins
reddit_url = "https://old.reddit.com/r/france/comments/1kws0vy/..."
wiki_urls = [
    "https://fr.wikipedia.org/wiki/Technologie",
    "https://fr.wikipedia.org/wiki/Intelligence_artificielle"
]
pdf_paths = ["../databrut/academique.pdf", "../databrut/climat.pdf"]

data_all = {
    "familier": scraper_reddit(reddit_url),
    "courant": scraper_wikipedia(wiki_urls),
    "academique": extraire_phrases_pdfs(pdf_paths)
}

🔴 Reddit...
🟡 Wikipédia...
  ✅ 111 phrases de https://fr.wikipedia.org/wiki/Technologie
  ✅ 527 phrases de https://fr.wikipedia.org/wiki/Intelligence_artificielle
🔵 Extraction PDF...
  ✅ 137 phrases depuis : ../databrut/academique.pdf
  ✅ 4872 phrases depuis : ../databrut/climat.pdf


In [26]:
corpus_total = []
for registre, data in data_all.items():
    df = pd.DataFrame(data)
    if df.empty or "texte" not in df.columns:
        continue
    df["texte_nettoye"] = df["texte"].str.lower().str.strip().replace(r"\s+", " ", regex=True)
    df = df[df["texte_nettoye"].str.len() >= 20]
    df = df.drop_duplicates(subset="texte_nettoye").dropna()
    df["registre"] = registre
    corpus_total.append(df)
    df.to_csv(f"corpus_{registre}.csv", index=False, encoding="utf-8")

df_complet = pd.concat(corpus_total, ignore_index=True)
df_equilibre = df_complet.groupby("registre", group_keys=False).apply(
    lambda g: g.sample(n=min(len(g), TAILLE_CIBLE), random_state=42)
).reset_index(drop=True)

df_equilibre.to_csv("corpus_equilibre.csv", index=False, encoding="utf-8")
print("\n✅ corpus_equilibre.csv généré avec succès !")

💾 corpus_familier.csv → 437 lignes
💾 corpus_courant.csv → 638 lignes
💾 corpus_academique.csv → 4824 lignes

✅ corpus_equilibre.csv généré avec succès !
registre
academique    450
courant       450
familier      437
Name: count, dtype: int64


  df_equilibre = df_complet.groupby("registre", group_keys=False).apply(
