
# Pipeline Pack Ultra‑Cohérente — Rask × LLM (**v4 — stricte, durée→mots, audit vitesse**)

Objectif : générer **tous les segments en une seule requête** pour garantir un style homogène **et** un **respect strict du tempo** (durée → nombre de mots via `WPM`).  
La qualité pédagogique prime, mais la longueur est **contrainte** segment par segment. Si le LLM dévie, une **normalisation stricte** recadre à l’exact nombre de mots.  
En fin de pipeline, un **audit de vitesse parlée** vérifie le débit réel par segment.

**Étapes :**
1) Configuration (Rask, OpenAI, WPM)  
2) Auth + fonctions Rask  
3) Ingestion transcript + cibles (durée → mots)  
4) Construction du pack (avec `exact_words`, `min_words`, `max_words` par segment)  
5) Génération pack (une requête) → JSON **patchable Rask**  
6) Régénération ciblée des segments hors bornes (2 passes max)  
7) **Normalisation stricte**: chaque segment atteint **exactement** `exact_words` (LLM strict + fallback)  
8) Audit fins/débuts + auto-fix (pas de phrase coupée)  
9) Tableau AVANT / APRÈS (tailles)  
10) **Audit vitesse** par segment (WPM) et statut OK/FAST/SLOW  
11) Patch Rask (dry‑run par défaut)  
12) Exports


In [431]:

# (0) Dépendances (exécuter si nécessaire)
# %pip install --quiet requests python-dotenv pandas tqdm openai


In [432]:

# (1) Configuration & variables d'environnement
import os
from dotenv import load_dotenv
load_dotenv()

# Rask
RASK_TOKEN = os.getenv("RASK_TOKEN", "")
RASK_CLIENT_ID = os.getenv("RASK_CLIENT_ID", "")
RASK_CLIENT_SECRET = os.getenv("RASK_CLIENT_SECRET", "")

# OpenAI
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5-mini")
OPENAI_TEMPERATURE = float(os.getenv("OPENAI_TEMPERATURE", "0.2"))
OPENAI_MAX_RETRIES = int(os.getenv("OPENAI_MAX_RETRIES", "2"))

# Paramètres projet
LANG_DEST = os.getenv("LANG_DEST", "fr-fr")
WPM = float(os.getenv("WPM", "147.0"))                    # mots/minute cible
LENGTH_TOLERANCE = int(os.getenv("LENGTH_TOLERANCE", "2"))# ± mots autour de la cible pour la 1ère passe
MIN_SENT_WORDS = int(os.getenv("MIN_SENT_WORDS", "7"))    # fourchette indicative par phrase
MAX_SENT_WORDS = int(os.getenv("MAX_SENT_WORDS", "22"))
RATE_TOL_PCT = float(os.getenv("RATE_TOL_PCT", "2.0"))    # tolérance vitesse ±% autour du WPM
STRICT_MAX_RETRIES = int(os.getenv("STRICT_MAX_RETRIES", "10"))  # tentatives LLM pour exact N mots

# Sélection du projet (nom ou URL app.rask.ai)
PROJECT_SELECTOR = "3_-_Presentation_des_moules_de_bougies_KtwWjO"

# Scope à traiter (None = tous)
SEGMENTS_RANGE = None

# Patch
DRY_RUN = True         # True: prévisualiser; False: envoyer
INCLUDE_TIMING = True # True pour pousser start/end

print("Config OK —", {
    "LANG_DEST": LANG_DEST, "WPM": WPM, "TOL_WORDS": LENGTH_TOLERANCE,
    "SENT_RANGE": f"{MIN_SENT_WORDS}-{MAX_SENT_WORDS}",
    "RATE_TOL_PCT": RATE_TOL_PCT,
    "STRICT_MAX_RETRIES": STRICT_MAX_RETRIES,
    "PROJECT_SELECTOR": PROJECT_SELECTOR,
    "DRY_RUN": DRY_RUN, "INCLUDE_TIMING": INCLUDE_TIMING
})


Config OK — {'LANG_DEST': 'fr-fr', 'WPM': 147.0, 'TOL_WORDS': 2, 'SENT_RANGE': '7-22', 'RATE_TOL_PCT': 2.0, 'STRICT_MAX_RETRIES': 10, 'PROJECT_SELECTOR': '8_-_Pratique_2_utilisation_des_colorants_Hibou_KtwWjO', 'DRY_RUN': True, 'INCLUDE_TIMING': True}


In [433]:

# (2) Auth & helpers Rask
import re, json, time, math
import pandas as pd
import requests
from IPython.display import display

TOKEN_URL = "https://rask-prod.auth.us-east-2.amazoncognito.com/oauth2/token"
SCOPES = "api/source api/input api/output api/limit"

PROJECTS_URL = "https://api.rask.ai/v2/projects"
GET_PROJECT_URL = "https://api.rask.ai/v2/projects/{project_id}"
TRANSCRIPTION_URL = "https://api.rask.ai/v2/projects/{project_id}/transcription"
PATCH_SEGMENTS_URL = "https://api.rask.ai/v2/projects/{project_id}/transcription/segments"
GENERATE_URL = "https://api.rask.ai/v2/projects/{project_id}/generate"

def get_token(client_id: str, client_secret: str) -> str:
    if not client_id or not client_secret:
        raise RuntimeError("RASK_CLIENT_ID / RASK_CLIENT_SECRET manquants (ou fournir RASK_TOKEN).")
    r = requests.post(TOKEN_URL, data={"grant_type":"client_credentials","scope":SCOPES},
                      auth=(client_id, client_secret), timeout=30)
    r.raise_for_status()
    return r.json()["access_token"]

def build_headers() -> dict:
    token = RASK_TOKEN or get_token(RASK_CLIENT_ID, RASK_CLIENT_SECRET)
    return {"Authorization": f"Bearer {token}"}

HEADERS = build_headers()
print("Auth Rask OK.")

def _normalize_name(s: str) -> str:
    return re.sub(r"[-\s_]+", "", (s or "").strip().lower())

def project_id_from_app_url(url: str) -> str | None:
    m = re.search(r"/project/([0-9a-fA-F-]{36})", str(url))
    return m.group(1) if m else None

def find_project_id_by_name(headers: dict, name: str, limit: int = 100) -> str | None:
    target = _normalize_name(name); offset = 0
    while True:
        r = requests.get(PROJECTS_URL, headers=headers, params={"offset": offset, "limit": limit}, timeout=30)
        r.raise_for_status()
        payload = r.json()
        for p in payload.get("projects", []):
            if _normalize_name(p.get("name")) == target:
                return p["id"]
        offset += limit
        if offset >= payload.get("total", 0):
            break
    offset = 0
    while True:
        r = requests.get(PROJECTS_URL, headers=headers, params={"offset": offset, "limit": limit}, timeout=30)
        r.raise_for_status()
        payload = r.json()
        for p in payload.get("projects", []):
            if target in _normalize_name(p.get("name")):
                return p["id"]
        offset += limit
        if offset >= payload.get("total", 0):
            break
    return None

def get_project(headers: dict, project_id: str) -> dict:
    r = requests.get(GET_PROJECT_URL.format(project_id=project_id), headers=headers, timeout=30)
    r.raise_for_status()
    return r.json()

def guess_dst_lang(headers: dict, project_id: str, default="fr-fr") -> str:
    info = get_project(headers, project_id)
    return (info.get("dst_lang") or info.get("dstLanguage") or default)

def select_project_id(selector: str) -> str:
    return project_id_from_app_url(selector) or find_project_id_by_name(HEADERS, selector) or            (_ for _ in ()).throw(RuntimeError(f"Aucun projet trouvé pour: {selector}"))


Auth Rask OK.


In [434]:

# (3) Ingestion + cibles (durée → mots via WPM)
def _parse_timecode(tc) -> float:
    if isinstance(tc, (int, float)): return float(tc)
    s = str(tc).strip().replace(",", ".")
    m = re.match(r"^(?P<h>\d{1,2}):(?P<m>\d{2}):(?P<s>\d{2})(?:\.(?P<ms>\d{1,3}))?$", s)
    if m:
        h = int(m.group("h")); mn = int(m.group("m")); sec = int(m.group("s")); ms = int(m.group("ms") or 0)
        return h*3600 + mn*60 + sec + ms/1000.0
    return float(s)

WORD_RE = re.compile(r"\b[\w’'-]+\b", flags=re.UNICODE)
def words_count(text: str) -> int:
    return len(WORD_RE.findall(text or ""))

def words_target_for_duration(duration_sec: float, wpm: float) -> int:
    return max(0, int(round(wpm * (duration_sec / 60.0))))

def spoken_rate_wpm(words: int, duration_sec: float) -> float:
    return (words / max(0.5, float(duration_sec))) * 60.0

def get_transcription_df(headers: dict, project_id: str, wpm: float) -> pd.DataFrame:
    r = requests.get(TRANSCRIPTION_URL.format(project_id=project_id), headers=headers, timeout=60)
    r.raise_for_status()
    data = r.json()
    rows = []
    for seg in data.get("segments", []):
        start = seg.get("start"); end = seg.get("end")
        dur = max(0.0, _parse_timecode(end) - _parse_timecode(start))
        src = (seg.get("src") or {}).get("text", "")
        dst = (seg.get("dst") or {}).get("text", "")
        rows.append({
            "segment_id": seg.get("id"),
            "speaker": seg.get("speaker"),
            "start": start,
            "end": end,
            "duration_sec": round(dur, 3),
            "src_text": src,
            "dst_text": dst,
            "dst_length": words_count(dst),
            "estimation_length": words_target_for_duration(dur, wpm),
        })
    return pd.DataFrame(rows)

project_id = select_project_id(PROJECT_SELECTOR)
dst_lang = guess_dst_lang(HEADERS, project_id, default=LANG_DEST)
df = get_transcription_df(HEADERS, project_id, WPM)
indices_to_process = list(range(len(df))) if SEGMENTS_RANGE is None else list(SEGMENTS_RANGE)

print("Project ID:", project_id, "| dst_lang:", dst_lang, "| segments:", len(df))
display(df.head(8)[["segment_id","start","end","duration_sec","dst_length","estimation_length","dst_text"]])


Project ID: 7602f534-dd28-4793-90ef-4e6bd848d04e | dst_lang: fr-fr | segments: 22


Unnamed: 0,segment_id,start,end,duration_sec,dst_length,estimation_length,dst_text
0,7062485a-d5ee-41a7-8f4b-92e1811fc354,"00:00:00,619","00:00:06,290",5.671,21,14,"Dans cette partie, nous découvrons les moules ..."
1,59a9c338-d60f-4da6-b566-e524a1a64362,"00:00:06,918","00:00:12,201",5.283,25,13,"Même en moulage, les mesures restent essentiel..."
2,798b5160-d60b-4ce4-8cca-18612dce8e5a,"00:00:12,640","00:00:22,521",9.881,24,24,"Comme pour les bougies en contenant, on pèse l..."
3,ab857ca1-d57d-43aa-af3a-cbf10ec0211e,"00:00:23,115","00:00:31,757",8.642,69,21,"Ici, nous obtenons 36 g d’eau. La cire étant ..."
4,cf5e9d7c-b9e1-4e2e-a097-85aae1cf957e,"00:00:32,792","00:00:45,797",13.005,23,32,"Ces 28,8 g constituent la masse totale de vot..."
5,e3ef803b-1c4d-4741-855f-ed50fe228067,"00:00:46,561","00:01:00,602",14.041,61,34,"« Paraffine 40 % : 28,8 × 0,40 = 11,52 g, arro..."
6,8e39315a-8e31-4bba-86ed-6f00b396eac2,"00:01:01,451","00:01:17,173",15.722,50,39,"Soja 30 % : 28,8 × 0,30 = 8,64 g, arrondis à 9..."
7,225cf057-3c7a-4bd6-9877-bb03046ab75a,"00:01:17,988","00:01:32,114",14.126,36,35,"Acide stéarique 20 % : 28,8 × 0,20 = 5,76 g, a..."


In [435]:

# (4) Pack global — contraintes de longueur exactes
LEXICON = [
    "balance", "fondoir", "bain-marie", "cuillère en bois", "parfums",
    "colorants", "mèches", "moules", "pastille métallique", "support de mèche",
    "pipettes", "cire naturelle"
]

def build_segments_pack(df: pd.DataFrame, indices: list[int], tol: int = LENGTH_TOLERANCE) -> dict:
    dfx = df.reset_index(drop=True)
    pack = []
    for i in indices:
        base = str(dfx.at[i, "dst_text"] or dfx.at[i, "src_text"] or "").strip()
        prev_text = str(dfx.at[i-1, "dst_text"]).strip() if i-1 >= 0 else ""
        next_text = str(dfx.at[i+1, "dst_text"]).strip() if i+1 < len(dfx) else ""
        tgt = int(dfx.at[i, "estimation_length"])
        mn = max(1, tgt - tol)
        mx = tgt + tol
        pack.append({
            "index": i,
            "id": str(dfx.at[i, "segment_id"]),
            "exact_words": tgt,
            "min_words": mn,
            "max_words": mx,
            "duration_sec": float(dfx.at[i, "duration_sec"]),
            "base_text": base,
            "prev_text": prev_text,
            "next_text": next_text
        })
    context = {
        "language": LANG_DEST,
        "lexicon_preferred": LEXICON,
        "style": "présentation de matériel, pédagogique, clair, précis, ton professionnel sobre",
        "audience": "débutant à intermédiaire"
    }
    return {"context": context, "segments": pack}

segments_pack = build_segments_pack(df, indices_to_process, tol=LENGTH_TOLERANCE)
print("Pack prêt:", len(segments_pack["segments"]), "segments")


Pack prêt: 22 segments


In [436]:

# (5) Client OpenAI + prompts pack / strict / parse JSON
_openai_mode = None
try:
    from openai import OpenAI
    _client = OpenAI(api_key=OPENAI_API_KEY)
    _openai_mode = "new"
except Exception:
    try:
        import openai
        openai.api_key = OPENAI_API_KEY
        _openai_mode = "legacy"
    except Exception:
        _openai_mode = None

if not OPENAI_API_KEY or not _openai_mode:
    print("AVERTISSEMENT: OpenAI indisponible. Fallback sur textes existants.")

GLOBAL_CHARTER = (
    "Génère tous les segments en une seule passe pour un style homogène. "
    "Chaque segment est un paragraphe complet, autoportant, ponctué, ne se poursuit jamais dans le suivant. "
    "Style pédagogique et précis, sans pas-à-pas; lexique constant: " + ", ".join(LEXICON) + "."
)

def build_pack_prompt(lang_dest: str, pack: dict, smin: int, smax: int) -> list[dict]:
    system = (
        f"Tu es un rédacteur pédagogique expert. Tu écris en {lang_dest}. "
        f"{GLOBAL_CHARTER} Respecte des phrases simples courtes à moyennes. "
        "RENVOIE UNIQUEMENT un JSON **valide** directement patchable par l'API Rask."
    )
    user_payload = {
        "contraintes_globales": {
            "phrases": f"{smin}-{smax} mots/phrase environ",
            "pas_d_actions": True,
            "pas_de_listes": True,
            "segment_independant": True,
            "transition_douce": True,
            "interdit_finir_par": ["et","mais","ou","donc","or","ni","car","cela","les"]
        },
        "règle_longueur": "Pour chaque segment, produire **exactement** <exact_words> mots (±2 seulement si la fluidité l'exige).",
        "segments": [
            {
                "id": it["id"],
                "exact_words": it["exact_words"],
                "min_words": it["min_words"],
                "max_words": it["max_words"],
                "duration_sec": it["duration_sec"],
                "base_text": it["base_text"],
                "prev_text": it["prev_text"],
                "next_text": it["next_text"]
            } for it in pack["segments"]
        ],
        "sortie_json_patch_rask": f'[{{"id":"<segment_id>","dst":{{"text":"<texte EXACT <exact_words> mots>","lang":"{LANG_DEST}"}}}}]'
    }
    return [{"role":"system","content":system},
            {"role":"user","content":json.dumps(user_payload, ensure_ascii=False)}]

def build_strict_segment_prompt(lang_dest: str, base_text: str, prev_text: str, next_text: str,
                                exact_words: int, smin: int, smax: int) -> list[dict]:
    system = (
        f"Tu écris en {lang_dest} un segment unique, très fluide, pédagogique. Sans mots clés scientisiques compliqué"
        "Un seul paragraphe, ponctuation finale, sans listes ni actions procédurales. "
        f"Le texte DOIT contenir **exactement {exact_words} mots** (compte standard). Sans anglissime"
    )
    user = {
        "exact_words": exact_words,
        "phrases": f"{smin}-{smax} mots/phrase environ",
        "contexte": {"prev_text": prev_text, "next_text": next_text},
        "base_text": base_text,
        "sortie": {"format": "texte seul, sans guillemets, sans balises"}
    }
    return [{"role":"system","content":system},
            {"role":"user","content":json.dumps(user, ensure_ascii=False)}]

def openai_chat(messages: list[dict], model: str, temperature: float) -> str:
    last_err = None
    for _ in range(1 + OPENAI_MAX_RETRIES):
        try:
            if _openai_mode == "new":
                resp = _client.chat.completions.create(model=model,
                                                       # temperature=temperature,
                                                       messages=messages)
                return resp.choices[0].message.content.strip()
            else:
                resp = openai.ChatCompletion.create(model=model,
                                                    # temperature=temperature,
                                                    messages=messages)
                return resp["choices"][0]["message"]["content"].strip()
        except Exception as e:
            last_err = e
            time.sleep(0.8)
    raise RuntimeError(f"Echec OpenAI après retries: {last_err}")

def parse_patch_json(raw: str) -> list[dict]:
    try:
        data = json.loads(raw)
        if isinstance(data, dict) and "segments" in data:
            return data["segments"]
        if isinstance(data, list):
            return data
    except json.JSONDecodeError:
        m = re.search(r"```json\s*(\[.*?\])\s*```", raw, flags=re.DOTALL)
        if m: return json.loads(m.group(1))
        m2 = re.search(r"(\[\s*\{.*\}\s*\])", raw, flags=re.DOTALL)
        if m2: return json.loads(m2.group(1))
    raise RuntimeError("Réponse LLM non JSON patchable Rask.")


In [None]:

# (6) Génération pack + régénération bornes + normalisation stricte EXACT
from tqdm.auto import tqdm

def generate_pack(df: pd.DataFrame,
                  lang_dest: str,
                  indices: list[int],
                  tol_words: int,
                  smin: int, smax: int,
                  model: str = OPENAI_MODEL,
                  temperature: float = OPENAI_TEMPERATURE) -> pd.DataFrame:
    dfx = df.reset_index(drop=True).copy()
    pack = build_segments_pack(dfx, indices, tol=tol_words)

    # Pass 1 — pack
    if not OPENAI_API_KEY or _openai_mode is None:
        print("OpenAI indisponible: fallback base_text.")
        results = [{"id": it["id"], "dst": {"text": it["base_text"], "lang": lang_dest}} for it in pack["segments"]]
    else:
        msgs = build_pack_prompt(lang_dest, pack, smin=smin, smax=smax)
        raw = openai_chat(msgs, model=model, temperature=temperature)
        results = parse_patch_json(raw)

    # Injection
    if "new_text" not in dfx.columns: dfx["new_text"] = pd.NA
    if "new_length" not in dfx.columns: dfx["new_length"] = pd.NA
    sid_to_idx = {str(dfx.at[i,"segment_id"]): i for i in range(len(dfx))}
    for item in results:
        sid = item.get("id") or item.get("segment_id")
        txt = (item.get("dst",{}).get("text") or item.get("text") or "").strip()
        if sid in sid_to_idx:
            j = sid_to_idx[sid]
            dfx.at[j,"new_text"] = txt
            dfx.at[j,"new_length"] = words_count(txt)

    # Régénération ciblée: segments hors [min,max] (2 passes max)
    def validate_lengths(dfa: pd.DataFrame, idxs: list[int]) -> pd.DataFrame:
        rows = []
        for i in idxs:
            tgt = int(dfa.at[i,"estimation_length"])
            mn, mx = max(1, tgt - tol_words), tgt + tol_words
            nl = int(dfa.at[i,"new_length"] or 0)
            if not (mn <= nl <= mx):
                rows.append({"index": i, "segment_id": dfa.at[i,"segment_id"],
                             "target": tgt, "min": mn, "max": mx, "have": nl})
        return pd.DataFrame(rows)

    def build_single_prompt(base_text: str, prev_text: str, next_text: str,
                            exact_words: int) -> list[dict]:
        return build_strict_segment_prompt(lang_dest, base_text, prev_text, next_text, exact_words, smin, smax)

    for p in range(2):
        bad = validate_lengths(dfx, indices)
        if bad.empty or not (OPENAI_API_KEY and _openai_mode):
            break
        print(f"Régénération ciblée (bornes) — pass {p+1}: {len(bad)} segments")
        for _, row in bad.iterrows():
            i = int(row["index"])
            target = int(row["target"])
            base = str(dfx.at[i,"new_text"] or dfx.at[i,"dst_text"] or dfx.at[i,"src_text"] or "").strip()
            prev_text = str(dfx.at[i-1,"new_text"] or dfx.at[i-1,"dst_text"] or "").strip() if i-1 >= 0 else ""
            next_text = str(dfx.at[i+1,"new_text"] or dfx.at[i+1,"dst_text"] or "").strip() if i+1 < len(dfx) else ""
            msgs = build_single_prompt(base, prev_text, next_text, target)
            cand = openai_chat(msgs, model=model, temperature=temperature).strip()
            dfx.at[i,"new_text"] = cand
            dfx.at[i,"new_length"] = words_count(cand)

    # Normalisation stricte EXACT = target (LLM strict + fallback)
    def exact_length_normalize(dfa: pd.DataFrame, idxs: list[int]) -> pd.DataFrame:
        out = dfa.copy()
        for i in idxs:
            target = int(out.at[i,"estimation_length"])
            cur = int(out.at[i,"new_length"] or 0)
            if cur == target:
                continue
            base = str(out.at[i,"new_text"] or out.at[i,"dst_text"] or out.at[i,"src_text"] or "").strip()
            prev_text = str(out.at[i-1,"new_text"] or out.at[i-1,"dst_text"] or "").strip() if i-1 >= 0 else ""
            next_text = str(out.at[i+1,"new_text"] or out.at[i+1,"dst_text"] or "").strip() if i+1 < len(out) else ""

            # Tentatives LLM exact N mots
            ok = False
            if OPENAI_API_KEY and _openai_mode:
                for _ in range(STRICT_MAX_RETRIES):
                    msgs = build_strict_segment_prompt(LANG_DEST, base, prev_text, next_text, target, smin, smax)
                    candidate = openai_chat(msgs, model=model, temperature=temperature).strip()
                    if words_count(candidate) == target:
                        out.at[i,"new_text"] = candidate
                        out.at[i,"new_length"] = target
                        ok = True
                        break
            if ok:
                continue

            # Fallback: compression/expansion soft
            txt = base
            cur = words_count(txt)
            if cur > target:
                sentences = re.split(r"(?<=[.!?…])\s+", txt.strip())
                # retirer la phrase la plus longue tant qu'on dépasse
                while words_count(" ".join(sentences)) > target and len(sentences) > 1:
                    idx_long = max(range(len(sentences)), key=lambda k: words_count(sentences[k]))
                    del sentences[idx_long]
                txt = " ".join(sentences).strip()
                toks = WORD_RE.findall(txt)
                if len(toks) > target:
                    toks = toks[:target]
                txt = " ".join(toks)
            elif cur < target:
                pad1 = " Cet aperçu fixe des repères clairs."
                pad2 = " Vous disposerez ainsi d’une base fiable."
                while words_count(txt) < target and txt.count(pad1) < 2:
                    txt = (txt + pad1).strip()
                if words_count(txt) < target:
                    txt = (txt + pad2).strip()
                toks = WORD_RE.findall(txt)
                if len(toks) > target:
                    toks = toks[:target]
                txt = " ".join(toks)

            out.at[i,"new_text"] = txt.strip()
            out.at[i,"new_length"] = words_count(txt)
        return out

    dfx = exact_length_normalize(dfx, indices)
    return dfx

df = generate_pack(df, LANG_DEST, indices_to_process, tol_words=LENGTH_TOLERANCE,
                   smin=MIN_SENT_WORDS, smax=MAX_SENT_WORDS)
print("Génération + normalisation strictes terminées.")


In [425]:

# (7) Audit frontières + auto-fix (fins propres, débuts nets)
TERM_PUNCT_RE = re.compile(r"[.!?…]$")
WEAK_START_RE = re.compile(r"^(?:et|mais|ou|donc|or|ni|car|cela|les)\b", re.IGNORECASE)
ORPHAN_END_RE = re.compile(r"\b(?:et|mais|ou|donc|or|ni|car|cela|les)\s*$", re.IGNORECASE)

def sentence_tokens(text: str) -> list[str]:
    s = (text or "").strip()
    if not s: return []
    parts = re.split(r"(?<=[.!?…])\s+", s)
    return [p.strip() for p in parts if p.strip()]

def audit_boundaries(df: pd.DataFrame, indices: list[int],
                     smin: int, smax: int) -> pd.DataFrame:
    rows = []
    for i in indices:
        txt = (df.at[i,"new_text"] or "").strip()
        sins = sentence_tokens(txt)
        issues = []
        if not TERM_PUNCT_RE.search(txt): issues.append("no_terminal_punct")
        if ORPHAN_END_RE.search(txt): issues.append("orphan_connector_end")
        if i+1 < len(df):
            nxt = (df.at[i+1,"new_text"] or "").strip()
            if WEAK_START_RE.search(nxt): issues.append("weak_start_next")
        if len(sins) == 0: issues.append("empty_segment")
        if len(sins) > 4: issues.append("too_many_sentences")
        for s in sins:
            wc = len(WORD_RE.findall(s))
            if wc < smin: issues.append("sentence_too_short")
            if wc > smax: issues.append("sentence_too_long")
        if issues:
            rows.append({"index": i, "segment_id": df.at[i,"segment_id"], "issues": ", ".join(sorted(set(issues)))})
    return pd.DataFrame(rows)

def polish_end(text: str) -> str:
    s = (text or "").strip()
    s = ORPHAN_END_RE.sub("", s).strip()
    if not TERM_PUNCT_RE.search(s):
        s += "."
    return re.sub(r"\s{2,}", " ", s)

def polish_start(text: str) -> str:
    s = (text or "").strip()
    if WEAK_START_RE.match(s):
        s = WEAK_START_RE.sub("Ensuite,", s, count=1)
        s = re.sub(r"\s{2,}", " ", s).strip()
        if s: s = s[0].upper() + s[1:]
    return s

def auto_fix_boundaries(df: pd.DataFrame, indices: list[int]) -> pd.DataFrame:
    dfx = df.copy()
    for i in indices:
        s = (dfx.at[i,"new_text"] or "").strip()
        s = polish_end(s)
        dfx.at[i,"new_text"] = s
        dfx.at[i,"new_length"] = words_count(s)
        if i+1 < len(dfx):
            nxt = (dfx.at[i+1,"new_text"] or "").strip()
            nxt = polish_start(nxt)
            dfx.at[i+1,"new_text"] = nxt
            dfx.at[i+1,"new_length"] = words_count(nxt)
    return dfx

issues_df = audit_boundaries(df, indices_to_process, smin=MIN_SENT_WORDS, smax=MAX_SENT_WORDS)
print("Issues frontières détectées:", len(issues_df))
display(issues_df.head(20))

df = auto_fix_boundaries(df, indices_to_process)

issues_after = audit_boundaries(df, indices_to_process, smin=MIN_SENT_WORDS, smax=MAX_SENT_WORDS)
print("Issues restantes après fix:", len(issues_after))
display(issues_after.head(20))


Issues frontières détectées: 3


Unnamed: 0,index,segment_id,issues
0,4,f1bdfd4a-7283-49b6-95ba-b7de7bf4b49c,sentence_too_long
1,5,40fe5576-2765-4e75-b2d4-e9e779ec338d,sentence_too_long
2,6,2caf09cf-05c5-44a8-87d7-c53cbc52abff,too_many_sentences


Issues restantes après fix: 3


Unnamed: 0,index,segment_id,issues
0,4,f1bdfd4a-7283-49b6-95ba-b7de7bf4b49c,sentence_too_long
1,5,40fe5576-2765-4e75-b2d4-e9e779ec338d,sentence_too_long
2,6,2caf09cf-05c5-44a8-87d7-c53cbc52abff,too_many_sentences


In [426]:

# (8) Visualisation AVANT / APRÈS + tailles
view_cols = ["segment_id","start","end","duration_sec","estimation_length","dst_text","dst_length","new_text","new_length"]
table = df.loc[indices_to_process, view_cols].copy()
display(table.head(20))

table.to_csv("v4_pack_avant_apres.csv", index=False, encoding="utf-8-sig")
print("Export -> v4_pack_avant_apres.csv")


Unnamed: 0,segment_id,start,end,duration_sec,estimation_length,dst_text,dst_length,new_text,new_length
0,8d7a71c7-84e7-4be4-addf-8d69c3d7c0b3,"00:00:00,738","00:00:10,418",9.68,24,Les cires jouent un rôle essentiel dans la fab...,24,"Les cires déterminent la combustion, l'aspect ...",24
1,fc98262d-4ef3-429a-9d85-3057c5d999c9,"00:00:10,418","00:00:13,913",3.495,9,Le choix de la cire influence directement la q...,9,Le choix de la cire naturelle influence la qua...,9
2,aa4deb66-623c-4c45-9ba3-d99bbe9418bd,"00:00:14,354","00:00:30,025",15.671,38,"La cire d’abeille, prisée pour sa qualité exce...",38,La cire d'abeille est une cire naturelle offra...,38
3,342b2806-3672-4035-8971-548b78a169f0,"00:00:30,823","00:00:35,186",4.363,11,Considérée comme 100 pourcent naturelle elle ...,11,"Considérée comme cire naturelle à 100%, elle e...",11
4,f1bdfd4a-7283-49b6-95ba-b7de7bf4b49c,"00:00:35,645","00:00:54,694",19.049,47,"La cire de soja, en plus de brûler lentement, ...",47,"La cire de soja, cire naturelle issue de resso...",47
5,40fe5576-2765-4e75-b2d4-e9e779ec338d,"00:00:55,526","00:01:13,981",18.455,45,"La cire de soja, provenant de ressources renou...",45,"La cire de paraffine, issue de la pétrochimie,...",45
6,2caf09cf-05c5-44a8-87d7-c53cbc52abff,"00:01:15,560","00:02:01,074",45.514,112,La cire de paraffine bien que très répandue da...,112,"La paraffine, très répandue, offre une mise en...",112
7,5063a8c7-38bf-488b-b18f-eb94001b248c,"00:02:01,400","00:02:11,290",9.89,24,Outre ses propriétés esthétiques l’acide stéar...,24,"L'acide stéarique améliore la texture, l'opaci...",24
8,54091848-748e-4a99-aea0-1e2eb40988d3,"00:02:11,290","00:02:21,110",9.82,24,L’acide stéarique est également un agent blanc...,24,L'acide stéarique agit aussi comme agent blanc...,24
9,adf33389-a2c8-417b-860d-760bb039ff30,"00:02:21,689","00:02:28,724",7.035,17,"Son intérêt ne s’arrête pas là, car il renforc...",17,Son intérêt dépasse l'esthétique car il renfor...,17


Export -> v4_pack_avant_apres.csv


In [427]:

# (9) Audit vitesse parlée (WPM) par segment + statut
def rate_status(rate_wpm: float, target_wpm: float, tol_pct: float) -> str:
    if target_wpm <= 0: return "NA"
    delta_pct = 100.0 * (rate_wpm - target_wpm) / target_wpm
    if abs(delta_pct) <= tol_pct:
        return "OK"
    return "FAST" if delta_pct > 0 else "SLOW"

rate_rows = []
for i in indices_to_process:
    d = float(df.at[i,"duration_sec"] or 0.0)
    w = int(df.at[i,"new_length"] or 0)
    tgt = int(df.at[i,"estimation_length"] or 0)
    rate = spoken_rate_wpm(w, d) if d > 0 else 0.0
    status = rate_status(rate, WPM, RATE_TOL_PCT)
    rate_rows.append({
        "segment_id": df.at[i,"segment_id"],
        "duration_sec": d,
        "target_words": tgt,
        "new_length": w,
        "rate_wpm": round(rate, 1),
        "target_wpm": WPM,
        "rate_deviation_pct": round(100.0*(rate-WPM)/WPM, 1) if WPM > 0 else 0.0,
        "rate_status": status
    })

rate_df = pd.DataFrame(rate_rows)
display(rate_df.head(20))

rate_df.to_csv("v4_audit_vitesse.csv", index=False, encoding="utf-8-sig")
print("Export -> v4_audit_vitesse.csv")


Unnamed: 0,segment_id,duration_sec,target_words,new_length,rate_wpm,target_wpm,rate_deviation_pct,rate_status
0,8d7a71c7-84e7-4be4-addf-8d69c3d7c0b3,9.68,24,24,148.8,147.0,1.2,OK
1,fc98262d-4ef3-429a-9d85-3057c5d999c9,3.495,9,9,154.5,147.0,5.1,FAST
2,aa4deb66-623c-4c45-9ba3-d99bbe9418bd,15.671,38,38,145.5,147.0,-1.0,OK
3,342b2806-3672-4035-8971-548b78a169f0,4.363,11,11,151.3,147.0,2.9,FAST
4,f1bdfd4a-7283-49b6-95ba-b7de7bf4b49c,19.049,47,47,148.0,147.0,0.7,OK
5,40fe5576-2765-4e75-b2d4-e9e779ec338d,18.455,45,45,146.3,147.0,-0.5,OK
6,2caf09cf-05c5-44a8-87d7-c53cbc52abff,45.514,112,112,147.6,147.0,0.4,OK
7,5063a8c7-38bf-488b-b18f-eb94001b248c,9.89,24,24,145.6,147.0,-1.0,OK
8,54091848-748e-4a99-aea0-1e2eb40988d3,9.82,24,24,146.6,147.0,-0.2,OK
9,adf33389-a2c8-417b-860d-760bb039ff30,7.035,17,17,145.0,147.0,-1.4,OK


Export -> v4_audit_vitesse.csv


In [428]:

# (10) Préparation payload & patch Rask (dry‑run par défaut)
def _row_to_segment_patch(row: pd.Series, dst_lang: str, include_timing: bool = False) -> dict:
    seg = {"id": str(row["segment_id"]), "dst": {"text": str(row["new_text"]), "lang": dst_lang}}
    if include_timing:
        if pd.notna(row.get("start")): seg["start"] = str(row["start"])
        if pd.notna(row.get("end")):   seg["end"]   = str(row["end"])
    return seg

def build_segments_payload_for_patch(df: pd.DataFrame,
                                     dst_lang: str,
                                     indices: list[int],
                                     include_timing: bool = False) -> list[dict]:
    subset = df.loc[indices]
    subset = subset[subset["new_text"].notna() & (subset["new_text"].astype(str).str.strip().str.len() > 0)]
    return [_row_to_segment_patch(row, dst_lang, include_timing=include_timing) for _, row in subset.iterrows()]

def patch_segments_text(headers: dict,
                        project_id: str,
                        segments: list[dict],
                        batch_size: int = 100,
                        dry_run: bool = True,
                        sleep_between: float = 0.4) -> None:
    url = PATCH_SEGMENTS_URL.format(project_id=project_id)
    total = len(segments)
    batches = math.ceil(total / batch_size)
    for b in range(batches):
        chunk = segments[b*batch_size:(b+1)*batch_size]
        if dry_run:
            print(f"[DRY-RUN] PATCH {b+1}/{batches} -> {url} ({len(chunk)} segments)")
            preview = [{"id": s["id"],
                        "dst_len": len(str(s.get("dst",{}).get("text","")).split()),
                        "dst_lang": s["dst"].get("lang"),
                        "start": s.get("start"), "end": s.get("end")} for s in chunk]
            display(pd.DataFrame(preview))
            continue
        resp = requests.patch(url, headers={**headers,"Content-Type":"application/json"},
                              json={"segments": chunk}, timeout=60)
        if resp.status_code >= 400:
            try: print("Erreur serveur:", json.dumps(resp.json(), ensure_ascii=False, indent=2))
            except Exception: print("Erreur brute:", resp.text)
            resp.raise_for_status()
        print(f"OK PATCH {b+1}/{batches}: {len(chunk)} segments")
        time.sleep(sleep_between)

def generate_project(headers: dict, project_id: str) -> dict:
    r = requests.post(GENERATE_URL.format(project_id=project_id), headers=headers, timeout=60)
    if r.status_code >= 400:
        try: print("Erreur serveur generate:", json.dumps(r.json(), ensure_ascii=False, indent=2))
        except Exception: print("Erreur brute generate:", r.text)
        r.raise_for_status()
    print("Génération lancée.")
    return r.json()

payload = build_segments_payload_for_patch(df, dst_lang=dst_lang, indices=indices_to_process, include_timing=INCLUDE_TIMING)
print("Segments à patcher:", len(payload))

# Dry‑run pour inspection (mettre DRY_RUN=False pour envoyer)
patch_segments_text(HEADERS, project_id, payload, dry_run=False)


Segments à patcher: 10
OK PATCH 1/1: 10 segments


In [429]:

# (11) Exports finaux
path_dir_output = os.path.join(os.getcwd(), PROJECT_SELECTOR)
os.makedirs(path_dir_output, exist_ok=True)
out_csv = os.path.join(path_dir_output, "v4_transcription_postgen.csv")
df.to_csv(out_csv, index=False, encoding="utf-8-sig")
print("CSV écrit ->", out_csv)

out_payload = os.path.join(path_dir_output, "v4_segments_payload.json")
try:
    with open(out_payload, "w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)
    print("Payload JSON écrit ->", out_payload)
except Exception as e:
    print("Aucun payload à écrire ou erreur:", e)


CSV écrit -> /mnt/c/Users/Utilisateur/PycharmProjects/lumierelearning/notebook/2_-_Presentation_des_cires_KtwWjO/v4_transcription_postgen.csv
Payload JSON écrit -> /mnt/c/Users/Utilisateur/PycharmProjects/lumierelearning/notebook/2_-_Presentation_des_cires_KtwWjO/v4_segments_payload.json
