# Pipeline Pack Ultra‑Cohérente — Rask × LLM (v5)

Objectif: produire des **segments cohérents** en **une seule requête** LLM avec:

- Respect **durée → nombre de mots** via `WPM` (cible par segment)
- **Fins propres**, **débuts nets**, **pas de phrase coupée**
- **Zéro pad foireux** du type `Cet.` ou `Cela.`
- **Régénération ciblée** + **normalisation EXACTE** si nécessaire
- **Audit vitesse** (WPM réel vs cible) et **audit frontières**

Le notebook inclut aussi un correcteur pour **sanitiser un payload JSON existant**, afin de réparer les phrases inachevées et enlever les pads inutiles.

## (0) Dépendances

Si nécessaire, installez:

```bash
%pip install --quiet requests python-dotenv pandas tqdm openai
```

Le notebook n'utilise **pas** d'`ipywidgets` et reste compatible noyaux simples.

In [27]:
# (1) Configuration & variables d'environnement
import os
from dotenv import load_dotenv
load_dotenv()

# Rask
RASK_TOKEN = os.getenv("RASK_TOKEN", "")
RASK_CLIENT_ID = os.getenv("RASK_CLIENT_ID", "")
RASK_CLIENT_SECRET = os.getenv("RASK_CLIENT_SECRET", "")

# OpenAI
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
OPENAI_TEMPERATURE = float(os.getenv("OPENAI_TEMPERATURE", "0.2"))
OPENAI_MAX_RETRIES = int(os.getenv("OPENAI_MAX_RETRIES", "2"))

# Paramètres projet
LANG_DEST = os.getenv("LANG_DEST", "fr-fr")
WPM = float(os.getenv("WPM", "147.0"))                   # mots/minute cible
LENGTH_TOLERANCE = int(os.getenv("LENGTH_TOLERANCE", "2"))# ± mots pour première passe
MIN_SENT_WORDS = int(os.getenv("MIN_SENT_WORDS", "7"))    # bornes indicatives par phrase
MAX_SENT_WORDS = int(os.getenv("MAX_SENT_WORDS", "22"))
RATE_TOL_PCT = float(os.getenv("RATE_TOL_PCT", "2.0"))    # tolérance vitesse ±% autour du WPM
STRICT_MAX_RETRIES = int(os.getenv("STRICT_MAX_RETRIES", "7"))

# Sélection du projet (nom ou URL app.rask.ai)
PROJECT_SELECTOR = "4_-_Presentation_des_meches_KtwWjO"

# Scope à traiter (None = tous)
SEGMENTS_RANGE = None

# Patch
DRY_RUN = False         # True: prévisualiser; False: envoyer
INCLUDE_TIMING = False # True pour pousser start/end

print("Config OK —", {
    "LANG_DEST": LANG_DEST, "WPM": WPM, "TOL_WORDS": LENGTH_TOLERANCE,
    "SENT_RANGE": f"{MIN_SENT_WORDS}-{MAX_SENT_WORDS}",
    "RATE_TOL_PCT": RATE_TOL_PCT,
    "STRICT_MAX_RETRIES": STRICT_MAX_RETRIES,
    "PROJECT_SELECTOR": PROJECT_SELECTOR,
    "DRY_RUN": DRY_RUN, "INCLUDE_TIMING": INCLUDE_TIMING
})

Config OK — {'LANG_DEST': 'fr-fr', 'WPM': 147.0, 'TOL_WORDS': 2, 'SENT_RANGE': '7-22', 'RATE_TOL_PCT': 2.0, 'STRICT_MAX_RETRIES': 7, 'PROJECT_SELECTOR': '4_-_Presentation_des_meches_KtwWjO', 'DRY_RUN': False, 'INCLUDE_TIMING': False}


In [28]:
# (2) Auth & helpers Rask
import re, json, time, math
import pandas as pd
import requests
from IPython.display import display

TOKEN_URL = "https://rask-prod.auth.us-east-2.amazoncognito.com/oauth2/token"
SCOPES = "api/source api/input api/output api/limit"

PROJECTS_URL = "https://api.rask.ai/v2/projects"
GET_PROJECT_URL = "https://api.rask.ai/v2/projects/{project_id}"
TRANSCRIPTION_URL = "https://api.rask.ai/v2/projects/{project_id}/transcription"
PATCH_SEGMENTS_URL = "https://api.rask.ai/v2/projects/{project_id}/transcription/segments"
GENERATE_URL = "https://api.rask.ai/v2/projects/{project_id}/generate"

def get_token(client_id: str, client_secret: str) -> str:
    if not client_id or not client_secret:
        raise RuntimeError("RASK_CLIENT_ID / RASK_CLIENT_SECRET manquants (ou fournir RASK_TOKEN).")
    r = requests.post(TOKEN_URL, data={"grant_type":"client_credentials","scope":SCOPES},
                      auth=(client_id, client_secret), timeout=30)
    r.raise_for_status()
    return r.json()["access_token"]

def build_headers() -> dict:
    token = RASK_TOKEN or get_token(RASK_CLIENT_ID, RASK_CLIENT_SECRET)
    return {"Authorization": f"Bearer {token}"}

HEADERS = build_headers()
print("Auth Rask OK.")

def _normalize_name(s: str) -> str:
    return re.sub(r"[-\s_]+", "", (s or "").strip().lower())

def project_id_from_app_url(url: str) -> str | None:
    m = re.search(r"/project/([0-9a-fA-F-]{36})", str(url))
    return m.group(1) if m else None

def find_project_id_by_name(headers: dict, name: str, limit: int = 100) -> str | None:
    target = _normalize_name(name); offset = 0
    while True:
        r = requests.get(PROJECTS_URL, headers=headers, params={"offset": offset, "limit": limit}, timeout=30)
        r.raise_for_status(); payload = r.json()
        for p in payload.get("projects", []):
            if _normalize_name(p.get("name")) == target:
                return p["id"]
        offset += limit
        if offset >= payload.get("total", 0): break
    offset = 0
    while True:
        r = requests.get(PROJECTS_URL, headers=headers, params={"offset": offset, "limit": limit}, timeout=30)
        r.raise_for_status(); payload = r.json()
        for p in payload.get("projects", []):
            if target in _normalize_name(p.get("name")):
                return p["id"]
        offset += limit
        if offset >= payload.get("total", 0): break
    return None

def get_project(headers: dict, project_id: str) -> dict:
    r = requests.get(GET_PROJECT_URL.format(project_id=project_id), headers=headers, timeout=30)
    r.raise_for_status(); return r.json()

def guess_dst_lang(headers: dict, project_id: str, default="fr-fr") -> str:
    info = get_project(headers, project_id)
    return (info.get("dst_lang") or info.get("dstLanguage") or default)

def select_project_id(selector: str) -> str:
    return project_id_from_app_url(selector) or find_project_id_by_name(HEADERS, selector) or \
           (_ for _ in ()).throw(RuntimeError(f"Aucun projet trouvé pour: {selector}"))

Auth Rask OK.


In [29]:
# (3) Ingestion + cibles (durée → mots via WPM)
def _parse_timecode(tc) -> float:
    if isinstance(tc, (int, float)): return float(tc)
    s = str(tc).strip().replace(",", ".")
    m = re.match(r"^(?P<h>\d{1,2}):(?P<m>\d{2}):(?P<s>\d{2})(?:\.(?P<ms>\d{1,3}))?$", s)
    if m:
        h = int(m.group("h")); mn = int(m.group("m")); sec = int(m.group("s")); ms = int(m.group("ms") or 0)
        return h*3600 + mn*60 + sec + ms/1000.0
    return float(s)

WORD_RE = re.compile(r"\b[\w’'-]+\b", flags=re.UNICODE)
def words_count(text: str) -> int:
    return len(WORD_RE.findall(text or ""))

def words_target_for_duration(duration_sec: float, wpm: float) -> int:
    return max(0, int(round(wpm * (duration_sec / 60.0))))

def spoken_rate_wpm(words: int, duration_sec: float) -> float:
    return (words / max(0.5, float(duration_sec))) * 60.0

def get_transcription_df(headers: dict, project_id: str, wpm: float) -> pd.DataFrame:
    r = requests.get(TRANSCRIPTION_URL.format(project_id=project_id), headers=headers, timeout=60)
    r.raise_for_status(); data = r.json()
    rows = []
    for seg in data.get("segments", []):
        start = seg.get("start"); end = seg.get("end")
        dur = max(0.0, _parse_timecode(end) - _parse_timecode(start))
        src = (seg.get("src") or {}).get("text", "")
        dst = (seg.get("dst") or {}).get("text", "")
        rows.append({
            "segment_id": seg.get("id"),
            "speaker": seg.get("speaker"),
            "start": start,
            "end": end,
            "duration_sec": round(dur, 3),
            "src_text": src,
            "dst_text": dst,
            "dst_length": words_count(dst),
            "estimation_length": words_target_for_duration(dur, wpm),
        })
    return pd.DataFrame(rows)

project_id = select_project_id(PROJECT_SELECTOR)
dst_lang = guess_dst_lang(HEADERS, project_id, default=LANG_DEST)
df = get_transcription_df(HEADERS, project_id, WPM)
indices_to_process = list(range(len(df))) if SEGMENTS_RANGE is None else list(SEGMENTS_RANGE)

print("Project ID:", project_id, "| dst_lang:", dst_lang, "| segments:", len(df))
display(df.head(8)[["segment_id","start","end","duration_sec","dst_length","estimation_length","dst_text"]])

Project ID: bc8e6b7f-2ec4-4517-842d-fabf72ab9991 | dst_lang: fr-fr | segments: 7


Unnamed: 0,segment_id,start,end,duration_sec,dst_length,estimation_length,dst_text
0,81f4de3f-c894-4472-a934-5a51d11a9f8d,"00:00:00,857","00:00:07,597",6.74,19,17,Les mèches jouent un rôle crucial dans la comb...
1,6dd4b4fb-f4df-4092-be49-a874dfd76e1f,"00:00:08,259","00:00:33,488",25.229,62,62,La mèche en coton est un choix privilégié pour...
2,b1b46f6e-1e68-4c7a-a6e1-5eeaf9bc491d,"00:00:34,847","00:01:09,457",34.61,58,85,La mèche en bois fabriquée à partir de fines l...
3,dda51233-7321-4c5b-8a69-a10c187b2052,"00:01:10,212","00:01:59,482",49.27,80,121,Pour installer une mèche en coton commencez pa...
4,968ab399-20af-4cec-bf27-4b304be1bf53,"00:02:02,640","00:02:47,936",45.296,111,111,"Pour la mèche en bois, il est essentiel de bie..."
5,68c0ca8b-fe05-4d5b-b365-7ed7c2ab86a8,"00:02:49,074","00:03:34,779",45.705,110,112,"Ensuite, les mèches en coton sont souvent choi..."
6,803bb373-a69e-40e5-9c1c-78a833dbd000,"00:03:35,305","00:03:38,640",3.335,11,8,Choisir la bonne mèche est essentiel pour obte...


In [30]:
# (4) Pack global — contraintes exactes + topic hints
LEXICON = [
    "balance", "fondoir", "bain-marie", "cuillère en bois", "parfums",
    "colorants", "mèches", "moules", "pastille métallique", "support de mèche",
    "pipettes", "cire naturelle"
]

TOPIC_HINTS = {
    r"abeille|abeilles": "cire d’abeille",
    r"soja": "cire de soja",
    r"paraffin": "cire de paraffine",
    r"stéarique|stearique": "acide stéarique",
    r"mèche|meche": "mèches",
    r"moule|conten": "moules et contenants",
}

def infer_topic(text: str) -> str:
    s = (text or "").lower()
    for pat, label in TOPIC_HINTS.items():
        if re.search(pat, s):
            return label
    return "présentation du matériel"

def build_segments_pack(df: pd.DataFrame, indices: list[int], tol: int = LENGTH_TOLERANCE) -> dict:
    dfx = df.reset_index(drop=True)
    pack = []
    for i in indices:
        base = str(dfx.at[i, "dst_text"] or dfx.at[i, "src_text"] or "").strip()
        prev_text = str(dfx.at[i-1, "dst_text"]).strip() if i-1 >= 0 else ""
        next_text = str(dfx.at[i+1, "dst_text"]).strip() if i+1 < len(dfx) else ""
        tgt = int(dfx.at[i, "estimation_length"])
        mn = max(1, tgt - tol); mx = tgt + tol
        pack.append({
            "index": i,
            "id": str(dfx.at[i, "segment_id"]),
            "exact_words": tgt,
            "min_words": mn,
            "max_words": mx,
            "duration_sec": float(dfx.at[i, "duration_sec"]),
            "base_text": base,
            "prev_text": prev_text,
            "next_text": next_text,
            "topic": infer_topic(base)
        })
    context = {
        "language": LANG_DEST,
        "lexicon_preferred": LEXICON,
        "style": "présentation de matériel, pédagogique, clair, précis, ton professionnel sobre",
        "audience": "débutant à intermédiaire"
    }
    return {"context": context, "segments": pack}

segments_pack = build_segments_pack(df, indices_to_process, tol=LENGTH_TOLERANCE)
print("Pack prêt:", len(segments_pack["segments"]), "segments")

Pack prêt: 7 segments


In [31]:
# (5) Client OpenAI + prompts pack/strict + parse JSON
_openai_mode = None
try:
    from openai import OpenAI
    _client = OpenAI(api_key=OPENAI_API_KEY)
    _openai_mode = "new"
except Exception:
    try:
        import openai
        openai.api_key = OPENAI_API_KEY
        _openai_mode = "legacy"
    except Exception:
        _openai_mode = None

if not OPENAI_API_KEY or not _openai_mode:
    print("AVERTISSEMENT: OpenAI indisponible. Fallback sur textes existants.")

GLOBAL_CHARTER = (
    "Génère tous les segments en une seule passe pour un style homogène. "
    "Chaque segment est un paragraphe complet, indépendant, ponctué, ne se poursuit jamais dans le suivant. "
    "Style pédagogique et précis, sans pas-à-pas; lexique constant: " + ", ".join(LEXICON) + "."
)

BANNED_TAILS = {"cet","cette","ceci","cela","ça","ce","les","et","mais","ou","donc","ni","car","ainsi"}

def build_pack_prompt(lang_dest: str, pack: dict, smin: int, smax: int) -> list[dict]:
    system = (
        f"Tu es un rédacteur pédagogique expert. Tu écris en {lang_dest}. "
        f"{GLOBAL_CHARTER} Respecte des phrases courtes à moyennes. "
        "Interdiction absolue de finir un segment par un démonstratif isolé (\"Cet\", \"Cela\", \"Ce\", etc.) ou par un connecteur. "
        "RENVOIE UNIQUEMENT un JSON **valide** directement patchable par l'API Rask."
    )
    user_payload = {
        "contraintes_globales": {
            "phrases": f"{smin}-{smax} mots/phrase environ",
            "pas_d_actions": True,
            "pas_de_listes": True,
            "segment_independant": True,
            "transition_douce": True,
            "interdit_finir_par": sorted(list(BANNED_TAILS))
        },
        "règle_longueur": (
            "Pour chaque segment, produire **exactement** <exact_words> mots lorsque possible; "
            "tolérance de ±2 si la fluidité l'exige. Finir par une ponctuation terminale."
        ),
        "segments": [
            {
                "id": it["id"],
                "exact_words": it["exact_words"],
                "min_words": it["min_words"],
                "max_words": it["max_words"],
                "duration_sec": it["duration_sec"],
                "base_text": it["base_text"],
                "prev_text": it["prev_text"],
                "next_text": it["next_text"],
                "topic": it["topic"]
            } for it in pack["segments"]
        ],
        "sortie_json_patch_rask": f'[["id":"<segment_id>", "dst":{{"text":"<texte exact>", "lang":"{LANG_DEST}"}}]]'
    }
    return [{"role":"system","content":system},
            {"role":"user","content":json.dumps(user_payload, ensure_ascii=False)}]

def build_strict_segment_prompt(lang_dest: str, base_text: str, prev_text: str, next_text: str,
                                exact_words: int, smin: int, smax: int) -> list[dict]:
    system = (
        f"Tu écris en {lang_dest} un segment unique, très fluide et pédagogique. "
        f"Un seul paragraphe. **EXACTEMENT {exact_words} mots** (compte standard). "
        "Ponctuation finale obligatoire. Pas de listes ni d'actions procédurales. "
        "Interdiction de finir par un démonstratif isolé ou un connecteur."
    )
    user = {
        "exact_words": exact_words,
        "phrases": f"{smin}-{smax} mots/phrase environ",
        "contexte": {"prev_text": prev_text, "next_text": next_text},
        "base_text": base_text,
        "sortie": {"format": "texte seul, sans guillemets, sans balises"}
    }
    return [{"role":"system","content":system},
            {"role":"user","content":json.dumps(user, ensure_ascii=False)}]

def openai_chat(messages: list[dict], model: str, temperature: float) -> str:
    last_err = None
    for _ in range(1 + OPENAI_MAX_RETRIES):
        try:
            if _openai_mode == "new":
                resp = _client.chat.completions.create(model=model, temperature=temperature, messages=messages)
                return resp.choices[0].message.content.strip()
            else:
                resp = openai.ChatCompletion.create(model=model, temperature=temperature, messages=messages)
                return resp["choices"][0]["message"]["content"].strip()
        except Exception as e:
            last_err = e; time.sleep(0.8)
    raise RuntimeError(f"Echec OpenAI après retries: {last_err}")

def parse_patch_json(raw: str) -> list[dict]:
    try:
        data = json.loads(raw)
        if isinstance(data, dict) and "segments" in data: return data["segments"]
        if isinstance(data, list): return data
    except json.JSONDecodeError:
        m = re.search(r"```json\s*(\[.*?\])\s*```", raw, flags=re.DOTALL)
        if m: return json.loads(m.group(1))
        m2 = re.search(r"(\[\s*\{.*\}\s*\])", raw, flags=re.DOTALL)
        if m2: return json.loads(m2.group(1))
    raise RuntimeError("Réponse LLM non JSON patchable Rask.")

In [32]:
# (6) Post-traitements robustes — anti 'Cet.' + fins/débuts propres
TERM_PUNCT_RE = re.compile(r"[.!?…]$")
WEAK_START_RE = re.compile(r"^(?:et|mais|ou|donc|or|ni|car|cela|les|ce|cet|cette|ceci|ça)\b", re.IGNORECASE)
ORPHAN_END_RE = re.compile(r"\b(?:et|mais|ou|donc|or|ni|car|cela|les|ce|cet|cette|ceci|ça)\s*$", re.IGNORECASE)

def fix_common_content_glitches(text: str) -> str:
    s = (text or "").strip()
    s = re.sub(r"\b100\s*naturelle\b", "100 % naturelle", s)
    s = re.sub(r"\s{2,}", " ", s)
    return s

def ensure_terminal_punct(text: str) -> str:
    s = (text or "").strip()
    s = ORPHAN_END_RE.sub("", s).strip()
    if not TERM_PUNCT_RE.search(s):
        s += "."
    return s

def sanitize_end_tail(text: str) -> str:
    s = (text or "").strip()
    # Supprime un éventuel dernier mot interdit
    toks = WORD_RE.findall(s)
    if not toks: return s
    last = toks[-1].lower()
    if last in BANNED_TAILS:
        # retire le dernier mot et toute fin non ponctuée
        s = re.sub(r"\b" + re.escape(toks[-1]) + r"[\W]*$", "", s).strip()
    s = ensure_terminal_punct(s)
    return s

def sanitize_start(text: str) -> str:
    s = (text or "").strip()
    if WEAK_START_RE.match(s):
        s = WEAK_START_RE.sub("Ensuite,", s, count=1)
        s = re.sub(r"\s{2,}", " ", s).strip()
        if s: s = s[0].upper() + s[1:]
    return s

# Banque de micro-phrases complètes (pas de 'Cet.') pour extension douce
FILLER_BANK = [
    "Ce point servira de repère.",                   # 5 mots
    "Ce repère facilite vos choix.",                # 5
    "Ces bases guideront la suite.",               # 5
    "La suite détaillera ces critères.",           # 5
    "Ce résumé fixe l'essentiel.",                 # 5 ("l'essentiel" = 1 mot)
    "Vous disposez maintenant d'un cadre clair.",  # 7
    "Ces repères soutiennent votre progression.",  # 5
    "Retenez surtout la logique de sélection.",    # 6
    "Ceci clarifie l'objectif pédagogique.",       # 5
    "Le prochain point prolonge cette idée.",      # 6
]

def choose_fillers_to_reach(diff: int) -> list[str]:
    # Petite DP gloutonne + amélioration simple pour couvrir diff
    # Pré-calcul des longueurs
    lengths = [words_count(s) for s in FILLER_BANK]
    items = list(zip(FILLER_BANK, lengths))
    # Glouton décroissant
    items.sort(key=lambda x: x[1], reverse=True)
    out = []
    remain = diff
    for sent, ln in items:
        while remain - ln >= 0 and remain > 0:
            out.append(sent); remain -= ln
            if remain == 0: return out
    # Si on n'est pas à zéro, on tente une seconde passe fine
    if remain > 0:
        # essayer toutes petites combinaisons jusqu'à 2 phrases
        for i, (s1,l1) in enumerate(items):
            if l1 == remain: return [s1]
            for s2,l2 in items[i:]:
                if l1 + l2 == remain: return [s1,s2]
    return out  # peut être vide; on gèrera plus tard

def trim_to_exact_words(text: str, target: int) -> str:
    # Coupe avec respect phrase → mots; évite de finir sur un démonstratif
    s = fix_common_content_glitches(text)
    toks = WORD_RE.findall(s)
    if len(toks) <= target:
        return s
    # Tente de couper au dernier séparateur de phrase
    parts = re.split(r"(?<=[.!?…])\s+", s)
    acc = []
    for p in parts:
        if words_count(" ".join(acc + [p])) <= target:
            acc.append(p)
        else:
            break
    s2 = " ".join(acc).strip()
    if words_count(s2) > target:
        # fallback: couper au mot
        sub = toks[:target]
        s2 = " ".join(sub)
    s2 = sanitize_end_tail(s2)
    return s2

def expand_to_exact_words(text: str, target: int) -> str:
    s = fix_common_content_glitches(text)
    cur = words_count(s)
    if cur >= target:
        return s
    need = target - cur
    fillers = choose_fillers_to_reach(need)
    if fillers:
        s2 = (s + " " + " ".join(fillers)).strip()
        if words_count(s2) == target:
            return ensure_terminal_punct(sanitize_end_tail(s2))
    # Dernier recours: répéter une phrase neutre pour approcher, puis rogner proprement
    pad = " Ce résumé fixe l'essentiel."
    while words_count(s) + words_count(pad) <= target:
        s += pad
    if words_count(s) < target:
        s += " Cette précision complète l'ensemble."  # 5 mots environ
    s = trim_to_exact_words(s, target)
    return ensure_terminal_punct(sanitize_end_tail(s))

def postprocess_segment(text: str, target_words: int, fix_start: bool = True) -> str:
    s = (text or "").strip()
    s = fix_common_content_glitches(s)
    # Ajuster à EXACT
    wc = words_count(s)
    if wc > target_words:
        s = trim_to_exact_words(s, target_words)
    elif wc < target_words:
        s = expand_to_exact_words(s, target_words)
    # Fins et débuts propres
    s = sanitize_end_tail(ensure_terminal_punct(s))
    if fix_start:
        s = sanitize_start(s)
    # Double-check exact
    if words_count(s) != target_words:
        # ajustement final minimal: rogner/étendre proprement
        if words_count(s) > target_words:
            s = trim_to_exact_words(s, target_words)
        else:
            s = expand_to_exact_words(s, target_words)
    return s

In [33]:
# (7) Génération pack + régénération + normalisation EXACT (LLM d'abord, algo sinon)
from tqdm.auto import tqdm

def generate_pack(df: pd.DataFrame,
                  lang_dest: str,
                  indices: list[int],
                  tol_words: int,
                  smin: int, smax: int,
                  model: str = OPENAI_MODEL,
                  temperature: float = OPENAI_TEMPERATURE) -> pd.DataFrame:
    dfx = df.reset_index(drop=True).copy()
    pack = build_segments_pack(dfx, indices, tol=tol_words)

    # Pass 1 — pack
    if not OPENAI_API_KEY or _openai_mode is None:
        print("OpenAI indisponible: fallback base_text.")
        results = [{"id": it["id"], "dst": {"text": it["base_text"], "lang": lang_dest}} for it in pack["segments"]]
    else:
        msgs = build_pack_prompt(lang_dest, pack, smin=smin, smax=smax)
        raw = openai_chat(msgs, model=model, temperature=temperature)
        results = parse_patch_json(raw)

    # Injection brute
    if "new_text" not in dfx.columns: dfx["new_text"] = pd.NA
    if "new_length" not in dfx.columns: dfx["new_length"] = pd.NA
    sid_to_idx = {str(dfx.at[i,"segment_id"]): i for i in range(len(dfx))}
    for item in results:
        sid = item.get("id") or item.get("segment_id")
        txt = (item.get("dst",{}).get("text") or item.get("text") or "").strip()
        if sid in sid_to_idx:
            j = sid_to_idx[sid]
            dfx.at[j,"new_text"] = txt
            dfx.at[j,"new_length"] = words_count(txt)

    # Normalisation EXACT via LLM strict, sinon fallback algo
    for i in indices:
        target = int(dfx.at[i,"estimation_length"])
        base = str(dfx.at[i,"new_text"] or dfx.at[i,"dst_text"] or dfx.at[i,"src_text"] or "").strip()
        prev_text = str(dfx.at[i-1,"new_text"] or dfx.at[i-1,"dst_text"] or "").strip() if i-1 >= 0 else ""
        next_text = str(dfx.at[i+1,"new_text"] or dfx.at[i+1,"dst_text"] or "").strip() if i+1 < len(dfx) else ""
        # LLM strict
        ok = False
        if OPENAI_API_KEY and _openai_mode:
            for _ in range(STRICT_MAX_RETRIES):
                msgs = build_strict_segment_prompt(LANG_DEST, base, prev_text, next_text, target, smin, smax)
                cand = openai_chat(msgs, model=model, temperature=temperature).strip()
                cand = sanitize_start(sanitize_end_tail(ensure_terminal_punct(cand)))
                if words_count(cand) == target and not ORPHAN_END_RE.search(cand):
                    dfx.at[i,"new_text"] = cand
                    dfx.at[i,"new_length"] = target
                    ok = True
                    break
        if not ok:
            # Fallback algo exact
            repaired = postprocess_segment(base, target_words=target, fix_start=True)
            dfx.at[i,"new_text"] = repaired
            dfx.at[i,"new_length"] = words_count(repaired)

    return dfx

df = generate_pack(df, LANG_DEST, indices_to_process, tol_words=LENGTH_TOLERANCE,
                   smin=MIN_SENT_WORDS, smax=MAX_SENT_WORDS)
print("Génération + normalisation terminées.")

Génération + normalisation terminées.


In [34]:
# (8) Audit frontières + auto-fix final
def sentence_tokens(text: str) -> list[str]:
    s = (text or "").strip()
    if not s: return []
    parts = re.split(r"(?<=[.!?…])\s+", s)
    return [p.strip() for p in parts if p.strip()]

def audit_boundaries(df: pd.DataFrame, indices: list[int], smin: int, smax: int) -> pd.DataFrame:
    rows = []
    for i in indices:
        txt = (df.at[i,"new_text"] or "").strip()
        sins = sentence_tokens(txt)
        issues = []
        if not TERM_PUNCT_RE.search(txt): issues.append("no_terminal_punct")
        if ORPHAN_END_RE.search(txt): issues.append("orphan_connector_end")
        if i+1 < len(df):
            nxt = (df.at[i+1,"new_text"] or "").strip()
            if WEAK_START_RE.search(nxt): issues.append("weak_start_next")
        if len(sins) == 0: issues.append("empty_segment")
        if len(sins) > 4: issues.append("too_many_sentences")
        for s in sins:
            wc = words_count(s)
            if wc < smin: issues.append("sentence_too_short")
            if wc > smax: issues.append("sentence_too_long")
        if issues:
            rows.append({"index": i, "segment_id": df.at[i,"segment_id"], "issues": ", ".join(sorted(set(issues)))})
    return pd.DataFrame(rows)

def auto_fix_boundaries(df: pd.DataFrame, indices: list[int]) -> pd.DataFrame:
    dfx = df.copy()
    for i in indices:
        s = (dfx.at[i,"new_text"] or "").strip()
        s = sanitize_end_tail(ensure_terminal_punct(s))
        dfx.at[i,"new_text"] = s
        dfx.at[i,"new_length"] = words_count(s)
        if i+1 < len(dfx):
            nxt = (dfx.at[i+1,"new_text"] or "").strip()
            nxt = sanitize_start(nxt)
            dfx.at[i+1,"new_text"] = nxt
            dfx.at[i+1,"new_length"] = words_count(nxt)
    return dfx

issues_df = audit_boundaries(df, indices_to_process, smin=MIN_SENT_WORDS, smax=MAX_SENT_WORDS)
print("Issues frontières détectées:", len(issues_df))
display(issues_df.head(20))

df = auto_fix_boundaries(df, indices_to_process)

issues_after = audit_boundaries(df, indices_to_process, smin=MIN_SENT_WORDS, smax=MAX_SENT_WORDS)
print("Issues restantes après fix:", len(issues_after))
display(issues_after.head(20))

Issues frontières détectées: 5


Unnamed: 0,index,segment_id,issues
0,1,6dd4b4fb-f4df-4092-be49-a874dfd76e1f,sentence_too_long
1,2,b1b46f6e-1e68-4c7a-a6e1-5eeaf9bc491d,"sentence_too_long, sentence_too_short, too_man..."
2,3,dda51233-7321-4c5b-8a69-a10c187b2052,"sentence_too_long, sentence_too_short, too_man..."
3,4,968ab399-20af-4cec-bf27-4b304be1bf53,"sentence_too_long, too_many_sentences"
4,5,68c0ca8b-fe05-4d5b-b365-7ed7c2ab86a8,sentence_too_long


Issues restantes après fix: 5


Unnamed: 0,index,segment_id,issues
0,1,6dd4b4fb-f4df-4092-be49-a874dfd76e1f,sentence_too_long
1,2,b1b46f6e-1e68-4c7a-a6e1-5eeaf9bc491d,"sentence_too_long, sentence_too_short, too_man..."
2,3,dda51233-7321-4c5b-8a69-a10c187b2052,"sentence_too_long, sentence_too_short, too_man..."
3,4,968ab399-20af-4cec-bf27-4b304be1bf53,"sentence_too_long, too_many_sentences"
4,5,68c0ca8b-fe05-4d5b-b365-7ed7c2ab86a8,sentence_too_long


In [35]:
# (9) Visualisation AVANT / APRÈS + tailles
view_cols = [
    "segment_id","start","end","duration_sec","estimation_length",
    "dst_text","dst_length","new_text","new_length"
]
table = df.loc[indices_to_process, view_cols].copy()
display(table.head(20))
table.to_csv("v5_pack_avant_apres.csv", index=False, encoding="utf-8-sig")
print("Export -> v5_pack_avant_apres.csv")

Unnamed: 0,segment_id,start,end,duration_sec,estimation_length,dst_text,dst_length,new_text,new_length
0,81f4de3f-c894-4472-a934-5a51d11a9f8d,"00:00:00,857","00:00:07,597",6.74,17,Les mèches jouent un rôle crucial dans la comb...,19,"Ensuite, mèches influencent significativement ...",17
1,6dd4b4fb-f4df-4092-be49-a874dfd76e1f,"00:00:08,259","00:00:33,488",25.229,62,La mèche en coton est un choix privilégié pour...,62,La mèche en coton est souvent choisie pour sa ...,62
2,b1b46f6e-1e68-4c7a-a6e1-5eeaf9bc491d,"00:00:34,847","00:01:09,457",34.61,85,La mèche en bois fabriquée à partir de fines l...,58,La mèche en bois fabriquée à partir de fines l...,82
3,dda51233-7321-4c5b-8a69-a10c187b2052,"00:01:10,212","00:01:59,482",49.27,121,Pour installer une mèche en coton commencez pa...,80,Pour installer une mèche en coton commencez pa...,121
4,968ab399-20af-4cec-bf27-4b304be1bf53,"00:02:02,640","00:02:47,936",45.296,111,"Pour la mèche en bois, il est essentiel de bie...",111,"Pour la mèche en bois, il est essentiel de bie...",111
5,68c0ca8b-fe05-4d5b-b365-7ed7c2ab86a8,"00:02:49,074","00:03:34,779",45.705,112,"Ensuite, les mèches en coton sont souvent choi...",110,"Ensuite, les mèches en coton sont souvent choi...",110
6,803bb373-a69e-40e5-9c1c-78a833dbd000,"00:03:35,305","00:03:38,640",3.335,8,Choisir la bonne mèche est essentiel pour obte...,11,Choisir la bonne mèche garantit une bougie réu...,8


Export -> v5_pack_avant_apres.csv


In [36]:
# (10) Audit vitesse parlée (WPM) par segment + statut
def rate_status(rate_wpm: float, target_wpm: float, tol_pct: float) -> str:
    if target_wpm <= 0: return "NA"
    delta_pct = 100.0 * (rate_wpm - target_wpm) / target_wpm
    if abs(delta_pct) <= tol_pct:
        return "OK"
    return "FAST" if delta_pct > 0 else "SLOW"

rate_rows = []
for i in indices_to_process:
    d = float(df.at[i,"duration_sec"] or 0.0)
    w = int(df.at[i,"new_length"] or 0)
    tgt = int(df.at[i,"estimation_length"] or 0)
    rate = spoken_rate_wpm(w, d) if d > 0 else 0.0
    status = rate_status(rate, WPM, RATE_TOL_PCT)
    rate_rows.append({
        "segment_id": df.at[i,"segment_id"],
        "duration_sec": d,
        "target_words": tgt,
        "new_length": w,
        "rate_wpm": round(rate, 1),
        "target_wpm": WPM,
        "rate_deviation_pct": round(100.0*(rate-WPM)/WPM, 1) if WPM > 0 else 0.0,
        "rate_status": status
    })
rate_df = pd.DataFrame(rate_rows)
display(rate_df.head(20))
rate_df.to_csv("v5_audit_vitesse.csv", index=False, encoding="utf-8-sig")
print("Export -> v5_audit_vitesse.csv")

Unnamed: 0,segment_id,duration_sec,target_words,new_length,rate_wpm,target_wpm,rate_deviation_pct,rate_status
0,81f4de3f-c894-4472-a934-5a51d11a9f8d,6.74,17,17,151.3,147.0,2.9,FAST
1,6dd4b4fb-f4df-4092-be49-a874dfd76e1f,25.229,62,62,147.4,147.0,0.3,OK
2,b1b46f6e-1e68-4c7a-a6e1-5eeaf9bc491d,34.61,85,82,142.2,147.0,-3.3,SLOW
3,dda51233-7321-4c5b-8a69-a10c187b2052,49.27,121,121,147.4,147.0,0.2,OK
4,968ab399-20af-4cec-bf27-4b304be1bf53,45.296,111,111,147.0,147.0,0.0,OK
5,68c0ca8b-fe05-4d5b-b365-7ed7c2ab86a8,45.705,112,110,144.4,147.0,-1.8,OK
6,803bb373-a69e-40e5-9c1c-78a833dbd000,3.335,8,8,143.9,147.0,-2.1,SLOW


Export -> v5_audit_vitesse.csv


In [37]:
# (11) Préparation payload & patch Rask (dry‑run par défaut)
def _row_to_segment_patch(row: pd.Series, dst_lang: str, include_timing: bool = False) -> dict:
    seg = {"id": str(row["segment_id"]), "dst": {"text": str(row["new_text"]), "lang": dst_lang}}
    if include_timing:
        if pd.notna(row.get("start")): seg["start"] = str(row["start"])
        if pd.notna(row.get("end")):   seg["end"]   = str(row["end"])
    return seg

def build_segments_payload_for_patch(df: pd.DataFrame,
                                     dst_lang: str,
                                     indices: list[int],
                                     include_timing: bool = False) -> list[dict]:
    subset = df.loc[indices]
    subset = subset[subset["new_text"].notna() & (subset["new_text"].astype(str).str.strip().str.len() > 0)]
    return [_row_to_segment_patch(row, dst_lang, include_timing=include_timing) for _, row in subset.iterrows()]

def patch_segments_text(headers: dict,
                        project_id: str,
                        segments: list[dict],
                        batch_size: int = 100,
                        dry_run: bool = True,
                        sleep_between: float = 0.4) -> None:
    url = PATCH_SEGMENTS_URL.format(project_id=project_id)
    total = len(segments); batches = math.ceil(total / batch_size)
    for b in range(batches):
        chunk = segments[b*batch_size:(b+1)*batch_size]
        if dry_run:
            print(f"[DRY-RUN] PATCH {b+1}/{batches} -> {url} ({len(chunk)} segments)")
            preview = [{"id": s["id"],
                        "dst_len": len(str(s.get("dst",{}).get("text",""))).__int__(),
                        "dst_lang": s["dst"].get("lang"),
                        "start": s.get("start"), "end": s.get("end")} for s in chunk]
            display(pd.DataFrame(preview))
            continue
    
        resp = requests.patch(url, headers={**headers,"Content-Type":"application/json"},
                              json={"segments": chunk}, timeout=60)
        if resp.status_code >= 400:
            try: print("Erreur serveur:", json.dumps(resp.json(), ensure_ascii=False, indent=2))
            except Exception: print("Erreur brute:", resp.text)
            resp.raise_for_status()
        print(f"OK PATCH {b+1}/{batches}: {len(chunk)} segments")
        time.sleep(sleep_between)

payload = build_segments_payload_for_patch(df, dst_lang=dst_lang, indices=indices_to_process, include_timing=INCLUDE_TIMING)
print("Segments à patcher:", len(payload))
patch_segments_text(HEADERS, project_id, payload, dry_run=DRY_RUN)

Segments à patcher: 7
OK PATCH 1/1: 7 segments


In [38]:
# (12) Sanitiser un payload JSON EXISTANT (répare 'Cet.' & phrases inachevées)
def sanitize_existing_payload(payload: list[dict], wpm: float = WPM) -> list[dict]:
    out = []
    for item in payload:
        seg_id = item.get("id") or item.get("segment_id")
        start = item.get("start"); end = item.get("end")
        txt = (item.get("dst",{}).get("text") or item.get("text") or "").strip()
        dur = 0.0
        try:
            if start and end:
                dur = max(0.0, _parse_timecode(end) - _parse_timecode(start))
        except Exception:
            dur = 0.0
        target = words_target_for_duration(dur, wpm) if dur > 0 else words_count(txt)
        fixed = postprocess_segment(txt, target_words=target, fix_start=True)
        out.append({
            "id": seg_id,
            "dst": {"text": fixed, "lang": item.get("dst",{}).get("lang","fr-fr")},
            "start": start, "end": end
        })
    return out

# Exemple d'utilisation:
example_payload = [
  {
    "id": "342b2806-3672-4035-8971-548b78a169f0",
    "dst": {"text": "Considérée comme 100 naturelle elle est sans danger pour tous Cet.", "lang": "fr-fr"},
    "start": "00:00:30,823", "end": "00:00:35,186"
  }
]
sanitized = sanitize_existing_payload(example_payload, wpm=WPM)
print("Payload réparé (extrait):\n", json.dumps(sanitized, ensure_ascii=False, indent=2))

Payload réparé (extrait):
 [
  {
    "id": "342b2806-3672-4035-8971-548b78a169f0",
    "dst": {
      "text": "Considérée comme 100 % naturelle elle est sans danger pour tous.",
      "lang": "fr-fr"
    },
    "start": "00:00:30,823",
    "end": "00:00:35,186"
  }
]


In [39]:
# (13) Exports finaux
path_dir_output = os.path.join(os.getcwd(), PROJECT_SELECTOR)
os.makedirs(path_dir_output, exist_ok=True)
out_csv = os.path.join(path_dir_output, "v5_transcription_postgen.csv");
df.to_csv(out_csv, index=False, encoding="utf-8-sig")
print("CSV écrit ->", out_csv)

out_payload = os.path.join(path_dir_output, "v5_segments_payload.json");
try:
    with open(out_payload, "w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)
    print("Payload JSON écrit ->", out_payload)
except Exception as e:
    print("Aucun payload à écrire ou erreur:", e)

CSV écrit -> /mnt/c/Users/Utilisateur/PycharmProjects/lumierelearning/notebook/4_-_Presentation_des_meches_KtwWjO/v5_transcription_postgen.csv
Payload JSON écrit -> /mnt/c/Users/Utilisateur/PycharmProjects/lumierelearning/notebook/4_-_Presentation_des_meches_KtwWjO/v5_segments_payload.json
