<a href="https://colab.research.google.com/github/elijahManPerson/Flappy-Bird/blob/master/Copy_of_Data_prepocessing_20251104.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
# AES Step 1‚Üí9: Clean, run-ready Colab block (v2025-11-03.cleaned)
#
# This script wraps the entire AES text processing pipeline into a single file.
# It removes duplicated cells, removes leaked keys, keeps the deterministic
# dialogue alignment, and keeps the sentence boundary flags.

# ---------- 0) Drive + imports ----------
from google.colab import drive
drive.mount('/content/drive')

import os, time, re, io, csv, json, math, logging, zipfile
import numpy as np
import pandas as pd

def status(msg, ok=True):
    print(("‚úÖ " if ok else "‚ùå ") + msg)

# Mount check
root_mount = '/content/drive'
root_mydrive = '/content/drive/MyDrive'
status("Drive mount detected at /content/drive", os.path.ismount(root_mount))
status("MyDrive folder present", os.path.isdir(root_mydrive))

# Probe read/write
try:
    sample = os.listdir(root_mydrive)[:5]
    status("Read test passed (listed MyDrive)")
except Exception as e:
    status(f"Read test failed: {e}", False)

probe = os.path.join(root_mydrive, "_colab_mount_check.txt")
try:
    with open(probe, "w", encoding="utf-8") as f:
        f.write(f"colab mount check {time.time()}\n")
    status("Write test passed (created file)")
    os.remove(probe)
    status("Cleanup passed (deleted file)")
except Exception as e:
    status(f"Write test failed: {e}", False)


Mounted at /content/drive
‚úÖ Drive mount detected at /content/drive
‚úÖ MyDrive folder present
‚úÖ Read test passed (listed MyDrive)
‚úÖ Write test passed (created file)
‚úÖ Cleanup passed (deleted file)


In [None]:


# ---------- 1) Data in ----------
DATA_PATH = "/content/drive/MyDrive/JM/Sandbox/1.Training Data/Data for Testing avg short.csv"
RAW_TEXT_ALIASES = {"raw text","raw_text","rawtext"}

if not os.path.exists(DATA_PATH):
    status(f"File not found: {DATA_PATH}", False)
    raise FileNotFoundError(DATA_PATH)

def try_read(path, sep, engine=None):
    kw = dict(encoding="utf-8-sig", on_bad_lines="skip", low_memory=False)
    if sep is None:
        kw["sep"] = None
        kw["engine"] = "python"
    else:
        kw["sep"] = sep
        if engine:
            kw["engine"] = engine
    try:
        return pd.read_csv(path, **kw)
    except Exception:
        return None

cands = [
    ("auto", None, "python"),
    ("comma", "," , None),
    ("semicolon", ";", None),
    ("tab", "\t", None),
    ("pipe", "|", None)
]
best, best_score, parsed_by = None, (-1, -1), None

def score(df):
    if df is None or df.empty:
        return (-1, -1)
    cols = [c.strip().lower() for c in df.columns]
    has_raw = int(any(c in RAW_TEXT_ALIASES for c in cols) or ("raw" in cols and "text" in cols))
    return (has_raw, len(cols))

for name, sep, eng in cands:
    df_ = try_read(DATA_PATH, sep, eng)
    s = score(df_)
    if s > best_score:
        best, best_score, parsed_by = df_, s, name

if best is None or best.empty:
    status("Failed to read CSV", False)
    raise ValueError("Could not parse CSV")

status(f"Parsed using: {parsed_by}. Columns: {len(best.columns)}")
df_pre = best.copy()

# Ensure 'Raw text'
cols_norm = {c: c.strip().lower() for c in df_pre.columns}
raw_col = None
for c, n in cols_norm.items():
    if n in RAW_TEXT_ALIASES:
        raw_col = c
        break

if raw_col is None and "raw" in cols_norm.values() and "text" in cols_norm.values():
    raw_name  = next(k for k,v in cols_norm.items() if v=="raw")
    text_name = next(k for k,v in cols_norm.items() if v=="text")
    df_pre["Raw text"] = (
        df_pre[raw_name].astype(str).fillna("") + " " +
        df_pre[text_name].astype(str).fillna("")
    ).str.strip()
    status(f"Merged '{raw_name}' + '{text_name}' into 'Raw text'")
else:
    if raw_col is None:
        status("Raw text column not found after parsing", False)
        print("Columns present:", list(df_pre.columns))
        raise KeyError("'Raw text' column is missing")
    if raw_col != "Raw text":
        df_pre.rename(columns={raw_col: "Raw text"}, inplace=True)
        status(f"Renamed '{raw_col}' to 'Raw text'")

df_pre["Raw text"] = df_pre["Raw text"].fillna("").astype(str)

# Canonical ID
def _normalize_id_series(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.replace(r"\.0$", "", regex=True)
    def _fix(x):
        if any(c.isalpha() for c in x):
            return x
        try:
            if "." in x or "e" in x.lower():
                f = float(x)
                if f.is_integer():
                    return str(int(f))
        except Exception:
            pass
        return x
    return s.map(_fix)

def find_id_column(cols):
    cols = list(cols)
    if not cols:
        return None
    if re.search(r"id|identifier", cols[0], flags=re.I):
        return cols[0]
    for name in cols[1:]:
        if re.search(r"id|identifier", name, flags=re.I):
            return name
    return cols[0]

CANON_ID = find_id_column(df_pre.columns)
df_pre["ID"] = _normalize_id_series(df_pre[CANON_ID])

total = len(df_pre)
usable = total - df_pre["Raw text"].str.strip().eq("").sum()
status(f"Non-empty 'Raw text' rows: {usable} of {total}")
print(df_pre[["ID","Raw text"]].head(3))



‚úÖ Parsed using: comma. Columns: 15
‚úÖ Non-empty 'Raw text' rows: 21 of 21
         ID                                           Raw text
0  BBCMHJPT  There once was a girl called lilly she had pet...
1  BBKBYNDW  wrire a narrative story abouta search for some...
2  BBRWTLYV  The Failed Submarine I had always wanted go on...


In [None]:

# ---------- 2) Deps ----------
!pip -q install --upgrade "openai==1.*" tqdm nltk tiktoken spacy pandas==2.2.2 ipywidgets openpyxl jsonschema backoff
!python -m spacy download en_core_web_sm -q

import importlib, spacy, nltk, tiktoken, backoff
from tqdm import tqdm

def v(name):
    try:
        m = importlib.import_module(name)
        return getattr(m, "__version__", "unknown")
    except Exception as e:
        return f"import failed: {e}"

mods = ["openai","pandas","spacy","nltk","tiktoken","tqdm","backoff"]
print("Versions:", {m: v(m) for m in mods})
nlp = spacy.load("en_core_web_sm")
print("spaCy ok:", [t.text for t in nlp("A tiny sanity check.")])

os.makedirs("/content/nltk_data", exist_ok=True)
if "/content/nltk_data" not in nltk.data.path:
    nltk.data.path.insert(0, "/content/nltk_data")
try:
    nltk.data.find("tokenizers/punkt")
    print("NLTK punkt ok")
except LookupError:
    nltk.download("punkt", download_dir="/content/nltk_data", quiet=False)
    nltk.data.find("tokenizers/punkt")
    print("NLTK punkt downloaded")

try:
    enc = tiktoken.get_encoding("o200k_base")
except Exception:
    enc = tiktoken.get_encoding("cl100k_base")
print("tiktoken ok, sample tokens:", len(enc.encode("Tokenization sanity check.")))


# ---------- 3) OpenAI config ----------
import os
from getpass import getpass
from openai import OpenAI, BadRequestError

MODEL_ID = os.environ.get("AES_MODEL_ID", "gpt-4o")
USE_MOCK = os.environ.get("AES_USE_MOCK", "0").lower() in {"1","true","yes"}

def ensure_api_key():
    key = os.environ.get("OPENAI_API_KEY")
    if key:
        return key
    print("Enter your OpenAI API key (hidden). Leave blank for mock mode.")
    key = getpass("API key: ").strip()
    if key:
        os.environ["OPENAI_API_KEY"] = key
    return key or None

def verify_openai(model_id: str = MODEL_ID):
    key = ensure_api_key()
    if not key:
        print("No key supplied. Using mock corrections.")
        return None
    client = OpenAI()
    try:
        _ = client.models.list().data[:1]
        try:
            _ = client.models.retrieve(model_id)
        except Exception:
            print(f"Model '{model_id}' not retrieved. Will still try.")
        print("API key verified.")
        return client
    except Exception as e:
        print("Verification failed:", type(e).__name__, str(e))
        return None

client = None if USE_MOCK else verify_openai(MODEL_ID)
if client is None:
    USE_MOCK = True
    print("Running in MOCK mode.")


# ---------- 4) Text utils ----------
_MOJIBAKE_FIXES = [
    (r"√¢‚Ç¨‚Äù","‚Äî"), (r"√¢‚Ç¨‚Äú","‚Äì"), (r"√¢‚Ç¨Àú","‚Äò"), (r"√¢‚Ç¨‚Ñ¢","‚Äô"),
    (r"√¢‚Ç¨≈ì","‚Äú"), (r"√¢‚Ç¨¬ù","‚Äù"), (r"√¢‚Ç¨¬¶","‚Ä¶"), (r"√Ç "," ")
]
def normalize_mojibake(s: str) -> str:
    out = str(s or "")
    for pat, rep in _MOJIBAKE_FIXES:
        out = re.sub(pat, rep, out)
    return out

def normalise_punct(s: str) -> str:
    s = normalize_mojibake(str(s))
    s = s.replace("‚Äô", "'")
    s = s.replace("‚Äì‚Äì", "‚Äî").replace("‚Äì", "‚Äî").replace("--", "‚Äî").replace("‚Äï", "‚Äî")
    return s

def collapse_ellipsis(text: str) -> str:
    return re.sub(r"\.{3,}", "‚Ä¶", str(text))

def _extract_first_json_object(txt: str):
    if not txt:
        return None
    start = txt.find("{")
    if start < 0:
        return None
    depth, in_str, esc = 0, False, False
    for i in range(start, len(txt)):
        ch = txt[i]
        if in_str:
            if esc:
                esc = False
            elif ch == "\\":
                esc = True
            elif ch == '"':
                in_str = False
        else:
            if ch == '"':
                in_str = True
            elif ch == "{":
                depth += 1
            elif ch == "}":
                depth -= 1
                if depth == 0:
                    frag = txt[start:i+1]
                    try:
                        return json.loads(frag)
                    except Exception:
                        return None
    return None

# Dialogue helpers
QUOTE_OPENERS = {'"', '‚Äú'}
QUOTE_CLOSERS = {'"', '‚Äù'}
SQUOTE_OPENERS = {'‚Äò'}
SQUOTE_CLOSERS = {'‚Äô'}
_WORD_CHAR = re.compile(r"\w", flags=re.UNICODE)

def _looks_like_apostrophe(text, i):
    if i <= 0 or i >= len(text) - 1:
        return False
    return (_WORD_CHAR.match(text[i-1] or "") and _WORD_CHAR.match(text[i+1] or ""))

def extract_dialogue_spans_deterministic(text: str):
    s = str(text or "")
    spans = []
    i = 0
    open_char = None
    start_idx = None
    while i < len(s):
        ch = s[i]
        is_double_open  = (ch in QUOTE_OPENERS or ch == '"') and not _looks_like_apostrophe(s, i)
        is_double_close = ch in QUOTE_CLOSERS or ch == '"'
        is_single_open  = ch in SQUOTE_OPENERS and not _looks_like_apostrophe(s, i)
        is_single_close = ch in SQUOTE_CLOSERS
        if open_char is None:
            if is_double_open or is_single_open:
                open_char = ch
                start_idx = i
        else:
            if open_char in QUOTE_OPENERS or open_char == '"':
                if is_double_close:
                    spans.append({"start": start_idx, "end": i+1})
                    open_char, start_idx = None, None
            elif open_char in SQUOTE_OPENERS:
                if is_single_close:
                    spans.append({"start": start_idx, "end": i+1})
                    open_char, start_idx = None, None
        i += 1
    if start_idx is not None:
        spans.append({"start": start_idx, "end": len(s)})
    out = []
    for sp in spans:
        st = max(0, min(int(sp["start"]), len(s)))
        en = max(st+1, min(int(sp["end"]), len(s)))
        out.append({"start": st, "end": en})
    return out

def _safe_load_json_list(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        s = x.strip()
        if not s:
            return []
        try:
            val = json.loads(s)
            return val if isinstance(val, list) else []
        except Exception:
            return []
    return []

def _spans_align_with_quotes(text, spans):
    t = str(text or "")
    for sp in spans:
        st = sp.get("start", -1)
        en = sp.get("end", -1)
        if not (0 <= st < en <= len(t)):
            return False
        if t[st] not in QUOTE_OPENERS.union({'"'}).union(SQUOTE_OPENERS):
            return False
        if t[en-1] not in QUOTE_CLOSERS.union({'"'}).union(SQUOTE_CLOSERS):
            return False
    return True

def ensure_dialogue_json(df_texts, corrected_col="Corrected text (8)"):
    df = df_texts.copy()
    if "DialogueSpansJSON" not in df.columns:
        df["DialogueSpansJSON"] = "[]"
    clean_col = []
    corr_texts = df.get(corrected_col, pd.Series([""]*len(df), dtype=object)).astype(str)
    for s, corr in zip(df["DialogueSpansJSON"], corr_texts):
        spans = _safe_load_json_list(s)
        if not spans or not _spans_align_with_quotes(corr, spans):
            spans = extract_dialogue_spans_deterministic(corr)
        spans = [
            {
                "start": int(max(0, min(sp["start"], len(corr)))),
                "end":   int(max(0, min(sp["end"],   len(corr))))
            }
            for sp in spans
            if sp.get("end",0) > sp.get("start",0)
        ]
        clean_col.append(json.dumps(spans, ensure_ascii=False))
    df["DialogueSpansJSON"] = clean_col
    if "NarrativeTagsJSON" not in df.columns:
        df["NarrativeTagsJSON"] = "[]"
    return df


[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/1.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[90m‚ï∫[0m[90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.7/1.5 MB[0m [31m21.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.5/1.5 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m139.8/139.8 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.2/2.2 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m


[nltk_data] Downloading package punkt to /content/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


NLTK punkt downloaded
tiktoken ok, sample tokens: 5
Enter your OpenAI API key (hidden). Leave blank for mock mode.
API key: ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑
API key verified.


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


In [None]:


# ---------- 5) Step 8 correction ----------
_DEFAULT_MAX_TOKENS = 1500

@backoff.on_exception(backoff.expo, (Exception,), max_tries=5, factor=2)
def _openai_chat_complete(client: OpenAI, model: str, prompt: str, max_tokens: int = _DEFAULT_MAX_TOKENS):
    def _compat_obj(text: str):
        return type("Compat", (), {"text": (text or "").strip()})

    def _chat_try(params):
        return client.chat.completions.create(**params)

    def _responses_try(params):
        r = client.responses.create(**params)
        text = getattr(r, "output_text", None)
        if text:
            return _compat_obj(text)
        out = []
        for item in getattr(r, "output", []) or []:
            if getattr(item, "type", "") == "output_text":
                out.append(item.text)
        return _compat_obj("".join(out))

    chat_base = dict(
        model=model,
        messages=[{"role": "user", "content": prompt}],
    )

    try:
        resp = _chat_try({**chat_base, "temperature": 0.0, "max_tokens": max_tokens})
        return _compat_obj(resp.choices[0].message.content)
    except BadRequestError as e:
        msg = str(e)
        if "Unsupported value: 'temperature'" in msg:
            resp = _chat_try({**chat_base, "max_tokens": max_tokens})
            return _compat_obj(resp.choices[0].message.content)
        if "Unsupported parameter" in msg and "'max_tokens'" in msg:
            try:
                resp = _chat_try({**chat_base, "temperature": 0.0, "max_completion_tokens": max_tokens})
                return _compat_obj(resp.choices[0].message.content)
            except BadRequestError as e2:
                msg2 = str(e2)
                if "Unsupported value: 'temperature'" in msg2:
                    resp = _chat_try({**chat_base, "max_completion_tokens": max_tokens})
                    return _compat_obj(resp.choices[0].message.content)
                raise
        raise
    except Exception:
        pass

    try:
        return _responses_try(dict(
            model=model,
            input=[{"role": "user", "content": prompt}],
            temperature=0.0,
            max_completion_tokens=max_tokens,
        ))
    except BadRequestError as e:
        msg = str(e)
        if "Unsupported value: 'temperature'" in msg:
            return _responses_try(dict(
                model=model,
                input=[{"role": "user", "content": prompt}],
                max_completion_tokens=max_tokens,
            ))
        if "Unsupported parameter" in msg and "'max_completion_tokens'" in msg:
            try:
                return _responses_try(dict(
                    model=model,
                    input=[{"role": "user", "content": prompt}],
                    temperature=0.0,
                    max_tokens=max_tokens,
                ))
            except BadRequestError as e2:
                if "Unsupported value: 'temperature'" in str(e2):
                    return _responses_try(dict(
                        model=model,
                        input=[{"role": "user", "content": prompt}],
                        max_tokens=max_tokens,
                    ))
                raise
        raise

def _model_complete_text(client: OpenAI, model: str, prompt: str, max_tokens: int = _DEFAULT_MAX_TOKENS) -> str:
    resp = _openai_chat_complete(client, model, prompt, max_tokens=max_tokens)
    return resp.text

def correct_with_tags(raw: str, client=None, model=MODEL_ID, use_mock=USE_MOCK, max_tokens=1500):
    s = normalize_mojibake(str(raw or ""))
    if use_mock or client is None:
        t = s.strip()
        m = re.search(r"[A-Za-z0-9]", t)
        if m:
            i = m.start()
            ch = t[i]
            if ch.isalpha():
                t = t[:i] + ch.upper() + t[i+1:]
        if t and not re.search(r"[.!?‚Ä¶]\s*$", t):
            t += "."
        t = collapse_ellipsis(normalise_punct(t))
        return t, [], [], "mock"

    prompt = f"""
You are a meticulous copy editor. Fix punctuation, grammar, and spelling.
Keep meaning and paragraphing. Use standard English punctuation.

Then return character-offset spans for:
1) narrative_tags: list of objects with fields:
   - type: one of "title", "temporal", "closure"
   - text: exact substring from corrected_text
   - start: integer (0-based, inclusive)
   - end: integer (0-based, exclusive)
2) dialogue_spans: list of objects with fields:
   - start: integer (0-based, inclusive)
   - end: integer (0-based, exclusive)

CRITICAL RULES:
- All offsets refer to corrected_text AFTER your edits.
- 0 <= start < end <= len(corrected_text)
- narrative_tags[i].text MUST equal corrected_text[start:end] exactly.
- If there are no tags or dialogue, return empty lists.

DIALOGUE SPAN RULES:
- Each span must begin at an opening quote character and end just after the matching closing quote character.
- Include all characters between the quotes.
- Treat " ‚Äú ‚Äù ‚Äò ‚Äô as quotes. Do not treat apostrophes in words as quotes.
- If a quote is unclosed, span ends at the first terminal clause or the end.

Output JSON only in this schema:
{{
  "corrected_text": "‚Ä¶",
  "narrative_tags": [],
  "dialogue_spans": []
}}

Text:
<<<BEGIN>>>
{s}
<<<END>>>
""".strip()

    out = _model_complete_text(client, model, prompt, max_tokens=max_tokens)
    js = _extract_first_json_object(out)
    if not isinstance(js, dict):
        t = collapse_ellipsis(normalise_punct(s))
        return t, [], [], "fallback"

    corrected = (js.get("corrected_text") or "").rstrip()
    tags = js.get("narrative_tags") or []
    spans = js.get("dialogue_spans") or []

    N = len(corrected)
    def _clip(a, b):
        a = max(0, int(a))
        b = max(a, int(b))
        return (a if a <= N else N, b if b <= N else N)

    clean_tags, clean_spans = [], []
    for t in tags:
        try:
            st, en = _clip(t.get("start", 0), t.get("end", 0))
            if en > st and t.get("type") in {"title", "temporal", "closure"}:
                clean_tags.append({
                    "type": t["type"],
                    "text": corrected[st:en],
                    "start": st,
                    "end": en
                })
        except Exception:
            pass
    for d in spans:
        try:
            st, en = _clip(d.get("start", 0), d.get("end", 0))
            if en > st:
                clean_spans.append({"start": st, "end": en})
        except Exception:
            pass

    return corrected, clean_tags, clean_spans, model

def run_correct_only(df_in, text_col="Raw text", id_col="ID", client=None, model=MODEL_ID, use_mock=USE_MOCK, out_col="Corrected text (8)"):
    if text_col not in df_in.columns:
        raise KeyError(f"Missing required column: {text_col}")
    df = df_in.copy()
    if id_col not in df.columns:
        df[id_col] = pd.RangeIndex(len(df)).astype(str)
    else:
        df[id_col] = df[id_col].astype(str).str.replace(r"\.0$","",regex=True)

    cache = {}
    corrected, tags_json, dlg_json, sources = [], [], [], []
    for raw in tqdm(df[text_col].astype(str).tolist(), desc="Step 8: correcting"):
        if raw in cache:
            c, tags, spans, src = cache[raw]
        else:
            c, tags, spans, src = correct_with_tags(raw, client=client, model=model, use_mock=use_mock)
            cache[raw] = (c, tags, spans, src)
        corrected.append(c)
        tags_json.append(json.dumps(tags, ensure_ascii=False))
        dlg_json.append(json.dumps(spans, ensure_ascii=False))
        sources.append(src)

    df[out_col] = corrected
    df["NarrativeTagsJSON"] = tags_json
    df["DialogueSpansJSON"] = dlg_json
    df["CorrectedBy"] = sources
    return df



In [None]:

# ---------- 6) Token map (runner) ----------
from difflib import SequenceMatcher
_WORD_RX = re.compile(r"\w", flags=re.UNICODE)

def _split_merged_word(tok: str):
    if not tok or not tok.isalpha():
        return [tok]
    m = re.match(r"^([A-Z]{2,})([a-z].*)$", tok)
    return [m.group(1), m.group(2)] if m else [tok]

def _tokenize_with_split(s: str):
    base = re.findall(r"\w+|[^\w\s]", s or "", flags=re.UNICODE)
    out = []
    for t in base:
        if re.fullmatch(r"\w+", t):
            out.extend(_split_merged_word(t))
        else:
            out.append(t)
    return out

def _rebuild_offsets_with_splitting(text, tokens):
    spans, i, n = [], 0, len(text or "")
    text = text or ""
    for tok in tokens:
        if not tok:
            spans.append((i, i))
            continue
        pos = text.find(tok, i)
        if pos >= 0:
            start, end = pos, pos + len(tok)
        else:
            j = i
            while j < n and text[j].isspace():
                j += 1
            start = j
            end = min(n, start + len(tok))
        spans.append((start, end))
        i = end
    return spans

def _is_word(tok: str) -> bool:
    return bool(tok) and bool(_WORD_RX.search(tok))

def _canon(tok: str) -> str:
    if tok is None:
        return ""
    u = str(tok).upper()
    return re.sub(r"(.)\1+", r"\1", u)

def build_word_map(raw_text, corr_text):
    raw_text  = str(raw_text or "")
    corr_text = str(corr_text or "")
    raw_tokens  = _tokenize_with_split(raw_text)
    corr_tokens = _tokenize_with_split(corr_text)
    raw_spans  = _rebuild_offsets_with_splitting(raw_text,  raw_tokens)
    corr_spans = _rebuild_offsets_with_splitting(corr_text, corr_tokens)
    sm = SequenceMatcher(
        a=[_canon(t) for t in raw_tokens],
        b=[_canon(t) for t in corr_tokens],
        autojunk=False
    )
    rows = []
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag == "equal":
            for k in range(i2 - i1):
                ri, ci = i1 + k, j1 + k
                r_tok, c_tok = raw_tokens[ri], corr_tokens[ci]
                r_start, r_end = raw_spans[ri]; c_start, c_end = corr_spans[ci]
                rows.append({
                    "raw_index": ri, "raw_token": r_tok, "raw_start": r_start, "raw_end": r_end,
                    "corr_index": ci, "corr_token": c_tok, "corr_start": c_start, "corr_end": c_end,
                    "op": "equal", "equal_ci": (r_tok == c_tok), "error_type": "Equal"
                })
        elif tag == "replace":
            ii, jj = i1, j1
            while ii < i2 or jj < j2:
                if ii < i2 and jj < j2:
                    r_tok = raw_tokens[ii]; c_tok = corr_tokens[jj]
                    r_start, r_end = raw_spans[ii]; c_start, c_end = corr_spans[jj]
                    if _is_word(r_tok) and not _is_word(c_tok) and (jj + 1) < j2 and _is_word(corr_tokens[jj + 1]):
                        rows.append({
                            "raw_index": None, "raw_token": None, "raw_start": None, "raw_end": None,
                            "corr_index": jj, "corr_token": c_tok, "corr_start": c_start, "corr_end": c_end,
                            "op": "insert", "equal_ci": False, "error_type": "PunctuationInsertion"
                        })
                        jj += 1
                        c2_tok = corr_tokens[jj]; c2_start, c2_end = corr_spans[jj]
                        err = "Spelling" if (str(r_tok).isalpha() and str(c2_tok).isalpha() and _canon(r_tok) == _canon(c2_tok)) else "Replacement"
                        rows.append({
                            "raw_index": ii, "raw_token": r_tok, "raw_start": r_start, "raw_end": r_end,
                            "corr_index": jj, "corr_token": c2_tok, "corr_start": c2_start, "corr_end": c2_end,
                            "op": "replace", "equal_ci": (_canon(r_tok) == _canon(c2_tok)), "error_type": err
                        })
                        ii += 1; jj += 1
                        continue
                    err = "Spelling" if (str(r_tok).isalpha() and str(c_tok).isalpha() and _canon(r_tok) == _canon(c_tok)) else "Replacement"
                    rows.append({
                        "raw_index": ii, "raw_token": r_tok, "raw_start": r_start, "raw_end": r_end,
                        "corr_index": jj, "corr_token": c_tok, "corr_start": c_start, "corr_end": c_end,
                        "op": "replace", "equal_ci": (_canon(r_tok) == _canon(c_tok)), "error_type": err
                    })
                    ii += 1; jj += 1
                elif ii < i2:
                    r_tok = raw_tokens[ii]; r_start, r_end = raw_spans[ii]
                    rows.append({
                        "raw_index": ii, "raw_token": r_tok, "raw_start": r_start, "raw_end": r_end,
                        "corr_index": None, "corr_token": None, "corr_start": None, "corr_end": None,
                        "op": "delete", "equal_ci": False,
                        "error_type": "PunctuationDeletion" if not _is_word(r_tok) else "Deletion"
                    })
                    ii += 1
                else:
                    c_tok = corr_tokens[jj]; c_start, c_end = corr_spans[jj]
                    rows.append({
                        "raw_index": None, "raw_token": None, "raw_start": None, "raw_end": None,
                        "corr_index": jj, "corr_token": c_tok, "corr_start": c_start, "corr_end": c_end,
                        "op": "insert", "equal_ci": False,
                        "error_type": "PunctuationInsertion" if not _is_word(c_tok) else "Insertion"
                    })
                    jj += 1
        elif tag == "delete":
            for ri in range(i1, i2):
                r_tok = raw_tokens[ri]; r_start, r_end = raw_spans[ri]
                rows.append({
                    "raw_index": ri, "raw_token": r_tok, "raw_start": r_start, "raw_end": r_end,
                    "corr_index": None, "corr_token": None, "corr_start": None, "corr_end": None,
                    "op": "delete", "equal_ci": False,
                    "error_type": "PunctuationDeletion" if not _is_word(r_tok) else "Deletion"
                })
        elif tag == "insert":
            for ci in range(j1, j2):
                c_tok = corr_tokens[ci]; c_start, c_end = corr_spans[ci]
                rows.append({
                    "raw_index": None, "raw_token": None, "raw_start": None, "raw_end": None,
                    "corr_index": ci, "corr_token": c_tok, "corr_start": c_start, "corr_end": c_end,
                    "op": "insert", "equal_ci": False,
                    "error_type": "PunctuationInsertion" if not _is_word(c_tok) else "Insertion"
                })
    return rows

def run_mapping_only(df_corr: pd.DataFrame, *, id_col="ID", raw_col="Raw text", corr_col="Corrected text (8)"):
    if not isinstance(df_corr, pd.DataFrame):
        raise TypeError("run_mapping_only: df_corr")
    need = {id_col, raw_col, corr_col}
    missing = need - set(df_corr.columns)
    if missing:
        raise KeyError(f"run_mapping_only missing columns: {missing}")

    rows = []
    texts_rows = []
    for _, r in df_corr.iterrows():
        ID = str(r[id_col])
        raw = str(r[raw_col]) if not pd.isna(r[raw_col]) else ""
        corr = str(r[corr_col]) if not pd.isna(r[corr_col]) else ""
        mapped = build_word_map(raw, corr)
        for rec in mapped:
            rows.append({
                "ID": ID,
                "raw_index": rec.get("raw_index"),
                "raw_token": rec.get("raw_token"),
                "raw_start": rec.get("raw_start"),
                "raw_end":   rec.get("raw_end"),
                "corr_index": rec.get("corr_index"),
                "corr_token": rec.get("corr_token"),
                "corr_start": rec.get("corr_start"),
                "corr_end":   rec.get("corr_end"),
                "op": rec.get("op"),
                "equal_ci": rec.get("equal_ci"),
                "error_type": rec.get("error_type"),
            })
        texts_rows.append({
            "ID": ID,
            raw_col: raw,
            corr_col: corr,
            "NarrativeTagsJSON": r.get("NarrativeTagsJSON", "[]"),
            "DialogueSpansJSON": r.get("DialogueSpansJSON", "[]"),
        })

    df_map = pd.DataFrame(rows)
    for c in ["corr_index","corr_start","corr_end","raw_index","raw_start","raw_end"]:
        if c in df_map.columns:
            df_map[c] = pd.to_numeric(df_map[c], errors="coerce")
    for c, default in [
        ("TITLE", False),
        ("DIALOGUE", False),
        ("DialogueSpanID", pd.NA),
        ("Sentence Boundaries",""),
        ("BoundaryCheck","")
    ]:
        if c not in df_map.columns:
            df_map[c] = default

    df_texts = pd.DataFrame(texts_rows)
    if "NarrativeTagsJSON" not in df_texts.columns:
        df_texts["NarrativeTagsJSON"] = "[]"
    if "DialogueSpansJSON" not in df_texts.columns:
        df_texts["DialogueSpansJSON"] = "[]"
    return df_map, df_texts


# ---------- 8) Sentence IDs and flags ----------
ABBREV = { "mr.","mrs.","ms.","dr.","prof.","sr.","jr.","st.","vs.","etc.","e.g.","i.e.","cf.","fig.","ex.","no.",
    "approx.","circa.","ca.","dept.","est.","misc.","rev.","jan.","feb.","mar.","apr.","jun.","jul.",
    "aug.","sep.","sept.","oct.","nov.","dec." }
TERMINALS = {".","!","?","‚Ä¶","...","?!","!?"}
CLOSERS   = {")","]","}","‚Äù","‚Äô","¬ª"}
OPENERS   = {"(","[","{","‚Äú","‚Äò","¬´"}

def _require_df(df, name):
    if not isinstance(df, pd.DataFrame):
        raise TypeError(f"{name} is None or not a pandas.DataFrame")
    return df

def assign_corr_sentence_ids(df_map: pd.DataFrame) -> pd.DataFrame:
    df = _require_df(df_map, "assign_corr_sentence_ids: df_map").copy()
    if "RowID" in df.columns:
        df["ID"] = df["RowID"].astype(str)
    elif "ID" in df.columns:
        df["ID"] = df["ID"].astype(str)
    else:
        df["ID"] = df.index.astype(str)
    if "corr_index" not in df.columns:
        df["corr_index"] = np.nan
    df["_rowpos"] = np.arange(len(df))
    df["_sort"] = pd.to_numeric(df["corr_index"], errors="coerce").fillna(1e12) + df["_rowpos"]*1e-9

    RE_INITIAL = re.compile(r"^[A-Z]\.$")
    RE_INITIAL_PAIR = re.compile(r"^[A-Z]\.[A-Z]\.$")
    RE_NUM_DOT = re.compile(r"^\d+\.$")
    RE_SECTION = re.compile(r"^\d+(?:\.\d+){1,3}$")
    RE_DOT_TAIL = re.compile(r"^\.\d+$")

    def _is_terminal(tok, prev_tok, next_tok):
        t = tok.strip()
        if not t:
            return False
        if t in {"‚Ä¶","...","?!","!?"}:
            return True
        if t in {"!","?"}:
            return True
        if t == ".":
            p = prev_tok.strip().lower()
            n = next_tok.strip()
            if p in ABBREV:
                return False
            if RE_INITIAL.fullmatch(prev_tok) or RE_INITIAL_PAIR.fullmatch(prev_tok):
                return False
            if RE_SECTION.fullmatch(prev_tok):
                return False
            if RE_NUM_DOT.fullmatch(prev_tok) and (n and re.match(r"[A-Za-z(‚Äú\"'\[]", n)):
                return False
            if RE_DOT_TAIL.fullmatch(n):
                return False
            if n.isdigit():
                return False
            return True
        return False

    df["CorrSentenceID"] = pd.NA
    for ID, g in df.sort_values(["ID","_sort"], kind="mergesort").groupby("ID", sort=False):
        toks = (g["corr_token"] if "corr_token" in g.columns else g["raw_token"]).astype(str).tolist()
        sids, sid, pending = [], 0, False
        i = 0
        while i < len(toks):
            tok = toks[i].strip()
            prev_tok = toks[i-1].strip() if i > 0 else ""
            next_tok = toks[i+1].strip() if i+1 < len(toks) else ""
            if i+2 < len(toks) and toks[i]=="." and toks[i+1]=="." and toks[i+2]==".":
                pending = True
                sids.append(sid)
                i += 1
                continue
            if pending:
                if tok in CLOSERS or (tok=='"' and not (prev_tok == "" or prev_tok in TERMINALS or prev_tok in OPENERS)):
                    sids.append(sid)
                    i += 1
                    continue
                if tok in OPENERS or (tok=='"' and (prev_tok == "" or prev_tok in TERMINALS or prev_tok in OPENERS)):
                    sid += 1
                    pending = False
                    sids.append(sid)
                    i += 1
                    continue
                sid += 1
                pending = False
                sids.append(sid)
                i += 1
                continue
            else:
                sids.append(sid)
                i += 1
            if _is_terminal(tok, prev_tok, next_tok):
                pending = True
        df.loc[g.index, "CorrSentenceID"] = pd.Series(sids, index=g.index).astype("Int64")

    df.drop(columns=["_rowpos","_sort"], inplace=True, errors="ignore")
    return df


def ensure_dialogue_and_tags(df_texts):
    df = df_texts.copy()
    if "DialogueSpansJSON" not in df.columns:
        df["DialogueSpansJSON"] = "[]"
    if "NarrativeTagsJSON" not in df.columns:
        df["NarrativeTagsJSON"] = "[]"
    return ensure_dialogue_json(df, corrected_col="Corrected text (8)")

def mark_title_and_dialogue(df_map: pd.DataFrame, df_texts: pd.DataFrame) -> pd.DataFrame:
    df = df_map.copy()
    for c, default in [
        ("TITLE", False),
        ("DIALOGUE", False),
        ("DialogueSpanID", pd.NA),
        ("Sentence Boundaries", ""),
        ("BoundaryCheck", ""),
    ]:
        if c not in df.columns:
            df[c] = default

    df_texts = ensure_dialogue_json(df_texts, corrected_col="Corrected text (8)")
    tags_by_id = dict(zip(df_texts["ID"].astype(str), df_texts["NarrativeTagsJSON"]))
    dlg_by_id  = dict(zip(df_texts["ID"].astype(str), df_texts["DialogueSpansJSON"]))

    def _loads(x):
        try:
            return json.loads(x) if isinstance(x, str) else (x or [])
        except Exception:
            return []

    if "ID" not in df.columns and "RowID" in df.columns:
        df["ID"] = df["RowID"].astype(str)

    def per_id(block: pd.DataFrame) -> pd.DataFrame:
        g  = block.sort_values(["CorrSentenceID", "corr_index"], kind="mergesort")
        ID = str(g["ID"].iat[0])
        starts = pd.to_numeric(g["corr_start"], errors="coerce").fillna(0).astype(int)
        ends   = pd.to_numeric(g["corr_end"],   errors="coerce").fillna(0).astype(int)
        for t in _loads(tags_by_id.get(ID, "[]")):
            if isinstance(t, dict) and t.get("type") == "title":
                st, en = int(t.get("start", 0) or 0), int(t.get("end", 0) or 0)
                g.loc[(starts < en) & (ends > st), "TITLE"] = True
        g.loc[g["TITLE"], "Sentence Boundaries"] = g.loc[g["TITLE"], "Sentence Boundaries"].replace({"": "Title"})
        for span_idx, sp in enumerate(_loads(dlg_by_id.get(ID, "[]")), start=1):
            st, en = int(sp.get("start", 0) or 0), int(sp.get("end", 0) or 0)
            mask = (starts < en) & (ends > st)
            g.loc[mask, "DIALOGUE"] = True
            g.loc[mask, "DialogueSpanID"] = span_idx
        return g

    df = (
        df.groupby("ID", group_keys=False)
          .apply(per_id)
          .reset_index(drop=True)
    )

    df["SentenceRef"] = (
        df["ID"].astype(str) + "_s" +
        df["CorrSentenceID"].apply(lambda x: f"{int(x):03d}" if pd.notna(x) else "000")
    )
    return df

def add_dialogue_boundary_flags(df_map_in: pd.DataFrame) -> pd.DataFrame:
    df = df_map_in.copy()
    for c in ["Dialogue Boundaries", "DialogueBoundaryCheck"]:
        if c not in df.columns:
            df[c] = ""
    if "DialogueSpanID" not in df.columns or "DIALOGUE" not in df.columns:
        return df

    def _is_open_quote(tok):
        t = str(tok or "").strip()
        return t in QUOTE_OPENERS or t == '"'
    def _is_close_quote(tok):
        t = str(tok or "").strip()
        return t in QUOTE_CLOSERS or t == '"'

    sort_cols = ["ID","CorrSentenceID"]
    if "corr_index" in df.columns:
        sort_cols.append("corr_index")
    df = df.sort_values(sort_cols, kind="mergesort")

    updates_boundaries = {}
    updates_checks = {}

    q = df[df["DIALOGUE"].astype(bool) & df["DialogueSpanID"].notna()]
    for (ID, SID, DID), g in q.groupby(["ID","CorrSentenceID","DialogueSpanID"], sort=False):
        begin_idx = None
        for idx, tok in zip(g.index, g["corr_token"].astype(str)):
            if not _is_open_quote(tok):
                begin_idx = idx
                break
        end_idx = None
        for idx, tok in zip(reversed(g.index.tolist()), reversed(g["corr_token"].astype(str).tolist())):
            if not _is_close_quote(tok):
                end_idx = idx
                break
        if begin_idx is not None:
            updates_boundaries.setdefault(begin_idx, []).append("Dialogue Beginning")
            updates_checks.setdefault(begin_idx, []).append("Correct Dialogue Beginning")
        if end_idx is not None:
            updates_boundaries.setdefault(end_idx, []).append("Dialogue Ending")
            updates_checks.setdefault(end_idx, []).append("Correct Dialogue Ending")

    if updates_boundaries:
        col = df["Dialogue Boundaries"].astype(str).fillna("")
        for idx, labs in updates_boundaries.items():
            joined = " | ".join(sorted(set(labs)))
            col.loc[idx] = (col.loc[idx] + " | " if col.loc[idx] else "") + joined
        df["Dialogue Boundaries"] = col

    if updates_checks:
        col = df["DialogueBoundaryCheck"].astype(str).fillna("")
        for idx, labs in updates_checks.items():
            joined = " | ".join(sorted(set(labs)))
            col.loc[idx] = (col.loc[idx] + " | " if col.loc[idx] else "") + joined
        df["DialogueBoundaryCheck"] = col

    return df.reset_index(drop=True)

# sentence boundary flags
TERMINALS_HARD = {".","!","?","‚Ä¶","...","?!","!?"}
OPENING_PUNCT  = {'"', "‚Äú", "‚Äò", "¬´", "(", "[", "{"}
CLOSING_PUNCT  = {'"', "‚Äù", "‚Äô", "¬ª", ")", "]", "}"}

def _first_cap_or_digit(s: str) -> bool:
    if not isinstance(s, str):
        return False
    m = re.search(r"[A-Za-z0-9]", s)
    if not m:
        return False
    ch = s[m.start()]
    return ch.isdigit() or (ch.isalpha() and ch.isupper())

def _first_begin_row(g: pd.DataFrame):
    g = g.copy()
    toks = g["corr_token"].astype(str).tolist()
    titles = g["TITLE"].astype(bool).tolist()
    for idx, tok, is_title in zip(g.index, toks, titles):
        if is_title:
            continue
        t = tok.strip()
        if t in OPENING_PUNCT:
            continue
        if _first_cap_or_digit(t):
            return idx
    for idx, tok, is_title in zip(g.index, toks, titles):
        if is_title:
            continue
        if re.search(r"\w", tok or ""):
            return idx
    return None

def _last_terminal_row(g: pd.DataFrame):
    toks = g["corr_token"].astype(str).tolist()
    idxs = list(g.index)
    last_term_loc = None
    for j in range(len(toks) - 1, -1, -1):
        t = toks[j].strip()
        if t in TERMINALS_HARD:
            last_term_loc = j
            break
    if last_term_loc is None:
        return None
    return idxs[last_term_loc]

def add_sentence_boundary_flags(df_map_in: pd.DataFrame) -> pd.DataFrame:
    df = _require_df(df_map_in, "add_sentence_boundary_flags: df_map_in").copy()
    for c, default in [("Sentence Boundaries",""), ("BoundaryCheck","")]:
        if c not in df.columns:
            df[c] = default
    if "CorrSentenceID" not in df.columns:
        raise KeyError("add_sentence_boundary_flags requires CorrSentenceID. Run assign_corr_sentence_ids first.")
    sort_cols = ["ID","CorrSentenceID"]
    if "corr_index" in df.columns:
        sort_cols.append("corr_index")
    df = df.sort_values(sort_cols, kind="mergesort")
    updates_boundaries = {}
    updates_checks = {}
    for (ID, SID), g in df.groupby(["ID","CorrSentenceID"], sort=False):
        if g.empty:
            continue
        begin_idx = _first_begin_row(g)
        end_idx   = _last_terminal_row(g)
        if begin_idx is not None:
            updates_boundaries.setdefault(begin_idx, []).append("Sentence Beginning")
            tok = str(df.loc[begin_idx, "corr_token"])
            if _first_cap_or_digit(tok):
                updates_checks.setdefault(begin_idx, []).append("Correct Beginning")
            else:
                updates_checks.setdefault(begin_idx, []).append("Incorrect Beginning")
        if end_idx is not None:
            updates_boundaries.setdefault(end_idx, []).append("Sentence Ending")
            later_term = False
            toks = g["corr_token"].astype(str).tolist()
            idxs = list(g.index)
            chosen_pos = idxs.index(end_idx)
            for j in range(chosen_pos + 1, len(idxs)):
                if toks[j].strip() in TERMINALS_HARD:
                    later_term = True
                    break
            if later_term:
                updates_checks.setdefault(end_idx, []).append("Incorrect Ending")
            else:
                updates_checks.setdefault(end_idx, []).append("Correct Ending")
        else:
            last_idx = g.index[-1]
            updates_boundaries.setdefault(last_idx, []).append("Sentence Ending")
            updates_checks.setdefault(last_idx, []).append("Incorrect Ending")

    if updates_boundaries:
        sb = df["Sentence Boundaries"].astype(str).fillna("")
        for idx, lab_list in updates_boundaries.items():
            joined = " | ".join(sorted(set(lab_list)))
            sb.loc[idx] = (sb.loc[idx] + " | " if sb.loc[idx] else "") + joined
        df["Sentence Boundaries"] = sb

    if updates_checks:
        bc = df["BoundaryCheck"].astype(str).fillna("")
        for idx, lab_list in updates_checks.items():
            joined = " | ".join(sorted(set(lab_list)))
            bc.loc[idx] = (bc.loc[idx] + " | " if bc.loc[idx] else "") + joined
        df["BoundaryCheck"] = bc

    return df.reset_index(drop=True)


In [None]:


# ---------- 11) Step 9 summariser ----------
NO_SPACE_BEFORE = set(list(".,;:!?)]}\"'¬ª‚Äù‚Äô‚Ä¶"))
NO_SPACE_AFTER  = set(list("([{{\"'¬´‚Äú‚Äò"))

def _detok(tokens):
    out = []
    for t in tokens:
        if t is None or (isinstance(t, float) and math.isnan(t)):
            continue
        t = str(t)
        if not out:
            out.append(t)
            continue
        prev = out[-1]
        if t in NO_SPACE_BEFORE or re.fullmatch(r"[.]{3}", t):
            out[-1] = prev + t
        elif prev in NO_SPACE_AFTER:
            out[-1] = prev + t
        else:
            out.append(" " + t)
    s = "".join(out)
    s = re.sub(r"\.\s*\.\s*\.", "...", s)
    return s.strip()

def _summarize_sentence(g: pd.DataFrame, tags_row: pd.Series) -> pd.Series:
    corr_tokens = g["corr_token"].tolist()
    raw_tokens  = [x for x in g.get("raw_token", pd.Series([], dtype=object)).tolist() if not pd.isna(x)]
    corr_text   = _detok(corr_tokens)
    raw_text    = _detok(raw_tokens) if raw_tokens else ""
    b_rows = g[g["Sentence Boundaries"].str.contains("Sentence Beginning", na=False)]
    e_rows = g[g["Sentence Boundaries"].str.contains("Sentence Ending",   na=False)]
    def _resolve(check_series: pd.Series, good: str, bad: str):
        if check_series.empty:
            return np.nan
        joined = " | ".join(check_series.dropna().astype(str))
        if good in joined:
            return 1
        if bad in joined:
            return 0
        return np.nan
    begin_ok = _resolve(b_rows.get("BoundaryCheck", pd.Series([], dtype=object)), "Correct Beginning", "Incorrect Beginning")
    end_ok   = _resolve(e_rows.get("BoundaryCheck", pd.Series([], dtype=object)), "Correct Ending", "Incorrect Ending")
    ops = g.get("op", pd.Series([], dtype=object))
    rec = {
        "SentenceRef": g["SentenceRef"].iloc[0],
        "CorrectedSentence": corr_text,
        "RawSentence": raw_text,
        "TokensInSentence": int(len(g)),
        "EditsInSentence": int((ops != "equal").sum()) if not ops.empty else np.nan,
        "EqualsInSentence": int((ops == "equal").sum()) if not ops.empty else np.nan,
        "Insertions": int((ops == "insert").sum()) if not ops.empty else np.nan,
        "Deletions": int((ops == "delete").sum()) if not ops.empty else np.nan,
        "Replacements": int((ops == "replace").sum()) if not ops.empty else np.nan,
        "CorrectBeginning": begin_ok,
        "CorrectEnding": end_ok,
        "NarrativeTagsJSON": tags_row.get("NarrativeTagsJSON","[]"),
        "DialogueSpansJSON": tags_row.get("DialogueSpansJSON","[]"),
    }
    return pd.Series(rec)


In [None]:


# ---------- 12) Step 8 runner ----------
def run_step8(df_preprocessed, raw_col="Raw text", id_col="ID", client=None, model=MODEL_ID, use_mock=USE_MOCK):
    df_corr = run_correct_only(
        df_preprocessed, text_col=raw_col, id_col=id_col,
        client=None if use_mock else client, model=model, use_mock=use_mock,
        out_col="Corrected text (8)"
    )
    df_map, df_texts = run_mapping_only(df_corr, id_col=id_col, raw_col=raw_col, corr_col="Corrected text (8)")
    df_texts = ensure_dialogue_json(df_texts, corrected_col="Corrected text (8)")
    df_map = assign_corr_sentence_ids(df_map)
    df_map = mark_title_and_dialogue(df_map, df_texts)
    df_map = add_sentence_boundary_flags(df_map)
    df_map = add_dialogue_boundary_flags(df_map)
    return df_texts, df_map


# ---------- 13) Step 9 runner ----------
def run_step9(df_map, df_texts_with_tags):
    need = {"ID","CorrSentenceID","corr_token","Sentence Boundaries","BoundaryCheck","SentenceRef"}
    missing = need - set(df_map.columns)
    if missing:
        raise KeyError(f"df_map missing: {missing}")
    sort_cols = ["ID","CorrSentenceID"]
    if "corr_index" in df_map.columns:
        sort_cols.append("corr_index")
    wm = df_map.sort_values(sort_cols, kind="mergesort").copy()
    wm_nontitle = wm[~wm["TITLE"].astype(bool)].copy()
    tags_by_id = df_texts_with_tags.set_index("ID")[["NarrativeTagsJSON","DialogueSpansJSON"]]
    out = []
    for (ID, SID), g in wm_nontitle.groupby(["ID","CorrSentenceID"], sort=False):
        tags_row = tags_by_id.loc[ID] if ID in tags_by_id.index else pd.Series({}, dtype=object)
        out.append(_summarize_sentence(g, tags_row))
    sent_df = pd.DataFrame(out).sort_values(["SentenceRef"], kind="mergesort").reset_index(drop=True)
    return sent_df


In [None]:


# ---------- 14) Save and download ----------
from google.colab import files

def _ensure_dir(path):
    if not os.path.exists(path):
        os.makedirs(path, exist_ok=True)
    return path


# ---------- 14) Save and download ----------
from google.colab import files

def _ensure_dir(path): os.makedirs(path, exist_ok=True); return path

def save_and_download_step8_9(df_preprocessed, *, raw_col="Raw text", id_col="ID", client=None, model=MODEL_ID, use_mock=USE_MOCK, out_dir="/content"):
    ts = time.strftime("%Y%m%d_%H%M%S"); _ensure_dir(out_dir)
    df_texts_8, df_map_8 = run_step8(df_preprocessed, raw_col=raw_col, id_col=id_col,
                                     client=client if not use_mock else None, model=model, use_mock=use_mock)
    sent_df = run_step9(df_map_8, df_texts_8)
    p_texts = os.path.join(out_dir, f"step8_texts_{ts}.csv")
    p_map   = os.path.join(out_dir, f"step8_wordmap_checked_{ts}.csv")
    p_sent  = os.path.join(out_dir, f"step9_sentence_mapping_with_boundaries_{ts}.csv")
    p_zip   = os.path.join(out_dir, f"step8_9_outputs_{ts}.zip")
    df_texts_8.to_csv(p_texts, index=False, encoding="utf-8")
    df_map_8.to_csv(p_map,   index=False, encoding="utf-8")
    sent_df.to_csv(p_sent,   index=False, encoding="utf-8")
    with zipfile.ZipFile(p_zip, "w", compression=zipfile.ZIP_DEFLATED) as zf:
        zf.write(p_texts, arcname=os.path.basename(p_texts))
        zf.write(p_map,   arcname=os.path.basename(p_map))
        zf.write(p_sent,  arcname=os.path.basename(p_sent))
    print("üóÇÔ∏è  Outputs:")
    print("   step8 texts:  ", p_texts)
    print("   step8 map:    ", p_map)
    print("   step9 table:  ", p_sent)
    print("   zip bundle:   ", p_zip)
    try:
        files.download(p_texts); files.download(p_map); files.download(p_sent); files.download(p_zip)
    except Exception as e:
        print("Download hint:", e)
    return dict(step8_texts_path=p_texts, step8_map_path=p_map, step9_sentences_path=p_sent, zip_path=p_zip,
                df_texts_8=df_texts_8, df_map_8=df_map_8, sent_df=sent_df)

In [None]:

# ---------- 15) Run ----------
if __name__ == '__main__':
    results = save_and_download_step8_9(
        df_pre,
        raw_col="Raw text",
        id_col="ID",
        client=client,
        model=MODEL_ID,
        use_mock=USE_MOCK,
        out_dir="/content"
    )
    print("Shapes ‚Äî step8:", results["df_texts_8"].shape, results["df_map_8"].shape)
    print("Shape ‚Äî step9 sentences:", results["sent_df"].shape)


Step 8: correcting: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 21/21 [02:11<00:00,  6.24s/it]
  .apply(per_id)


üóÇÔ∏è  Outputs:
   step8 texts:   /content/step8_texts_20251106_130040.csv
   step8 map:     /content/step8_wordmap_checked_20251106_130040.csv
   step9 table:   /content/step9_sentence_mapping_with_boundaries_20251106_130040.csv
   zip bundle:    /content/step8_9_outputs_20251106_130040.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Shapes ‚Äî step8: (21, 5) (7379, 21)
Shape ‚Äî step9 sentences: (561, 13)


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
