<a href="https://colab.research.google.com/github/elijahManPerson/Flappy-Bird/blob/master/_Mechanical_Criteria_Pipeline_GPT_2410__Corrected_only.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data access and libray set up

Mounting the Google Drive to access files and save them.

# Step 1: Mount Google Drive
## Purpose:
To access and manipulate files stored in your Google Drive from the Colab environment.

##What each part does





*  drive.mount('/content/drive') starts the Google auth flow so Colab can access your Drive.
* CHECK_PATH is the single place to point at your working folder.
*  The helper status() prints clear pass or fail messages.
*  Read test lists a few entries to confirm you can read.
*  Write test creates and deletes a tiny file to confirm you can write.

## Actions:

**Import and Mount:** Uses google.colab.drive to mount the drive.
Verification: Checks if the drive is successfully mounted by verifying the existence of the /content/drive/MyDrive directory.

## Outcome:
Access to files within Google Drive is established, allowing the script to read from and write to specific directories.

In [None]:
# ===============================================
# Step 1: Mount Google Drive
# ===============================================
# ==== Drive mount + verification ====
from google.colab import drive
drive.mount('/content/drive')  # remove force_remount if you do not want to re-prompt

import os, time

def status(msg, ok):
    print(("✅ " if ok else "❌ ") + msg)

root_mount = '/content/drive'
root_mydrive = '/content/drive/MyDrive'

# 1) Basic mount checks
status("Drive mount detected at /content/drive", os.path.ismount(root_mount))
status("MyDrive folder present", os.path.isdir(root_mydrive))

# 2) Read test: list a few entries in MyDrive
try:
    entries = os.listdir(root_mydrive)[:5]
    status("Read test passed (listed MyDrive)", True)
    print("   • Sample entries:", entries if entries else "(empty)")
except Exception as e:
    status(f"Read test failed: {e}", False)

# 3) Write test: create and remove a tiny file
probe_path = os.path.join(root_mydrive, "_colab_mount_check.txt")
try:
    with open(probe_path, "w", encoding="utf-8") as f:
        f.write(f"colab mount check {time.time()}\n")
    status("Write test passed (created file)", True)
    os.remove(probe_path)
    status("Cleanup passed (deleted file)", True)
except Exception as e:
    status(f"Write test failed: {e}", False)


Mounted at /content/drive
✅ Drive mount detected at /content/drive
✅ MyDrive folder present
✅ Read test passed (listed MyDrive)
   • Sample entries: ['Colab Notebooks', 'Untitled.gdoc', 'Copy of Untitled.gdoc', 'Russelline Doxology.gdoc', 'Application Letter to St. John Bosco.gdoc']
✅ Write test passed (created file)
✅ Cleanup passed (deleted file)


#Step 2.0 Data upload guide (optional)
###Strict loader for files where:
- first column = ID
- last column  = Raw text
- everything else optional

###Instructions for preparing your CSV

Put your unique identifier in the first column. Name it whatever you like, but “ID” is unambiguous.

Put the original writing text in the last column. Name it “Raw text” for readability.

Any other fields can live between first and last. They’ll be preserved, but not required.

Save as CSV with UTF-8 encoding. If you’re unsure, Excel and Google Sheets default exports are fine; our loader tolerates BOM too.


In [None]:
import pandas as pd
import ipywidgets as widgets
from IPython.display import display
from google.colab import files

# Build blank DataFrames
df_min = pd.DataFrame(columns=["ID", "Raw text"])
df_ext = pd.DataFrame(columns=[
    "ID", "AU", "TS", "CS/PD", "Voc", "Coh", "Pa", "SS", "Pun", "Spell",
    "Total", "WordCount", "yrlev", "Prompt ID", "Prompt Name", "Raw text"
])

# Save to local workspace
df_min.to_csv("/content/texts_template_min.csv", index=False, encoding="utf-8")
df_ext.to_csv("/content/texts_template_extended.csv", index=False, encoding="utf-8")

# Button callbacks
def on_download_min(_):
    files.download("/content/texts_template_min.csv")

def on_download_ext(_):
    files.download("/content/texts_template_extended.csv")

# UI
display(widgets.HTML("<h4>Download a CSV Template</h4>"))
display(widgets.HTML(
    "Use the first column as <b>ID</b> and the last as <b>Raw text</b>.<br>"
    "Everything in between is optional."
))
btn_min = widgets.Button(description="⬇️ Download Minimal Template (ID + Raw text)", button_style="primary")
btn_ext = widgets.Button(description="⬇️ Download Extended Template (Full NAPLAN-style)", button_style="info")

btn_min.on_click(on_download_min)
btn_ext.on_click(on_download_ext)
display(widgets.HBox([btn_min, btn_ext]))



HTML(value='<h4>Download a CSV Template</h4>')

HTML(value='Use the first column as <b>ID</b> and the last as <b>Raw text</b>.<br>Everything in between is opt…

HBox(children=(Button(button_style='primary', description='⬇️ Download Minimal Template (ID + Raw text)', styl…

# Step 2 — Load and Preprocess Data

###Purpose
- Bring a CSV file from Google Drive into Python as a Pandas DataFrame, then ensure the key text column is present, consistently named, and safe to use.
- The rest of the pipeline expects a column called "Raw text". This step prepares that column so later steps do not break.

###What you may need to change
- DATA_PATH: set this to the exact path of your CSV inside Google Drive. If you moved the file or renamed folders, update it here.

###Inputs
- A CSV file located at DATA_PATH. The file should contain one column that holds the original student text or source text. It might be called "Raw text", "Raw Text", "raw_text", or something similar.

###Outputs
- df_preprocessed: a Pandas DataFrame that definitely has a column named "Raw text". If your file used a variant name, the code renames it to "Raw text" so the rest of the notebook can rely on one standard.
- Printed checks that confirm: the path is valid, the file loaded, how many rows are present, how many are non-empty in "Raw text", and a small sample of the first few rows.

###Key ideas
- CSVs created on different systems sometimes include a special marker at the start of the file called a BOM. Using UTF-8 with BOM support avoids header glitches.
- Real-world files often vary in how they label the same concept. We accept common header variants for the text column, then rename to "Raw text" so every downstream function can assume one consistent name.
- It is better to stop early if the text column is missing or empty rather than let subtle errors propagate. This step fails fast with a clear message if something essential is wrong.

###Actions performed in this code
1) Validate your CSV path.  
2) Load the CSV with safe defaults and BOM handling.  
3) Find and standardise the "Raw text" column (case and spacing tolerant).  
4) Fill missing values and report how many rows are usable.  
5) Preview the first few rows.  

###Verification prints
- "CSV path exists" confirms the notebook can see the file. If you get a cross here, fix DATA_PATH.
- "Data loaded. Rows: X, Columns: Y" confirms the file parsed as a table.
- "Non-empty 'Raw text' rows: A of B" shows how many rows actually contain usable text after trimming blanks.
- A preview of the first five rows lets you eyeball whether the data looks right before moving on.

###Common pitfalls
- Wrong path: the file was moved, renamed, or the folder hierarchy changed. Update DATA_PATH.
- Unusual delimiter or BOM: some CSVs use semicolons or tabs, or include a BOM. The loader handles most cases, but if columns look fused together, specify a delimiter explicitly.
- Header named slightly differently: if your text column label is unexpected, the auto-detection usually finds it. If not, either rename the column in the CSV or add your variant to the accepted names in the code.


### NOTE:
####DATA_PATH = "/content/drive/MyDrive/JM/Sandbox/1.Training Data/Data for Testing avg short.csv"
#### RAW_TEXT_ALIASES = {"raw text", "raw_text", "rawtext"}


In [None]:
# ===============================================
# Step 2: Load and Preprocess Data  + optional download
# ===============================================


import os, io, csv
import pandas as pd

def status(msg, ok=True):
    print(("✅ " if ok else "❌ ") + msg)

# ======= EDIT HERE IF NEEDED =======
DATA_PATH = "/content/drive/MyDrive/JM/Sandbox/1.Training Data/Data for Testing avg short.csv"
RAW_TEXT_ALIASES = {"raw text", "raw_text", "rawtext"}
# ===================================

if not os.path.exists(DATA_PATH):
    status(f"File not found: {DATA_PATH}", ok=False)
    raise FileNotFoundError(DATA_PATH)
status("CSV path exists")

def try_read(path, sep, engine=None):
    kwargs = dict(encoding="utf-8-sig", on_bad_lines="skip", low_memory=False)
    if sep is None:
        kwargs["sep"] = None
        kwargs["engine"] = "python"  # auto-sniff
    else:
        kwargs["sep"] = sep
        if engine:
            kwargs["engine"] = engine
    try:
        df = pd.read_csv(path, **kwargs)
        return df
    except Exception:
        return None

# Try several parsers; keep the best
candidates = [
    ("auto-sniff", None, "python"),
    ("comma", ",", None),
    ("semicolon", ";", None),
    ("tab", "\t", None),
    ("pipe", "|", None),
]

best = None
best_score = (-1, -1)  # (has_raw_text_like, n_cols)

def score_df(df):
    if df is None or df.empty:
        return (-1, -1)
    cols = [c.strip().lower() for c in df.columns]
    has_raw_like = int(any(c in RAW_TEXT_ALIASES for c in cols) or ("raw" in cols and "text" in cols))
    return (has_raw_like, len(cols))

parsed_by = None
for name, sep, eng in candidates:
    df = try_read(DATA_PATH, sep, eng)
    s = score_df(df)
    if s > best_score:
        best_score, best, parsed_by = s, df, name

if best is None or best.empty:
    status("Failed to read CSV with all strategies", ok=False)
    raise ValueError("Could not parse CSV")

status(f"Parsed using: {parsed_by}. Columns: {len(best.columns)}")

df_preprocessed = best.copy()

# --- Standardise/repair the Raw text column ---
cols_norm = {c: c.strip().lower() for c in df_preprocessed.columns}

raw_col = None
for c, norm in cols_norm.items():
    if norm in RAW_TEXT_ALIASES:
        raw_col = c
        break

# If we didn't find it, handle the split-header case: "Raw" and "text" as separate columns
if raw_col is None and "raw" in cols_norm.values() and "text" in cols_norm.values():
    # Find the actual column names that normalise to 'raw' and 'text'
    raw_name = next(k for k, v in cols_norm.items() if v == "raw")
    text_name = next(k for k, v in cols_norm.items() if v == "text")

    # Merge them into one string column, preserving whichever side has content
    df_preprocessed["Raw text"] = (
        df_preprocessed[raw_name].astype(str).fillna("").str.rstrip() +
        df_preprocessed[text_name].astype(str).fillna("").radd(
            df_preprocessed[text_name].astype(str).where(
                df_preprocessed[raw_name].astype(str).str.strip().eq(""),
                ""  # avoid double-joining if 'raw' already holds full text
            )
        )
    )

    # If that was too clever, just do a simple join with a space fallback
    mask_all_empty = df_preprocessed["Raw text"].str.strip().eq("")
    df_preprocessed.loc[mask_all_empty, "Raw text"] = (
        df_preprocessed[raw_name].astype(str).str.strip() + " " +
        df_preprocessed[text_name].astype(str).str.strip()
    ).str.strip()

    status(f"Merged split columns '{raw_name}' + '{text_name}' into 'Raw text'")
else:
    if raw_col is None:
        status("Raw text column not found after parsing", ok=False)
        print("Columns present:", list(df_preprocessed.columns))
        raise KeyError("'Raw text' column is missing")
    if raw_col != "Raw text":
        df_preprocessed.rename(columns={raw_col: "Raw text"}, inplace=True)
        status(f"Renamed '{raw_col}' to 'Raw text'")

# Clean and verify
df_preprocessed["Raw text"] = df_preprocessed["Raw text"].fillna("").astype(str)
total = len(df_preprocessed)
empty = df_preprocessed["Raw text"].str.strip().eq("").sum()
usable = total - empty
status(f"Non-empty 'Raw text' rows: {usable} of {total}")

if usable == 0:
    status("All 'Raw text' entries are empty after cleaning", ok=False)
    raise ValueError("No usable text in 'Raw text'")

# Peek and a quick stat
print("\nFirst 5 rows of 'Raw text':")
print(df_preprocessed[["Raw text"]].head(5))

avg_len = df_preprocessed["Raw text"].str.len().mean()
if pd.isna(avg_len):
    avg_len = 0.0
status(f"Average character length across 'Raw text': {avg_len:.1f}")


✅ CSV path exists
✅ Parsed using: comma. Columns: 15
✅ Non-empty 'Raw text' rows: 21 of 21

First 5 rows of 'Raw text':
                                            Raw text
0  There once was a girl called lilly she had pet...
1  wrire a narrative story abouta search for some...
2  The Failed Submarine I had always wanted go on...
3  The diamond ring Emmy was just having breakfas...
4  If you are locking for a dimiens go to most di...
✅ Average character length across 'Raw text': 1504.6


In [None]:

# ===============================================
# 2A — Detect and standardise ID
# (Start with leftmost column; look rightward for any ID-like name)
# ===============================================


import re
import pandas as pd

def _normalize_id_series(s: pd.Series) -> pd.Series:
    """Return all IDs as clean strings (no .0, no float drift)."""
    s = s.astype(str).str.replace(r"\.0$", "", regex=True)
    def _fix(x):
        if any(c.isalpha() for c in x):
            return x
        try:
            if "." in x or "e" in x.lower():
                f = float(x)
                if f.is_integer():
                    return str(int(f))
        except Exception:
            pass
        return x
    return s.map(_fix)

def find_id_column(cols):
    """Start with leftmost column, then move right until one looks like an ID."""
    # Convert to list if it's a Pandas Index
    if isinstance(cols, pd.Index):
        cols = cols.tolist()

    if len(cols) == 0:
        return None

    # 1. Start with leftmost
    candidate = cols[0]
    if re.search(r"id|identifier", candidate.lower()):
        return candidate

    # 2. Search other columns for anything containing 'id'
    for name in cols[1:]:
        if re.search(r"id|identifier", name.lower()):
            return name

    # 3. Fallback: just use leftmost anyway
    return cols[0]

# --- run detection ---
CANON_ID = find_id_column(df_preprocessed.columns)
print(f"✅ Canonical ID column selected: {CANON_ID}")

# --- assign canonical ID ---
df_preprocessed["ID"] = _normalize_id_series(df_preprocessed[CANON_ID])

# quick preview
print("Preview of canonical ID values:")
print(df_preprocessed[[CANON_ID, "ID"]].head(10).to_string(index=False))

print(df_preprocessed[[CANON_ID, "ID"]].head(10).to_string(index=False))


✅ Canonical ID column selected: Research ID
Preview of canonical ID values:
Research ID       ID
   BBCMHJPT BBCMHJPT
   BBKBYNDW BBKBYNDW
   BBRWTLYV BBRWTLYV
   BCDVWQDF BCDVWQDF
   BCXSFTWC BCXSFTWC
   BGRRHYPQ BGRRHYPQ
   BGZZHTXS BGZZHTXS
   BHLQHBRW BHLQHBRW
   BPHVBHZV BPHVBHZV
   BQTNFJFX BQTNFJFX
Research ID       ID
   BBCMHJPT BBCMHJPT
   BBKBYNDW BBKBYNDW
   BBRWTLYV BBRWTLYV
   BCDVWQDF BCDVWQDF
   BCXSFTWC BCXSFTWC
   BGRRHYPQ BGRRHYPQ
   BGZZHTXS BGZZHTXS
   BHLQHBRW BHLQHBRW
   BPHVBHZV BPHVBHZV
   BQTNFJFX BQTNFJFX


#NOTATION: Step 2.1 — Word and Token Stats

###Purpose
- Add quick length metrics to each row of text so you can sanity check size and plan token budgets.
- Two columns are added to df_preprocessed: WordCount and TokenCount.
- Estimate how much it would cost to send this dataset to the API, using your per-row TokenCount
  and a configurable assumption for output size.

###What you may need to change
- Nothing for most cases. If you target a specific OpenAI model later, we can switch to its matching tokenizer.
-You may need to update the cose of the API call (or could this be updated automatically?).

###Inputs
- df_preprocessed['Raw text'] produced in Step 2.

###Outputs
- df_preprocessed with two new numeric columns:
    - WordCount  count of whitespace separated words per row
    - TokenCount approximate token count per row using tiktoken
- Printed checks that show which tokenizer is used, averages, a small distribution summary, and a short preview.
- Printed summary: total input tokens, estimated output tokens, and costs for 5 models.
- Optional: a small widget to download the DataFrame now as CSV or Excel for manual checks and token and word estimates.

###Key ideas
- Word counts are simple readability and length signals.
- Token counts are model dependent. We try o200k_base first and fall back to cl100k_base, which keeps estimates close to how current OpenAI chat models tokenize text.
- API pricing bills both input and output tokens.
- We use your TokenCount as “input tokens” and estimate “output tokens” with a single ratio so you can quickly forecast spend.
- Keep stats light and fast so they scale to larger datasets.

###Verification prints
- "Using tokenizer: ..." confirms the encoding choice.
- "Average words per Raw text: ..." and "Average tokens per Raw text (approx.): ..." confirm central tendencies.
- A min, median, max snapshot for both metrics.
- A short preview of the DataFrame with counts.
- Shows total rows, total/avg tokens, the output ratio used, and a cost table for:
  gpt-4o, gpt-4o-mini, gpt-4.1, gpt-4.1-mini, gpt-3.5-turbo (Standard tier).

###Common pitfalls
- Running this before Step 2 or without a 'Raw text' column.
- Non string entries in 'Raw text'  Step 2 already coerces to strings, so you should be safe.

###Actions performed in this code
1) Choose a tokenizer and report which one is used.
2) Compute WordCount and TokenCount for each row.
3) Print summary statistics and show a small preview.
4) Offer an optional Download DataFrame button with a format picker




In [None]:
# ===============================================
# Step 2.1: Word and token stats
# ===============================================

import math
import tiktoken
import pandas as pd
from IPython.display import display, HTML

# Optional download widget support
try:
    import ipywidgets as widgets
    WIDGETS_OK = True
except Exception:
    WIDGETS_OK = False

from google.colab import files

def status(msg, ok=True):
    print(("✅ " if ok else "❌ ") + msg)

# 1) Choose tokenizer
try:
    _ENC = tiktoken.get_encoding("o200k_base")
    enc_name = "o200k_base"
except Exception:
    _ENC = tiktoken.get_encoding("cl100k_base")
    enc_name = "cl100k_base"
status(f"Using tokenizer: {enc_name}")

def count_tokens(text: str) -> int:
    try:
        return len(_ENC.encode(text or ""))
    except Exception:
        return 0

def count_words(text: str) -> int:
    if not isinstance(text, str):
        return 0
    # Simple whitespace split
    return len([w for w in text.split() if w.strip()])

# 2) Compute counts
if "Raw text" not in df_preprocessed.columns:
    status("Missing 'Raw text' column. Run Step 2 first.", ok=False)
    raise KeyError("'Raw text' column is missing")

df_preprocessed["WordCount"] = df_preprocessed["Raw text"].apply(count_words)
df_preprocessed["TokenCount"] = df_preprocessed["Raw text"].apply(count_tokens)

# 3) Summary statistics
avg_words = df_preprocessed["WordCount"].mean()
avg_tokens = df_preprocessed["TokenCount"].mean()
median_words = df_preprocessed["WordCount"].median()
median_tokens = df_preprocessed["TokenCount"].median()
min_words = df_preprocessed["WordCount"].min()
max_words = df_preprocessed["WordCount"].max()
min_tokens = df_preprocessed["TokenCount"].min()
max_tokens = df_preprocessed["TokenCount"].max()

status(f"Average words per Raw text: {avg_words:.1f}")
status(f"Average tokens per Raw text (approx.): {avg_tokens:.1f}")

print("\nQuick distribution check:")
print(f"  Words  → min {min_words}, median {median_words:.0f}, max {max_words}")
print(f"  Tokens → min {min_tokens}, median {median_tokens:.0f}, max {max_tokens}")

print("\nPreview with counts:")
display(df_preprocessed[["Raw text", "WordCount", "TokenCount"]].head(5))

# 4) Optional: Download DataFrame for checks
def _download_df(df, filename="df_preprocessed_counts.csv", file_format="csv"):
    local_path = f"/content/{filename}"
    if file_format.lower() == "csv":
        df.to_csv(local_path, index=False, encoding="utf-8")
    elif file_format.lower() in {"xlsx", "excel"}:
        df.to_excel(local_path, index=False)
    else:
        raise ValueError("Use 'csv' or 'xlsx'")
    files.download(local_path)

if WIDGETS_OK:
    fmt_dd = widgets.Dropdown(
        options=[("CSV", "csv"), ("Excel (.xlsx)", "xlsx")],
        value="csv",
        description="Format:",
        layout=widgets.Layout(width="240px")
    )
    dl_btn = widgets.Button(
        description="Download DataFrame now",
        button_style="primary",
        tooltip="Click to download the DataFrame with WordCount and TokenCount"
    )
    out = widgets.Output()

    def on_click_download(_):
        with out:
            out.clear_output()
            try:
                _download_df(df_preprocessed, filename="df_preprocessed_counts." + fmt_dd.value, file_format=fmt_dd.value)
                print(f"Started download as df_preprocessed_counts.{fmt_dd.value}")
            except Exception as e:
                print("Download failed:", e)

    dl_btn.on_click(on_click_download)
    display(HTML("<b>Do you want to download the DataFrame now for checks?</b>"))
    display(widgets.HBox([fmt_dd, dl_btn]), out)
else:
    print("\nWidgets not available. To download manually, run:")
    print("  df_preprocessed.to_csv('/content/df_preprocessed_counts.csv', index=False, encoding='utf-8')")
    print("  from google.colab import files; files.download('/content/df_preprocessed_counts.csv')")

✅ Using tokenizer: o200k_base
✅ Average words per Raw text: 284.1
✅ Average tokens per Raw text (approx.): 347.1

Quick distribution check:
  Words  → min 5, median 243, max 706
  Tokens → min 6, median 275, max 822

Preview with counts:


Unnamed: 0,Raw text,WordCount,TokenCount
0,There once was a girl called lilly she had pet...,52,61
1,wrire a narrative story abouta search for some...,43,53
2,The Failed Submarine I had always wanted go on...,494,578
3,The diamond ring Emmy was just having breakfas...,243,275
4,If you are locking for a dimiens go to most di...,57,71


HBox(children=(Dropdown(description='Format:', layout=Layout(width='240px'), options=(('CSV', 'csv'), ('Excel …

Output()

# Step 3: Install and Import Required Libraries

###Purpose
- Ensure the exact Python libraries and language models you need are installed and working.
- Verify imports and show clear version numbers so you can troubleshoot quickly.

###What you may need to change
- Library versions, if you want to pin different ones for compatibility.
- You can remove the openai uninstall if you are sure the environment is clean.

###Inputs
- Internet access in the Colab runtime to download packages and the spaCy model.

###Outputs
- Installed packages: openai, tqdm, nltk, tiktoken, spacy, pandas.
- Downloaded spaCy model: en_core_web_sm.
- Verified imports with printed version numbers.
- NLTK tokenizers available: punkt and punkt_tab.

###Key ideas
- Pin versions when you want reproducibility. The defaults below are stable and pair well with Colab.
- spaCy needs a separate model download. We fetch en_core_web_sm and then test a tiny parse.
- NLTK recently split tokenizers, so we check both punkt and punkt_tab.

###Verification prints
- Package versions for openai, pandas, spacy, nltk, tiktoken, tqdm.
- Confirmation that spaCy model loads and can process a sentence.
- Confirmation that NLTK tokenizers are present.
- A tiny tiktoken encode test to confirm the tokenizer is usable.

###Common pitfalls
- Conflicting preinstalled openai versions. We uninstall first to avoid API mismatch.
- Missing spaCy model. Installing the library is not enough, you must download a model.
- NLTK data not present. We fetch tokenizers to avoid runtime errors later.

###Actions performed in this code
1) Uninstall any preinstalled openai to avoid conflicts.
2) Install required libraries with pinned versions.
3) Download the spaCy English model en_core_web_sm.
4) Import libraries and print versions.
5) Verify spaCy model load.
6) Verify NLTK tokenizers punkt and punkt_tab.
7) Verify tiktoken by encoding a short string.


In [None]:
# -----------------------
#3.1 — Install / Upgrade (run once)
# -----------------------
# Uninstall preinstalled openai versions to avoid conflicts, then install v1 + dependencies
!pip -q uninstall -y openai
!pip -q install --upgrade "openai==1.*" tqdm nltk tiktoken spacy pandas==2.2.2 ipywidgets openpyxl jsonschema

# Download spaCy small model
!python -m spacy download en_core_web_sm -q



print("⬇️ Install step finished. IMPORTANT: Restart the runtime (Runtime > Restart runtime) and then run the verification cell.")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/948.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m948.6/948.6 kB[0m [31m64.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m78.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/139.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.8/139.8 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m87.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [None]:
# -----------------------
#3.2 — Imports, version checks, and tokenizer / model verification
# -----------------------
import importlib, sys, os, logging
from getpass import getpass

# Helper to report versions safely
def v(name):
    try:
        m = importlib.import_module(name)
        return getattr(m, "__version__", "unknown")
    except Exception as e:
        return f"import failed: {e}"

modules = ["openai","pandas","spacy","nltk","tiktoken","tqdm","ipywidgets","openpyxl","jsonschema"]
print("Versions:")
for name in modules:
    print(f"  {name}: {v(name)}")

# spaCy load check
import spacy
try:
    nlp = spacy.load("en_core_web_sm")
    doc = nlp("A tiny sanity check.")
    print("✅ spaCy loaded and tokenized sample:", [t.text for t in doc])
except Exception as e:
    print("❌ spaCy failed to load:", e)
    raise

# NLTK tokenizers
import nltk
NLTK_DIR = "/content/nltk_data"
os.makedirs(NLTK_DIR, exist_ok=True)
if NLTK_DIR not in nltk.data.path:
    nltk.data.path.insert(0, NLTK_DIR)

try:
    nltk.data.find("tokenizers/punkt")
    print("✅ NLTK 'punkt' tokenizer is present.")
except LookupError:
    print("⬇️ Downloading NLTK 'punkt' tokenizer...")
    nltk.download("punkt", download_dir=NLTK_DIR, quiet=False)
    try:
        nltk.data.find("tokenizers/punkt")
        print("✅ 'punkt' downloaded.")
    except LookupError:
        print("❌ Failed to download 'punkt'.")

# tiktoken check
import tiktoken
try:
    enc = None
    try:
        enc = tiktoken.get_encoding("o200k_base")
        enc_name = "o200k_base"
    except Exception:
        enc = tiktoken.get_encoding("cl100k_base")
        enc_name = "cl100k_base"
    n_tokens = len(enc.encode("Tokenization sanity check."))
    print(f"✅ tiktoken ready with {enc_name}. Sample tokens: {n_tokens}")
except Exception as e:
    print("❌ tiktoken failed:", e)
    raise




Versions:
  openai: 1.109.1
  pandas: 2.2.2
  spacy: 3.8.7
  nltk: 3.9.2
  tiktoken: 0.12.0
  tqdm: 4.67.1
  ipywidgets: 7.7.1
  openpyxl: 3.1.5


  return getattr(m, "__version__", "unknown")


  jsonschema: 4.25.1
✅ spaCy loaded and tokenized sample: ['A', 'tiny', 'sanity', 'check', '.']
⬇️ Downloading NLTK 'punkt' tokenizer...


[nltk_data] Downloading package punkt to /content/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


✅ 'punkt' downloaded.
✅ tiktoken ready with o200k_base. Sample tokens: 5


#Step 4.1: Optional Preempt — install common extras up front
###Purpose
- Preemptively install/verify common libraries and language data you’re likely to need later
  so downstream steps don’t break mid-run.

#What you may need to change
- Toggle WANT_SPACY_MD to True if you want the larger 'en_core_web_md' spaCy model.
- Adjust VERS pins if you prefer different versions.

#Installs/Verifies
- Libraries: ipywidgets, openpyxl (Excel export), matplotlib (plots), chardet (encoding detect),
             pyarrow (faster IO). xlsxwriter optional as an alternate Excel engine.
- NLTK data: punkt, punkt_tab (if available), stopwords, wordnet, omw-1.4.
- spaCy model: ensures 'en_core_web_sm' is present; optionally downloads 'en_core_web_md'.

#Outputs
- Clear prints of what was installed or already present, plus quick sanity checks.

In [None]:


# ----------------- toggles -----------------
WANT_SPACY_MD = False   # True to also download/load en_core_web_md
# ------------------------------------------

import sys, os, shutil, subprocess, importlib

# ---------- helpers ----------
def _pip_install(spec):
    print(f"⬇️  Installing {spec} ...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", spec])

def ensure_pkg(mod_name, spec=None):
    try:
        m = importlib.import_module(mod_name)
        print(f"✅ {mod_name} already available")
    except Exception:
        _pip_install(spec or mod_name)
        m = importlib.import_module(mod_name)
        print(f"✅ {mod_name} installed")
    return m

def print_version(mod_name):
    try:
        m = importlib.import_module(mod_name)
        v = getattr(m, "__version__", "unknown")
        print(f"   • {mod_name} {v}")
    except Exception as e:
        print(f"   • {mod_name} version check failed: {e}")

# ---------- core convenience libraries ----------
ipywidgets = ensure_pkg("ipywidgets", "ipywidgets==8.1.1")
openpyxl   = ensure_pkg("openpyxl",   "openpyxl>=3.1.2")
matplotlib = ensure_pkg("matplotlib", "matplotlib>=3.8.0")
chardet    = ensure_pkg("chardet",    "chardet>=5.2.0")
pyarrow    = ensure_pkg("pyarrow",    "pyarrow>=16.1.0")
# Optional alternative Excel writer:
# xlsxwriter = ensure_pkg("xlsxwriter", "XlsxWriter>=3.2.0")

print("\nVersions:")
for name in ["ipywidgets", "openpyxl", "matplotlib", "chardet", "pyarrow"]:
    print_version(name)

# ---------- NLTK: one directory, robust downloads ----------
nltk = ensure_pkg("nltk", "nltk>=3.8.1")

NLTK_DIR = "/content/nltk_data"
os.makedirs(NLTK_DIR, exist_ok=True)
os.environ["NLTK_DATA"] = NLTK_DIR
# put our folder at the front of the search path
if NLTK_DIR in nltk.data.path:
    nltk.data.path.remove(NLTK_DIR)
nltk.data.path.insert(0, NLTK_DIR)

print("\nNLTK search paths (in order):")
for p in nltk.data.path:
    print("  -", p)

def ensure_nltk_resource(resource, kind="corpora", retries=2, clean_if_stuck=True):
    """
    Ensure NLTK resource (e.g. 'wordnet') exists under NLTK_DIR/kind.
    Retries and optionally removes a partial folder if verification fails.
    """
    resource_path = f"{kind}/{resource}"
    target_folder = os.path.join(NLTK_DIR, kind, resource)

    def _verified():
        try:
            nltk.data.find(resource_path)
            return True
        except LookupError:
            return False

    if _verified():
        print(f"✅ NLTK {kind} '{resource}' available")
        return

    if clean_if_stuck and os.path.isdir(target_folder):
        print(f"🧹 Removing partial folder: {target_folder}")
        shutil.rmtree(target_folder, ignore_errors=True)

    for attempt in range(1, retries + 1):
        print(f"⬇️  Downloading NLTK {kind} '{resource}' (attempt {attempt}/{retries}) ...")
        ok = nltk.download(resource, download_dir=NLTK_DIR, quiet=False)
        exists_flag = os.path.isdir(target_folder)
        verified = _verified()
        print(f"   ↳ folder exists: {exists_flag}; verified: {verified}; downloader_returned: {ok}")
        if exists_flag and verified:
            print(f"✅ NLTK {kind} '{resource}' ready at {target_folder}")
            return

    # Last resort: show directory state and fail
    parent = os.path.join(NLTK_DIR, kind)
    print(f"❌ Could not verify NLTK {kind} '{resource}' after {retries} attempts.")
    print("   Contents of", parent, ":", os.listdir(parent) if os.path.isdir(parent) else "(missing)")
    raise LookupError(f"Failed to ensure NLTK {resource_path}")

# Tokenizers
ensure_nltk_resource("punkt", kind="tokenizers")



✅ ipywidgets already available
✅ openpyxl already available
✅ matplotlib already available
✅ chardet already available
✅ pyarrow already available

Versions:
   • ipywidgets 7.7.1
   • openpyxl 3.1.5
   • matplotlib 3.10.0
   • chardet 5.2.0
   • pyarrow 18.1.0
✅ nltk already available

NLTK search paths (in order):
  - /content/nltk_data
  - /root/nltk_data
  - /usr/nltk_data
  - /usr/share/nltk_data
  - /usr/lib/nltk_data
  - /usr/share/nltk_data
  - /usr/local/share/nltk_data
  - /usr/lib/nltk_data
  - /usr/local/lib/nltk_data
✅ NLTK tokenizers 'punkt' available


# Step 4: Import Required Libraries
###Purpose
- Import the libraries used for data processing, NLP, tokenisation, logging, and progress bars.
- Load the spaCy English model prepared in Step 3.
- Run quick sanity checks so you know everything is working before you proceed.

###What you may need to change
- Nothing in most cases. If you installed a different spaCy model name, update MODEL_NAME below.

###Inputs
- Installed packages from Step 3. SpaCy model en_core_web_sm should already be present.

###Outputs
- Imported modules in memory.
- tqdm progress bars enabled for pandas operations.
- A loaded spaCy pipeline in the variable `nlp`.
- Short verification prints from spaCy, NLTK, and tiktoken.

###Key ideas
- Keep imports in one place so the rest of the notebook can assume they exist.
- Fail early with clear messages if a critical import or model is missing.

###Verification prints
- Confirms tqdm was enabled.
- Confirms spaCy loaded and tokenised a tiny sentence.
- Confirms NLTK tokeniser works.
- Confirms tiktoken can encode a short string.

In [None]:
# ===============================================
# Step 4: Import Required Libraries
# ===============================================

# ================================
import os
import re
import json
import time
import string
import random
import threading
import logging
import difflib
from functools import lru_cache

# Set up logging for helpful debug output
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

# Ensure deterministic-ish behaviour for debugging
random.seed(1)

# Pandas + tqdm
import pandas as pd
try:
    # prefer notebook tqdm if available
    from tqdm.notebook import tqdm
except Exception:
    from tqdm import tqdm
tqdm.pandas()
logging.info("✅ tqdm progress bars enabled for pandas")

# NLTK setup
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
NLTK_DIR = "/content/nltk_data" if os.path.exists("/content") else os.path.join(os.getcwd(), "nltk_data")
os.makedirs(NLTK_DIR, exist_ok=True)
if NLTK_DIR not in nltk.data.path:
    nltk.data.path.insert(0, NLTK_DIR)

# Download 'punkt' if missing, but keep output quiet unless it fails
try:
    nltk.data.find("tokenizers/punkt")
    logging.info("✅ NLTK 'punkt' tokenizer present")
except LookupError:
    logging.info("⬇️ Downloading NLTK 'punkt' tokenizer...")
    nltk.download("punkt", download_dir=NLTK_DIR, quiet=True)
    try:
        nltk.data.find("tokenizers/punkt")
        logging.info("✅ NLTK 'punkt' downloaded")
    except LookupError:
        raise RuntimeError("NLTK 'punkt' tokenizer download failed. Check network or permissions.")

# spaCy + model load (single load only)
import spacy
MODEL_NAME = "en_core_web_sm"
try:
    nlp = spacy.load(MODEL_NAME)
    _doc = nlp("Quick spaCy check.")
    logging.info("✅ spaCy model loaded (%s). Tokens: %s", MODEL_NAME, [t.text for t in _doc])
except Exception as e:
    logging.error("❌ Could not load spaCy model '%s': %s", MODEL_NAME, e)
    logging.info("Tip: re-run your install cell to fetch the model, then restart the runtime.")
    raise

# tiktoken robust selection
import tiktoken
def get_token_encoder(preferred=("o200k_base", "cl100k_base")):
    names = []
    try:
        names = tiktoken.list_encoding_names()
    except Exception:
        # older tiktoken versions may not expose list_encoding_names
        pass
    for enc in preferred:
        try:
            if names and enc not in names:
                continue
            return tiktoken.get_encoding(enc)
        except Exception:
            continue
    # final fallback to a known encoding name if available
    try:
        return tiktoken.get_encoding("cl100k_base")
    except Exception as e:
        logging.error("❌ tiktoken encoders unavailable: %s", e)
        raise

ENC = get_token_encoder()
_enc_name = getattr(ENC, "__name__", "encoding")
_sample_len = len(ENC.encode("Tiny tiktoken check."))
logging.info("✅ tiktoken ready (%s). Sample length: %d", _enc_name, _sample_len)

print("🎉 Step 4 imports and model load verified.")



🎉 Step 4 imports and model load verified.


# Step 5 — Configure Logging

###Purpose
- Capture info, warnings, and errors to both the Colab console and a log file for later debugging.
- Adjust logging settings from a small UI: filenames, console/file levels, rotation size/backups.
- Make log levels easy to change for noisy vs quiet runs.

- Apply the config, run a tiny self-test, optionally preview the log tail, and download the log.


###What you can change via UI
- LOG_FILE: filename of the rotating log
- CONSOLE_LEVEL: how noisy the notebook output is
- FILE_LEVEL: how detailed the on-disk log is
- ROTATE_MAX_MB: size per log file before rotation
- ROTATE_BACKUPS: how many rotated files to keep

###Outputs
- Reconfigured root logger with a StreamHandler (console) and RotatingFileHandler (file)
- Self-test entries written to the log
- Optional log tail preview

###Key ideas
- Set the root logger level high enough to allow through what handlers need.
- Handlers have their own levels. Console can be quieter than file.
- Rotating logs prevent a single giant file.

###Verification prints
- A quick self test logs INFO, WARNING, ERROR, and an example exception.


In [None]:


import os, io, logging, traceback
from logging.handlers import RotatingFileHandler
from IPython.display import display, HTML
try:
    import ipywidgets as widgets
    WIDGETS_OK = True
except Exception:
    WIDGETS_OK = False

# ---- helper: make or update logging based on UI values ----
def configure_logging(log_file: str,
                      console_level: int,
                      file_level: int,
                      rotate_max_mb: int,
                      rotate_backups: int) -> logging.Logger:
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)  # always let handlers filter

    # Remove old handlers (avoids duplicates on re-run)
    for h in list(logger.handlers):
        logger.removeHandler(h)

    fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(name)s - %(message)s")

    # Console
    ch = logging.StreamHandler()
    ch.setLevel(console_level)
    ch.setFormatter(fmt)
    logger.addHandler(ch)

    # Rotating file
    fh = RotatingFileHandler(
        log_file,
        maxBytes=rotate_max_mb * 1024 * 1024,
        backupCount=rotate_backups,
        encoding="utf-8"
    )
    fh.setLevel(file_level)
    fh.setFormatter(fmt)
    logger.addHandler(fh)

    # Chattery libs can be quieted if you like
    logging.getLogger("urllib3").setLevel(logging.WARNING)
    logging.getLogger("tqdm").setLevel(logging.WARNING)

    # Self-test
    logger.info("Logging control panel: INFO test")
    logger.warning("Logging control panel: WARNING test")
    try:
        1/0
    except ZeroDivisionError:
        logger.error("Logging control panel: ERROR test with traceback")
        logger.error(traceback.format_exc())

    return logger

# ---- level maps for dropdowns ----
LEVELS = {
    "DEBUG (most verbose)": logging.DEBUG,
    "INFO (standard)": logging.INFO,
    "WARNING (only important)": logging.WARNING,
    "ERROR (failures only)": logging.ERROR,
    "CRITICAL": logging.CRITICAL
}

# ---- defaults (you can change here if you want different initial values) ----
DEFAULT_LOG_FILE = "text_correction.log"
DEFAULT_CONSOLE = "INFO (standard)"
DEFAULT_FILE = "DEBUG (most verbose)"
DEFAULT_ROTATE_MB = 5
DEFAULT_BACKUPS = 3

if not WIDGETS_OK:
    print("ipywidgets not available. Install it first or run the preempt cell. "
          "Meanwhile, you can configure logging programmatically via Step 5.")
else:
    # ---- UI controls ----
    log_file_text = widgets.Text(
        value=DEFAULT_LOG_FILE,
        description="LOG_FILE:",
        layout=widgets.Layout(width="420px")
    )
    console_level_dd = widgets.Dropdown(
        options=list(LEVELS.keys()),
        value=DEFAULT_CONSOLE,
        description="Console:",
        layout=widgets.Layout(width="420px")
    )
    file_level_dd = widgets.Dropdown(
        options=list(LEVELS.keys()),
        value=DEFAULT_FILE,
        description="File:",
        layout=widgets.Layout(width="420px")
    )
    rotate_mb_slider = widgets.IntSlider(
        value=DEFAULT_ROTATE_MB, min=1, max=50, step=1,
        description="Rotate MB:",
        readout=True, continuous_update=False
    )
    backups_slider = widgets.IntSlider(
        value=DEFAULT_BACKUPS, min=0, max=20, step=1,
        description="Backups:",
        readout=True, continuous_update=False
    )

    apply_btn = widgets.Button(
        description="Apply logging settings",
        button_style="primary",
        tooltip="Configure handlers and run a quick self-test"
    )
    show_tail_btn = widgets.Button(
        description="Show log tail",
        tooltip="Display the last lines of the current log file"
    )
    download_btn = widgets.Button(
        description="Download log",
        tooltip="Download the current log file"
    )
    out = widgets.Output()

    # ---- callbacks ----
    def on_apply_clicked(_):
        with out:
            out.clear_output()
            log_file = log_file_text.value.strip() or DEFAULT_LOG_FILE
            console_level = LEVELS[console_level_dd.value]
            file_level = LEVELS[file_level_dd.value]
            rotate_mb = int(rotate_mb_slider.value)
            backups = int(backups_slider.value)

            logger = configure_logging(
                log_file=log_file,
                console_level=console_level,
                file_level=file_level,
                rotate_max_mb=rotate_mb,
                rotate_backups=backups
            )
            print("✅ Applied logging settings")
            print(f"   LOG_FILE: {log_file}")
            print(f"   CONSOLE_LEVEL: {console_level_dd.value}")
            print(f"   FILE_LEVEL: {file_level_dd.value}")
            print(f"   ROTATE_MAX_MB: {rotate_mb}")
            print(f"   ROTATE_BACKUPS: {backups}")
            print("\nWrote self-test lines. Use 'Show log tail' to preview.")

    def on_show_tail_clicked(_):
        with out:
            log_file = log_file_text.value.strip() or DEFAULT_LOG_FILE
            out.clear_output()
            if not os.path.exists(log_file):
                print(f"❌ Log file not found yet: {log_file}")
                print("   Click 'Apply logging settings' first.")
                return
            try:
                # read last ~100 lines
                with open(log_file, "r", encoding="utf-8", errors="replace") as f:
                    lines = f.readlines()[-100:]
                print(f"--- tail of {log_file} (last {len(lines)} lines) ---")
                for line in lines:
                    print(line.rstrip())
            except Exception as e:
                print("❌ Failed to read log:", e)

    def on_download_clicked(_):
        from google.colab import files
        with out:
            out.clear_output()
            log_file = log_file_text.value.strip() or DEFAULT_LOG_FILE
            if not os.path.exists(log_file):
                print(f"❌ Log file not found: {log_file}")
                print("   Click 'Apply logging settings' first.")
                return
            try:
                files.download(log_file)
                print(f"Started download: {log_file}")
            except Exception as e:
                print("❌ Download failed:", e)

    apply_btn.on_click(on_apply_clicked)
    show_tail_btn.on_click(on_show_tail_clicked)
    download_btn.on_click(on_download_clicked)

    # ---- layout ----
    display(HTML("<h4>Logging Control Panel</h4>"))
    display(
        widgets.VBox([
            log_file_text,
            widgets.HBox([console_level_dd, file_level_dd]),
            widgets.HBox([rotate_mb_slider, backups_slider]),
            widgets.HBox([apply_btn, show_tail_btn, download_btn]),
            out
        ])
    )



VBox(children=(Text(value='text_correction.log', description='LOG_FILE:', layout=Layout(width='420px')), HBox(…

# Step 6: Securely Prompt for OpenAI API Key and test OpenAI API key

###Purpose
- Obtain the OpenAI API key securely (without exposing it in code cells).
- Verify that the key works by attempting a harmless API call (listing models).

###What you may need to change
- Nothing. Just run this cell; it will prompt you for the key if it’s not already in memory.

###Inputs
- User-entered API key (via getpass prompt).

###Outputs
- Environment variable OPENAI_API_KEY set for this session.
- Verification message confirming that the key works, or an error if it doesn’t.

###Key ideas
- Never hard-code your API key into notebooks.
- The key is stored only in the temporary runtime environment variable, not in the notebook file.
- Verification uses a lightweight call (model listing).

###Verification prints
- “✅ OpenAI API key is valid.” if the call succeeds.
- A clear “❌ Invalid or failed verification” message if not.

In [None]:
import importlib
try:
    import openai
    print("openai version:", getattr(openai, "__version__", "unknown"))
except Exception as e:
    print("openai import failed:", e)



openai version: 1.109.1


In [None]:
# ===============================================
# Step 6: Securely Prompt for OpenAI API Key
# ===============================================
# Step 6 (fixed for openai v1)
import os
from getpass import getpass
from openai import OpenAI

def ensure_api_key():
    key = os.environ.get("OPENAI_API_KEY")
    if key:
        return key
    print("Enter your OpenAI API key (input hidden). Leave blank to skip verification.")
    key = getpass("API key: ").strip()
    if not key:
        return None
    os.environ["OPENAI_API_KEY"] = key
    return key

def verify_openai_api_key_v1():
    key = ensure_api_key()
    if not key:
        print("Skipped OpenAI verification. Set OPENAI_API_KEY to run API calls.")
        return None
    client = OpenAI()  # reads OPENAI_API_KEY from environment
    try:
        models = client.models.list()
        sample = [m.id for m in models.data[:8]]
        print("✅ OpenAI API key verified (v1). Sample models:", sample)
        return client
    except Exception as e:
        print("❌ Verification failed:", type(e).__name__, str(e))
        print("\nCommon fixes:")
        print("  • Ensure the key is correct and has required permissions.")
        print("  • Check network access from the runtime.")
        print("  • If your code still uses old v0 calls (openai.Model, openai.ChatCompletion.create), update them.")
        raise

# Run it and store client in the variable `client` for later use
client = verify_openai_api_key_v1()








Enter your OpenAI API key (input hidden). Leave blank to skip verification.
API key: ··········
✅ OpenAI API key verified (v1). Sample models: ['gpt-4-0613', 'gpt-4', 'gpt-3.5-turbo', 'gpt-5-search-api-2025-10-14', 'gpt-realtime-mini', 'gpt-realtime-mini-2025-10-06', 'sora-2', 'sora-2-pro']


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


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


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


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


# Step 6.2: Ensure NLTK Data is Available


###Purpose
- Confirm that the required NLTK datasets, particularly the 'punkt' tokenizer,
  are available for sentence and word tokenization.
- Automatically download them into a local or Colab-safe directory if missing.

###What you may need to change
- nltk_data_path: set this to a writable directory if running outside Colab
  (e.g., './nltk_data' for local use).

###Inputs
- None (downloads handled internally if needed).

###Outputs
- Verified or newly downloaded 'punkt' tokenizer package.

###Key ideas
- NLTK looks for data in a set of known paths; adding a custom directory avoids permission issues.
- Downloading into /content/nltk_data keeps notebooks portable and clean.

In [None]:

# ===============================================
# Step 6.2: Ensure NLTK Data is Available
# ===============================================
import nltk, os

# Specify a safe directory for NLTK data (works in Colab or local)
nltk_data_path = "/content/nltk_data" if os.path.exists("/content") else "./nltk_data"

# Create the directory if it doesn't exist
os.makedirs(nltk_data_path, exist_ok=True)

# Ensure our path is part of NLTK’s search list
if nltk_data_path not in nltk.data.path:
    nltk.data.path.append(nltk_data_path)

# Try locating the punkt tokenizer
try:
    nltk.data.find("tokenizers/punkt")
    print("✅ NLTK 'punkt' tokenizer is already available.")
except LookupError:
    print("⚙️ Downloading NLTK 'punkt' tokenizer...")
    nltk.download("punkt", download_dir=nltk_data_path)
    try:
        nltk.data.find("tokenizers/punkt")
        print("✅ 'punkt' tokenizer downloaded successfully.")
    except LookupError:
        print("❌ Failed to download 'punkt' tokenizer. Check network or permissions.")

# Optional: newer punkt tokenizer for NLTK ≥ 3.8
try:
    nltk.data.find("tokenizers/punkt_tab")
except LookupError:
    try:
        nltk.download("punkt_tab", download_dir=nltk_data_path)
        print("✅ 'punkt_tab' (improved tokenizer) also downloaded.")
    except Exception:
        pass



✅ NLTK 'punkt' tokenizer is already available.
✅ 'punkt_tab' (improved tokenizer) also downloaded.


[nltk_data] Downloading package punkt_tab to /content/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# Step 7: Define Utility Functions for ChatGPT API Interaction
## Purpose:
To create reusable functions that handle interactions with the OpenAI API, including making requests with retry mechanisms to handle potential API errors.

##Input: Function parameters (things you pass in)

-prompt — the text you want the model to read and reply to. Required.

-model="gpt-4o" — which model to call if you don’t give one. Default is gpt-4o.For cost:  https://platform.openai.com/settings/organization/limits

-max_retries=5 — how many times to try again if the API fails temporarily. Five tries is a common default.

-backoff_factor=2 — controls how long to wait between retries. Bigger numbers make you wait longer each retry.

-temperature=0 — controls randomness. Zero aims for consistent, repeatable answers.

-max_tokens=8192 — the maximum number of tokens you allow the model to output.

## Output: Local variables inside the function

attempt — which retry number you are on in the loop.

response — the raw object the API returns.

content — the actual text reply you extract from the response.

usage — token usage information (helps with billing and diagnostics).

e — the caught exception when something goes wrong.

wait — how many seconds the code sleeps before retrying.

## Actions:

* Define call_chatgpt Function:
Parameters: Accepts prompt, model, max_retries, backoff_factor, temperature, and max_tokens.
* Functionality: Attempts to call the OpenAI API with exponential backoff in case of failures like rate limits or timeouts.
* Error Handling: Catches specific OpenAI errors and retries the request after waiting for a calculated duration.
* Returns: The content of the API response or None if all retries fail.

In [None]:
# ===============================================
# Step 7: Define Utility Functions for ChatGPT API Interaction
# ===============================================
# ---- v1-safe, timeout + retries, correct exception classes ----
import time, random, logging
from openai import OpenAI
from openai import (
    APIError, APIConnectionError, RateLimitError, APITimeoutError,
    AuthenticationError
)

logger = logging.getLogger(__name__)

def _token_len(text, encoder):
    try:
        return len(encoder.encode(text or ""))
    except Exception:
        return 0

def call_chatgpt_v1(
    prompt,
    model="gpt-4o",
    max_retries=5,
    backoff_factor=2.0,
    temperature=0.0,
    max_tokens=None,
    client: OpenAI = None,
    token_encoder=None,
    model_context_limit=131072,
    request_timeout=30  # seconds
):
    client = client or OpenAI()
    # per-request timeout (prevents hanging)
    if hasattr(client, "with_options"):
        client = client.with_options(timeout=request_timeout)

    if max_tokens is None and token_encoder is not None:
        prompt_toks = _token_len(prompt, token_encoder)
        max_tokens = max(256, min(4096, model_context_limit - prompt_toks - 1024))
    elif max_tokens is None:
        max_tokens = 1024

    for attempt in range(1, max_retries + 1):
        try:
            resp = client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature,
                max_tokens=max_tokens,
            )
            content = resp.choices[0].message.content
            usage = getattr(resp, "usage", {}) or {}
            return content, usage

        except (RateLimitError, APIConnectionError, APITimeoutError, APIError) as e:
            wait = min(60, (backoff_factor ** attempt) + random.uniform(0, 1))
            logger.warning(
                "API transient error (attempt %d/%d): %s. Retrying in %.1fs",
                attempt, max_retries, str(e), wait
            )
            time.sleep(wait)
            continue

        except AuthenticationError as e:
            logger.error("Authentication failed: %s", e)
            raise

        except KeyboardInterrupt:
            logger.error("Interrupted by user. Aborting cleanly.")
            raise

        except Exception as e:
            logger.exception("Unexpected error on attempt %d: %s", attempt, e)
            break

    logger.error("Max retries exceeded. Returning (None, {}).")
    return None, {}



# Step 8: Correct Text by Sentence
## Purpose:
To process raw text by correcting punctuation, grammar, and spelling. Then creating a new field with the corrected text. Also, itm maaps the correctext back to the raw text.




## Actions:
* Important all helpers and loggers.
* Define corrrect_and_mmap Function:
 * Parameters: Accepts text (the raw input text).
 * Prompt Creation: Constructs a detailed prompt with instructions for the AI to perform specific tasks on the text: correcting punctuation, grammar, and spelling. Providing bencharks to map corrected to segmented by word.
 * API Call: Uses the previously defined call_chatgpt function to send the prompt to the OpenAI API.
 * Response Handling: Extracts JSON from the API response and parses it into a Python dictionary.
 * Error Handling: Catches JSON decoding errors and returns an empty dictionary if parsing fails.
*Mock Text for Testing:
at the end, create mock version of correct_and_map is used to simulate API behavior for testing purposes.

In [None]:
# ===============================================
# STEP 8 & 9 — Final rebuild (titles, dialogue, alignment, boundaries)
# ===============================================

import re, json, math, time, os, io, zipfile, logging, random
import numpy as np
import pandas as pd
from difflib import SequenceMatcher

# -------------------------
# Config & logger
# -------------------------
MODEL_ID   = "gpt-4o"
MAX_TOKENS = 1200
USE_MOCK   = False  # flip to True for offline tests
logger = logging.getLogger(__name__)

try:
    from google.colab import files
    _COLAB = True
except Exception:
    _COLAB = False


# -------------------------
# Utilities
# -------------------------
def _extract_first_json_object(txt: str):
    """Robustly pull the first {...} as JSON."""
    if not txt:
        return None
    start = txt.find("{")
    if start < 0:
        return None
    depth, in_str, esc = 0, False, False
    for i in range(start, len(txt)):
        ch = txt[i]
        if in_str:
            if esc: esc = False
            elif ch == "\\": esc = True
            elif ch == '"': in_str = False
        else:
            if ch == '"': in_str = True
            elif ch == "{": depth += 1
            elif ch == "}":
                depth -= 1
                if depth == 0:
                    frag = txt[start:i+1]
                    try:
                        return json.loads(frag)
                    except Exception:
                        return None
    return None


# -------------------------
# Mojibake normalization
# -------------------------
_MOJIBAKE_FIXES = [
    (r"â€”", "—"),   # em dash
    (r"â€“", "–"),   # en dash
    (r"â€˜", "‘"), (r"â€™", "’"),  # single quotes
    (r"â€œ", "“"), (r"â€\u009d", "”"), (r"â€\u009D", "”"), (r"â€�", "”"), # double quotes
    (r"â€¦", "…"),   # ellipsis
    (r"Â ", " "),    # stray non-breaking space
]

def normalize_mojibake(s: str) -> str:
    if not isinstance(s, str):
        return s
    out = s
    for pat, repl in _MOJIBAKE_FIXES:
        out = re.sub(pat, repl)
    return out

def apply_mojibake_normalization(df_texts: pd.DataFrame,
                                 corrected_col: str = "Corrected text (8)") -> pd.DataFrame:
    df = df_texts.copy()
    if corrected_col in df.columns:
        df[corrected_col] = df[corrected_col].map(normalize_mojibake)
    return df


# -------------------------
# Step 8A: Correction (LLM) with narrative & dialogue extraction
# -------------------------
def _mock_correct_and_tag(s: str):
    """Deterministic mock: simple caps + final punct, no tags."""
    t = (s or "").strip()
    m = re.search(r"[A-Za-z]", t)
    if m:
        i = m.start()
        t = t[:i] + t[i].upper() + t[i+1:]
    if t and not re.search(r"[.!?…]\s*$", t):
        t += "."
    return {
        "corrected_text": t,
        "narrative_tags": [],
        "dialogue_spans": []
    }

def correct_text_and_tags(raw: str, client=None, model=MODEL_ID, use_mock=USE_MOCK):
    s = str(raw or "")
    if use_mock or client is None:
        js = _mock_correct_and_tag(s)
        return js["corrected_text"], js["narrative_tags"], js["dialogue_spans"], "mock"

    from openai import OpenAI
    _client = client or OpenAI()

    prompt = f"""
You are a meticulous copy-editor. Fix punctuation, grammar, and spelling.
Keep meaning and paragraphing. Use standard English punctuation.

Also detect and return:
- Title(s): a short heading at the very beginning (if present).
- Temporal transitions (e.g., "The next day", "Two weeks later", explicit dates like "March 19, 2032").
- Closure tags (e.g., "THE END", "To be continued").
- Dialogue spans: character start/end offsets (0-based, end-exclusive) for direct speech (quoted speech).

Return ONLY strict JSON:
{{
  "corrected_text": "...",
  "narrative_tags": [
     {{"type":"title"|"temporal"|"closure","text":"...","start":int,"end":int}}, ...
  ],
  "dialogue_spans": [
     {{"start":int,"end":int}}, ...
  ]
}}

Text:
<<<BEGIN>>>
{s}
<<<END>>>
""".strip()

    resp = _client.chat.completions.create(
        model=model,
        messages=[{"role":"user","content":prompt}],
        temperature=0.0,
        max_tokens=MAX_TOKENS
    )
    out = (resp.choices[0].message.content or "").strip()
    js = _extract_first_json_object(out)
    if not isinstance(js, dict):
        # fallback minimal
        return s, [], [], model

    corrected = normalize_mojibake(str(js.get("corrected_text","")).strip())
    tags = js.get("narrative_tags") or []
    spans = js.get("dialogue_spans") or []
    # sanity types
    if not isinstance(tags, list): tags = []
    if not isinstance(spans, list): spans = []
    return corrected, tags, spans, model


def run_correct_only(
    df_in: pd.DataFrame,
    text_col="Raw text",
    id_col="ID",
    client=None,
    model=MODEL_ID,
    use_mock=USE_MOCK,
    out_col="Corrected text (8)"
) -> pd.DataFrame:
    if text_col not in df_in.columns:
        raise KeyError(f"Missing required column: {text_col}")
    df = df_in.copy()

    # Normalize ID strings
    def _norm_id_series(s: pd.Series) -> pd.Series:
        s = s.astype(str).str.replace(r"\.0$", "", regex=True)
        def _fix(x):
            if any(c.isalpha() for c in x):
                return x
            try:
                if "." in x or "e" in x.lower():
                    f = float(x)
                    if f.is_integer():
                        return str(int(f))
            except Exception:
                pass
            return x
        return s.map(_fix)

    if id_col not in df.columns:
        df[id_col] = pd.RangeIndex(len(df)).astype(str)
    else:
        df[id_col] = _norm_id_series(df[id_col])

    corr, tags_json, dial_json = [], [], []
    for raw in df[text_col].astype(str).tolist():
        fixed, tags, spans, _src = correct_text_and_tags(raw, client=client, model=model, use_mock=use_mock)
        corr.append(fixed)
        tags_json.append(json.dumps(tags, ensure_ascii=False))
        dial_json.append(json.dumps(spans, ensure_ascii=False))
    df[out_col] = corr
    df["NarrativeTagsJSON"] = tags_json
    df["DialogueSpansJSON"] = dial_json
    return df


# -------------------------
# Step 8B: tokenize + map (with merged-word split)
# -------------------------
_WORD_RX = re.compile(r"\w", flags=re.UNICODE)

def _split_merged_word(tok: str):
    """
    Heuristic: split once when ALLCAPS is followed by lowercase (YAAAYwe -> YAAAY + we).
    """
    if not tok or not tok.isalpha():
        return [tok]
    m = re.match(r"^([A-Z]{2,})([a-z].*)$", tok)
    if m:
        left, right = m.group(1), m.group(2)
        return [left, right]
    return [tok]

def _simple_tokenize_with_splitting(s: str):
    base = re.findall(r"\w+|[^\w\s]", s or "", flags=re.UNICODE)
    out = []
    for t in base:
        if re.fullmatch(r"\w+", t):
            out.extend(_split_merged_word(t))
        else:
            out.append(t)
    return out

def _rebuild_offsets_with_splitting(text, tokens):
    spans = []
    i = 0
    n = len(text)
    for tok in tokens:
        if tok == "" or tok is None:
            spans.append((i, i))
            continue
        pos = text.find(tok, i)
        if pos >= 0:
            start, end = pos, pos + len(tok)
            spans.append((start, end))
            i = end
        else:
            # best-effort slice to keep indices monotone
            j = i
            while j < n and text[j].isspace():
                j += 1
            start = j
            end = min(n, start + len(tok))
            spans.append((start, end))
            i = end
    return spans

def _is_word(tok: str) -> bool:
    return bool(tok) and bool(_WORD_RX.search(tok))

def build_word_map(raw_text, corr_text):
    raw_tokens  = _simple_tokenize_with_splitting(raw_text or "")
    corr_tokens = _simple_tokenize_with_splitting(corr_text or "")

    raw_spans  = _rebuild_offsets_with_splitting(raw_text or "", raw_tokens)
    corr_spans = _rebuild_offsets_with_splitting(corr_text or "", corr_tokens)

    sm = SequenceMatcher(a=[t.lower() for t in raw_tokens],
                         b=[t.lower() for t in corr_tokens],
                         autojunk=False)

    rows = []
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag == "equal":
            for k in range(i2 - i1):
                r_tok = raw_tokens[i1 + k]; c_tok = corr_tokens[j1 + k]
                r_start, r_end = raw_spans[i1 + k]
                c_start, c_end = corr_spans[j1 + k]
                rows.append({
                    "raw_index": i1 + k, "raw_token": r_tok, "raw_start": r_start, "raw_end": r_end,
                    "corr_index": j1 + k, "corr_token": c_tok, "corr_start": c_start, "corr_end": c_end,
                    "op": "equal", "equal_ci": (r_tok == c_tok), "error_type": "Equal"
                })

        elif tag == "replace":
            m = min(i2 - i1, j2 - j1)
            for k in range(m):
                r_tok = raw_tokens[i1 + k]; c_tok = corr_tokens[j1 + k]
                r_start, r_end = raw_spans[i1 + k]
                c_start, c_end = corr_spans[j1 + k]
                err = "Spelling" if (r_tok.lower() != c_tok.lower() and r_tok.isalpha() and c_tok.isalpha()) else "Replacement"
                rows.append({
                    "raw_index": i1 + k, "raw_token": r_tok, "raw_start": r_start, "raw_end": r_end,
                    "corr_index": j1 + k, "corr_token": c_tok, "corr_start": c_start, "corr_end": c_end,
                    "op": "replace", "equal_ci": (r_tok.lower() == c_tok.lower()), "error_type": err
                })
            for k in range(i1 + m, i2):  # deletions
                r_tok = raw_tokens[k]; r_start, r_end = raw_spans[k]
                rows.append({
                    "raw_index": k, "raw_token": r_tok, "raw_start": r_start, "raw_end": r_end,
                    "corr_index": None, "corr_token": None, "corr_start": None, "corr_end": None,
                    "op": "delete", "equal_ci": False,
                    "error_type": "PunctuationDeletion" if not _is_word(r_tok) else "Deletion"
                })
            for k in range(j1 + m, j2):  # insertions
                c_tok = corr_tokens[k]; c_start, c_end = corr_spans[k]
                rows.append({
                    "raw_index": None, "raw_token": None, "raw_start": None, "raw_end": None,
                    "corr_index": k, "corr_token": c_tok, "corr_start": c_start, "corr_end": c_end,
                    "op": "insert", "equal_ci": False,
                    "error_type": "PunctuationInsertion" if not _is_word(c_tok) else "Insertion"
                })

        elif tag == "delete":
            for k in range(i1, i2):
                r_tok = raw_tokens[k]; r_start, r_end = raw_spans[k]
                rows.append({
                    "raw_index": k, "raw_token": r_tok, "raw_start": r_start, "raw_end": r_end,
                    "corr_index": None, "corr_token": None, "corr_start": None, "corr_end": None,
                    "op": "delete", "equal_ci": False,
                    "error_type": "PunctuationDeletion" if not _is_word(r_tok) else "Deletion"
                })

        elif tag == "insert":
            for k in range(j1, j2):
                c_tok = corr_tokens[k]; c_start, c_end = corr_spans[k]
                rows.append({
                    "raw_index": None, "raw_token": None, "raw_start": None, "raw_end": None,
                    "corr_index": k, "corr_token": c_tok, "corr_start": c_start, "corr_end": c_end,
                    "op": "insert", "equal_ci": False,
                    "error_type": "PunctuationInsertion" if not _is_word(c_tok) else "Insertion"
                })

    return rows

def run_mapping_only(df_with_corr,
                     id_col="ID",
                     raw_col="Raw text",
                     corr_col="Corrected text (8)"):
    if raw_col not in df_with_corr.columns or corr_col not in df_with_corr.columns:
        raise KeyError(f"Need both {raw_col} and {corr_col} present")

    df = df_with_corr.copy()
    if id_col not in df.columns:
        df[id_col] = pd.RangeIndex(len(df)).astype(str)
    df[id_col] = df[id_col].astype(str)

    all_rows = []
    for order, (rid, raw, cor) in enumerate(zip(df[id_col].tolist(),
                                                df[raw_col].astype(str).tolist(),
                                                df[corr_col].astype(str).tolist())):
        rows = build_word_map(raw, cor)
        if not rows:
            rows = [{
                "raw_index": np.nan, "raw_token": None, "raw_start": np.nan, "raw_end": np.nan,
                "corr_index": np.nan, "corr_token": None, "corr_start": np.nan, "corr_end": np.nan,
                "op": "empty", "equal_ci": False, "error_type": "EmptyText"
            }]
        for r in rows:
            rec = {"RowID": rid, "DocOrder": order, **r}
            rec["Changed"] = (r.get("op") != "equal")
            all_rows.append(rec)

    map_df = pd.DataFrame(all_rows)
    texts_out = df.copy()
    return map_df, texts_out


# -------------------------
# Alignment repair: pair delete+insert → replace
# -------------------------
def _jw_ratio(a: str, b: str) -> float:
    return SequenceMatcher(None, a, b).ratio()

def _is_alpha_word(s: str) -> bool:
    return bool(s) and s.isalpha()

def postprocess_pair_deletions_and_insertions(df_map: pd.DataFrame,
                                              max_window: int = 6,
                                              min_sim: float = 0.55) -> pd.DataFrame:
    if df_map.empty:
        return df_map

    df = df_map.copy()
    sort_cols = ["ID"]
    if "CorrSentenceID" in df.columns: sort_cols.append("CorrSentenceID")
    if "corr_index" in df.columns:     sort_cols.append("corr_index")
    sort_cols = [c for c in sort_cols if c in df.columns]
    if not sort_cols: sort_cols = ["RowID"]

    df["_ord"] = np.arange(len(df))
    df = df.sort_values(sort_cols + ["_ord"], kind="mergesort")

    to_drop = set()

    def handle_group(g):
        idxs = list(g.index)
        deletes = [i for i in idxs if g.at[i, "op"] == "delete" and _is_alpha_word(str(g.at[i, "raw_token"] or ""))]
        inserts = [i for i in idxs if g.at[i, "op"] == "insert" and _is_alpha_word(str(g.at[i, "corr_token"] or ""))]
        used_insert = set()

        for di in deletes:
            if di in to_drop:
                continue
            raw_tok = str(g.at[di, "raw_token"] or "")
            pos_di = idxs.index(di)
            best, best_sim = None, 0.0
            for ii in inserts:
                if ii in used_insert or ii in to_drop:
                    continue
                pos_ii = idxs.index(ii)
                if abs(pos_ii - pos_di) > max_window:
                    continue
                corr_tok = str(g.at[ii, "corr_token"] or "")
                sim = _jw_ratio(raw_tok.lower(), corr_tok.lower())
                if sim >= min_sim and (best is None or sim > best_sim or (sim == best_sim and abs(pos_ii - pos_di) < abs(idxs.index(best)-pos_di))):
                    best, best_sim = ii, sim
            if best is not None:
                # convert delete→replace using corr fields from insert
                g.at[di, "op"] = "replace"
                g.at[di, "equal_ci"] = (raw_tok.lower() == str(g.at[best, "corr_token"] or "").lower())
                g.at[di, "error_type"] = "Replacement"
                for c in ["corr_index", "corr_token", "corr_start", "corr_end"]:
                    if c in g.columns:
                        g.at[di, c] = g.at[best, c]
                used_insert.add(best)
                to_drop.add(best)
        return g

    df = df.groupby("ID", group_keys=False).apply(handle_group, include_groups=False)
    if to_drop:
        df = df.drop(index=list(to_drop))
    df = df.sort_values(sort_cols + ["_ord"], kind="mergesort").drop(columns=["_ord"], errors="ignore")
    return df


# -------------------------
# Step 8C: CorrSentenceID (robust endings + quotes/ellipsis)
# -------------------------
ABBREV = {
    "mr.","mrs.","ms.","dr.","prof.","sr.","jr.","st.","vs.","etc.",
    "e.g.","i.e.","cf.","fig.","ex.","no.","approx.","circa.","ca.",
    "dept.","est.","misc.","rev.","jan.","feb.","mar.","apr.","jun.",
    "jul.","aug.","sep.","sept.","oct.","nov.","dec."
}
TERMINALS = {".", "!", "?", "…", "...", "?!", "!?"}
CLOSERS   = {")", "]", "}", "”", "’", "»"}
OPENERS   = {"(", "[", "{", "“", "‘", "«"}

RE_INITIAL       = re.compile(r"^[A-Z]\.$")
RE_INITIAL_PAIR  = re.compile(r"^[A-Z]\.[A-Z]\.$")
RE_NUM_WITH_DOT  = re.compile(r"^\d+\.$")
RE_SECTION_NUM   = re.compile(r"^\d+(?:\.\d+){1,3}$")
RE_DOT_TAIL      = re.compile(r"^\.\d+$")
RE_ELLIPSIS      = re.compile(r"^\.\.\.$")
RE_ALPHA_PAREN   = re.compile(r"^[A-Za-z]\)$")

def _tok(x):
    if pd.isna(x) or x is None: return ""
    return str(x)

def _is_ellipsis_triplet(i, toks):
    return (i+2 < len(toks) and toks[i] == "." and toks[i+1] == "." and toks[i+2] == ".")

def _is_terminal_token(tok: str, prev_tok: str, next_tok: str) -> bool:
    t = tok.strip()
    if not t:
        return False
    if t == "…" or t == "..." or RE_ELLIPSIS.fullmatch(t):
        return True
    if t in {"?!","!?"}:
        return True
    if t in {"!","?"}:
        return True
    if t == ".":
        p = (prev_tok or "").strip()
        n = (next_tok or "").strip()
        low_prev = p.lower()
        if low_prev in ABBREV:
            return False
        if RE_INITIAL.fullmatch(p) or RE_INITIAL_PAIR.fullmatch(p):
            return False
        if RE_SECTION_NUM.fullmatch(p):
            return False
        if RE_NUM_WITH_DOT.fullmatch(p) and (n and re.match(r"[A-Za-z(“\"'\[]", n)):
            return False
        if RE_DOT_TAIL.fullmatch(n):
            return False
        if n.isdigit():
            return False
        return True
    if t == ")" and RE_ALPHA_PAREN.fullmatch(prev_tok):
        return False
    return False

def _likely_ascii_opening(prev_tok: str, next_tok: str) -> bool:
    prev = (prev_tok or "").strip()
    nxt  = (next_tok or "").strip()
    if prev == "" or prev in TERMINALS or prev in OPENERS:
        return True
    if nxt and nxt not in TERMINALS and nxt not in CLOSERS:
        return True
    return False

def assign_corr_sentence_ids(df_map: pd.DataFrame) -> pd.DataFrame:
    df = df_map.copy()
    if "RowID" in df.columns:
        df["ID"] = df["RowID"].astype(str)
    elif "ID" not in df.columns:
        df["ID"] = df.index.astype(str)

    has_ci = "corr_index" in df.columns
    if has_ci and "corr_index_orig" not in df.columns:
        df["corr_index_orig"] = df["corr_index"]

    def _stable_sort_key(g: pd.DataFrame) -> pd.Series:
        pos = pd.Series(np.arange(len(g)), index=g.index, dtype=float)
        if has_ci:
            ci = pd.to_numeric(g["corr_index"], errors="coerce")
            nan_mask = ci.isna()
            bump = (pos - pos.min()) / max((pos.max() - pos.min()), 1) * 1e-6
            return ci.where(~nan_mask, 1e9) + bump
        return pos

    df["_sort_key"] = df.groupby("ID", group_keys=False).apply(_stable_sort_key, include_groups=False)

    def _assign(g: pd.DataFrame) -> pd.Series:
        g = g.sort_values("_sort_key", kind="mergesort")
        toks = (g["corr_token"] if "corr_token" in g.columns else g["raw_token"]).map(_tok).tolist()

        sids = []
        sent_id = 0
        pending_end = False
        i = 0
        while i < len(toks):
            tok = toks[i].strip()
            prev_tok = toks[i-1].strip() if i > 0 else ""
            next_tok = toks[i+1].strip() if i+1 < len(toks) else ""

            if _is_ellipsis_triplet(i, toks):
                pending_end = True
                sids.append(sent_id)
                i += 1
                continue

            if pending_end:
                if tok in CLOSERS or (tok == '"' and not _likely_ascii_opening(prev_tok, next_tok)):
                    sids.append(sent_id); i += 1; continue
                if tok in OPENERS or (tok == '"' and _likely_ascii_opening(prev_tok, next_tok)):
                    sent_id += 1; pending_end = False; sids.append(sent_id); i += 1; continue
                sent_id += 1; pending_end = False; sids.append(sent_id); i += 1; continue
            else:
                sids.append(sent_id); i += 1

            if _is_terminal_token(tok, prev_tok, next_tok):
                pending_end = True

        return pd.Series(sids, index=g.index).reindex(g.index)

    df["CorrSentenceID"] = (
        df.groupby("ID", group_keys=False).apply(_assign, include_groups=False).astype("Int64")
    )
    df.drop(columns=["_sort_key"], inplace=True, errors="ignore")
    return df


# -------------------------
# Step 8D: Title isolation using LLM spans (no extra word)
#           + renumber: use s000 only if a title exists
# -------------------------
def _parse_json_list(s):
    try:
        v = json.loads(s) if isinstance(s, str) else (s or [])
        return v if isinstance(v, list) else []
    except Exception:
        return []

def mark_title_tokens(df_map: pd.DataFrame, df_texts_with_tags: pd.DataFrame) -> pd.DataFrame:
    """
    If a narrative tag of type 'title' exists, mark exactly those corrected
    character spans as TITLE=True (no leaked extra word). Otherwise use a fallback heuristic.
    After marking, if any TITLE exists for an ID, keep them in sentence 0 and bump others +1.
    If no title exists, bump all sentences +1 so 0 is unused (as requested).
    """
    df = df_map.copy()

    # Attach per-ID title spans from df_texts_with_tags
    title_spans = {}
    for _id, corr_text, tags_s in zip(df_texts_with_tags["ID"].astype(str),
                                      df_texts_with_tags["Corrected text (8)"].astype(str),
                                      df_texts_with_tags["NarrativeTagsJSON"].astype(str)):
        tags = _parse_json_list(tags_s)
        spans = [(t.get("start", -1), t.get("end", -1))
                 for t in tags if isinstance(t, dict) and t.get("type") == "title"
                 and isinstance(t.get("start"), int) and isinstance(t.get("end"), int)]
        if spans:
            # Only consider titles that start near the beginning
            spans = [sp for sp in spans if 0 <= sp[0] < max(60, len(corr_text)//3)]
        title_spans[str(_id)] = spans

    if "Sentence Boundaries" not in df.columns:
        df["Sentence Boundaries"] = ""
    if "TITLE" not in df.columns:
        df["TITLE"] = False

    # exact span marking based on corr_start/end
    def mark_group(g):
        gid = str(g["ID"].iloc[0])
        spans = title_spans.get(gid, [])
        if spans:
            for (s0, s1) in spans:
                idx = g.index[(g["corr_start"] >= s0) & (g["corr_end"] <= s1)]
                if len(idx):
                    df.loc[idx, "TITLE"] = True
                    df.loc[idx, "Sentence Boundaries"] = "Title"
        return g

    df.groupby("ID", group_keys=False).apply(mark_group, include_groups=False)

    # If no title spans found for an ID, leave TITLE as False (no s000)

    # Sentence ID renumbering
    def bump_group(g):
        has_title = bool(g["TITLE"].any())
        sids = g["CorrSentenceID"].astype("Int64").copy()
        if has_title:
            # keep titles at 0, bump non-title by +1 if they are 0
            bump_mask = (~g["TITLE"]) & sids.notna()
            sids.loc[bump_mask] = sids.loc[bump_mask] + 1
        else:
            # no title: bump all sentences by +1 so s000 is unused
            mask = sids.notna()
            sids.loc[mask] = sids.loc[mask] + 1
        g["CorrSentenceID"] = sids
        return g

    df = df.groupby("ID", group_keys=False).apply(bump_group, include_groups=False).reset_index(drop=True)

    # Rebuild SentenceRef
    def _sid3(x):
        try: return f"{int(x):03d}"
        except: return "000"
    df["SentenceRef"] = df["ID"].astype(str) + "_s" + df["CorrSentenceID"].map(_sid3)

    return df


# -------------------------
# Step 8E: Boundary flags (case-pattern; skip inserted punct)
# -------------------------
TERMINALS_HARD = {".","!","?","…","...","?!","!?"}
OPENING_PUNCT  = {'"', "“", "‘", "«", "(", "[", "{"}

def _first_alpha_case(s: str):
    m = re.search(r"[A-Za-z]", s or "")
    if not m:
        return None
    return s[m.start()].isupper()

def _is_wordish(tok: str) -> bool:
    return bool(tok) and bool(re.search(r"\w", tok))

def add_sentence_boundary_flags(df_map: pd.DataFrame) -> pd.DataFrame:
    df = df_map.copy()
    for col in ("Sentence Boundaries", "BoundaryCheck"):
        if col not in df.columns:
            df[col] = ""

    if "corr_index" not in df.columns:
        df["corr_index"] = np.nan
    df["_rowpos"] = np.arange(len(df))
    df["_sort_corr"] = pd.to_numeric(df["corr_index"], errors="coerce").fillna(1e12) + (df["_rowpos"]*1e-9)
    df = df.sort_values(["ID","CorrSentenceID","_sort_corr"], kind="mergesort")

    def _first_content_row(g: pd.DataFrame):
        # Prefer first non-insert word-ish token; skip opening quotes
        ops = g["op"] if "op" in g.columns else pd.Series(["equal"]*len(g), index=g.index)
        for idx in g.index:
            tok = str(g.at[idx, "corr_token"])
            if tok in OPENING_PUNCT:
                continue
            if not _is_wordish(tok):
                continue
            if ops.at[idx] == "insert":
                continue
            return idx
        # fallback: any word-ish
        for idx in g.index:
            tok = str(g.at[idx, "corr_token"])
            if tok in OPENING_PUNCT:
                continue
            if _is_wordish(tok):
                return idx
        return None

    def _last_terminal_row(g: pd.DataFrame):
        toks = g["corr_token"].astype(str).tolist()
        for pos in range(len(toks)-1, -1, -1):
            if toks[pos] in TERMINALS_HARD:
                return g.index[pos]
        return None

    for (id_, sid), g in df.groupby(["ID","CorrSentenceID"], sort=False):
        if "TITLE" in g.columns and g["TITLE"].any():
            df.loc[g.index, "Sentence Boundaries"] = "Title"
            continue

        g = g.sort_values("_sort_corr", kind="mergesort")
        b = _first_content_row(g)
        e = _last_terminal_row(g)

        if b is not None:
            prev = df.at[b, "Sentence Boundaries"]
            if prev.strip() != "Title":
                df.at[b, "Sentence Boundaries"] = prev + (" | " if prev else "") + "Sentence Beginning"
                raw_tok = str(df.at[b, "raw_token"] or "") if "raw_token" in df.columns else ""
                corr_tok = str(df.at[b, "corr_token"] or "")
                ra = _first_alpha_case(raw_tok)
                ca = _first_alpha_case(corr_tok)
                tag = "Unknown Beginning" if (ra is None or ca is None) else ("Correct Beginning" if (ra == ca) else "Incorrect Beginning")
                prev = df.at[b, "BoundaryCheck"]
                df.at[b, "BoundaryCheck"] = prev + (" | " if prev else "") + tag

        if e is not None:
            prev = df.at[e, "Sentence Boundaries"]
            if prev.strip() != "Title":
                df.at[e, "Sentence Boundaries"] = prev + (" | " if prev else "") + "Sentence Ending"
                ce_tok = str(df.at[e, "corr_token"] or "")
                tag = "Correct Ending" if (ce_tok in TERMINALS_HARD) else "Incorrect Ending"
                prev = df.at[e, "BoundaryCheck"]
                df.at[e, "BoundaryCheck"] = prev + (" | " if prev else "") + tag

    def _sid3(x):
        try: return f"{int(x):03d}"
        except: return "000"
    df["SentenceRef"] = df["ID"].astype(str) + "_s" + df["CorrSentenceID"].map(_sid3)

    df.drop(columns=["_rowpos","_sort_corr"], inplace=True, errors="ignore")
    return df


# -------------------------
# Detokenizer (for Step 9)
# -------------------------
NO_SPACE_BEFORE = set(list(".,;:!?)]}\"'»”’…"))
NO_SPACE_AFTER  = set(list("([{\"'«“‘"))

def _detok(tokens):
    out = []
    for t in tokens:
        if t is None or (isinstance(t, float) and math.isnan(t)):
            continue
        t = str(t)
        if not out:
            out.append(t); continue
        prev = out[-1]
        if t in NO_SPACE_BEFORE or re.fullmatch(r"[.]{3}", t):
            out[-1] = prev + t
        elif prev in NO_SPACE_AFTER:
            out[-1] = prev + t
        else:
            out.append(" " + t)
    return "".join(out).strip()


# -------------------------
# Public Step 8
# -------------------------
def run_step8(df_preprocessed: pd.DataFrame,
              raw_col="Raw text",
              id_col="ID",
              client=None,
              model=MODEL_ID,
              use_mock=USE_MOCK):
    # 8A: correction + tags
    df_corr = run_correct_only(
        df_preprocessed,
        text_col=raw_col,
        id_col=id_col,
        client=None if use_mock else client,
        model=model,
        use_mock=use_mock,
        out_col="Corrected text (8)"
    )
    # extra safety: normalize mojibake
    df_corr = apply_mojibake_normalization(df_corr, corrected_col="Corrected text (8)")

    # 8B: token map
    df_map, df_texts = run_mapping_only(
        df_corr, id_col=id_col, raw_col=raw_col, corr_col="Corrected text (8)"
    )

    # Repair alignment (YAAAY→Yay, lest→Let)
    df_map = postprocess_pair_deletions_and_insertions(df_map, max_window=6, min_sim=0.55)

    # 8C: sentence IDs
    df_map = assign_corr_sentence_ids(df_map)

    # 8D: title marking using exact spans + renumbering rule for s000
    df_texts["ID"] = df_texts["ID"].astype(str)
    df_map = mark_title_tokens(df_map, df_texts_with_tags=df_texts)

    # 8E: boundaries with case-pattern correctness
    df_map = add_sentence_boundary_flags(df_map)

    return df_texts, df_map


# -------------------------
# Step 9: Sentence table (exclude titles) + project dialogue/temporal/closure
# -------------------------
def _overlap(a0, a1, b0, b1):
    return max(0, min(a1, b0 if b0>b1 else b1) - max(a0, b0)) > 0 if a1>=a0 and b1>=b0 else False

def run_step9(df_map: pd.DataFrame, df_texts_with_tags: pd.DataFrame) -> pd.DataFrame:
    need = {"ID","CorrSentenceID","corr_token","Sentence Boundaries","BoundaryCheck","SentenceRef","TITLE","corr_start","corr_end"}
    miss = need - set(df_map.columns)
    if miss:
        raise KeyError(f"df_map missing columns needed for Step 9: {miss}")

    # sort stable
    sort_cols = ["ID","CorrSentenceID"]
    if "corr_index" in df_map.columns: sort_cols.append("corr_index")
    wm = df_map.sort_values(sort_cols, kind="mergesort").copy()

    # Exclude titles from aggregation
    core = wm[~wm["TITLE"].astype(bool)].copy()

    # build sentence spans (min/max corr char)
    sent_spans = (
        core.groupby(["ID","CorrSentenceID"], as_index=False, sort=False)
            .agg(CorrStartMin=("corr_start","min"),
                 CorrEndMax=("corr_end","max"))
    )

    # sentence text and metrics
    def _summarize_sentence(g: pd.DataFrame) -> pd.Series:
        corr_tokens = g["corr_token"].tolist()
        raw_tokens  = [x for x in g["raw_token"].tolist() if not pd.isna(x)] if "raw_token" in g.columns else []
        corr_text   = _detok(corr_tokens)
        raw_text    = _detok(raw_tokens) if raw_tokens else ""

        b_rows = g[g["Sentence Boundaries"].str.contains("Sentence Beginning", na=False)]
        e_rows = g[g["Sentence Boundaries"].str.contains("Sentence Ending",   na=False)]

        begin_ok = np.nan
        end_ok   = np.nan
        if not b_rows.empty:
            chk = " | ".join(b_rows["BoundaryCheck"].dropna().astype(str))
            begin_ok = 1 if "Correct Beginning" in chk else (0 if "Incorrect Beginning" in chk else np.nan)
        if not e_rows.empty:
            chk = " | ".join(e_rows["BoundaryCheck"].dropna().astype(str))
            end_ok = 1 if "Correct Ending" in chk else (0 if "Incorrect Ending" in chk else np.nan)

        ops = g["op"] if "op" in g.columns else pd.Series([], dtype=object)
        return pd.Series({
            "SentenceRef": g["SentenceRef"].iloc[0],
            "CorrectedSentence": corr_text,
            "RawSentence": raw_text,
            "TokensInSentence": int(len(g)),
            "EditsInSentence": int((ops != "equal").sum()) if not ops.empty else np.nan,
            "EqualsInSentence": int((ops == "equal").sum()) if not ops.empty else np.nan,
            "Insertions": int((ops == "insert").sum()) if not ops.empty else np.nan,
            "Deletions": int((ops == "delete").sum()) if not ops.empty else np.nan,
            "Replacements": int((ops == "replace").sum()) if not ops.empty else np.nan,
            "BeginBoundaryRow": (b_rows.index[0] if not b_rows.empty else np.nan),
            "EndBoundaryRow":   (e_rows.index[0] if not e_rows.empty else np.nan),
            "CorrectBeginning": begin_ok,
            "CorrectEnding":    end_ok,
        })

    sent_df = (
        core.groupby(["ID","CorrSentenceID"], as_index=False, sort=False)
            .apply(_summarize_sentence, include_groups=False)
            .reset_index(drop=True)
            .sort_values(["ID","SentenceRef"], kind="mergesort")
    )

    # Project dialogue + temporal + closure flags from df_texts_with_tags (per ID)
    # We'll mark a sentence as dialogue if any dialogue span overlaps its [CorrStartMin, CorrEndMax)
    df_texts = df_texts_with_tags[["ID","Corrected text (8)","NarrativeTagsJSON","DialogueSpansJSON"]].copy()
    df_texts["ID"] = df_texts["ID"].astype(str)

    sent_df = sent_df.merge(sent_spans, on=["ID","CorrSentenceID"], how="left")
    sent_df = sent_df.merge(df_texts, on="ID", how="left")

    def _flags(row):
        # dialogue
        dlg = _parse_json_list(row.get("DialogueSpansJSON","[]"))
        s0, s1 = row["CorrStartMin"], row["CorrEndMax"]
        has_dialogue = False
        for d in dlg:
            try:
                ds, de = int(d.get("start", -1)), int(d.get("end", -1))
                if ds >= 0 and de >= 0 and _overlap(s0, s1, ds, de):
                    has_dialogue = True; break
            except Exception:
                pass

        # temporal + closure
        tags = _parse_json_list(row.get("NarrativeTagsJSON","[]"))
        has_temporal = False
        has_closure  = False
        for t in tags:
            try:
                if t.get("type") == "temporal":
                    ts, te = int(t.get("start",-1)), int(t.get("end",-1))
                    if ts >= 0 and te >= 0 and _overlap(s0, s1, ts, te):
                        has_temporal = True
                elif t.get("type") == "closure":
                    ts, te = int(t.get("start",-1)), int(t.get("end",-1))
                    if ts >= 0 and te >= 0 and _overlap(s0, s1, ts, te):
                        has_closure = True
            except Exception:
                pass

        return pd.Series({
            "IsDialogue": bool(has_dialogue),
            "HasTemporal": bool(has_temporal),
            "HasClosure": bool(has_closure),
        })

    sent_df = pd.concat([sent_df, sent_df.apply(_flags, axis=1)], axis=1)

    # clean columns
    drop_cols = ["CorrStartMin","CorrEndMax","Corrected text (8)","NarrativeTagsJSON","DialogueSpansJSON"]
    for c in drop_cols:
        if c in sent_df.columns:
            sent_df.drop(columns=[c], inplace=True)

    return sent_df


# -------------------------
# Save + Download wrapper
# -------------------------
def _ensure_dir(path):
    os.makedirs(path, exist_ok=True)
    return path

def save_and_download_step8_9(
    df_preprocessed: pd.DataFrame,
    *,
    raw_col="Raw text",
    id_col="ID",
    client=None,              # from Step 6
    model="gpt-4o",
    use_mock=False,           # True for offline test
    out_dir="/content"
):
    ts = time.strftime("%Y%m%d_%H%M%S")
    _ensure_dir(out_dir)

    # Step 8
    df_texts_8, df_map_8 = run_step8(
        df_preprocessed,
        raw_col=raw_col,
        id_col=id_col,
        client=client if not use_mock else None,
        model=model,
        use_mock=use_mock
    )

    # Step 9
    sent_df = run_step9(df_map_8, df_texts_with_tags=df_texts_8)

    # Paths
    p_texts = os.path.join(out_dir, f"step8_texts_{ts}.csv")
    p_map   = os.path.join(out_dir, f"step8_wordmap_checked_{ts}.csv")
    p_sent  = os.path.join(out_dir, f"step9_sentence_mapping_with_boundaries_{ts}.csv")
    p_zip   = os.path.join(out_dir, f"step8_9_outputs_{ts}.zip")

    # Save
    df_texts_8.to_csv(p_texts, index=False, encoding="utf-8")
    df_map_8.to_csv(p_map,   index=False, encoding="utf-8")
    sent_df.to_csv(p_sent,   index=False, encoding="utf-8")

    # Zip bundle
    with zipfile.ZipFile(p_zip, "w", compression=zipfile.ZIP_DEFLATED) as zf:
        zf.write(p_texts, arcname=os.path.basename(p_texts))
        zf.write(p_map,   arcname=os.path.basename(p_map))
        zf.write(p_sent,  arcname=os.path.basename(p_sent))

    print("Saved:")
    print("  ", p_texts)
    print("  ", p_map)
    print("  ", p_sent)
    print("  ", p_zip)

    if _COLAB:
        try:
            files.download(p_texts)
            files.download(p_map)
            files.download(p_sent)
            files.download(p_zip)
        except Exception as e:
            print("Download hint:", e)

    return dict(
        step8_texts_path=p_texts,
        step8_map_path=p_map,
        step9_sentences_path=p_sent,
        zip_path=p_zip,
        df_texts_8=df_texts_8,
        df_map_8=df_map_8,
        sent_df=sent_df
    )


In [None]:
# ===============================================
# STEP 8 & 9 (Final build): correction → mapping → IDs
# Uses LLM for corrected text + narrative tags + dialogue,
# robust alignment, title exact-span marking, better beginnings,
# ellipsis + quotes, and clean downloads.
# ===============================================

import os, re, io, json, time, math, zipfile, logging, random
import numpy as np
import pandas as pd
from difflib import SequenceMatcher

# -----------------------
# Config
# -----------------------
MODEL_ID   = "gpt-4o"
MAX_TOKENS = 1500
USE_MOCK   = False     # True = offline deterministic correction (no API)
OUT_DIR    = "/content"

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

# Colab download helper
try:
    from google.colab import files
    _COLAB = True
except Exception:
    _COLAB = False

# -----------------------
# Mojibake normalization
# -----------------------
_MOJIBAKE_FIXES = [
    (r"â€”", "—"), (r"â€“", "–"),
    (r"â€˜", "‘"), (r"â€™", "’"),
    (r"â€œ", "“"), (r"â€", "”"),
    (r"â€¦", "…"),
    (r"Â ", " "),
]

def normalize_mojibake(s: str) -> str:
    if s is None:
        return ""
    out = str(s)
    for pat, repl in _MOJIBAKE_FIXES:
        # support string or compiled pattern
        if hasattr(pat, "sub"):
            out = pat.sub(repl, out)
        else:
            out = re.sub(pat, repl, out)
    return out

# -----------------------
# Utility: extract first JSON object
# -----------------------
def _extract_first_json_object(txt: str):
    if not txt:
        return None
    start = txt.find("{")
    if start < 0:
        return None
    depth, in_str, esc = 0, False, False
    for i in range(start, len(txt)):
        ch = txt[i]
        if in_str:
            if esc: esc = False
            elif ch == "\\": esc = True
            elif ch == '"': in_str = False
        else:
            if ch == '"': in_str = True
            elif ch == "{": depth += 1
            elif ch == "}":
                depth -= 1
                if depth == 0:
                    frag = txt[start:i+1]
                    try:
                        return json.loads(frag)
                    except Exception:
                        return None
    return None

# -----------------------
# 8A. LLM correction + tags
# -----------------------
def correct_with_tags(raw: str, client=None, model=MODEL_ID, use_mock=USE_MOCK):
    """Return corrected_text, narrative_tags(list), dialogue_spans(list of {start,end}), source."""
    s = normalize_mojibake(str(raw or ""))

    if use_mock or client is None:
        # Deterministic light-touch mock: capitalize first alpha, ensure end dot
        t = s.strip()
        m = re.search(r"[A-Za-z]", t)
        if m:
            i = m.start()
            t = t[:i] + t[i].upper() + t[i+1:]
        if t and not re.search(r"[.!?…]\s*$", t):
            t += "."
        return t, [], [], "mock"

    prompt = f"""
You are a meticulous copy-editor. Fix punctuation, grammar, and spelling.
Keep meaning and paragraphing. Use standard English punctuation.

Also identify narrative tags and dialogue:

- NarrativeTags:
  * Titles at the very start (one line/phrase): type="title"
  * Temporal transitions (e.g., "The next day", "2 weeks later", dates like "March 19th, 3202"): type="temporal"
  * Closures (e.g., "THE END", "To be continued", "***" as a scene break): type="closure"
  Each item: {{"type":"title|temporal|closure","text":"...","start":<char_index>,"end":<char_index>}}

- DialogueSpans:
  Contiguous regions of direct quoted speech (opening to closing quotes, include quotes).
  Return as array of objects: {{"start":<char_index>,"end":<char_index>}}

Return ONLY JSON:
{{
  "corrected_text": "...",
  "narrative_tags": [ ... ],
  "dialogue_spans": [ ... ]
}}

Text:
<<<BEGIN>>>
{s}
<<<END>>>
""".strip()

    try:
        from openai import OpenAI
        _client = client or OpenAI()
        resp = _client.chat.completions.create(
            model=model,
            messages=[{"role":"user","content":prompt}],
            temperature=0.0,
            max_tokens=MAX_TOKENS
        )
        out = (resp.choices[0].message.content or "").strip()
        js = _extract_first_json_object(out)
        if not isinstance(js, dict):
            return s, [], [], "fallback"
        corrected = normalize_mojibake(js.get("corrected_text","") or "").strip()
        tags = js.get("narrative_tags") or []
        spans = js.get("dialogue_spans") or []
        # sanitize spans
        def _clip(a,b,n):
            a = max(0, int(a)); b = max(a, int(b));
            if n is not None: b = min(b, n); a = min(a, b)
            return a,b
        N = len(corrected)
        clean_tags = []
        for t in tags:
            try:
                tt = str(t.get("type","")).lower()
                st, en = _clip(t.get("start",0), t.get("end",0), N)
                txt = corrected[st:en]
                if tt in {"title","temporal","closure"} and en>st:
                    clean_tags.append({"type":tt,"text":txt,"start":st,"end":en})
            except Exception:
                pass
        clean_spans = []
        for d in spans:
            try:
                st, en = _clip(d.get("start",0), d.get("end",0), N)
                if en>st:
                    clean_spans.append({"start":st,"end":en})
            except Exception:
                pass
        return corrected, clean_tags, clean_spans, model
    except Exception as e:
        logger.warning("LLM correction failed, using mojibake-normalized text. %s", e)
        return s, [], [], "error"

def run_correct_only(
    df_in: pd.DataFrame,
    text_col="Raw text",
    id_col="ID",
    client=None,
    model=MODEL_ID,
    use_mock=USE_MOCK,
    out_col="Corrected text (8)"
) -> pd.DataFrame:
    if text_col not in df_in.columns:
        raise KeyError(f"Missing required column: {text_col}")
    df = df_in.copy()

    # Normalize ID strings (avoid 123.0 drift)
    def _norm_id_series(s: pd.Series) -> pd.Series:
        s = s.astype(str).str.replace(r"\.0$", "", regex=True)
        def _fix(x):
            if any(c.isalpha() for c in x):  # alphanumeric → keep
                return x
            try:
                if "." in x or "e" in x.lower():
                    f = float(x)
                    if f.is_integer():
                        return str(int(f))
            except Exception:
                pass
            return x
        return s.map(_fix)

    if id_col not in df.columns:
        df[id_col] = pd.RangeIndex(len(df)).astype(str)
    else:
        df[id_col] = _norm_id_series(df[id_col])

    corrected, tags_json, dlg_json, sources = [], [], [], []
    for raw in df[text_col].astype(str).tolist():
        c, tags, spans, src = correct_with_tags(raw, client=client, model=model, use_mock=use_mock)
        corrected.append(c)
        tags_json.append(json.dumps(tags, ensure_ascii=False))
        dlg_json.append(json.dumps(spans, ensure_ascii=False))
        sources.append(src)

    df[out_col] = corrected
    df["NarrativeTagsJSON"] = tags_json
    df["DialogueSpansJSON"] = dlg_json
    df["CorrectedBy"] = sources
    return df

# -----------------------
# 8B. Tokenize + map (with merged-word split + canonical diff)
# -----------------------
_WORD_RX = re.compile(r"\w", flags=re.UNICODE)

def _split_merged_word(tok: str):
    # Split once when ALLCAPS followed by lowercase: YAAYwe -> YAAY + we
    if not tok or not tok.isalpha():
        return [tok]
    m = re.match(r"^([A-Z]{2,})([a-z].*)$", tok)
    if m:
        left, right = m.group(1), m.group(2)
        return [left, right]
    return [tok]

def _tokenize_with_split(s: str):
    base = re.findall(r"\w+|[^\w\s]", s or "", flags=re.UNICODE)
    out = []
    for t in base:
        if re.fullmatch(r"\w+", t):
            out.extend(_split_merged_word(t))
        else:
            out.append(t)
    return out

def _rebuild_offsets_with_splitting(text, tokens):
    spans = []
    i = 0
    n = len(text)
    for tok in tokens:
        if tok == "" or tok is None:
            spans.append((i, i)); continue
        pos = text.find(tok, i)
        if pos >= 0:
            start, end = pos, pos + len(tok)
            spans.append((start, end))
            i = end
        else:
            # best-effort slice (keeps map coherent if we introduced a split)
            j = i
            while j < n and text[j].isspace():
                j += 1
            start = j
            end = min(n, start + len(tok))
            spans.append((start, end))
            i = end
    return spans

def _is_word(tok: str) -> bool:
    return bool(tok) and bool(_WORD_RX.search(tok))

def _canon(tok: str) -> str:
    """Canonical form for diff: uppercase + collapse repeated letters (YAAY→YAY)."""
    if tok is None:
        return ""
    u = str(tok).upper()
    return re.sub(r"(.)\1+", r"\1", u)

def build_word_map(raw_text, corr_text):
    raw_text  = str(raw_text or "")
    corr_text = str(corr_text or "")

    raw_tokens  = _tokenize_with_split(raw_text)
    corr_tokens = _tokenize_with_split(corr_text)

    raw_spans  = _rebuild_offsets_with_splitting(raw_text, raw_tokens)
    corr_spans = _rebuild_offsets_with_splitting(corr_text, corr_tokens)

    # Diff on canonical tokens to improve alignment (YAAY↔Yay, etc.)
    sm = SequenceMatcher(a=[_canon(t) for t in raw_tokens],
                         b=[_canon(t) for t in corr_tokens],
                         autojunk=False)

    rows = []
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag == "equal":
            for k in range(i2 - i1):
                ri = i1 + k; ci = j1 + k
                r_tok, c_tok = raw_tokens[ri], corr_tokens[ci]
                r_start, r_end = raw_spans[ri]
                c_start, c_end = corr_spans[ci]
                rows.append({
                    "raw_index": ri, "raw_token": r_tok, "raw_start": r_start, "raw_end": r_end,
                    "corr_index": ci, "corr_token": c_tok, "corr_start": c_start, "corr_end": c_end,
                    "op": "equal", "equal_ci": (r_tok == c_tok), "error_type": "Equal"
                })

        elif tag == "replace":
            m = min(i2 - i1, j2 - j1)
            for k in range(m):
                ri = i1 + k; ci = j1 + k
                r_tok, c_tok = raw_tokens[ri], corr_tokens[ci]
                r_start, r_end = raw_spans[ri]
                c_start, c_end = corr_spans[ci]
                # Spelling vs Replacement
                err = "Spelling" if (str(r_tok).isalpha() and str(c_tok).isalpha() and _canon(r_tok) == _canon(c_tok)) else "Replacement"
                rows.append({
                    "raw_index": ri, "raw_token": r_tok, "raw_start": r_start, "raw_end": r_end,
                    "corr_index": ci, "corr_token": c_tok, "corr_start": c_start, "corr_end": c_end,
                    "op": "replace", "equal_ci": (_canon(r_tok) == _canon(c_tok)), "error_type": err
                })
            for ri in range(i1 + m, i2):  # deletions
                r_tok = raw_tokens[ri]; r_start, r_end = raw_spans[ri]
                rows.append({
                    "raw_index": ri, "raw_token": r_tok, "raw_start": r_start, "raw_end": r_end,
                    "corr_index": None, "corr_token": None, "corr_start": None, "corr_end": None,
                    "op": "delete", "equal_ci": False,
                    "error_type": "PunctuationDeletion" if not _is_word(r_tok) else "Deletion"
                })
            for ci in range(j1 + m, j2):  # insertions
                c_tok = corr_tokens[ci]; c_start, c_end = corr_spans[ci]
                rows.append({
                    "raw_index": None, "raw_token": None, "raw_start": None, "raw_end": None,
                    "corr_index": ci, "corr_token": c_tok, "corr_start": c_start, "corr_end": c_end,
                    "op": "insert", "equal_ci": False,
                    "error_type": "PunctuationInsertion" if not _is_word(c_tok) else "Insertion"
                })

        elif tag == "delete":
            for ri in range(i1, i2):
                r_tok = raw_tokens[ri]; r_start, r_end = raw_spans[ri]
                rows.append({
                    "raw_index": ri, "raw_token": r_tok, "raw_start": r_start, "raw_end": r_end,
                    "corr_index": None, "corr_token": None, "corr_start": None, "corr_end": None,
                    "op": "delete", "equal_ci": False,
                    "error_type": "PunctuationDeletion" if not _is_word(r_tok) else "Deletion"
                })

        elif tag == "insert":
            for ci in range(j1, j2):
                c_tok = corr_tokens[ci]; c_start, c_end = corr_spans[ci]
                rows.append({
                    "raw_index": None, "raw_token": None, "raw_start": None, "raw_end": None,
                    "corr_index": ci, "corr_token": c_tok, "corr_start": c_start, "corr_end": c_end,
                    "op": "insert", "equal_ci": False,
                    "error_type": "PunctuationInsertion" if not _is_word(c_tok) else "Insertion"
                })
    return rows

def run_mapping_only(df_with_corr, id_col="ID",
                     raw_col="Raw text", corr_col="Corrected text (8)"):
    need = {raw_col, corr_col}
    if not need.issubset(df_with_corr.columns):
        raise KeyError(f"Missing columns: {need - set(df_with_corr.columns)}")
    df = df_with_corr.copy()
    if id_col not in df.columns:
        df[id_col] = pd.RangeIndex(len(df)).astype(str)
    df[id_col] = df[id_col].astype(str)

    all_rows = []
    for order, (rid, raw, cor) in enumerate(zip(
        df[id_col].tolist(),
        df[raw_col].astype(str).tolist(),
        df[corr_col].astype(str).tolist()
    )):
        rows = build_word_map(raw, cor)
        if not rows:
            rows = [{
                "raw_index": np.nan, "raw_token": None, "raw_start": np.nan, "raw_end": np.nan,
                "corr_index": np.nan, "corr_token": None, "corr_start": np.nan, "corr_end": np.nan,
                "op": "empty", "equal_ci": False, "error_type": "EmptyText"
            }]
        for r in rows:
            rec = {"RowID": rid, "DocOrder": order, **r}
            rec["Changed"] = (r.get("op") != "equal")
            all_rows.append(rec)
    map_df = pd.DataFrame(all_rows)
    texts_out = df.copy()
    return map_df, texts_out

# -----------------------
# 8C. Sentence IDs (ellipsis + quote aware)
# -----------------------
ABBREV = {
    "mr.","mrs.","ms.","dr.","prof.","sr.","jr.","st.","vs.","etc.",
    "e.g.","i.e.","cf.","fig.","ex.","no.","approx.","circa.","ca.",
    "dept.","est.","misc.","rev.","jan.","feb.","mar.","apr.","jun.",
    "jul.","aug.","sep.","sept.","oct.","nov.","dec."
}
TERMINALS = {".", "!", "?", "…", "...", "?!", "!?"}
CLOSERS   = {")", "]", "}", "”", "’", "»"}
OPENERS   = {"(", "[", "{", "“", "‘", "«"}
RE_INITIAL       = re.compile(r"^[A-Z]\.$")
RE_INITIAL_PAIR  = re.compile(r"^[A-Z]\.[A-Z]\.$")
RE_NUM_WITH_DOT  = re.compile(r"^\d+\.$")
RE_SECTION_NUM   = re.compile(r"^\d+(?:\.\d+){1,3}$")
RE_DOT_TAIL      = re.compile(r"^\.\d+$")
RE_ELLIPSIS      = re.compile(r"^\.\.\.$")
RE_ALPHA_PAREN   = re.compile(r"^[A-Za-z]\)$")

def _tok(x):
    if pd.isna(x) or x is None: return ""
    return str(x)

def _is_ellipsis_triplet(i, toks):
    return (i+2 < len(toks) and toks[i] == "." and toks[i+1] == "." and toks[i+2] == ".")

def _is_terminal_token(tok: str, prev_tok: str, next_tok: str) -> bool:
    t = tok.strip()
    if not t:
        return False
    if t in {"…","..."} or RE_ELLIPSIS.fullmatch(t):
        return True
    if t in {"?!","!?"}:
        return True
    if t in {"!","?"}:
        return True
    if t == ".":
        p = (prev_tok or "").strip()
        n = (next_tok or "").strip()
        low_prev = p.lower()
        if low_prev in ABBREV: return False
        if RE_INITIAL.fullmatch(p) or RE_INITIAL_PAIR.fullmatch(p): return False
        if RE_SECTION_NUM.fullmatch(p): return False
        if RE_NUM_WITH_DOT.fullmatch(p) and (n and re.match(r"[A-Za-z(“\"'\[]", n)): return False
        if RE_DOT_TAIL.fullmatch(n): return False
        if n.isdigit(): return False
        return True
    if t == ")" and RE_ALPHA_PAREN.fullmatch(prev_tok):
        return False
    return False

def _likely_ascii_opening(prev_tok: str, next_tok: str) -> bool:
    prev = (prev_tok or "").strip()
    nxt  = (next_tok or "").strip()
    if prev == "" or prev in TERMINALS or prev in OPENERS:
        return True
    if nxt and nxt not in TERMINALS and nxt not in CLOSERS:
        return True
    return False

def assign_corr_sentence_ids(df_map: pd.DataFrame) -> pd.DataFrame:
    df = df_map.copy()
    if "RowID" in df.columns:
        df["ID"] = df["RowID"].astype(str)
    elif "ID" not in df.columns:
        df["ID"] = df.index.astype(str)

    has_ci = "corr_index" in df.columns
    if has_ci and "corr_index_orig" not in df.columns:
        df["corr_index_orig"] = df["corr_index"]

    def _stable_sort_key(g: pd.DataFrame) -> pd.Series:
        pos = pd.Series(np.arange(len(g)), index=g.index, dtype=float)
        if has_ci:
            ci = pd.to_numeric(g["corr_index"], errors="coerce")
            nan_mask = ci.isna()
            bump = (pos - pos.min()) / max((pos.max() - pos.min()), 1) * 1e-6
            return ci.where(~nan_mask, 1e9) + bump
        return pos

    df["_sort_key"] = df.groupby("ID", group_keys=False).apply(_stable_sort_key)

    def _assign(g: pd.DataFrame) -> pd.Series:
        g = g.sort_values("_sort_key", kind="mergesort")
        toks = (g["corr_token"] if "corr_token" in g.columns else g["raw_token"]).map(_tok).tolist()

        sids = []
        sent_id = 0
        pending_end = False
        i = 0
        while i < len(toks):
            tok = toks[i].strip()
            prev_tok = toks[i-1].strip() if i > 0 else ""
            next_tok = toks[i+1].strip() if i+1 < len(toks) else ""

            if _is_ellipsis_triplet(i, toks):
                pending_end = True
                sids.append(sent_id)
                i += 1
                continue

            if pending_end:
                if tok in CLOSERS or (tok == '"' and not _likely_ascii_opening(prev_tok, next_tok)):
                    sids.append(sent_id); i += 1; continue
                if tok in OPENERS or (tok == '"' and _likely_ascii_opening(prev_tok, next_tok)):
                    sent_id += 1; pending_end = False; sids.append(sent_id); i += 1; continue
                sent_id += 1; pending_end = False; sids.append(sent_id); i += 1; continue
            else:
                sids.append(sent_id); i += 1

            if _is_terminal_token(tok, prev_tok, next_tok):
                pending_end = True

        return pd.Series(sids, index=g.index).reindex(g.index)

    df["CorrSentenceID"] = (
        df.groupby("ID", group_keys=False)
          .apply(_assign)
          .astype("Int64")
    )
    df.drop(columns=["_sort_key"], inplace=True, errors="ignore")
    return df

# -----------------------
# 8D. Mark Title/Dialogue by exact LLM spans
# -----------------------
def _overlap(a0,a1,b0,b1):
    return max(0, min(a1,b1)-max(a0,b0)) > 0

def mark_title_and_dialogue(df_map: pd.DataFrame, df_texts: pd.DataFrame) -> pd.DataFrame:
    df = df_map.copy()

    if "Sentence Boundaries" not in df.columns:
        df["Sentence Boundaries"] = ""
    df["TITLE"] = False
    df["DIALOGUE"] = False

    # Fast lookup per ID
    tags_by_id = {str(i): (json.loads(t) if isinstance(t,str) and t.strip().startswith("[") else (t or []))
                  for i,t in zip(df_texts["ID"].astype(str), df_texts.get("NarrativeTagsJSON",[]))}
    dlg_by_id  = {str(i): (json.loads(t) if isinstance(t,str) and t.strip().startswith("[") else (t or []))
                  for i,t in zip(df_texts["ID"].astype(str), df_texts.get("DialogueSpansJSON",[]))}

    def per_id(g: pd.DataFrame) -> pd.DataFrame:
        g = g.sort_values(["CorrSentenceID","corr_index"], kind="mergesort")
        ID = str(g["ID"].iloc[0]) if "ID" in g.columns else str(g["RowID"].iloc[0])

        # Mark titles using exact corrected spans
        for tag in tags_by_id.get(ID, []):
            if tag.get("type") == "title":
                st, en = int(tag["start"]), int(tag["end"])
                mask = g.apply(lambda r: _overlap(r.get("corr_start",0), r.get("corr_end",0), st, en), axis=1)
                if mask.any():
                    g.loc[mask, "TITLE"] = True

        # Mark dialogue tokens by span overlap
        for sp in dlg_by_id.get(ID, []):
            st, en = int(sp["start"]), int(sp["end"])
            mask = g.apply(lambda r: _overlap(r.get("corr_start",0), r.get("corr_end",0), st, en), axis=1)
            if mask.any():
                g.loc[mask, "DIALOGUE"] = True

        # If any TITLE tokens share CorrSentenceID with following words, keep TITLE strictly inside span
        # (We don't move sentence IDs—counts exclude TITLE rows in Step 9)
        g.loc[g["TITLE"]==True, "Sentence Boundaries"] = "Title"
        return g

    if "ID" not in df.columns:
        df["ID"] = df["RowID"].astype(str)
    df = df.groupby("ID", group_keys=False).apply(per_id).reset_index(drop=True)

    # SentenceRef (keeps numbering; sentence 0 exists even without a title)
    def _sid3(x):
        try: return f"{int(x):03d}"
        except: return "000"
    df["SentenceRef"] = df["ID"].astype(str) + "_s" + df["CorrSentenceID"].map(_sid3)
    return df

# -----------------------
# 8E. Boundary flags (skip titles), capitalization check only
# -----------------------
TERMINALS_HARD = {".","!","?","…","...","?!","!?"}
OPENING_PUNCT  = {'"', "“", "‘", "«", "(", "[", "{"}

def _first_alpha_capitalized(s: str):
    if not isinstance(s, str): return None
    m = re.search(r"[A-Za-z]", s)
    if not m: return None
    return s[m.start()].isupper()

def add_sentence_boundary_flags(df_map: pd.DataFrame) -> pd.DataFrame:
    df = df_map.copy()
    for col in ("Sentence Boundaries", "BoundaryCheck"):
        if col not in df.columns:
            df[col] = ""

    if "corr_index" not in df.columns:
        df["corr_index"] = np.nan
    df["_rowpos"] = np.arange(len(df))
    df["_sort_corr"] = pd.to_numeric(df["corr_index"], errors="coerce").fillna(1e12) + (df["_rowpos"]*1e-9)
    df = df.sort_values(["ID","CorrSentenceID","_sort_corr"], kind="mergesort")

    def _first_content_row(g: pd.DataFrame):
        for idx, tok, title in zip(g.index, g["corr_token"].astype(str), g["TITLE"]):
            if title:  # skip title tokens entirely
                continue
            if tok in OPENING_PUNCT:
                continue
            if re.search(r"\w", tok):
                return idx
        return None

    def _last_terminal_row(g: pd.DataFrame):
        toks = g["corr_token"].astype(str).tolist()
        titl = g["TITLE"].tolist()
        for pos in range(len(toks)-1, -1, -1):
            if titl[pos]:
                continue
            if toks[pos] in TERMINALS_HARD:
                return g.index[pos]
        return None

    for (id_, sid), g in df.groupby(["ID","CorrSentenceID"], sort=False):
        if g["TITLE"].all():
            df.loc[g.index, "Sentence Boundaries"] = "Title"
            continue

        g = g.sort_values("_sort_corr", kind="mergesort")
        b = _first_content_row(g)
        e = _last_terminal_row(g)

        if b is not None:
            prev = df.at[b, "Sentence Boundaries"]
            if prev.strip() != "Title":
                df.at[b, "Sentence Boundaries"] = (prev + (" | " if prev else "") + "Sentence Beginning")
                # Capitalization-only correctness:
                raw_tok = str(df.at[b, "raw_token"] or "")
                corr_tok = str(df.at[b, "corr_token"] or "")
                rcap = _first_alpha_capitalized(raw_tok)
                ccap = _first_alpha_capitalized(corr_tok)
                if rcap is None or ccap is None:
                    tag = "Unknown Beginning"
                else:
                    tag = "Correct Beginning" if (rcap == ccap) else "Incorrect Beginning"
                prev = df.at[b, "BoundaryCheck"]
                df.at[b, "BoundaryCheck"] = prev + (" | " if prev else "") + tag

        if e is not None:
            prev = df.at[e, "Sentence Boundaries"]
            if prev.strip() != "Title":
                df.at[e, "Sentence Boundaries"] = prev + (" | " if prev else "") + "Sentence Ending"
                # Ending correctness: exact token match only (spelling fixes shouldn’t matter here)
                re_tok = str(df.at[e, "raw_token"] or "")
                ce_tok = str(df.at[e, "corr_token"] or "")
                tag = "Correct Ending" if re_tok == ce_tok else "Incorrect Ending"
                prev = df.at[e, "BoundaryCheck"]
                df.at[e, "BoundaryCheck"] = prev + (" | " if prev else "") + tag

    def _sid3(x):
        try: return f"{int(x):03d}"
        except: return "000"
    df["SentenceRef"] = df["ID"].astype(str) + "_s" + df["CorrSentenceID"].map(_sid3)

    df.drop(columns=["_rowpos","_sort_corr"], inplace=True, errors="ignore")
    return df

# -----------------------
# 9. Sentences table (exclude titles) + carry tags/dialogue
# -----------------------
NO_SPACE_BEFORE = set(list(".,;:!?)]}\"'»”’…"))
NO_SPACE_AFTER  = set(list("([{\"'«“‘"))

def _detok(tokens):
    # combine tokens; unify spaced ellipsis ". . ." → "..."
    out = []
    for t in tokens:
        if t is None or (isinstance(t, float) and math.isnan(t)):
            continue
        t = str(t)
        if not out:
            out.append(t); continue
        prev = out[-1]
        if t in NO_SPACE_BEFORE or re.fullmatch(r"[.]{3}", t):
            out[-1] = prev + t
        elif prev in NO_SPACE_AFTER:
            out[-1] = prev + t
        else:
            out.append(" " + t)
    s = "".join(out)
    s = re.sub(r"\.\s*\.\s*\.", "...", s)
    return s.strip()

def _span_coverage(rows):
    # fraction of sentence that is dialogue/title by token count
    if len(rows)==0:
        return 0.0
    return float(np.mean(rows))

def _summarize_sentence(g: pd.DataFrame, tags_row: pd.Series) -> pd.Series:
    corr_tokens = g["corr_token"].tolist()
    raw_tokens  = [x for x in g["raw_token"].tolist() if not pd.isna(x)] if "raw_token" in g.columns else []
    corr_text   = _detok(corr_tokens)
    raw_text    = _detok(raw_tokens) if raw_tokens else ""

    b_rows = g[g["Sentence Boundaries"].str.contains("Sentence Beginning", na=False)]
    e_rows = g[g["Sentence Boundaries"].str.contains("Sentence Ending",   na=False)]

    begin_ok = np.nan
    end_ok   = np.nan
    if not b_rows.empty:
        chk = " | ".join(b_rows["BoundaryCheck"].dropna().astype(str))
        begin_ok = 1 if "Correct Beginning" in chk else (0 if "Incorrect Beginning" in chk else np.nan)
    if not e_rows.empty:
        chk = " | ".join(e_rows["BoundaryCheck"].dropna().astype(str))
        end_ok = 1 if "Correct Ending" in chk else (0 if "Incorrect Ending" in chk else np.nan)

    ops = g["op"] if "op" in g.columns else pd.Series([], dtype=object)

    # Dialogue/Title coverage
    dlg_frac   = _span_coverage(g["DIALOGUE"].astype(bool).tolist())
    title_frac = _span_coverage(g["TITLE"].astype(bool).tolist())

    rec = {
        "SentenceRef": g["SentenceRef"].iloc[0],
        "CorrectedSentence": corr_text,
        "RawSentence": raw_text,
        "TokensInSentence": int(len(g)),
        "EditsInSentence": int((ops != "equal").sum()) if not ops.empty else np.nan,
        "EqualsInSentence": int((ops == "equal").sum()) if not ops.empty else np.nan,
        "Insertions": int((ops == "insert").sum()) if not ops.empty else np.nan,
        "Deletions": int((ops == "delete").sum()) if not ops.empty else np.nan,
        "Replacements": int((ops == "replace").sum()) if not ops.empty else np.nan,
        "BeginBoundaryRow": (b_rows.index[0] if not b_rows.empty else np.nan),
        "EndBoundaryRow":   (e_rows.index[0] if not e_rows.empty else np.nan),
        "CorrectBeginning": begin_ok,
        "CorrectEnding":    end_ok,
        "HasHardTerminal":  any(t in TERMINALS_HARD for t in corr_tokens),
        "HasOpeningQuote":  any(t in {'"', "“", "‘", "«"} for t in corr_tokens),
        "HasClosingQuote":  any(t in {'"', "”", "’", "»"} for t in corr_tokens),
        "DialogueTokenFrac": dlg_frac,
        "TitleTokenFrac": title_frac,
    }

    # carry JSON for this doc (same for all sentences of the doc)
    rec["NarrativeTagsJSON"] = tags_row.get("NarrativeTagsJSON", "[]")
    rec["DialogueSpansJSON"] = tags_row.get("DialogueSpansJSON", "[]")

    return pd.Series(rec)

def run_step8(df_preprocessed: pd.DataFrame,
              raw_col="Raw text",
              id_col="ID",
              client=None,
              model=MODEL_ID,
              use_mock=USE_MOCK):
    df_corr = run_correct_only(
        df_preprocessed,
        text_col=raw_col,
        id_col=id_col,
        client=None if use_mock else client,
        model=model,
        use_mock=use_mock,
        out_col="Corrected text (8)"
    )
    df_map, df_texts = run_mapping_only(
        df_corr, id_col=id_col, raw_col=raw_col, corr_col="Corrected text (8)"
    )
    df_map = assign_corr_sentence_ids(df_map)
    df_map = mark_title_and_dialogue(df_map, df_texts)
    df_map = add_sentence_boundary_flags(df_map)
    return df_texts, df_map

def run_step9(df_map: pd.DataFrame, df_texts_with_tags: pd.DataFrame) -> pd.DataFrame:
    need = {"ID","CorrSentenceID","corr_token","Sentence Boundaries",
            "BoundaryCheck","SentenceRef","TITLE","DIALOGUE"}
    missing = need - set(df_map.columns)
    if missing:
        raise KeyError(f"df_map missing columns needed for Step 9: {missing}")

    sort_cols = ["ID","CorrSentenceID"]
    if "corr_index" in df_map.columns: sort_cols.append("corr_index")

    wm = df_map.sort_values(sort_cols, kind="mergesort").copy()
    # Exclude title tokens from aggregation
    wm_nontitle = wm[~wm["TITLE"].astype(bool)].copy()

    # Per-ID tags rows to carry JSON into sentence rows
    tags_by_id = df_texts_with_tags.set_index("ID")[["NarrativeTagsJSON","DialogueSpansJSON"]]

    out = []
    for (ID, SID), g in wm_nontitle.groupby(["ID","CorrSentenceID"], sort=False):
        # fallback if ID not present in tags_by_id (shouldn't happen)
        tags_row = tags_by_id.loc[ID] if ID in tags_by_id.index else pd.Series({}, dtype=object)
        out.append(_summarize_sentence(g, tags_row))

    sent_df = pd.DataFrame(out).sort_values(["SentenceRef"], kind="mergesort").reset_index(drop=True)
    return sent_df

# -----------------------
# Save + Download wrapper
# -----------------------
def _ensure_dir(path):
    os.makedirs(path, exist_ok=True)
    return path

def save_and_download_step8_9(
    df_preprocessed: pd.DataFrame,
    *,
    raw_col="Raw text",
    id_col="ID",
    client=None,              # OpenAI client from your Step 6
    model=MODEL_ID,
    use_mock=USE_MOCK,        # True for offline tests
    out_dir=OUT_DIR
):
    ts = time.strftime("%Y%m%d_%H%M%S")
    _ensure_dir(out_dir)

    # Step 8
    df_texts_8, df_map_8 = run_step8(
        df_preprocessed,
        raw_col=raw_col,
        id_col=id_col,
        client=client if not use_mock else None,
        model=model,
        use_mock=use_mock
    )

    # Step 9
    sent_df = run_step9(df_map_8, df_texts_8)

    # Paths
    p_texts = os.path.join(out_dir, f"step8_texts_{ts}.csv")
    p_map   = os.path.join(out_dir, f"step8_wordmap_checked_{ts}.csv")
    p_sent  = os.path.join(out_dir, f"step9_sentence_mapping_with_boundaries_{ts}.csv")
    p_zip   = os.path.join(out_dir, f"step8_9_outputs_{ts}.zip")

    # Save
    df_texts_8.to_csv(p_texts, index=False, encoding="utf-8")
    df_map_8.to_csv(p_map,   index=False, encoding="utf-8")
    sent_df.to_csv(p_sent,   index=False, encoding="utf-8")

    # Zip bundle
    with zipfile.ZipFile(p_zip, "w", compression=zipfile.ZIP_DEFLATED) as zf:
        zf.write(p_texts, arcname=os.path.basename(p_texts))
        zf.write(p_map,   arcname=os.path.basename(p_map))
        zf.write(p_sent,  arcname=os.path.basename(p_sent))

    print("Saved:")
    print("  ", p_texts)
    print("  ", p_map)
    print("  ", p_sent)
    print("  ", p_zip)

    if _COLAB:
        try:
            files.download(p_texts)
            files.download(p_map)
            files.download(p_sent)
            files.download(p_zip)
        except Exception as e:
            print("Download hint:", e)

    return dict(
        step8_texts_path=p_texts,
        step8_map_path=p_map,
        step9_sentences_path=p_sent,
        zip_path=p_zip,
        df_texts_8=df_texts_8,
        df_map_8=df_map_8,
        sent_df=sent_df
    )

# ============== RUN IT ==============
# Ensure df_preprocessed (with "ID" and "Raw text") exists and `client` (OpenAI v1) is ready
# results = save_and_download_step8_9(
#     df_preprocessed,
#     raw_col="Raw text",
#     id_col="ID",
#     client=client,
#     model="gpt-4o",
#     use_mock=False,     # set True to test without API
#     out_dir="/content"
# )
# results["step8_map_path"], results["step9_sentences_path"], results["zip_path"]


In [None]:
results = save_and_download_step8_9(
    df_preprocessed,
    raw_col="Raw text",
    id_col="ID",
    client=client,           # from your Step 6
    model="gpt-4o",
    use_mock=False,
    out_dir="/content"       # change to your Drive folder if you want
)


  df["_sort_key"] = df.groupby("ID", group_keys=False).apply(_stable_sort_key)
  .apply(_assign)
  df = df.groupby("ID", group_keys=False).apply(per_id).reset_index(drop=True)


Saved:
   /content/step8_texts_20251025_010939.csv
   /content/step8_wordmap_checked_20251025_010939.csv
   /content/step9_sentence_mapping_with_boundaries_20251025_010939.csv
   /content/step8_9_outputs_20251025_010939.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
results = save_and_download_step8_9(
    df_preprocessed,
    raw_col="Raw text",
    id_col="ID",        # or whatever your ID is; code will synthesize one if missing
    client=client,      # from your Step 6 verify
    model="gpt-4o",
    out_dir="/content"
)

results["step8_map_path"], results["step9_sentences_path"]


  df["_sort_key"] = df.groupby("ID", group_keys=False).apply(_stable_sort_key)
  .apply(_assign)
  df = df.groupby("ID", group_keys=False).apply(per_id).reset_index(drop=True)


Saved:
   /content/step8_texts_20251025_011131.csv
   /content/step8_wordmap_checked_20251025_011131.csv
   /content/step9_sentence_mapping_with_boundaries_20251025_011131.csv
   /content/step8_9_outputs_20251025_011131.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

('/content/step8_wordmap_checked_20251025_011131.csv',
 '/content/step9_sentence_mapping_with_boundaries_20251025_011131.csv')

#Below is archived - Please ignore


All the code that will introduce the functions

##8.1  Imports and loggers

In [None]:
 ===============================================
# Step 8A — Corrector (API or mock)
 ===============================================
import json
import re

MODEL_ID = "gpt-4o"
MAX_TOKENS = 1200
USE_MOCK = False  # set True to test without API

def correct_text(raw: str, client=None, model=MODEL_ID, use_mock=USE_MOCK):
    s = str(raw or "")
    if use_mock or client is None:
        t = s.strip()
        m = re.search(r"[A-Za-z]", t)
        if m:
            i = m.start()
            t = t[:i] + t[i].upper() + t[i+1:]
        if t and not re.search(r"[.!?…]\s*$", t):
            t += "."
        return t, "mock"

    prompt = f"""
You are a meticulous copy-editor. Fix punctuation, grammar, and spelling in the text.
Do not change meaning, voice, or level of formality. Keep paragraphing and spacing sane.
If there is a title, or ending But these in title case.
Return JSON only: {{"corrected_text":"..."}}

Text:
<<<BEGIN>>>
{s}
<<<END>>>
""".strip()

    resp = client.chat.completions.create(
        model=model,
        messages=[{"role":"user","content":prompt}],
        temperature=0.0,
        max_tokens=MAX_TOKENS
    )
    out = (resp.choices[0].message.content or "").strip()

    # extract {"corrected_text": "..."} if present
    def _first_obj(txt):
        start = txt.find("{")
        if start < 0: return None
        depth, in_str, esc = 0, False, False
        for k in range(start, len(txt)):
            ch = txt[k]
            if in_str:
                if esc: esc = False
                elif ch == "\\": esc = True
                elif ch == '"': in_str = False
            else:
                if ch == '"': in_str = True
                elif ch == "{": depth += 1
                elif ch == "}":
                    depth -= 1
                    if depth == 0:
                        frag = txt[start:k+1]
                        try:
                            return json.loads(frag)
                        except Exception:
                            return None
        return None

    js = _first_obj(out)
    if isinstance(js, dict) and "corrected_text" in js:
        return str(js["corrected_text"]).strip(), model

    # fallback to plain text
    return out, model




SyntaxError: invalid syntax (ipython-input-2243263149.py, line 1)

In [None]:
# ===============================================
# Step 8B — Define run_correct_only used by 8C
# ===============================================
from tqdm import tqdm
import pandas as pd

def run_correct_only(
    df_in: pd.DataFrame,
    text_col: str = "Raw text",
    id_col: str = "ID",
    client=None,
    model: str = MODEL_ID,
    use_mock: bool = USE_MOCK,
    out_col: str = "Corrected text (8)"
) -> pd.DataFrame:
    """
    Returns a copy of df_in with a new column out_col that contains the corrected text.
    Uses correct_text(...) which you defined earlier. Respects USE_MOCK and your OpenAI client.
    """
    if text_col not in df_in.columns:
        raise KeyError(f"Missing required column: {text_col}")

    df_out = df_in.copy()

    # Ensure ID column exists and is string-normalised
    if id_col not in df_out.columns:
        df_out[id_col] = pd.RangeIndex(len(df_out)).astype(str)
    else:
        # reuse your normaliser from Step 2A if present
        try:
            df_out[id_col] = _normalize_id_series(df_out[id_col])
        except NameError:
            df_out[id_col] = df_out[id_col].astype(str)

    corrected = []
    for raw in tqdm(df_out[text_col].astype(str).tolist(), desc="Correcting", total=len(df_out)):
        fixed, _src = correct_text(raw, client=client, model=model, use_mock=use_mock)
        corrected.append(fixed)

    df_out[out_col] = corrected
    return df_out


In [None]:
# ===============================================
# Patch: minimal build_word_map if missing
# Produces rows with raw_index, raw_token, raw_start, raw_end,
#         corr_index, corr_token, corr_start, corr_end,
#         op, equal_ci, error_type
# ===============================================
# --- PATCH: safe build_word_map (no 're' shadowing) ---
import re
from difflib import SequenceMatcher

_WORD_RX = re.compile(r"\w", flags=re.UNICODE)

def _simple_tokenize(s):
    # keep quotes and punctuation as separate tokens
    return re.findall(r"\w+|[^\w\s]", s or "", flags=re.UNICODE)

def _rebuild_offsets(text, tokens):
    spans = []
    i = 0
    for tok in tokens:
        start = text.find(tok, i)
        if start < 0:
            start = i
        end = start + len(tok)
        spans.append((start, end))
        i = end
    return spans

def _is_word(tok: str) -> bool:
    return bool(tok) and bool(_WORD_RX.search(tok))

def build_word_map(raw_text, corr_text, use_unmerge=True):
    raw_tokens  = _simple_tokenize(raw_text or "")
    corr_tokens = _simple_tokenize(corr_text or "")

    raw_spans  = _rebuild_offsets(raw_text or "", raw_tokens)
    corr_spans = _rebuild_offsets(corr_text or "", corr_tokens)

    sm = SequenceMatcher(a=[t.lower() for t in raw_tokens],
                         b=[t.lower() for t in corr_tokens],
                         autojunk=False)

    rows = []

    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag == "equal":
            for k in range(i2 - i1):
                r_tok = raw_tokens[i1 + k]; c_tok = corr_tokens[j1 + k]
                r_start, r_end = raw_spans[i1 + k]
                c_start, c_end = corr_spans[j1 + k]
                rows.append({
                    "raw_index": i1 + k, "raw_token": r_tok, "raw_start": r_start, "raw_end": r_end,
                    "corr_index": j1 + k, "corr_token": c_tok, "corr_start": c_start, "corr_end": c_end,
                    "op": "equal",
                    "equal_ci": (r_tok == c_tok),
                    "error_type": "Equal"
                })

        elif tag == "replace":
            # pair as many as possible
            m = min(i2 - i1, j2 - j1)
            for k in range(m):
                r_tok = raw_tokens[i1 + k]; c_tok = corr_tokens[j1 + k]
                r_start, r_end = raw_spans[i1 + k]
                c_start, c_end = corr_spans[j1 + k]
                err = "Spelling" if (r_tok.lower() != c_tok.lower() and r_tok.isalpha() and c_tok.isalpha()) else "Replacement"
                rows.append({
                    "raw_index": i1 + k, "raw_token": r_tok, "raw_start": r_start, "raw_end": r_end,
                    "corr_index": j1 + k, "corr_token": c_tok, "corr_start": c_start, "corr_end": c_end,
                    "op": "replace",
                    "equal_ci": (r_tok.lower() == c_tok.lower()),
                    "error_type": err
                })
            # spill extra raw as deletions
            for k in range(i1 + m, i2):
                r_tok = raw_tokens[k]; r_start, r_end = raw_spans[k]
                rows.append({
                    "raw_index": k, "raw_token": r_tok, "raw_start": r_start, "raw_end": r_end,
                    "corr_index": None, "corr_token": None, "corr_start": None, "corr_end": None,
                    "op": "delete",
                    "equal_ci": False,
                    "error_type": "PunctuationDeletion" if not _is_word(r_tok) else "Deletion"
                })
            # spill extra corr as insertions
            for k in range(j1 + m, j2):
                c_tok = corr_tokens[k]; c_start, c_end = corr_spans[k]
                rows.append({
                    "raw_index": None, "raw_token": None, "raw_start": None, "raw_end": None,
                    "corr_index": k, "corr_token": c_tok, "corr_start": c_start, "corr_end": c_end,
                    "op": "insert",
                    "equal_ci": False,
                    "error_type": "PunctuationInsertion" if not _is_word(c_tok) else "Insertion"
                })

        elif tag == "delete":
            for k in range(i1, i2):
                r_tok = raw_tokens[k]; r_start, r_end = raw_spans[k]
                rows.append({
                    "raw_index": k, "raw_token": r_tok, "raw_start": r_start, "raw_end": r_end,
                    "corr_index": None, "corr_token": None, "corr_start": None, "corr_end": None,
                    "op": "delete",
                    "equal_ci": False,
                    "error_type": "PunctuationDeletion" if not _is_word(r_tok) else "Deletion"
                })

        elif tag == "insert":
            for k in range(j1, j2):
                c_tok = corr_tokens[k]; c_start, c_end = corr_spans[k]
                rows.append({
                    "raw_index": None, "raw_token": None, "raw_start": None, "raw_end": None,
                    "corr_index": k, "corr_token": c_tok, "corr_start": c_start, "corr_end": c_end,
                    "op": "insert",
                    "equal_ci": False,
                    "error_type": "PunctuationInsertion" if not _is_word(c_tok) else "Insertion"
                })

    # For Step 8C, return rows and an "unmerged" echo of raw text
    return rows, (raw_text or "")



In [None]:
# ===============================================
# Step 8D — Run correction and mapping, then save
# ===============================================
def run_mapping_only(df_with_corr,
                     id_col="ID",
                     raw_col="Raw text",
                     corr_col="Corrected text (8)",
                     add_unmerged_to_texts=True):
    """
    Build per-token map between raw and corrected text.
    Guarantees at least one row per document (placeholder) so RowID coverage
    matches the number of rows in df_with_corr.
    """
    import numpy as np
    import pandas as pd
    from tqdm import tqdm

    # --- checks ---
    need = {raw_col, corr_col}
    missing = need - set(df_with_corr.columns)
    if missing:
        raise KeyError(f"Need both raw and corrected columns present: missing {missing}")

    # --- safe IDs as strings (no '.0', no NaN) ---
    def _norm_id_series(s: pd.Series) -> pd.Series:
        s = s.astype(str)
        s = s.str.replace(r"\.0$", "", regex=True)
        def _fix(x):
            if any(c.isalpha() for c in x):  # alphanumeric: keep as-is
                return x
            try:
                if "." in x or "e" in x.lower():
                    f = float(x)
                    if f.is_integer():
                        return str(int(f))
            except Exception:
                pass
            return x
        return s.map(_fix)

    if id_col in df_with_corr.columns:
        row_ids = _norm_id_series(df_with_corr[id_col])
    else:
        row_ids = pd.Series(pd.RangeIndex(len(df_with_corr)).astype(str), index=df_with_corr.index)

    raws = df_with_corr[raw_col].astype(str)
    cors = df_with_corr[corr_col].astype(str)

    # deterministic per-doc order (for later joins if needed)
    doc_orders = pd.RangeIndex(len(df_with_corr))

    all_rows = []
    per_row_unmerged = []

    for order, (rid, raw, cor) in tqdm(
        enumerate(zip(row_ids.tolist(), raws.tolist(), cors.tolist())),
        total=len(df_with_corr), desc="Mapping"
    ):
        rows, raw_unmerged = build_word_map(raw, cor, use_unmerge=True)
        per_row_unmerged.append(raw_unmerged)

        if not rows:
            # Ensure at least one record for this doc so the ID exists in map_df
            rows = [{
                "raw_index": np.nan, "raw_token": None, "raw_start": np.nan, "raw_end": np.nan,
                "corr_index": np.nan, "corr_token": None, "corr_start": np.nan, "corr_end": np.nan,
                "op": "empty", "equal_ci": False, "error_type": "EmptyText"
            }]

        for r in rows:
            rec = {"RowID": rid, "DocOrder": order, **r}
            rec["Changed"] = (r.get("op") != "equal")
            all_rows.append(rec)

    map_df = pd.DataFrame(all_rows)

    # tidy column order
    pref = [
        "DocOrder", "RowID",
        "raw_index","raw_token","raw_start","raw_end",
        "corr_index","corr_token","corr_start","corr_end",
        "op","equal_ci","error_type","Changed"
    ]
    map_df = map_df.reindex(columns=[c for c in pref if c in map_df.columns] +
                                   [c for c in map_df.columns if c not in pref])

    # carry unmerged raw text back out for reference
    df_out = df_with_corr.copy()
    if add_unmerged_to_texts:
        df_out["Raw text (unmerged)"] = per_row_unmerged

    return map_df, df_out

from tqdm import tqdm
from google.colab import files
import pandas as pd

# Safety: tiny saver
def _save_outputs(df_texts, df_map,
                  path_text="/content/step8_texts.csv",
                  path_map="/content/step8_wordmap.csv"):
    df_texts.to_csv(path_text, index=False, encoding="utf-8")
    df_map.to_csv(path_map, index=False, encoding="utf-8")
    print("Saved:", path_text)
    print("Saved:", path_map)
    # Try to download in Colab
    for p in (path_text, path_map):
        try:
            files.download(p)
        except Exception as e:
            print(f"(Download hint for {p}: {e})")

# --- 1) Correct (build df_corr) ---
# Uses your previously defined: correct_text(), MODEL_ID, USE_MOCK, client, and run_correct_only()
df_corr = run_correct_only(
    df_preprocessed,
    text_col="Raw text",
    id_col="ID",
    client=None if USE_MOCK else client,
    model=MODEL_ID,
    use_mock=USE_MOCK
)

# --- 2) Map raw→corrected tokens; always guarantees one RowID row per doc ---
df_map, df_corr_with_unmerged = run_mapping_only(
    df_corr,
    id_col="ID",
    raw_col="Raw text",
    corr_col="Corrected text (8)",
    add_unmerged_to_texts=True
)

# --- 3) Sanity prints (counts should match thanks to placeholders) ---
n_docs_texts = len(df_corr_with_unmerged)
n_docs_map   = df_map["RowID"].nunique() if "RowID" in df_map.columns else df_map["ID"].nunique()
print("Docs in step8_texts:", n_docs_texts)
print("Unique RowID in map :", n_docs_map)

if n_docs_map != n_docs_texts:
    print("⚠️  Note: counts differ. Placeholders should still ensure each doc appears at least once in the map.")
else:
    print("✅ Map coverage matches text rows.")

# Quick peek
print("\nstep8_texts preview:")
print(df_corr_with_unmerged[["ID","Raw text","Corrected text (8)"]].head(3).to_string(index=False))

print("\ndf_map preview:")
show_cols = [c for c in ["RowID","DocOrder","corr_index","corr_token","op","error_type"] if c in df_map.columns]
print(df_map[show_cols].head(10).to_string(index=False))

# --- 4) Save outputs ---
_save_outputs(
    df_texts=df_corr_with_unmerged,
    df_map=df_map,
    path_text="/content/step8_texts.csv",
    path_map="/content/step8_wordmap.csv"
)



In [None]:
# ===============================================
# 8DE — CorrSentenceID with quote-aware boundaries
# ===============================================
import pandas as pd
import numpy as np
import re

if "df_map" not in globals():
    raise NameError("df_map is not defined. Run Step 8 first.")
if "corr_token" not in df_map.columns and "raw_token" not in df_map.columns:
    raise KeyError("Need at least 'corr_token' or 'raw_token' in df_map.")

# Ensure ID present
if "RowID" in df_map.columns:
    df_map["ID"] = df_map["RowID"].astype(str)
elif "ID" in df_map.columns:
    df_map["ID"] = df_map["ID"].astype(str)
else:
    df_map["ID"] = df_map.index.astype(str)

# Stable order per ID
has_corr_index = "corr_index" in df_map.columns
if has_corr_index and "corr_index_orig" not in df_map.columns:
    df_map["corr_index_orig"] = df_map["corr_index"]

def _stable_sort_key(group: pd.DataFrame) -> pd.Series:
    pos = pd.Series(np.arange(len(group)), index=group.index, dtype=float)
    if has_corr_index:
        ci = pd.to_numeric(group["corr_index"], errors="coerce")
        nan_mask = ci.isna()
        bump = (pos - pos.min()) / max((pos.max() - pos.min()), 1) * 1e-6
        return ci.where(~nan_mask, 1e9) + bump
    return pos

df_map["_sort_key"] = df_map.groupby("ID", group_keys=False).apply(_stable_sort_key)

# Heuristics
TERMINALS = {".", "!", "?", "…", "...", "?!", "!?"}
CLOSERS   = {")", "]", "}", "”", "’", "»"}             # definite closers
OPENERS   = {"(", "[", "{", "“", "‘", "«"}             # definite openers

ABBREV = {
    "mr.", "mrs.", "ms.", "dr.", "prof.", "sr.", "jr.", "st.", "vs.", "etc.",
    "e.g.", "i.e.", "cf.", "fig.", "ex.", "no.", "approx.", "circa.", "ca.",
    "dept.", "est.", "misc.", "rev.", "jan.", "feb.", "mar.", "apr.", "jun.",
    "jul.", "aug.", "sep.", "sept.", "oct.", "nov.", "dec."
}

_RE_INITIAL      = re.compile(r"^[A-Z]\.$")
_RE_NUM_WITH_DOT = re.compile(r"^\d+\.$")
_RE_ELLIPSIS     = re.compile(r"^\.\.\.$")

def _tok(x):
    if x is None or pd.isna(x):
        return ""
    return str(x)

def _is_terminal_token(tok: str, prev_tok: str, next_tok: str) -> bool:
    t = tok.strip()
    if not t:
        return False
    if _RE_ELLIPSIS.fullmatch(t):
        return True
    if t in {"...", "?!", "!?"}:
        return True
    if t in {".", "!", "?"}:
        lower_prev = prev_tok.lower().strip()
        if lower_prev in ABBREV:
            return False
        if _RE_INITIAL.fullmatch(prev_tok):
            return False
        if _RE_NUM_WITH_DOT.fullmatch(prev_tok):
            return False
        if len(prev_tok) == 1 and prev_tok.isalpha():
            return False
        if next_tok.isdigit():
            return False
        return True
    if t == "…":
        return True
    return False

def _likely_ascii_opening(double_quote_prev: str, double_quote_next: str) -> bool:
    """
    Decide if an ASCII double quote should be treated as an opening mark.
    Simple cues:
      • opening if at document start, or previous token is a terminal or an opener
      • opening if next token is a letter/word and not a closer/terminal
    """
    prev = double_quote_prev.strip()
    nxt  = double_quote_next.strip()
    if prev == "" or prev in TERMINALS or prev in OPENERS:
        return True
    if nxt and nxt not in TERMINALS and nxt not in CLOSERS:
        return True
    return False

def _assign_corr_sentence_ids(group: pd.DataFrame) -> pd.Series:
    g = group.sort_values("_sort_key", kind="mergesort").copy()
    toks = g["corr_token"] if "corr_token" in g.columns else g["raw_token"]
    toks = toks.map(_tok).tolist()

    sids = []
    sent_id = 0
    pending_end = False

    for i, tok in enumerate(toks):
        t = tok.strip()
        prev_tok = toks[i-1].strip() if i > 0 else ""
        next_tok = toks[i+1].strip() if i+1 < len(toks) else ""

        if pending_end:
            # closers stick to the old sentence
            if t in CLOSERS or (t == '"' and not _likely_ascii_opening(prev_tok, next_tok)):
                sids.append(sent_id)
                continue
            # openers belong to the new sentence, including ASCII " when it looks opening
            if t in OPENERS or (t == '"' and _likely_ascii_opening(prev_tok, next_tok)):
                sent_id += 1
                sids.append(sent_id)
                pending_end = False
                continue
            # first real token after the gap starts the next sentence
            sent_id += 1
            pending_end = False
            sids.append(sent_id)
        else:
            sids.append(sent_id)

        # Does current token end a sentence given its context?
        if _is_terminal_token(t, prev_tok, next_tok):
            pending_end = True

    return pd.Series(sids, index=g.index).reindex(group.index)

df_map["CorrSentenceID"] = (
    df_map.groupby("ID", group_keys=False)
          .apply(_assign_corr_sentence_ids)
          .astype("Int64")
)

df_map.drop(columns=["_sort_key"], inplace=True, errors="ignore")
print("✅ CorrSentenceID rebuilt with quote-aware logic.")
print("Sample maxima:", df_map.groupby("ID")["CorrSentenceID"].max().head(8).to_dict())

# Save checkpoint for downstream
df_map.to_csv("/content/step8_wordmap_checked.csv", index=False, encoding="utf-8")


In [None]:
# ===============================================
# 8F — Sentence counts per ID (robust types)
# ===============================================
import pandas as pd
import numpy as np

_sid_num = pd.to_numeric(df_map["CorrSentenceID"], errors="coerce")
max_sid = _sid_num.groupby(df_map["ID"]).max()
num_sentences = (max_sid.fillna(-1) + 1).astype(int).clip(lower=0)

print("Sentence counts per ID (first 10):")
print(num_sentences.head(10).rename("NumSentences"))

# Compact sample (safe sorts even if corr_index missing)
sort_cols = ["ID"]
if "CorrSentenceID" in df_map.columns: sort_cols.append("CorrSentenceID")
if "corr_index" in df_map.columns:     sort_cols.append("corr_index")

cols_to_show = [c for c in ["ID","CorrSentenceID","corr_index","corr_token","op","error_type"] if c in df_map.columns]
sample = (df_map.sort_values(sort_cols, kind="mergesort")
               .groupby(["ID","CorrSentenceID"], group_keys=False)
               .head(8)[cols_to_show])

print("\nSample tokens per sentence (first ~50 rows):")
print(sample.head(50).to_string(index=False))



In [None]:
# ===============================================
# 8G — Post-pass: move stray opening quotes to the next sentence
# Works directly on df_map and CorrSentenceID
# ===============================================
import pandas as pd
import numpy as np

if "df_map" not in globals():
    raise NameError("df_map is not defined.")

REQUIRED = {"ID","CorrSentenceID","corr_token"}
missing = REQUIRED - set(df_map.columns)
if missing:
    raise KeyError(f"Missing columns for quote fix: {missing}")

df = df_map.copy()
df.sort_values(["ID",
                "CorrSentenceID" if "CorrSentenceID" in df.columns else "ID",
                "corr_index" if "corr_index" in df.columns else df.index.name or "ID"],
               inplace=True, kind="mergesort")

ids   = df["ID"].to_numpy()
sids  = pd.to_numeric(df["CorrSentenceID"], errors="coerce").to_numpy()
toks  = df["corr_token"].astype(str).to_numpy()

def _likely_ascii_opening(prev_tok, next_tok):
    TERMINALS = {".", "!", "?", "…", "...", "?!", "!?"}
    CLOSERS   = {")", "]", "}", "”", "’", "»"}
    OPENERS   = {"(", "[", "{", "“", "‘", "«"}
    prev = prev_tok.strip()
    nxt  = next_tok.strip()
    if prev == "" or prev in TERMINALS or prev in OPENERS:
        return True
    if nxt and nxt not in TERMINALS and nxt not in CLOSERS:
        return True
    return False

moved = np.zeros(len(df), dtype=bool)

i = 0
n = len(df)
while i < n:
    j = i + 1
    while j < n and ids[j] == ids[i]:
        j += 1

    for k in range(i, j):
        tok = toks[k]
        if tok not in {'"', '“', '‘', '«'}:
            continue

        cur_sid = sids[k]
        nxt = k + 1
        if nxt >= j:
            continue

        next_sid = sids[nxt]
        next_tok = toks[nxt]

        # If the very next token is on a newer sentence, this quote is leading noise
        if next_sid > cur_sid:
            if tok in {'“', '‘', '«'} or (tok == '"' and _likely_ascii_opening(toks[k-1] if k > i else "", next_tok)):
                sids[k] = next_sid
                moved[k] = True

    i = j

df_map["CorrSentenceID"] = sids
moved_count = int(moved.sum())
total_quotes = int(np.isin(toks, ['"', '“', '”', '‘', '’', '«', '»']).sum())
print(f"Moved opening quotes: {moved_count} of {total_quotes} quote tokens.")
df_map.to_csv("/content/step8_wordmap_checked.csv", index=False, encoding="utf-8")


In [None]:
# Compact preview - a few rows per ID
import pandas as pd

need = {"ID", "CorrSentenceID", "RowID", "corr_index", "corr_token", "op", "error_type"}
show = [c for c in ["ID","CorrSentenceID","RowID","corr_index","corr_token","op","error_type"] if c in df_map.columns]

print("Columns present:", show)

sample_per_id = (
    df_map
      .sort_values(["ID","CorrSentenceID","corr_index"], kind="mergesort")
      .groupby("ID", group_keys=False)
      .head(8)   # up to 8 tokens per ID - bump if you want more
      [show]
)

print(sample_per_id.to_string(index=False)[:5000])  # trim long output


In [None]:
# --- Step 8T: Title detection + quote surgery (post-pass on df_map) ---

import re
import pandas as pd
import numpy as np

# Load from disk if needed (adjust the path as necessary)
if "df_map" not in globals():
    PATH = "/content/step8_wordmap_checked.csv"  # or your /mnt/data/... path
    df_map = pd.read_csv(PATH, dtype={"ID": str, "RowID": str}, low_memory=False)

# --- helpers ---
TERMINALS = {".", "!", "?", "…", "...", "?!", "!?"}
CLOSERS   = {")", "]", "}", "”", "’", "»"}
OPENERS   = {"(", "[", "{", "“", "‘", "«"}

WORD_RX = re.compile(r"\w", flags=re.UNICODE)

def is_word(tok: str) -> bool:
    return bool(tok) and bool(WORD_RX.search(tok))

def is_ascii_quote_opening(prev_tok: str, next_tok: str) -> bool:
    prev = (prev_tok or "").strip()
    nxt  = (next_tok or "").strip()
    if prev == "" or prev in TERMINALS or prev in OPENERS:
        return True
    if nxt and (nxt not in TERMINALS) and (nxt not in CLOSERS):
        return True
    return False

def is_titleish(tokens):
    """
    Heuristic: short, no hard terminal, mostly TitleCase/ALLCAPS tokens, few stopwords.
    Safe defaults; tweak thresholds as you like.
    """
    words = [t for t in tokens if is_word(t)]
    if not words:
        return False
    n = len(words)
    if not (1 <= n <= 12):
        return False

    # Ends with obvious terminal? probably not a title.
    if tokens and tokens[-1] in TERMINALS:
        return False

    # Capitalization signal
    def cap_score(w):
        # TitleCase, ALLCAPS, MixedCaps score 1; lowercase score 0
        return 1 if (w.isupper() or (w[:1].isupper() and w[1:].islower()) or any(c.isupper() for c in w[1:])) else 0
    cap_ratio = (sum(cap_score(w) for w in words) / max(1, n))

    # tiny stopword penalty
    STOP = {"the","a","an","and","or","but","to","of","in","for","on","with","at","by","from","as"}
    stop_ratio = (sum(w.lower() in STOP for w in words) / max(1, n))

    return (cap_ratio >= 0.6) and (stop_ratio <= 0.5)

# Ensure types we need
for col in ["ID","CorrSentenceID","corr_index","corr_token"]:
    if col not in df_map.columns:
        raise KeyError(f"Missing required column: {col}")

df_map["ID"] = df_map["ID"].astype(str)
if df_map["CorrSentenceID"].dtype.kind not in "iu":
    df_map["CorrSentenceID"] = pd.to_numeric(df_map["CorrSentenceID"], errors="coerce").fillna(0).astype(int)

# --- 1) Title detection & reindexing of CorrSentenceID ---
df_map["TITLE"] = False  # initialize

def reassign_title_sentence(g: pd.DataFrame) -> pd.DataFrame:
    g = g.sort_values(["CorrSentenceID","corr_index"], kind="mergesort").copy()

    # Consider the very first sentence as potential title
    head = g[g["CorrSentenceID"] == g["CorrSentenceID"].min()]

    # Pull the actual token string sequence for that first sentence
    toks = head["corr_token"].astype(str).tolist()

    if is_titleish(toks):
        # mark title
        g.loc[head.index, "TITLE"] = True

        # If it's not already sentence 0, normalize to 0 and shift others +1
        min_sid = g["CorrSentenceID"].min()
        if min_sid != 0:
            g["CorrSentenceID"] = g["CorrSentenceID"] - min_sid  # normalize so title is 0
        # Ensure everything after title bumps by +1 (title stays 0)
        g.loc[g["CorrSentenceID"] > 0, "CorrSentenceID"] += 1
        # title remains 0
    else:
        # no title: ensure sids are 0..N contiguous (normalize)
        min_sid = g["CorrSentenceID"].min()
        if min_sid != 0:
            g["CorrSentenceID"] = g["CorrSentenceID"] - min_sid

    return g

df_map = (
    df_map
      .groupby("ID", group_keys=False)
      .apply(reassign_title_sentence)
      .reset_index(drop=True)
)

# --- 2) Quote marriage: fix closing/opening quotes at sentence edges ---

def fix_quotes(g: pd.DataFrame) -> pd.DataFrame:
    g = g.sort_values(["CorrSentenceID","corr_index"], kind="mergesort").copy()
    sids = g["CorrSentenceID"].to_numpy()
    toks = g["corr_token"].astype(str).to_numpy()

    n = len(g)
    if n == 0:
        return g

    # Pass A: pull stray closing quotes that start a sentence back to previous sentence
    # e.g., [..., '.', SID=0] , ['”', SID=1]  -> make '”' SID=0
    for i in range(1, n):
        if toks[i] in {'"', '”', '’', '»'} and sids[i] > sids[i-1]:
            # Only if previous token looks like a sentence-ending token
            if toks[i-1] in TERMINALS or toks[i-1] in CLOSERS or not is_ascii_quote_opening(toks[i-1], toks[i]):
                sids[i] = sids[i-1]

    # Pass B: keep opening quotes at the start of a sentence with that sentence
    # If an opening quote was left attached to previous sentence, move it forward.
    for i in range(1, n):
        if toks[i] == '"':
            prev_tok = toks[i-1]
            next_tok = toks[i+1] if i+1 < n else ""
            if is_ascii_quote_opening(prev_tok, next_tok) and sids[i] == sids[i-1]:
                # If looks like an opening quote but stuck to previous sid, nudge it forward if possible
                # Only shift when the next token already starts a newer sentence
                if i+1 < n and sids[i+1] > sids[i]:
                    sids[i] = sids[i+1]

    # Pass C: conventional .” or !” patterns — keep the terminal and the quote together in the same sentence
    for i in range(1, n):
        # Case: terminal at i-1, closing quote at i → keep same SID (already true by Pass A usually)
        if toks[i-1] in TERMINALS and toks[i] in {'"', '”', '’', '»'} and sids[i] != sids[i-1]:
            sids[i] = sids[i-1]

        # Case: closing quote at i-1, terminal at i → also keep together
        if toks[i-1] in {'"', '”', '’', '»'} and toks[i] in TERMINALS and sids[i] != sids[i-1]:
            sids[i] = sids[i-1]

    g["CorrSentenceID"] = sids
    return g

df_map = (
    df_map
      .groupby("ID", group_keys=False)
      .apply(fix_quotes)
      .reset_index(drop=True)
)

# Optional: persist a checkpoint
out_path = "/content/step8_wordmap_checked_title_quotes.csv"
df_map.to_csv(out_path, index=False, encoding="utf-8")
print("✅ Updated map saved:", out_path)

# Quick audit
print("Sample: IDs → max CorrSentenceID")
print(df_map.groupby("ID")["CorrSentenceID"].max().head(10))
print("\nTITLE counts:")
print(df_map.groupby(["ID","TITLE"]).size().unstack(fill_value=0).head(10))


In [None]:
# 8H Full head and a few stats
display(df_map.head(20))

print("\nCounts by ID and sentence:")
print(df_map.groupby(["ID","CorrSentenceID"]).size().head(20))

print("\nMax CorrSentenceID per ID:")
print(df_map.groupby("ID")["CorrSentenceID"].max().head(20))


In [None]:
# =========================
# 8Y — Sentence boundary flags + correctness checks
# =========================
import re
import numpy as np
import pandas as pd

# --- requirements ---
need = {"ID","CorrSentenceID","corr_index","corr_token","raw_token"}
missing = need - set(df_map.columns)
if missing:
    raise KeyError(f"df_map missing required columns: {missing}")

# sort for stable per-sentence order
df_map = df_map.sort_values(["ID","CorrSentenceID","corr_index"], kind="mergesort").copy()

# initialize target columns
if "Sentence Boundaries" not in df_map.columns:
    df_map["Sentence Boundaries"] = ""
if "BoundaryCheck" not in df_map.columns:
    df_map["BoundaryCheck"] = ""

# helpers
TERMINALS = {".","!","?","…","...","?!","!?"}
OPENING_PUNCT = {'"', "“", "‘", "«", "(", "[", "{"}

def is_word(tok: str) -> bool:
    return bool(tok) and bool(re.search(r"\w", str(tok), flags=re.UNICODE))

def first_content_row(g: pd.DataFrame) -> int | None:
    """
    Find the row index (label) of the first *word* token in this sentence,
    skipping opening punctuation like \" or ( if they occur first.
    """
    for idx, tok in zip(g.index, g["corr_token"].astype(str)):
        if is_word(tok):
            return idx
        # if it's opening punctuation, keep scanning
        if tok in OPENING_PUNCT:
            continue
        # if it's other punctuation, keep scanning until we hit a word
        continue
    return None

def last_boundary_row(g: pd.DataFrame) -> int | None:
    """
    Find the row index (label) of the final sentence-boundary marker token
    in corr_token within this sentence (., !, ?, …, ... , ?!, !?).
    If none present, return None (we won’t mark an ending).
    """
    # Go from the end toward the start
    toks = g["corr_token"].astype(str).tolist()
    for pos in range(len(toks)-1, -1, -1):
        t = toks[pos]
        if t in TERMINALS:
            return g.index[pos]
    return None

def begins_with_upper(raw_tok: str) -> bool | None:
    """
    True if the first alphabetic char in the *raw* token is uppercase.
    Returns None if no alphabetic char (e.g., insertion or pure punctuation).
    """
    s = str(raw_tok or "")
    m = re.search(r"[A-Za-z]", s)
    if not m:
        return None
    return s[m.start()].isupper()

# work per sentence
for (id_, sid), g in df_map.groupby(["ID","CorrSentenceID"], sort=False):
    g = g.sort_values("corr_index", kind="mergesort")
    b_row = first_content_row(g)
    e_row = last_boundary_row(g)

    # mark boundaries
    marks = {}
    if b_row is not None:
        marks.setdefault(b_row, []).append("Sentence Beginning")
    if e_row is not None:
        marks.setdefault(e_row, []).append("Sentence Ending")

    # apply boundary marks
    for idx, tags in marks.items():
        prev = df_map.at[idx, "Sentence Boundaries"]
        tag = " | ".join(tags)
        df_map.at[idx, "Sentence Boundaries"] = tag if not prev else (prev + " | " + tag)

    # correctness checks
    # 1) Beginning correctness (capitalisation of RAW token on the beginning row)
    if b_row is not None:
        rb = df_map.at[b_row, "raw_token"]
        cap = begins_with_upper(rb)
        if cap is True:
            v = "Correct Beginning"
        elif cap is False:
            v = "Incorrect Beginning"
        else:
            v = "Unknown Beginning"
        prev = df_map.at[b_row, "BoundaryCheck"]
        df_map.at[b_row, "BoundaryCheck"] = v if not prev else (prev + " | " + v)

    # 2) Ending correctness (RAW token equals CORR token on the ending row)
    if e_row is not None:
        re_tok = str(df_map.at[e_row, "raw_token"] or "")
        ce_tok = str(df_map.at[e_row, "corr_token"] or "")
        # exact match requested (punct/quotes matter)
        end_ok = (re_tok == ce_tok)
        v = "Correct Ending" if end_ok else "Incorrect Ending"
        prev = df_map.at[e_row, "BoundaryCheck"]
        df_map.at[e_row, "BoundaryCheck"] = v if not prev else (prev + " | " + v)

# optional: ensure a stable SentenceRef if you want to join with sentence table
def _sid_str(x):
    try: return f"{int(x):03d}"
    except: return "000"
if "SentenceRef" not in df_map.columns:
    df_map["SentenceRef"] = df_map["ID"].astype(str) + "_s" + df_map["CorrSentenceID"].map(_sid_str)

# quick peek
display(df_map.head(20)[[
    "ID","CorrSentenceID","corr_index",
    "raw_token","corr_token",
    "Sentence Boundaries","BoundaryCheck"
]])


In [None]:
# === Patch A: Sentence boundaries + checks + SentenceRef on df_map ===
import re
import numpy as np
import pandas as pd

REQUIRED = {"ID","CorrSentenceID","corr_token"}
missing = REQUIRED - set(df_map.columns)
if missing:
    raise KeyError(f"df_map missing required columns: {missing}")

# stable sort (works even if corr_index has NaNs/gaps)
if "corr_index" not in df_map.columns:
    df_map["corr_index"] = np.nan
df_map["_rowpos"] = np.arange(len(df_map))
df_map["_sort_corr"] = pd.to_numeric(df_map["corr_index"], errors="coerce").fillna(1e12) + (df_map["_rowpos"]*1e-9)
df_map = df_map.sort_values(["ID","CorrSentenceID","_sort_corr"], kind="mergesort")

# columns to fill
if "Sentence Boundaries" not in df_map.columns:
    df_map["Sentence Boundaries"] = ""
if "BoundaryCheck" not in df_map.columns:
    df_map["BoundaryCheck"] = ""

# helpers
TERMINALS = {".","!","?","…","...","?!","!?"}
OPENING_PUNCT = {'"', "“", "‘", "«", "(", "[", "{"}

def is_word(tok: str) -> bool:
    return bool(tok) and bool(re.search(r"\w", str(tok), flags=re.UNICODE))

def first_content_row(g: pd.DataFrame):
    for idx, tok in zip(g.index, g["corr_token"].astype(str)):
        if tok in OPENING_PUNCT:
            continue
        if is_word(tok):
            return idx
    return None

def last_terminal_row(g: pd.DataFrame):
    toks = g["corr_token"].astype(str).tolist()
    for pos in range(len(toks)-1, -1, -1):
        if toks[pos] in TERMINALS:
            return g.index[pos]
    return None

def begins_with_upper_raw(raw_tok: str):
    s = str(raw_tok or "")
    m = re.search(r"[A-Za-z]", s)
    if not m:
        return None
    return s[m.start()].isupper()

# fill per sentence
for (id_, sid), g in df_map.groupby(["ID","CorrSentenceID"], sort=False):
    g = g.sort_values("_sort_corr", kind="mergesort")
    b = first_content_row(g)
    e = last_terminal_row(g)

    if b is not None:
        prev = df_map.at[b, "Sentence Boundaries"]
        df_map.at[b, "Sentence Boundaries"] = prev + (" | " if prev else "") + "Sentence Beginning"
        # correctness on the beginning row uses RAW token capitalisation
        rawb = df_map.at[b, "raw_token"] if "raw_token" in df_map.columns else None
        cap = begins_with_upper_raw(rawb)
        tag = "Correct Beginning" if cap is True else ("Incorrect Beginning" if cap is False else "Unknown Beginning")
        prev = df_map.at[b, "BoundaryCheck"]
        df_map.at[b, "BoundaryCheck"] = prev + (" | " if prev else "") + tag

    if e is not None:
        prev = df_map.at[e, "Sentence Boundaries"]
        df_map.at[e, "Sentence Boundaries"] = prev + (" | " if prev else "") + "Sentence Ending"
        rawe = str(df_map.at[e, "raw_token"] or "") if "raw_token" in df_map.columns else ""
        corre = str(df_map.at[e, "corr_token"] or "")
        tag = "Correct Ending" if rawe == corre else "Incorrect Ending"
        prev = df_map.at[e, "BoundaryCheck"]
        df_map.at[e, "BoundaryCheck"] = prev + (" | " if prev else "") + tag

# stable sentence key for joins
def _sid3(x):
    try: return f"{int(x):03d}"
    except: return "000"
df_map["SentenceRef"] = df_map["ID"].astype(str) + "_s" + df_map["CorrSentenceID"].map(_sid3)

# cleanup temps
df_map.drop(columns=["_rowpos","_sort_corr"], inplace=True, errors="ignore")

print("✅ Sentence boundaries + SentenceRef added to df_map.")
print(df_map[["ID","CorrSentenceID","SentenceRef","corr_index","raw_token","corr_token","Sentence Boundaries","BoundaryCheck"]]
      .head(12).to_string(index=False))


In [None]:
#8I Save and download
from google.colab import files

out_path = "/content/step8_wordmap_checked.csv"
df_map.to_csv(out_path, index=False, encoding="utf-8")
print("Saved to:", out_path)

try:
    files.download(out_path)
except Exception as e:
    print("If automatic download is blocked, grab it from the Files pane on the left.")


# Step 9: Segment Text by Sentence
## Purpose:
To process raw text annd corrected text by segmenting it into sentences, structuring the results into a standardized format.

## Actions:

* Define segment_and_correct Function:
 * Parameters: Accepts text (the raw input text).
 * Prompt Creation: Constructs a detailed prompt with instructions for the AI to perform specific tasks on the text.
 * API Call: Uses the previously defined call_chatgpt function to send the prompt to the OpenAI API.
 * Response Handling: Extracts JSON from the API response and parses it into a Python dictionary.
 * Error Handling: Catches JSON decoding errors and returns an empty dictionary if parsing fails.
*Mock Text for Testing:
In Step 8.3, a mock version of segment_and_correct is used to simulate API behavior for testing purposes.

In [None]:
# === Step 9 (drop-in): build sentence table using df_map’s flags ===
import pandas as pd, numpy as np, re
from IPython.display import display, HTML
from google.colab import files

# we expect df_map from Step 8 + Patch A
NEED = {"ID","CorrSentenceID","corr_token","raw_token","Sentence Boundaries","BoundaryCheck","SentenceRef"}
missing = NEED - set(df_map.columns)
if missing:
    raise KeyError(f"df_map missing columns needed for Step 9: {missing}")

# sorting
sort_cols = ["ID","CorrSentenceID"]
if "corr_index" in df_map.columns: sort_cols.append("corr_index")
wm = df_map.sort_values(sort_cols, kind="mergesort").copy()

NO_SPACE_BEFORE = set(list(".,;:!?)]}\"'»”’…"))
NO_SPACE_AFTER  = set(list("([{\"'«“‘"))

def detok(tokens):
    out = []
    for t in tokens:
        if t is None or pd.isna(t):
            continue
        t = str(t)
        if not out:
            out.append(t); continue
        prev = out[-1]
        if t in NO_SPACE_BEFORE or re.fullmatch(r"[.]{3}", t):
            out[-1] = prev + t
        elif prev in NO_SPACE_AFTER:
            out[-1] = prev + t
        else:
            out.append(" " + t)
    return "".join(out).strip()

def summarize_sentence(g: pd.DataFrame):
    corr_tokens = g["corr_token"].tolist()
    raw_tokens  = [x for x in g["raw_token"].tolist() if not pd.isna(x)]
    corr_text   = detok(corr_tokens)
    raw_text    = detok(raw_tokens)

    # flags from boundary rows (may be empty if heuristics didn’t find one)
    b_rows = g[g["Sentence Boundaries"].str.contains("Sentence Beginning", na=False)]
    e_rows = g[g["Sentence Boundaries"].str.contains("Sentence Ending",   na=False)]

    begin_ok = np.nan
    end_ok   = np.nan
    if not b_rows.empty:
        chk = " | ".join(b_rows["BoundaryCheck"].dropna().astype(str))
        begin_ok = (("Correct Beginning" in chk) * 1) if chk else np.nan
    if not e_rows.empty:
        chk = " | ".join(e_rows["BoundaryCheck"].dropna().astype(str))
        end_ok = (("Correct Ending" in chk) * 1) if chk else np.nan

    # convenience booleans
    has_hard_terminal = any(t in {".","!","?","…","...","?!","!?"} for t in corr_tokens)
    has_opening_quote = any(t in {'"', "“", "‘", "«"} for t in corr_tokens)
    has_closing_quote = any(t in {'"', "”", "’", "»"} for t in corr_tokens)

    # edits summary (if present)
    op  = g["op"] if "op" in g.columns else pd.Series([], dtype=object)
    et  = g["error_type"] if "error_type" in g.columns else pd.Series([], dtype=object)

    return pd.Series({
        "SentenceRef": g["SentenceRef"].iloc[0],
        "CorrectedSentence": corr_text,
        "RawSentence": raw_text,
        "TokensInSentence": int(len(g)),
        "EditsInSentence": int((op != "equal").sum()) if not op.empty else np.nan,
        "EqualsInSentence": int((op == "equal").sum()) if not op.empty else np.nan,
        "Insertions": int((op == "insert").sum()) if not op.empty else np.nan,
        "Deletions": int((op == "delete").sum()) if not op.empty else np.nan,
        "Replacements": int((op == "replace").sum()) if not op.empty else np.nan,
        "BeginBoundaryRow": (b_rows.index[0] if not b_rows.empty else np.nan),
        "EndBoundaryRow":   (e_rows.index[0] if not e_rows.empty else np.nan),
        "CorrectBeginning": begin_ok,        # 1=true, 0=false, NaN=unknown/missing
        "CorrectEnding":    end_ok,          # 1=true, 0=false, NaN=missing
        "HasHardTerminal":  bool(has_hard_terminal),
        "HasOpeningQuote":  bool(has_opening_quote),
        "HasClosingQuote":  bool(has_closing_quote),
        "CorrIndexMin": g["corr_index"].min() if "corr_index" in g.columns else np.nan,
        "CorrIndexMax": g["corr_index"].max() if "corr_index" in g.columns else np.nan,
    })

sent_df = (
    wm.groupby(["ID","CorrSentenceID"], as_index=False, sort=False)
      .apply(summarize_sentence)
      .reset_index(drop=True)
      .sort_values(["ID","SentenceRef"], kind="mergesort")
)

# Save + preview
out_path = "/content/step9_sentence_mapping_with_boundaries.csv"
sent_df.to_csv(out_path, index=False, encoding="utf-8")
print("✅ Step 9 sentence mapping saved:", out_path)

display(HTML("<b>Sentence mapping preview (with boundary flags)</b>"))
display(sent_df.head(50))

try:
    files.download(out_path)
except Exception:
    print("If the download is blocked, fetch it from the Files pane:", out_path)
