In [1]:
# ---- Setup: imports, API client, paths, and our src modules ----
import os, sys, time, pandas as pd, numpy as np
from openai import OpenAI

# Use your environment variable (set in the VS Code terminal)
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Detect project root whether the notebook is in /notebooks or not
BASE = os.getcwd()
if os.path.basename(BASE).lower() == "notebooks":
    BASE = os.path.dirname(BASE)

DATA = os.path.join(BASE, "data")
SRC  = os.path.join(BASE, "src")

if SRC not in sys.path:
    sys.path.append(SRC)

print("BASE:", BASE)
print("DATA:", DATA)
print("SRC:", SRC)
print("SRC files:", os.listdir(SRC))

from retrieval import build_glossary_corpus, best_k_terms, select_constraints
from prompting import make_prompt, restore_spans
from evaluation import term_adherence, basic_metrics


BASE: e:\Data\translation-pipeline
DATA: e:\Data\translation-pipeline\data
SRC: e:\Data\translation-pipeline\src
SRC files: ['evaluation.py', 'prompting.py', 'retrieval.py']


In [2]:
# ---- Load data & prepare retrieval ----
import pandas as pd
from sentence_transformers import SentenceTransformer

# 1) Load CSVs
glossary = pd.read_csv(os.path.join(DATA, "glossary.csv"))
samples  = pd.read_csv(os.path.join(DATA, "samples_en.csv"))

print("Glossary rows:", len(glossary))
display(glossary.head(3))
print("Sample rows:", len(samples))
display(samples.head(5))

# 2) Build per-language "documents" from the glossary (text used for retrieval)
corpora = {}
for tgt in ["fr", "it", "ja"]:
    corpora[tgt] = build_glossary_corpus(glossary, tgt)

# 3) Create the embedding model (fast & decent quality)
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# 4) Map short codes to friendly names for prompts
lang_label_map = {"fr": "French", "it": "Italian", "ja": "Japanese"}

print("Prepared corpora for:", list(corpora.keys()))


  from .autonotebook import tqdm as notebook_tqdm


Glossary rows: 9


Unnamed: 0,term,part_of_speech,domain,definition,en,fr,it,ja,notes
0,GPU,noun,tech,Graphics processing unit for parallel computat...,GPU,processeur graphique,GPU,ＧＰＵ,Keep as acronym; target forms preferred
1,Account ID,noun,finance,Unique identifier for user accounts,account ID,ID de compte,ID account,アカウントID,"Do not translate ""ID"""
2,Checkout,verb,commerce,The process of completing a purchase,checkout,paiement,cassa,チェックアウト,May be noun/verb; prefer payment flow sense


Sample rows: 30


Unnamed: 0,id,src,pair,has_html
0,1,Enter your email address to continue.,en-fr,0
1,2,Free shipping on orders over $50 at checkout.,en-it,0
2,3,Enable two-factor authentication (2FA) in Sett...,en-ja,0
3,4,Your refund has been issued to the original pa...,en-it,0
4,5,Copy the Account ID and share it with support.,en-fr,0


Prepared corpora for: ['fr', 'it', 'ja']


In [6]:
# --- Caching + budget guard (paste once) ---
import os, json, hashlib, time, random
import tiktoken
from openai import RateLimitError, APIError

# --------- adjust these to your current model prices ---------
# Fill with your model's current $/1M tokens (input/output) from your dashboard
INPUT_COST_PER_M = 0.15   # dollars per 1M input tokens (example)
OUTPUT_COST_PER_M = 0.60  # dollars per 1M output tokens (example)
BUDGET_DOLLARS = 5.00     # hard cap for this run
# -------------------------------------------------------------

enc = tiktoken.get_encoding("cl100k_base")  # decent default for OpenAI chat models
spent_dollars = 0.0

CACHE_PATH = os.path.join(DATA, "cache.jsonl")
_cache = {}

def _load_cache():
    if os.path.exists(CACHE_PATH):
        with open(CACHE_PATH, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    rec = json.loads(line)
                    _cache[rec["key"]] = rec
                except:
                    pass

def _save_cache_item(key, value):
    with open(CACHE_PATH, "a", encoding="utf-8") as f:
        f.write(json.dumps({"key": key, **value}, ensure_ascii=False) + "\n")

def _cache_key(source, tgt_lang_label, constraints, model):
    s = json.dumps({
        "src": source,
        "tgt": tgt_lang_label,
        "constraints": constraints,
        "model": model
    }, sort_keys=True, ensure_ascii=False)
    return hashlib.md5(s.encode("utf-8")).hexdigest()

def _estimate_cost_usd(prompt_text: str, completion_text: str = "") -> float:
    in_tok = len(enc.encode(prompt_text))
    out_tok = len(enc.encode(completion_text)) if completion_text else 0
    return (in_tok/1_000_000.0)*INPUT_COST_PER_M + (out_tok/1_000_000.0)*OUTPUT_COST_PER_M

# Replaces your previous translate_llm_safe (adds cache + cost guard)
def translate_llm_safe(source: str, tgt_lang_label: str, constraints, model="gpt-4o-mini",
                       temperature: float = 0.2, max_retries: int = 5, base_sleep: float = 0.8):
    global spent_dollars

    prompt, spans = make_prompt(source, tgt_lang_label, constraints)
    key = _cache_key(source, tgt_lang_label, constraints, model)

    # Cache hit?
    if key in _cache:
        rec = _cache[key]
        hyp = restore_spans(rec["hyp"], spans)
        return hyp, rec["latency_s"]

    # Budget check (input-side prediction)
    est_in_cost = _estimate_cost_usd(prompt, "")
    if spent_dollars + est_in_cost > BUDGET_DOLLARS:
        raise RuntimeError(f"Budget guard tripped at ${spent_dollars:.2f}/${BUDGET_DOLLARS:.2f}. "
                           "Lower k, reduce rows, or increase budget.")

    attempt = 0
    while True:
        try:
            start = time.time()
            resp = client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature,
            )
            hyp = resp.choices[0].message.content.strip()
            hyp_restored = restore_spans(hyp, spans)
            latency = time.time() - start

            # Estimate full (input + output) cost for accounting
            spent_dollars += _estimate_cost_usd(prompt, hyp)

            # Save to cache
            value = {"hyp": hyp, "latency_s": latency}
            _cache[key] = value
            _save_cache_item(key, value)

            time.sleep(0.2)
            return hyp_restored, latency

        except (RateLimitError, APIError):
            attempt += 1
            if attempt > max_retries:
                raise
            time.sleep(base_sleep * (2 ** (attempt - 1)) + random.uniform(0, 0.4))


In [3]:
# ---- Translator with light retry/backoff ----
import time, random
from openai import RateLimitError, APIError

def translate_llm_safe(source: str, tgt_lang_label: str, constraints, model="gpt-4o-mini",
                       temperature: float = 0.2, max_retries: int = 5, base_sleep: float = 0.8):
    prompt, spans = make_prompt(source, tgt_lang_label, constraints)

    attempt = 0
    while True:
        try:
            start = time.time()
            resp = client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature,
            )
            hyp = resp.choices[0].message.content.strip()
            hyp = restore_spans(hyp, spans)
            latency = time.time() - start
            # Tiny pause to avoid bursty calls
            time.sleep(0.2)
            return hyp, latency
        except (RateLimitError, APIError) as e:
            attempt += 1
            if attempt > max_retries:
                raise
            sleep_s = base_sleep * (2 ** (attempt - 1)) + random.uniform(0, 0.4)
            time.sleep(sleep_s)


In [5]:
# ---- Quick single test (first row) ----
row = samples.iloc[0]
src = row["src"]
tgt = row["pair"].split("-")[-1]          # 'fr', 'it', or 'ja'
tgt_label = lang_label_map[tgt]           # 'French', 'Italian', 'Japanese'

# Retrieve constraints
docs = corpora[tgt]
idxs = best_k_terms(src, docs, embeddings=embedder, k=3)
cons = select_constraints(glossary, idxs, tgt)

# Translate WITH retrieval
hyp_with, lat_with = translate_llm_safe(src, tgt_label, cons)

# Translate WITHOUT retrieval
hyp_wo, lat_wo = translate_llm_safe(src, tgt_label, constraints=[])

print("SOURCE:", src)
print("TARGET:", tgt_label)
print("\n--- WITH retrieval ---")
print(hyp_with, f"\n(latency: {lat_with:.2f}s)")
print("\n--- WITHOUT retrieval ---")
print(hyp_wo, f"\n(latency: {lat_wo:.2f}s)")


SOURCE: Enter your email address to continue.
TARGET: French

--- WITH retrieval ---
Entrez votre adresse e-mail pour continuer. 
(latency: 1.01s)

--- WITHOUT retrieval ---
Entrez votre adresse e-mail pour continuer. 
(latency: 3.74s)


In [None]:
# --- Small-chunk batch with budget guard ---
from tqdm import tqdm

subset = samples.head(30)   # adjust; later do the rest
records = []

for _, row in tqdm(subset.iterrows(), total=len(subset)):
    src = row["src"]
    tgt = row["pair"].split("-")[-1]
    tgt_label = lang_label_map[tgt]

    # Retrieval
    docs = corpora[tgt]
    idxs = best_k_terms(src, docs, embeddings=embedder, k=3)  # try k=2 if you want shorter prompts
    cons = select_constraints(glossary, idxs, tgt)

    # WITH retrieval
    hyp_with, lat_with = translate_llm_safe(src, tgt_label, cons)

    # WITHOUT retrieval
    hyp_wo, lat_wo = translate_llm_safe(src, tgt_label, constraints=[])

    # Metrics
    ta_with = term_adherence(hyp_with, cons)
    ta_wo   = term_adherence(hyp_wo, cons)

    records.append({
        "id": row["id"], "pair": row["pair"], "source": src,
        "hyp_with": hyp_with, "hyp_without": hyp_wo,
        "term_acc_with": ta_with, "term_acc_without": ta_wo,
        "lat_with_s": lat_with, "lat_without_s": lat_wo
    })

results = pd.DataFrame(records)
print(f"Spent (estimated): ${spent_dollars:.2f}")
results.head(3)
