In [3]:
%pip install transformers accelerate torch sentencepiece bitsandbytes aiohttp requests --quiet together

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
moviepy 1.0.3 requires decorator<5.0,>=4.0.2, but you have decorator 5.1.1 which is incompatible.
label-studio 1.12.0 requires jsonschema==3.2.0, but you have jsonschema 4.23.0 which is incompatible.
label-studio 1.12.0 requires pydantic<=1.11.0,>=1.7.3, but you have pydantic 2.10.6 which is incompatible.
label-studio 1.12.0 requires wheel<=0.40.0,>=0.38.1, but you have wheel 0.44.0 which is incompatible.[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/homebrew/opt/python@3.11/bin/python3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [30]:
from dotenv import load_dotenv
import os, json, time
from together import Together

load_dotenv()
api_key = os.getenv("TOGETHER_API_KEY")

client = Together(api_key=api_key)

MODEL = "mistralai/Mistral-7B-Instruct-v0.1"

INPUT_FILE = "../data/query_data.jsonl"
OUTPUT_FILE = "../results/Mistral-Instruct-7b.jsonl"

In [31]:
GEN_CFG = dict(
    max_tokens=512,
    temperature=0.2,
    top_p=0.9,
    top_k=60,
    num_beams=1,
    repetition_penalty=1.0,
    stop=["</s>"],
)

In [32]:
def run_query(prompt: str):
    """Send one prompt to the model and return text + latency."""
    start = time.time()
    resp = client.chat.completions.create(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}],
        **GEN_CFG
    )
    elapsed = time.time() - start
    text = resp.choices[0].message.content if resp.choices else ""
    return text.strip(), elapsed


In [33]:
with open(INPUT_FILE, "r", encoding="utf-8") as fin, \
        open(OUTPUT_FILE, "w", encoding="utf-8") as fout:

    for i, line in enumerate(fin, 1):
        row = json.loads(line)

        context = row.get("context", "")
        prompt = row.get("prompt", "")
        anchor = row.get("anchor", "")
        qid = row.get("id", i)


        print(f"[{i}] id={qid} — sending prompt of length {len(prompt)} chars")

        try:
            output, latency = run_query(prompt)
            result = {
                "id": qid,
                "context": context,
                "prompt": prompt,
                "anchor": anchor,
                "output": output,
                "latency_sec": round(latency, 3),
                "model": MODEL,
            }
            fout.write(json.dumps(result, ensure_ascii=False) + "\n")
            print(f"✔ Done in {latency:.2f}s, {len(output)} chars output.")
        except Exception as e:
            print(f"✖ Error on example {i}: {e}")
            fout.write(json.dumps({
                "id": qid,
                "context": context,
                "prompt": prompt,
                "anchor": anchor,
                "error": str(e),
                "model": MODEL,
            }, ensure_ascii=False) + "\n")

print(f"\nAll queries completed. Results saved to {OUTPUT_FILE}")



[1] id=1 — sending prompt of length 1137 chars
✔ Done in 0.84s, 356 chars output.
[2] id=2 — sending prompt of length 1695 chars
✔ Done in 0.56s, 367 chars output.
[3] id=3 — sending prompt of length 1335 chars
✔ Done in 1.91s, 1125 chars output.
[4] id=4 — sending prompt of length 1491 chars
✔ Done in 0.69s, 450 chars output.
[5] id=5 — sending prompt of length 1546 chars
✔ Done in 2.20s, 1742 chars output.
[6] id=6 — sending prompt of length 1514 chars
✔ Done in 1.42s, 874 chars output.
[7] id=7 — sending prompt of length 1411 chars
✔ Done in 0.50s, 233 chars output.
[8] id=8 — sending prompt of length 1349 chars
✔ Done in 0.86s, 547 chars output.
[9] id=9 — sending prompt of length 1356 chars
✔ Done in 1.23s, 732 chars output.
[10] id=10 — sending prompt of length 1468 chars
✔ Done in 0.62s, 371 chars output.
[11] id=11 — sending prompt of length 1279 chars
✔ Done in 0.89s, 610 chars output.
[12] id=12 — sending prompt of length 1218 chars
✔ Done in 1.45s, 853 chars output.
[13] id=

In [34]:
%pip install nltk rouge-score bert-score scikit-learn pandas tqdm


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
Building wheels for collected packages: rouge-score
[33m  DEPRECATION: Building 'rouge-score' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'rouge-score'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=031924243ee1361e333adfc4df860d1446778a

In [35]:
def evaluate(results_path: str, out_csv: str = "evaluation_scores.csv"):
    import numpy as np
    import pandas as pd
    from tqdm import tqdm
    from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
    from rouge_score import rouge_scorer
    from sklearn.metrics import f1_score
    from bert_score import score as bert_score

    # Load results (expecting "context" and "output")
    rows = [json.loads(l) for l in open(results_path, "r", encoding="utf-8")]
    # Filter to rows that have both fields (graceful on errors)
    eval_rows = [r for r in rows if isinstance(r.get("context"), str) and isinstance(r.get("output"), str)]

    if not eval_rows:
        print("No rows with both 'context' and 'output' found; skipping evaluation.")
        return

    ids = [r.get("id") for r in eval_rows]
    anchors = [r.get("anchor", "") for r in eval_rows]
    refs = [r["context"] for r in eval_rows]
    hyps = [r["output"]  for r in eval_rows]

    # Metric helpers
    rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
    smoothie = SmoothingFunction().method1

    def rouge_l(h, r):
        # rouge api expects (target, prediction) order; we compare ref vs hyp
        return rouge.score(r, h)["rougeL"].fmeasure

    def bleu(h, r):
        h_toks = h.split()
        r_toks = [r.split()]
        return sentence_bleu(r_toks, h_toks, smoothing_function=smoothie)

    def token_f1(h, r):
        # multiset-insensitive token presence F1
        h_toks = h.split()
        r_toks = r.split()
        vocab = list(set(h_toks + r_toks))
        h_vec = [1 if t in h_toks else 0 for t in vocab]
        r_vec = [1 if t in r_toks else 0 for t in vocab]
        # Handle degenerate all-zeros gracefully
        try:
            return f1_score(r_vec, h_vec)
        except ValueError:
            return 0.0

    # Compute classical metrics
    rouge_vals, bleu_vals, f1_vals = [], [], []
    for h, r in tqdm(list(zip(hyps, refs)), total=len(hyps), desc="Classical metrics"):
        rouge_vals.append(rouge_l(h, r))
        bleu_vals.append(bleu(h, r))
        f1_vals.append(token_f1(h, r))

    # BERTScore (vectorized). Set device='cuda' if available.
    print("Computing BERTScore...")
    P, R, F = bert_score(hyps, refs, lang="en")
    bert_f1_vals = F.numpy().tolist()

    # 95% CI via normal approx
    def ci95(x):
        x = np.asarray(x, dtype=float)
        mean = x.mean()
        se = x.std(ddof=1) / max(1, np.sqrt(len(x)))
        low, high = mean - 1.96 * se, mean + 1.96 * se
        return mean, low, high

    # Per-row CSV
    df = pd.DataFrame({
        "id": ids,
        "anchor": anchors,
        "ROUGE-L": rouge_vals,
        "BLEU": bleu_vals,
        "Token-F1": f1_vals,
        "BERTScore": bert_f1_vals,
    })
    df.to_csv(out_csv, index=False)
    print(f"Saved detailed scores → {out_csv}")

    # Summary
    r_m, r_l, r_h = ci95(rouge_vals)
    b_m, b_l, b_h = ci95(bleu_vals)
    f_m, f_l, f_h = ci95(f1_vals)
    bs_m, bs_l, bs_h = ci95(bert_f1_vals)

    print("\n=== Mean ± 95% CI ===")
    print(f"ROUGE-L   : {r_m:.4f}  (95% CI {r_l:.4f}–{r_h:.4f})")
    print(f"BLEU      : {b_m:.4f}  (95% CI {b_l:.4f}–{b_h:.4f})")
    print(f"Token-F1  : {f_m:.4f}  (95% CI {f_l:.4f}–{f_h:.4f})")
    print(f"BERTScore : {bs_m:.4f} (95% CI {bs_l:.4f}–{bs_h:.4f})")

In [36]:
csv_eval_path = os.path.splitext(OUTPUT_FILE)[0] + ".csv"
evaluate(OUTPUT_FILE, out_csv=csv_eval_path)

Classical metrics: 100%|██████████| 230/230 [00:01<00:00, 147.70it/s]


Computing BERTScore...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Cancellation requested; stopping current tasks.


KeyboardInterrupt: 