In [16]:
%pip install transformers accelerate torch sentencepiece bitsandbytes aiohttp requests --quiet together

In [None]:
from dotenv import load_dotenv
import os, json, time
from together import Together

load_dotenv()
api_key = 

client = Together(api_key=api_key)

MODEL = "chrisperez04_345b/meta-llama/Llama-2-7b-chat-hf-ce20084e"

INPUT_FILE = "/content/data/query_data.jsonl"
OUTPUT_FILE = "/content/results/Llama-2-7b-chat-hf.jsonl"

In [28]:
GEN_CFG = dict(
    max_tokens=512,
    temperature=0.2,
    top_p=0.9,
    top_k=60,
    num_beams=1,
    repetition_penalty=1.0,
    stop=["</s>"],
)

In [31]:
def run_query(prompt: str):
    """
    Send one prompt to the model (chat or completions endpoint) and
    return (text, latency).  Works for Together dedicated endpoints.
    """
    import time
    start = time.time()

    # --- First try chat.completions (some endpoints accept this) ---
    try:
        resp = client.chat.completions.create(
            model=MODEL,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=512,
            temperature=0.2,
            top_p=0.9,
            stop=["</s>"],         # keep minimal safe args
        )
        text = resp.choices[0].message.content if resp.choices else ""
        return text.strip(), time.time() - start
    except Exception as e1:
        err1 = str(e1)

    # --- Fallback: completions endpoint with Llama-2 chat template ---
    sys_prompt = "You are a helpful assistant."
    templated = f"<<SYS>>\n{sys_prompt}\n<</SYS>>\n\n[INST] {prompt} [/INST]"

    try:
        resp = client.completions.create(
            model=MODEL,
            prompt=templated,
            max_tokens=512,
            temperature=0.2,
            top_p=0.9,
            stop=["</s>"],
        )
        text = resp.choices[0].text if resp.choices else ""
        return text.strip(), time.time() - start
    except Exception as e2:
        # If both fail, show both errors for debugging
        raise RuntimeError(
            f"chat.completions and completions both failed.\n"
            f"chat error: {err1}\n"
            f"completions error: {e2}"
        )

In [32]:
with open(INPUT_FILE, "r", encoding="utf-8") as fin, \
        open(OUTPUT_FILE, "w", encoding="utf-8") as fout:

    for i, line in enumerate(fin, 1):
        row = json.loads(line)

        context = row.get("context", "")
        prompt = row.get("prompt", "")
        anchor = row.get("anchor", "")
        qid = row.get("id", i)


        print(f"[{i}] id={qid} — sending prompt of length {len(prompt)} chars")

        try:
            output, latency = run_query(prompt)
            result = {
                "id": qid,
                "context": context,
                "prompt": prompt,
                "anchor": anchor,
                "output": output,
                "latency_sec": round(latency, 3),
                "model": MODEL,
            }
            fout.write(json.dumps(result, ensure_ascii=False) + "\n")
            print(f"✔ Done in {latency:.2f}s, {len(output)} chars output.")
        except Exception as e:
            print(f"✖ Error on example {i}: {e}")
            fout.write(json.dumps({
                "id": qid,
                "context": context,
                "prompt": prompt,
                "anchor": anchor,
                "error": str(e),
                "model": MODEL,
            }, ensure_ascii=False) + "\n")

print(f"\nAll queries completed. Results saved to {OUTPUT_FILE}")



[1] id=1 — sending prompt of length 1137 chars
✔ Done in 3.20s, 511 chars output.
[2] id=2 — sending prompt of length 1695 chars
✔ Done in 2.08s, 1234 chars output.
[3] id=3 — sending prompt of length 1335 chars
✔ Done in 2.40s, 1188 chars output.
[4] id=4 — sending prompt of length 1491 chars
✔ Done in 1.97s, 953 chars output.
[5] id=5 — sending prompt of length 1546 chars
✔ Done in 2.09s, 1214 chars output.
[6] id=6 — sending prompt of length 1514 chars
✔ Done in 2.32s, 1523 chars output.
[7] id=7 — sending prompt of length 1411 chars
✔ Done in 1.57s, 473 chars output.
[8] id=8 — sending prompt of length 1349 chars
✔ Done in 2.21s, 1094 chars output.
[9] id=9 — sending prompt of length 1356 chars
✔ Done in 1.56s, 979 chars output.
[10] id=10 — sending prompt of length 1468 chars
✔ Done in 1.97s, 835 chars output.
[11] id=11 — sending prompt of length 1279 chars
✔ Done in 1.50s, 417 chars output.
[12] id=12 — sending prompt of length 1218 chars
✔ Done in 1.85s, 1200 chars output.
[13]

In [33]:
%pip install nltk rouge-score bert-score scikit-learn pandas tqdm




In [34]:
%pip install nltk rouge-score bert-score scikit-learn pandas tqdm



In [39]:
def evaluate(results_path: str, out_csv: str = "evaluation_scores.csv"):
    import numpy as np
    import pandas as pd
    from tqdm import tqdm
    from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
    from rouge_score import rouge_scorer
    from sklearn.metrics import f1_score
    from bert_score import score as bert_score

    # Load results (expecting "context" and "output")
    rows = []
    with open(results_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue  # skip blank lines
            try:
                rows.append(json.loads(line))
            except json.JSONDecodeError:
                print(f"⚠️ Skipping malformed line: {line[:60]}...")
                continue
    # Filter to rows that have both fields (graceful on errors)
    eval_rows = [r for r in rows if isinstance(r.get("context"), str) and isinstance(r.get("output"), str)]

    if not eval_rows:
        print("No rows with both 'context' and 'output' found; skipping evaluation.")
        return

    ids = [r.get("id") for r in eval_rows]
    anchors = [r.get("anchor", "") for r in eval_rows]
    refs = [r["context"] for r in eval_rows]
    hyps = [r["output"]  for r in eval_rows]

    # Metric helpers
    rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
    smoothie = SmoothingFunction().method1

    def rouge_l(h, r):
        # rouge api expects (target, prediction) order; we compare ref vs hyp
        return rouge.score(r, h)["rougeL"].fmeasure

    def bleu(h, r):
        h_toks = h.split()
        r_toks = [r.split()]
        return sentence_bleu(r_toks, h_toks, smoothing_function=smoothie)

    def token_f1(h, r):
        # multiset-insensitive token presence F1
        h_toks = h.split()
        r_toks = r.split()
        vocab = list(set(h_toks + r_toks))
        h_vec = [1 if t in h_toks else 0 for t in vocab]
        r_vec = [1 if t in r_toks else 0 for t in vocab]
        # Handle degenerate all-zeros gracefully
        try:
            return f1_score(r_vec, h_vec)
        except ValueError:
            return 0.0

    # Compute classical metrics
    rouge_vals, bleu_vals, f1_vals = [], [], []
    for h, r in tqdm(list(zip(hyps, refs)), total=len(hyps), desc="Classical metrics"):
        rouge_vals.append(rouge_l(h, r))
        bleu_vals.append(bleu(h, r))
        f1_vals.append(token_f1(h, r))

    # BERTScore (vectorized). Set device='cuda' if available.
    print("Computing BERTScore...")
    P, R, F = bert_score(hyps, refs, lang="en")
    bert_f1_vals = F.numpy().tolist()

    # 95% CI via normal approx
    def ci95(x):
        x = np.asarray(x, dtype=float)
        mean = x.mean()
        se = x.std(ddof=1) / max(1, np.sqrt(len(x)))
        low, high = mean - 1.96 * se, mean + 1.96 * se
        return mean, low, high

    # Per-row CSV
    df = pd.DataFrame({
        "id": ids,
        "anchor": anchors,
        "ROUGE-L": rouge_vals,
        "BLEU": bleu_vals,
        "Token-F1": f1_vals,
        "BERTScore": bert_f1_vals,
    })
    df.to_csv(out_csv, index=False)
    print(f"Saved detailed scores → {out_csv}")

    # Summary
    r_m, r_l, r_h = ci95(rouge_vals)
    b_m, b_l, b_h = ci95(bleu_vals)
    f_m, f_l, f_h = ci95(f1_vals)
    bs_m, bs_l, bs_h = ci95(bert_f1_vals)

    print("\n=== Mean ± 95% CI ===")
    print(f"ROUGE-L   : {r_m:.4f}  (95% CI {r_l:.4f}–{r_h:.4f})")
    print(f"BLEU      : {b_m:.4f}  (95% CI {b_l:.4f}–{b_h:.4f})")
    print(f"Token-F1  : {f_m:.4f}  (95% CI {f_l:.4f}–{f_h:.4f})")
    print(f"BERTScore : {bs_m:.4f} (95% CI {bs_l:.4f}–{bs_h:.4f})")

In [40]:
csv_eval_path = os.path.splitext(OUTPUT_FILE)[0] + ".csv"
evaluate(OUTPUT_FILE, out_csv=csv_eval_path)

⚠️ Skipping malformed line:                                                             ...


Classical metrics: 100%|██████████| 218/218 [00:03<00:00, 63.54it/s]


Computing BERTScore...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Saved detailed scores → /content/results/Llama-2-7b-chat-hf.csv

=== Mean ± 95% CI ===
ROUGE-L   : 0.7848  (95% CI 0.7562–0.8135)
BLEU      : 0.6801  (95% CI 0.6424–0.7177)
Token-F1  : 0.8013  (95% CI 0.7759–0.8266)
BERTScore : 0.9428 (95% CI 0.9376–0.9480)
