In [None]:
!pip install instructor

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
import instructor
import asyncio
import nest_asyncio
import json
import requests

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from typing import List, Optional, Tuple
from pydantic import BaseModel, Field
from openai import AsyncOpenAI

import pandas as pd
import numpy as np

In [150]:
client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
client = instructor.from_openai(client, mode=instructor.Mode.JSON)

sem = asyncio.Semaphore(5)

In [151]:
data = pd.read_csv("data/redsm5.csv")

mlb = MultiLabelBinarizer()
explanations = data["explanation"].tolist()
texts = data["text"].tolist()

X_train, X_test, y_train, y_test = train_test_split(
    texts, explanations, test_size=0.2, random_state=42
)

with open("data/redsm5_test_explanations_inputs.jsonl", "w") as f:
    for text, explanation in zip(X_test, y_test):
        f.write(json.dumps({"text": text, "explanation": explanation}) + "\n")

In [152]:
class Explanation(BaseModel):
    explanation: str = Field(
        ...,
        description="Plain-language rationale that cites or paraphrases phrases from the text to justify why each symptom was tagged present or absent.",
    )

    chain_of_thought: str = Field(
        ..., description="The chain of thought that led to the explanation."
    )

In [153]:
async def explain(data: str) -> Explanation:
    def _fewshot_messages(k=10):
        msgs = []
        for x, y in zip(X_train[:k], y_train[:k]):
            msgs.append({"role": "user", "content": x})
            msgs.append({"role": "assistant", "content": y})
        return msgs

    async with sem:
        try:
            result = await client.chat.completions.create(
                model="gemma3:27b-it-qat",
                response_model=Explanation,
                max_retries=5,
                messages=[
                    {
                        "role": "system",
                        "content": (
                            "You are an expert mental-health assistant. "
                            "Your job is to read a first-person text (e.g. a social-media post) and explain which DSM-5 A-criteria symptoms for Major Depressive Disorder are directly present or explicitly absent in that text."
                        ),
                    },
                    *(_fewshot_messages(k=50)),
                    {"role": "user", "content": data},
                ],
            )
            return result
        except Exception as e:
            print(f"Error during API call: {e}")
            return Explanation(
                explanation="Error during explanation.",
                chain_of_thought="Error during explanation.",
            )

In [154]:
async def process_inputs(inputs: List[dict], output_path: Optional[str] = None) -> None:
    for input_data in inputs:
        prediction = await explain(input_data["text"])

        resp = {
            "text": input_data["text"],
            "explanation_true": input_data["explanation"],
            "explanation_predicted": prediction.explanation,
            "chain_of_thought": prediction.chain_of_thought,
        }

        if output_path:
            with open(output_path, "a") as f:
                json.dump(resp, f)
                f.write("\n")

In [155]:
with open("data/redsm5_test_explanations_inputs.jsonl", "r") as f:
    inputs = [json.loads(line) for line in f]

nest_asyncio.apply()
await process_inputs(inputs, output_path="data/redsm5_test_explanations_outputs.jsonl")

Error during API call: 1 validation error for Explanation
chain_of_thought
  Field required [type=missing, input_value={'explanation': "This is ...nation': 'This is ...}"}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
Error during API call: 1 validation error for Explanation
chain_of_thought
  Field required [type=missing, input_value={'explanation': "This nar...'mental health crisis']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
Error during API call: 1 validation error for Explanation
chain_of_thought
  Field required [type=missing, input_value={'explanation': "The text... deception', 'despair']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
Error during API call: 1 validation error for Explanation
  Invalid JSON: EOF while parsing an object at line 22 column 2 [type=json_invalid, input_value='{\n  "explanation": "Thi... \n\n  \n\n  \n  

In [168]:
def get_ollama_embedding(text, model="nomic-embed-text"):
    url = "http://localhost:11434/api/embed"
    payload = {"model": model, "input": text}
    resp = requests.post(url, json=payload)
    resp.raise_for_status()
    data = resp.json()
    embedding = data.get("embeddings", [])
    if not embedding:
        print(f"[WARN] Empty embedding for text: '{text[:60]}...'")
        return np.zeros(768, dtype=np.float32)
    if isinstance(embedding[0], list):
        embedding = embedding[0]
    return np.array(embedding, dtype=np.float32)

In [173]:
def compute_embedding_similarity(jsonl_path, model="nomic-embed-text"):
    refs_pred, hyps_pred = [], []
    entries = []

    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            true = obj["explanation_true"]
            pred = obj["explanation_predicted"]
            entries.append((true, pred))
            refs_pred.append(true)
            hyps_pred.append(pred)

    gptscore_pred = []
    for true, pred in entries:
        emb_true = get_ollama_embedding(true, model)
        emb_pred = get_ollama_embedding(pred, model)

        norm_true = np.linalg.norm(emb_true)
        norm_pred = np.linalg.norm(emb_pred)

        if norm_true == 0 or norm_pred == 0:
            cos_pred = 0.0
        else:
            cos_pred = np.dot(emb_true, emb_pred) / (norm_true * norm_pred)
            cos_pred = float(np.clip(cos_pred, -1.0, 1.0))

        gptscore_pred.append(cos_pred)

    def avg(scores):
        return float(np.mean(scores)) if scores else 0.0

    print(f"Avg. embedding similarity: {avg(gptscore_pred):.4f}")

In [174]:
compute_embedding_similarity(
    jsonl_path="data/redsm5_test_explanations_outputs.jsonl",
    model="nomic-embed-text",
)

Avg. embedding similarity: 0.7846


In [221]:
class JudgeScores(BaseModel):
    accuracy: int
    coverage: int
    clarity: int
    explanation: str

In [222]:
async def llm_judge(
    reference: str, candidate: str, model: str
) -> Tuple[float, JudgeScores]:
    WEIGHTS = {"accuracy": 0.40, "coverage": 0.30, "clarity": 0.30}

    system_msg = {
        "role": "system",
        "content": (
            "You are an expert evaluator of short clinical explanations.\n"
            "Rate the CANDIDATE against the REFERENCE with this rubric:\n"
            "1. Clinical Accuracy (1-5): Does the candidate broadly match the clinical statements? "
            "If there are no significant contradictions and the main points align, you may rate 4 or 5.\n"
            "2. Coverage (1-5): If the candidate mentions most of the important symptoms or absences described in the reference, and does not invent major content, you may give a high score. "
            "Minor omissions are acceptable if the main message remains the same.\n"
            "3. Clarity & Conciseness (1-5): Is the candidate easy to read and generally clear?\n\n"
            "If the candidate explanation broadly covers the same clinical content or intent as the reference, you may give a high score, even if the phrasing or minor details differ.\n"
            "Return ONLY a JSON object with keys accuracy, coverage, clarity and explanation (≤2 lines)."
        ),
    }

    user_msg = {
        "role": "user",
        "content": (
            f"REFERENCE:\n<<<{reference}>>>\n\n" f"CANDIDATE:\n<<<{candidate}>>>"
        ),
    }

    async with sem:
        try:
            result: JudgeScores = await client.chat.completions.create(
                model=model,
                response_model=JudgeScores,
                max_retries=5,
                messages=[system_msg, user_msg],
            )
        except Exception as e:
            result = JudgeScores(
                accuracy=3, coverage=3, clarity=3, explanation="Error during judging."
            )

    overall = sum(WEIGHTS[k] * (getattr(result, k) - 1) / 4 for k in WEIGHTS) * 100
    return overall, result

In [223]:
async def evaluate_jsonl_llm_judge(path: str, model: str) -> None:
    with open(path, encoding="utf-8") as f:
        data = [json.loads(line) for line in f]

    async def judge(ref: str, hyp: str):
        score, _ = await llm_judge(ref, hyp, model)
        return score

    tasks_pred = []
    for obj in data:
        ref = obj["explanation_true"]
        pred = obj["explanation_predicted"]
        tasks_pred.append(judge(ref, pred))

    pred_scores = await asyncio.gather(*tasks_pred)

    print(f"Avg. llm judge score: {np.mean(pred_scores):6.2f}")

In [224]:
asyncio.run(
    evaluate_jsonl_llm_judge(
        "data/redsm5_test_explanations_outputs.jsonl", model="deepseek-r1:8b"
    )
)

Avg. llm judge score:  61.50
