# Automatic Hallucination Detection for replication and extension of "A Data-Centric Approach To Generate Faithful and High Quality Patient Summaries with Large Language Models"

## Install depedencies

In [None]:
!pip install -q transformers==4.46.1 \
            accelerate==0.34.2 \
            datasets==3.0.0 \
            peft==0.11.1 \
            trl==0.9.4

## Install depedencies

In [None]:
from collections import defaultdict
from pathlib import Path
import shutil

import numpy as np
import torch
from datasets import load_dataset
from huggingface_hub import login
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
from peft import LoraConfig, PeftModel
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
import wandb
import json
from openai import OpenAI
import os

from google.colab import files
from google.colab import userdata

## Unzip LoRA adapters

In [None]:
!unzip "/content/drive/MyDrive/CS 598 DLH Fine tuned model LoRA weights/meta-llama_Llama-2-7b-hf_cleaned_ft.zip" \
    -d "/content/meta-llama_Llama-2-7b-hf_cleaned_ft"

## Config

In [None]:
# Set model and paths here
device = "cuda"
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
eval_path = "/content/data/hallucinations_mimic_di_validation_cleaned_improved.json"
adapter_path = "/content/mistralai_Mistral-7B-Instruct-v0.3_cleaned_improved_ft/mistralai_Mistral-7B-Instruct-v0.3_cleaned_improved_ft"
save_path = "/content/mistral_7b_instruct_v0.3_merged"

## Huggingface Login

In [None]:
login()

## Upload and load data

In [None]:
!mkdir -p data

uploaded = files.upload()

for filename in uploaded.keys():
    shutil.move(filename, f"data/{filename}")

In [None]:
# Load data
data = load_dataset("json", data_files={"validation": eval_path,})
validation_data = data["validation"]

### Helpers

In [None]:
def iter_bhc_avs(validation_data):
    for row in validation_data:
        bhc = row["text"]
        gold_avs = row["summary"]
        yield bhc, gold_avs

def generate_prompt(text):
    instruction = "Summarize for the patient what happened during the hospital stay based on this doctor's note:\n"
    response = "Summary for the patient:\n"
    return f"{instruction}{text}\n\n{response}"


def generate_model_avs(model, tokenizer, bhc, max_new_tokens=350):
    prompt = generate_prompt(bhc)

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    prompt_len = inputs["input_ids"].shape[1]

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id
        )

    generated_ids = output_ids[0][prompt_len:]
    decoded = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

    return decoded

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

# Load LoRA adapter
model = PeftModel.from_pretrained(model, adapter_path)

# Merge LoRA back into base model
model = model.merge_and_unload()
model.eval()

## Optionally save model

In [None]:
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

!zip -r model_merged.zip $save_path

## Generate Predictions

In [None]:
generated_results = []

for i, (bhc, gold_avs) in tqdm(
    enumerate(iter_bhc_avs(validation_data)),
    total=5,
    desc="Generating model summaries"
):
    if i == 5:
        break

    model_avs = generate_model_avs(model, tokenizer, bhc)

    generated_results.append({
        "bhc": bhc,
        "gold_avs": gold_avs,
        "model_avs": model_avs
    })


## Build prompts

In [None]:
def build_copy_paste_prompt(bhc, model_avs):
    return (
        "Evaluate-\n"
        "BHC:\n"
        f"{bhc}\n\n"
        "AVS:\n"
        f"{model_avs}\n"
    )


In [None]:
ready_prompts = []

for i, item in enumerate(generated_results):
    bhc = item["bhc"]
    model_avs = item["model_avs"]

    cp_prompt = build_copy_paste_prompt(bhc, model_avs)

    ready_prompts.append({
        "index": i,
        "prompt": cp_prompt,
        "bhc": bhc,
        "model_avs": model_avs
    })

## Initial Prompt

We will present you with a pair of a brief hospital course (BHC) and a patient after visit summary
(AVS). The AVS is also referred to as discharge summary. The BHC contains a detailed summary of
the hospital stay written by medical service. It usually contains medical jargon, and it can
follow different structures based on the hospital course and responsible medical specialty. The
AVS summarizes the hospital stay for the patient in plain language. In practice, the BHC is not
the only source of information to write the AVS. However, in our setting we treat the BHC as the
only context for the summary.
## Instructions
For this labelling task, we are interested in errors in the AVS that are either unsupported by the
BHC, contradict content in the BHC, or are wrong medical facts. We allow statements that contain
general medical knowledge or advice that are often used in patient summaries. Most errors are due
to unsupported facts, so we further distinguish those based on their specific content. This leads
to the following error types or labels:
1. Unsupported facts, including condition/procedure/medication/time/location/
number/name/word/other
2. Contradicted fact
3. Incorrect fact
And below is the detailed guideline, and we label error spans with the <error> tag (e.g. <error
class="error type">incorrect fact</error>).
### Determining Span of Errors
We label the smallest possible consecutive span that specifies the error given the BHC as a
context. Removing further parts from the span would remove important information. A useful
heuristic is to identify the minimal span that must be replaced to obtain a correct statement that
is grammatically correct. For example
- "We performed an <error>esophageal-gastro-duodenoscopy (EGD).<error>" when no such procedure
is reported in the BHC. The article "an" is not labeled as an error. When no procedure at all was
performed "performed an esophageal-gastro-duodenoscopy (EGD)" should be labeled as error because
there is no suitable substitute for "esophageal-gastro-duodenoscopy (EGD)".
- "After the surgery, we <error>transitioned you to oral oxycodone</error>." when the BHC
contains no information for such a transition. If another medication transition is mentioned in
the BHC and makes sense in this sentence only "oral oxycodone" should be labeled. If another oral
medication transition is mentioned in the BHC only "oxycodone" should be labeled.
- "<error>Your symptoms responded well</error>." when no part of the sentence makes sense in the
given context of the AVS.
We allow general medical knowledge and advice that is often part of the AVS. Usually, these are
information that are not specific for the hospital course given in the BHC. For example
- "Please take your medications as prescribed" contains no error even though the BHC does not
contain this instruction because this is general medical advice.
- "If the symptoms get worse, please contact your doctor" contains no error even when the BHC
does not contain this fact, since it is general medical knowledge that a doctor should be seen
for worsening symptoms.
We try to ignore grammatical errors in the BHC and AVS. If the original meaning can still be
inferred (e.g. "medictaions" instead of "medications"), the most likely corrected form can be
used. If the meaning cannot be inferred, they can be ignored in the BHC or labeled as Unsupported
Other in the AVS.
If a sentence or phrase is repeated, then please treat it as you would any other sentence and
highlight all errors (even if you did so in a previous sentence). For example
- "Please take Tylenol. Please take Tylenol" when Tylenol was prescribed in the BHC.
- "Limit your <error>use of stairs</error>. Please limit <error>use of stairs</error>" when
movement was encouraged.
To get reliable error counts a span should only contain a single error.
- "You received <error>Tylenol</error> and <error>Ciprofloxacin</error>" when there is no
evidence in the BHC that the two medications were administered to the patient.
- "You have a <error>follow-up appointment with your PCP</error> and <error>your
cardiologist</error>" when no such follow up is mentioned in the BHC. Both errors are labeled
separately.
### Dealing with Deidentified Information
The data contains deidentified information shown with " " in the text. We always treat this as
non-existent information. So, the annotators should not infer what the deidentified information
could be. In general, deidentified fields in the AVS should not be labeled as errors. However,
sometimes they belong to a wrong statement or clearly contain unsupported information (e.g., a
doctor’s name or phone numbers) that are not given in the BHC. In these cases, deidentified fields
should be included in the error span. For example
- "Take <error>200mg daily</error> and try to rest" when no such dosage information is
provided in the BHC, but the statement to rest. The deidentified medication name is excluded from
the error span.
- "Please avoid going up <error>more than stairs</error> at a time" when restrictions for the
number of stairs taken at a time are note mentioned in the BHC.
- "<error>Dr. will follow up with you</error>" when no follow-up is mentioned in the BHC.
- "Please stop taking Aspirin <error>on </error>" when no stopping date is given in the BHC.
- "Your RBC peaked <error>at million</error>" if there is no hint of a specific red blood cell
count given in the BHC.
### Error Types
In general, we ask for the most specific error that is applicable. If there is uncertainty which
type applies, prefer the one mentioned first in the enumeration of all error types shown earlier.
For instance, if the error contains an unsupported medication name, the Unsupported medication type
should be used instead of the Unsupported name type. Here is a detailed description of the error
types:
- ‘Unsupported Condition‘: includes unsupported symptoms, diseases, or findings of the patient.
For example
- "You were found to have a <error class="unsupported condition">left clavicle
fracture</error>" when no information was given for this condition in the BHC.
- ‘Unsupported Procedure‘: includes any unsupported medical procedures. For example
- "You had a <error class="unsupported procedure">filter placed in your vein</error>" when no
intervention with a filter was mentioned.
- ‘Unsupported Medication‘: contains all errors related to unsupported medications. This includes
medication classes, substances, routes, frequencies, and dosages. For example
- "You were placed on <error class="unsupported medication">antibiotics</error>" when only
blood thinners were prescribed.
- ‘Unsupported Time‘: includes all errors for unsupported time or interval statements. For
example
- "Keep your arm in a sling for the <error class="unsupported time">next 6 weeks</error>"
when no specific duration is given.
- ‘Unsupported Location‘: Locations include both unsupported physical places as well as regions of
the patient. For example
- "The patient was admitted to the <error class="unsupported location">Acute Surgery
Service</error>" when no admission location was provided in the BHC.
- ‘Unsupported Number‘: any number either as digits or written that are unsupported. This also
includes words such as "a" and "an". For example
- "Your pacemaker rate was increased to <error class="unsupported number">50</error>" when
the rate of 50 is not given in the BHC.
- ‘Unsupported Name‘: named entities that are not supported by the BHC. For example
- "You were seen by the <error class="unsupported name">interventional pulmonary
service</error>" when no consult with this service was mentioned in the BHC. - ‘Unsupported
Word‘: incorrect or inappropriate words or phrases which do not fit in any of the above types.
For example
- "We will send you home with a <error class="unsupported word">drain</error> in place" when
drain not mentioned in the BHC.
- ‘Unsupported Other‘: If there is a mistake which clearly does not belong to any of the above
categories, you may use this category as a last resort. We cannot give precise instructions
because the "other" category is very broad.
- ‘Contradicted Fact‘: This error type is independent of the content and contains all facts that
clearly contradict information provided in the BHC. For example
- "Your pacemaker rate was increased to <error class="contradicted fact">50</error>" when the
context state a pacemaker rate of 40.
- ‘Incorrect Fact‘: This error type is independent of the content and contains all facts that
clearly contradict general medical knowledge or advice. For example
- "We diagnosed a seizure, and you <error class="incorrect fact">can continue driving your
car</error>" when no reason for allowing driving after a seizure is provided this contradict
common medical knowledge.

IMPORTANT: Return solely a structured list of JSON containing errors and their counts in the following format:
{"Error Type": <error type>, "Count": <count>, "Example": {"BHC Span": <BHC span>, "AVS Span": <AVS span>}}.

## Generate sample prompts

In [None]:
for p in ready_prompts:
    print("=" * 80)
    print(f"PROMPT FOR SAMPLE {p['index']}")
    print("=" * 80)
    print(p["prompt"])
    print("\n\n")

## Optionally save prompts to file

In [None]:
with open("/content/hallucination_prompts.txt", "w") as f:
    for p in ready_prompts:
        f.write(f"### SAMPLE {p['index']}\n{p['prompt']}\n\n")