In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from bert_score import score as bert_score
import torch
import numpy as np
from tqdm import tqdm

try:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if device.type == "cuda":
        _ = torch.zeros(1).to(device)
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
        print(f"   VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    else:
        print("CUDA not available, using CPU instead.")
except Exception as e:
    print(f"CUDA not supported, defaulting to CPU: {e}")
    device = torch.device("cpu")


Using GPU: NVIDIA GeForce RTX 5060 Ti
   VRAM: 17.1 GB


In [4]:
from peft import PeftModel

BASE_MODEL = "microsoft/biogpt"
ADAPTER_PATH = "./biogpt_lora_finetuned"

print(f"\nLoading base model: {BASE_MODEL}")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)

# Fix BioGPT missing pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    base_model.config.pad_token_id = tokenizer.eos_token_id

# Attach PEFT adapter
print(f"Applying PEFT adapter from: {ADAPTER_PATH}")
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)

# Move to device and eval
model = model.to(device)
model.eval()

print(f"PEFT model loaded successfully on {device}")
THRESHOLD = 0.85



Loading base model: microsoft/biogpt
Applying PEFT adapter from: ./biogpt_lora_finetuned


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


PEFT model loaded successfully on cuda


Exception ignored in: <function tqdm.__del__ at 0x000001B4C0EC39C0>
Traceback (most recent call last):
  File "c:\Users\Jasper\anaconda3\envs\biogpt_gpu\Lib\site-packages\tqdm\std.py", line 1148, in __del__
    self.close()
  File "c:\Users\Jasper\anaconda3\envs\biogpt_gpu\Lib\site-packages\tqdm\notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm' object has no attribute 'disp'


In [5]:
dataset = load_dataset("qiaojin/PubMedQA", "pqa_labeled")["train"]  # evaluation subset

In [6]:
generated_answers = []
reference_answers = []

for sample in tqdm(dataset, desc="Generating answers"):
    question = sample["question"]
    context_passages = sample["context"]["contexts"]
    context = " ".join(context_passages)
    prompt = f"Question: {question}\nContext: {context}\nAnswer:"

    # Tokenize and generate
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        try:
            outputs = model.generate(
                **inputs,
                max_new_tokens=128,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )
        except RuntimeError as e:
            if "no kernel image" in str(e):
                print("CUDA kernel mismatch, running on CPU instead.")
                model = model.to("cpu")
                device = torch.device("cpu")
                outputs = model.generate(**inputs.to(device), max_new_tokens=128, do_sample=False)

    # Decode
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated_answer = generated_text.replace(prompt, "").strip()
    generated_answers.append(generated_answer)
    reference_answers.append(sample["long_answer"].strip())

print("Generation complete!")

Generating answers: 100%|██████████| 1000/1000 [25:01<00:00,  1.50s/it]

Generation complete!





In [7]:
P, R, F1 = bert_score(generated_answers, reference_answers, lang="en", verbose=True)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/32 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/16 [00:00<?, ?it/s]

done in 27.70 seconds, 36.10 sentences/sec


In [8]:
soft_matches = (F1 >= THRESHOLD).numpy()
accuracy = np.mean(soft_matches)

In [9]:
print(f"\nBioGPT Instruction fine tuning Text Answer Evaluation (on {len(dataset)} samples):")
print(f"1. Soft Accuracy (BERTScore F1 ≥ {int(THRESHOLD*100)}%): {accuracy:.4f}")
print(f"2. BERTScore Precision: {P.mean():.4f}")
print(f"3. BERTScore Recall:    {R.mean():.4f}")
print(f"4. BERTScore F1:        {F1.mean():.4f}")


BioGPT Instruction fine tuning Text Answer Evaluation (on 1000 samples):
1. Soft Accuracy (BERTScore F1 ≥ 85%): 0.1300
2. BERTScore Precision: 0.7947
3. BERTScore Recall:    0.8768
4. BERTScore F1:        0.8336


In [12]:
import pandas as pd
from bert_score import score as bert_score

questions = dataset["question"]
contexts = [" ".join(ctx["contexts"]) for ctx in dataset["context"]]
pubids = dataset["pubid"]

threshold = 0.85
f1_scores = F1.numpy()
wrong_indices = [i for i, score in enumerate(f1_scores) if score < threshold]

wrong_df = pd.DataFrame({
    "PubMed ID": [pubids[i] for i in wrong_indices],
    "Question": [questions[i] for i in wrong_indices],
    "Context": [contexts[i] for i in wrong_indices],
    "Generated Answer": [generated_answers[i] for i in wrong_indices],
    "Gold Answer": [reference_answers[i] for i in wrong_indices],
    "BERTScore F1": [f1_scores[i] for i in wrong_indices]
})

wrong_df.to_csv("biogpt_lora_0.85_wrong_answers.csv", index=False)
print(f"Saved {len(wrong_df)} wrong predictions to biogpt_wrong_answers.csv")

Saved 870 wrong predictions to biogpt_wrong_answers.csv
