In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from bert_score import score as bert_score
import torch
import numpy as np
from tqdm import tqdm

In [2]:
MODEL_NAME = "./biogpt_instruction_finetuned"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
THRESHOLD = 0.85
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()

BioGptForCausalLM(
  (biogpt): BioGptModel(
    (embed_tokens): BioGptScaledWordEmbedding(42384, 1024, padding_idx=1)
    (embed_positions): BioGptLearnedPositionalEmbedding(1026, 1024)
    (layers): ModuleList(
      (0-23): 24 x BioGptDecoderLayer(
        (self_attn): BioGptAttention(
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (activation_fn): GELUActivation()
        (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      )
    )
    (layer_norm): LayerNorm((1024

In [3]:
dataset = load_dataset("qiaojin/PubMedQA", "pqa_labeled")["train"]  # evaluation subset

In [4]:
generated_answers = []
reference_answers = []

for sample in tqdm(dataset):
    question = sample["question"]
    context_passages = sample["context"]["contexts"]
    context = " ".join(context_passages)
    prompt = f"Question: {question}\nContext: {context}\nAnswer:"

    # Tokenize and generate
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(DEVICE)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=128,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated_answer = generated_text.replace(prompt, "").strip()
    generated_answers.append(generated_answer)
    reference_answers.append(sample["long_answer"].strip())

100%|██████████| 1000/1000 [23:47<00:00,  1.43s/it]


In [5]:
P, R, F1 = bert_score(generated_answers, reference_answers, lang="en", verbose=True)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/32 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/16 [00:00<?, ?it/s]

done in 29.85 seconds, 33.50 sentences/sec


In [6]:
soft_matches = (F1 >= THRESHOLD).numpy()
accuracy = np.mean(soft_matches)

In [7]:
print(f"\nBioGPT Text Answer Evaluation (on {len(dataset)} samples):")
print(f"1. Soft Accuracy (BERTScore F1 ≥ {int(THRESHOLD*100)}%): {accuracy:.4f}")
print(f"2. BERTScore Precision: {P.mean():.4f}")
print(f"3. BERTScore Recall:    {R.mean():.4f}")
print(f"4. BERTScore F1:        {F1.mean():.4f}")


BioGPT Text Answer Evaluation (on 1000 samples):
1. Soft Accuracy (BERTScore F1 ≥ 85%): 0.0680
2. BERTScore Precision: 0.7888
3. BERTScore Recall:    0.8748
4. BERTScore F1:        0.8295


In [8]:
import pandas as pd
from bert_score import score as bert_score

questions = dataset["question"]
contexts = [" ".join(ctx["contexts"]) for ctx in dataset["context"]]
pubids = dataset["pubid"]

threshold = 0.85
f1_scores = F1.numpy()
wrong_indices = [i for i, score in enumerate(f1_scores) if score < threshold]

wrong_df = pd.DataFrame({
    "PubMed ID": [pubids[i] for i in wrong_indices],
    "Question": [questions[i] for i in wrong_indices],
    "Context": [contexts[i] for i in wrong_indices],
    "Generated Answer": [generated_answers[i] for i in wrong_indices],
    "Gold Answer": [reference_answers[i] for i in wrong_indices],
    "BERTScore F1": [f1_scores[i] for i in wrong_indices]
})

wrong_df.to_csv("biogpt_instruction_0.85_wrong_answers.csv", index=False)
print(f"Saved {len(wrong_df)} wrong predictions to biogpt_wrong_answers0.85.csv")

Saved 932 wrong predictions to biogpt_wrong_answers0.85.csv
