In [8]:
from pathlib import Path
import torch
import numpy as np
import os
from pathlib import Path
import sys
import numpy as np 
from datasets import load_dataset
import tqdm
import torch
parent_root = Path.cwd().parent
project_root = os.path.join(parent_root, "src")
sys.path.insert(0, str(project_root))
from pytorch_lightning import seed_everything
from accelerate import Accelerator
import matplotlib.pyplot as plt
import hydra

from hydra.core.global_hydra import GlobalHydra
from hydra import compose, initialize_config_dir
from omegaconf import DictConfig, OmegaConf
from hydra import compose, initialize

from datamodule import DataModule_llm
from lightning_module import llm_pl
from sklearn.metrics import f1_score, accuracy_score
from transformers import AutoTokenizer, DataCollatorWithPadding




In [2]:
out_dir = Path("..") / "src"
data_dir = out_dir / "data"

In [3]:
@hydra.main(config_path="config", config_name="config_seg", version_base="1.3")
def main(cfg: DictConfig):

    seed_everything(cfg.get("seed", 42), workers=True)

    cfg_data = cfg.data
    cfg_model = cfg.model
    cfg_lora = cfg_model.lora

    datamodule = DataModule_llm(cfg_data)
    model = llm_pl(cfg_model, cfg_lora)
    return model, datamodule


GlobalHydra.instance().clear()
config_path = os.path.join(out_dir, "config")

with initialize(version_base=None, config_path=config_path, job_name="nb"):
    cfg = compose(config_name="config")
    model, datamodule = main(cfg)
    

Seed set to 42


model token id </s>
ratio trainable parameters 0.40791125334440875


In [4]:
datamodule.setup(stage="test")
test_loader = datamodule.test_dataloader()



Map: 100%|██████████| 224562/224562 [00:52<00:00, 4251.82 examples/s]
Map: 100%|██████████| 4583/4583 [00:01<00:00, 4389.15 examples/s]
Map: 100%|██████████| 4677/4677 [00:00<00:00, 5074.59 examples/s]


In [5]:
ckpt_path = out_dir / "checkpoints/best.ckpt"
model = llm_pl.load_from_checkpoint(
    ckpt_path,
    weights_only = False
)

model token id </s>
ratio trainable parameters 0.40791125334440875


In [6]:
accelerator = Accelerator(mixed_precision="bf16")
model = accelerator.prepare(
    model
)

model.eval()

llm_pl(
  (model): PeftModelForCausalLM(
    (base_model): LoraModel(
      (model): LlamaForCausalLM(
        (model): LlamaModel(
          (embed_tokens): Embedding(32000, 2048)
          (layers): ModuleList(
            (0-21): 22 x LlamaDecoderLayer(
              (self_attn): LlamaAttention(
                (q_proj): lora.Linear(
                  (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=2048, out_features=16, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (default): Linear(in_features=16, out_features=2048, bias=False)
                  )
                  (lora_embedding_A): ParameterDict()
                  (lora_embedding_B): ParameterDict()
                  (lora_magnitude_vector): Modu

In [7]:
import math


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

total_loss = 0.0
total_tokens = 0

with torch.no_grad():
    for batch in tqdm.tqdm(test_loader, desc="Test loss"):
        batch = {k: v.to(device) for k, v in batch.items()}

        out = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"],
        )
        loss = out.loss  

        # count how many tokens contribute to loss (labels != -100)
        n_tokens = (batch["labels"] != -100).sum().item()

        total_loss += loss.item() * n_tokens
        total_tokens += n_tokens

avg_loss = total_loss / max(total_tokens, 1)
ppl = math.exp(avg_loss)

print({"test_loss": avg_loss, "test_ppl": ppl, "n_tokens": total_tokens})


Test loss:   0%|          | 0/585 [00:00<?, ?it/s]You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Test loss: 100%|██████████| 585/585 [00:42<00:00, 13.85it/s]

{'test_loss': 1.018826152707034, 'test_ppl': 2.769941366667132, 'n_tokens': 20439}





In [12]:
def generate_answer(model, tokenizer, prompt_text, device, max_new_tokens = 128):
    inputs = tokenizer(prompt_text, return_tensors = "pt").to(device)
    
    with torch.no_grad():
        gen_ids = model.model.generate(
            **inputs,
            max_new_tokens = max_new_tokens,
            do_sample = False,
            temperature = 1.0,
            top_p = 1.0,
            eos_token_id = tokenizer.eos_token_id,
            pad_token_id = tokenizer.pad_token_id
        )
    decoded = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
    return decoded

In [None]:
import random
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", use_fast=True)
ds = load_dataset("json", data_files={"train": str("/home/clement/mnt/ssd_nvme/ai-med-portfolio/projects/dl/llm-med-project/src/data/train.jsonl"), 
                                                   "validation": str("/home/clement/mnt/ssd_nvme/ai-med-portfolio/projects/dl/llm-med-project/src/data/val.jsonl"), 
                                                   "test": str("/home/clement/mnt/ssd_nvme/ai-med-portfolio/projects/dl/llm-med-project/src/data/test.jsonl")})

idxs = random.sample(range(len(ds["test"])), 5)

for idx in idxs:
    ex = ds["test"][idx]
    prompt = ex["input"]
    gold = ex["output"]
    task = ex.get("task", "unknown")
    if task == "qa_medmcqa":
        pred = generate_answer(model, tokenizer, prompt, device, max_new_tokens=30)
    else:
        pred = generate_answer(model, tokenizer, prompt, device, max_new_tokens = 128)

    print("\n" + "="*80)
    print("TASK:", task)
    print("\nPROMPT:\n", prompt[:1200])
    print("\nGOLD:\n", gold[:600])
    print("\nPRED:\n", pred[:1200])


Token indices sequence length is longer than the specified maximum sequence length for this model (5608 > 2048). Running this sequence through the model will result in indexing errors



TASK: qa_medmcqa

PROMPT:
 System: You are a medical assistant. Answer based on the provided context. If the context is insufficient, say you don't know.
Task: medical_mcq
User: An 18-year-old man moves from sea level to an elevation of 2,400 m to train as a skier. The increased requirement for oxygen delivery to tissues at the higher elevation stimulates the synthesis of a renal hormone (erythropoietin), which targets hematopoietic stem cells in the bone marrow. Erythropoietin promotes the survival of early erythroid progenitor cells primarily through which of the following mechanisms?

Options:
A. Altered cell-matrix adhesion
B. Downregulation of p53
C. Enhanced glucose uptake
D. Inhibition of apoptosis
Reply with: Final: <A|B|C|D>
Assistant:

GOLD:
 Final: D

PRED:
 System: You are a medical assistant. Answer based on the provided context. If the context is insufficient, say you don't know.
Task: medical_mcq
User: An 18-year-old man moves from sea level to an elevation of 2,400 m t

This is a friendly reminder - the current text generation call has exceeded the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.



TASK: summarization_pubmed

PROMPT:
 System: You are a medical assistant. Answer based on the provided context. If the context is insufficient, say you don't know.
Task: summarization
User: Summarize the following biomedical article into a concise abstract:

the study was designed as a prospective , observational , referral center cohort study of consecutive diabetic patients who underwent pci . 
 all incident cases of cli were recorded and followed within a structured , collaborative framework ( diabetologist , foot care specialist , vascular surgeon , interventional cardiologist ) . 
 this model of strict collaboration among different professional figures with a dedicated pathway for diabetic patients and early , aggressive attempts at endovascular revascularization , has been previously described ( 21 ) and demonstrated to result in a very low amputation rate . 
 consecutive diabetic patients undergoing pci with or without stent implantation for either acute coronary syndrome or st