In [1]:
import json
import random
import wandb
from datasets import Dataset, DatasetDict, Features, Value, load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
from sklearn.metrics import accuracy_score

In [2]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [3]:
wandb.init(project="llama-medx-reasoning", name="v3.2-lora-pubmedqa", config={"model": "Llama-medx-v3.2"})

model_name = "skumar9/Llama-medx_v3.2"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33measonwangzk[0m ([33measonwangzk-the-university-of-chicago[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/733 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/169 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.2k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/419 [00:00<?, ?B/s]

In [4]:
with open('./ori_pqal.json', 'r') as f:
    ori_data = json.load(f)

In [5]:
data_list = []
for pid, sample in ori_data.items():
    question = sample["QUESTION"]
    context = " ".join(sample["CONTEXTS"])
    final_decision = sample["final_decision"].lower()
    long_answer = sample["LONG_ANSWER"]

    prompt = f"Question: {question}\n\nContext: {context}\n\nAnswer:"
    data_list.append({"question": prompt, "answer": long_answer, "label": final_decision})

In [6]:
random.shuffle(data_list)
split_idx = int(0.8 * len(data_list))
train_data = data_list[:split_idx]
test_data = data_list[split_idx:]

features = Features({
    'question': Value('string'),
    'answer': Value('string'),
    'label': Value('string')
})

dataset = DatasetDict({
    "train": Dataset.from_list(train_data, features=features),
    "test": Dataset.from_list(test_data, features=features)
})

In [7]:
def tokenize_function(examples):
    full_texts = [
        f"Question: {q.strip()}\nAnswer: {a.strip()}"
        for q, a in zip(examples["question"], examples["answer"])
    ]


    tokenized = tokenizer(
        full_texts,
        truncation=True,
        padding="max_length",
        max_length=512
    )


    tokenized["labels"] = [
        [token if token != tokenizer.pad_token_id else -100 for token in seq]
        for seq in tokenized["input_ids"]
    ]

    return tokenized

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [8]:
tokenized_dataset = tokenized_dataset.remove_columns(["label",'question', 'answer' ])

In [9]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 800
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
})

In [10]:
import torch
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    attn_implementation="eager",
)


peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"]
)


model = get_peft_model(model, peft_config)

model.enable_adapter_layers()

model.train()
model.to("cuda")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128258, 4096, padding_idx=128257)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (

In [None]:
print("=== Trainable parameters after prepare_model_for_kbit_training ===")
for name, param in model.named_parameters():
    print(f"{name}: {param.requires_grad}")

In [None]:
for name, param in model.named_parameters():
    if "lora" in name:
        param.requires_grad = True

print("\n=== Trainable parameters after manual unfreeze ===")
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"‚úÖ {name} ‚Äî shape: {tuple(param.shape)}")

In [12]:
sample = tokenized_dataset["train"][0]
batch = {
    "input_ids": torch.tensor(sample["input_ids"]).unsqueeze(0).to("cuda"),
    "attention_mask": torch.tensor(sample["attention_mask"]).unsqueeze(0).to("cuda"),
    "labels": torch.tensor(sample["labels"]).unsqueeze(0).to("cuda"),
}

print("üîç ÂèØËÆ≠ÁªÉÂèÇÊï∞Ôºö")
for name, p in model.named_parameters():
    if p.requires_grad:
        print(f"‚úÖ {name} | shape: {p.shape}")


outputs = model(**batch)
loss = outputs.loss
print("üìå Loss value:", loss.item())
print("üß† loss.requires_grad:", loss.requires_grad)
print("üß¨ loss.grad_fn:", loss.grad_fn)


try:
    loss.backward()
    print("‚úÖ Backward successful!")
except Exception as e:
    print("‚ùå Backward failed:", e)

üîç ÂèØËÆ≠ÁªÉÂèÇÊï∞Ôºö
‚úÖ base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight | shape: torch.Size([16, 4096])
‚úÖ base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight | shape: torch.Size([4096, 16])
‚úÖ base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight | shape: torch.Size([16, 4096])
‚úÖ base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight | shape: torch.Size([1024, 16])
‚úÖ base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight | shape: torch.Size([16, 4096])
‚úÖ base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight | shape: torch.Size([4096, 16])
‚úÖ base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight | shape: torch.Size([16, 4096])
‚úÖ base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight | shape: torch.Size([1024, 16])
‚úÖ base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight | shape: torch.Size([16, 4096])
‚úÖ base_mode

In [13]:
training_args = TrainingArguments(
    output_dir="./llama-medx-ori_pqal-lora",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    logging_dir="./logs",
    learning_rate=2e-4,
    weight_decay=0.01,
    bf16=True,
    save_steps=1000,
    logging_steps=50,
    report_to="wandb",
    run_name="ori_pqal-lora-run",
    eval_strategy="no",
    save_strategy="steps",
    load_best_model_at_end=False,
    label_names=["labels"]
)

In [14]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=tokenized_dataset["train"],
    data_collator=data_collator
)


  trainer = Trainer(


In [15]:
trainer.train()

Step,Training Loss
50,1.7294
100,1.6679
150,1.6245
200,1.6405
250,1.6013
300,1.5778
350,1.6067
400,1.5604
450,1.5175
500,1.5292


TrainOutput(global_step=600, training_loss=1.590412228902181, metrics={'train_runtime': 473.3063, 'train_samples_per_second': 5.071, 'train_steps_per_second': 1.268, 'total_flos': 5.53826201370624e+16, 'train_loss': 1.590412228902181, 'epoch': 3.0})

In [16]:
model.save_pretrained("./lora-checkpoint")
tokenizer.save_pretrained("./lora-checkpoint")

('./lora-checkpoint/tokenizer_config.json',
 './lora-checkpoint/special_tokens_map.json',
 './lora-checkpoint/tokenizer.json')

In [17]:
import tempfile

with tempfile.TemporaryDirectory() as tmp_dir:

    model.save_pretrained(tmp_dir)
    tokenizer.save_pretrained(tmp_dir)

    artifact = wandb.Artifact("lora-llama-v3.2-pubmedqa", type="model", metadata={
        "model": "Llama-medx-v3.2",
        "task": "PubMedQA",
        "adapter_type": "LoRA"
    })
    artifact.add_dir(tmp_dir)

    wandb.log_artifact(artifact)

    print("‚úÖ LoRA Ê®°ÂûãÂ∑≤ÊàêÂäü‰øùÂ≠òÂπ∂‰∏ä‰º†Ëá≥ W&B artifact üéØ")

[34m[1mwandb[0m: Adding directory to artifact (/tmp/tmpnb_9xtbj)... Done. 0.1s


‚úÖ LoRA Ê®°ÂûãÂ∑≤ÊàêÂäü‰øùÂ≠òÂπ∂‰∏ä‰º†Ëá≥ W&B artifact üéØ


In [21]:
import wandb
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

wandb.init(project="llama-medx-reasoning")

artifact = wandb.use_artifact("easonwangzk-the-university-of-chicago/llama-medx-reasoning/lora-llama-v3.2-pubmedqa:latest", type="model")
artifact_dir = artifact.download()

base_model = AutoModelForCausalLM.from_pretrained(
    "skumar9/Llama-medx_v3.2",
    device_map="auto",
    torch_dtype=torch.float16,
)

model = PeftModel.from_pretrained(base_model, artifact_dir)

tokenizer = AutoTokenizer.from_pretrained(artifact_dir)
tokenizer.pad_token = tokenizer.eos_token

0,1
train/epoch,‚ñÅ‚ñÇ‚ñÇ‚ñÉ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÜ‚ñá‚ñá‚ñà‚ñà
train/global_step,‚ñÅ‚ñÇ‚ñÇ‚ñÉ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÜ‚ñá‚ñá‚ñà‚ñà
train/grad_norm,‚ñÑ‚ñÉ‚ñÅ‚ñÅ‚ñÇ‚ñÉ‚ñÑ‚ñÖ‚ñà‚ñà‚ñÜ‚ñÜ
train/learning_rate,‚ñà‚ñá‚ñá‚ñÜ‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÉ‚ñÇ‚ñÇ‚ñÅ
train/loss,‚ñà‚ñÜ‚ñÖ‚ñÖ‚ñÑ‚ñÉ‚ñÑ‚ñÉ‚ñÅ‚ñÇ‚ñÅ‚ñÅ

0,1
total_flos,5.53826201370624e+16
train/epoch,3.0
train/global_step,600.0
train/grad_norm,0.81592
train/learning_rate,0.0
train/loss,1.5136
train_loss,1.59041
train_runtime,473.3063
train_samples_per_second,5.071
train_steps_per_second,1.268


[34m[1mwandb[0m: \ 1 of 6 files downloaded...[34m[1mwandb[0m:   6 of 6 files downloaded.  


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



In [None]:
def generate_response(prompt, max_new_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Á§∫‰æã promptÔºàÈÄÇÂêà PubMedQA ‰ªªÂä°Ôºâ
prompt = "What is the recommended treatment for bacterial pneumonia in elderly patients?"
response = generate_response(prompt)
print("üß† Model Answer:\n", response)