In [None]:
!huggingface-cli download --token xxxxxxxxxxx --resume-download meta-llama/Meta-Llama-3-8B --local-dir /home/ubuntu/789pj/Meta-Llama-3-8B

In [3]:
# Install required libraries
!pip install "transformers==4.51.3" datasets "peft==0.14.0" bitsandbytes "accelerate==1.5.2" scikit-learn tqdm "trl==0.17.0" evaluate wandb ipywidgets
!jupyter nbextension enable --py widgetsnbextension
!jupyter nbextension install --py widgetsnbextension


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m
Installing /opt/pytorch/lib/python3.12/site-packages/widgetsnbextension/static -> jupyter-js-widgets
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/pathlib.py", line 1311, in mkdir
    os.mkdir(self, mode)
FileNotFoundError: [Errno 2] No such file or directory: '/usr/local/share/jupyter/nbextensions'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/pytorch/bin/jupyter-nbextension", line 8, in <module>
    sys.exit(main())
             ^^^^^^
  File "/opt/pytorch/lib/python3.12/site-packages/jupyter_core/application.py", line 283, in launch_instance
    super()

In [1]:
import torch
import transformers
import accelerate
import evaluate
import gc
import os
import wandb
import time
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from trl import SFTConfig, SFTTrainer
from tqdm import tqdm
from datasets import load_dataset

In [2]:
# Parameters
base_model = "/home/ubuntu/789pj/Meta-Llama-3-8B"  # or local path
lora_output = "/home/ubuntu/789pj/lora_output"
max_train_steps = 500
max_val_samples = 2000
max_seq_length = 4096
inference_max_length = 512
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Running on device:", device)

Running on device: cuda


In [3]:
def process_data(data):
    prompt = ("### Instruction:\nGiven multiple-choice medical question below, select the best answer (A, B, C, or D). \nOutput only the letter of the best choice followed by a short explanation starting with \"Explanation:\" on the same line.\n\n")
    prompt += f"Question: {data['question'].strip()}\n"
    prompt += f"A. {data['opa'].strip()}\nB. {data['opb'].strip()}\nC. {data['opc'].strip()}\nD. {data['opd'].strip()}\n### Response:\n"

    correct_option = chr(ord('A') + int(data['cop']))
    explanation = (data.get('exp') or '').strip()
    response = f"{correct_option} Explanation: {explanation}"

    return {"prompt": prompt, "label": response}

In [4]:
def tokenize_fn(batch, tokenizer):
    inputs = [prompt + answer for prompt, answer in zip(batch["prompt"], batch["label"])]
    tokenized_data = tokenizer(inputs, truncation=True, padding="max_length")
    labels = []
    for i, full_tokens in enumerate(tokenized_data["input_ids"]):
        prompt_tokens = tokenizer(batch["prompt"][i], add_special_tokens=False)["input_ids"]
        mask = [-100] * len(prompt_tokens) + full_tokens[len(prompt_tokens):]
        labels.append(mask[:tokenizer.model_max_length])
    tokenized_data["labels"] = labels
    return tokenized_data

In [5]:
def evaluate_model(model, tokenizer, dataset, device, max_length):
    acc_metric = evaluate.load('accuracy')
    preds, gts = [], []
    for data in tqdm(dataset, desc="Evaluating", leave=False):
        # prompt = ("### Instruction:\nPlease choose the correct answer (A, B, C, D) based on the multiple-choice medical question below. Output the letter corresponding to the best choice followed by a brief rationale starting with \"Explanation:\".\n\n")
        prompt = ("### Instruction:\nGiven multiple-choice medical question below, select the best answer (A, B, C, or D). \nOutput only the letter of the best choice followed by a short explanation starting with \"Explanation:\" on the same line.\n\n")
        prompt += f"Question: {data['question'].strip()}\n"
        prompt += f"A. {data['opa'].strip()}\nB. {data['opb'].strip()}\nC. {data['opc'].strip()}\nD. {data['opd'].strip()}\n### Response:\n"
        inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=max_length, padding='max_length').to(device)
        output = model.generate(**inputs, max_new_tokens=5, num_beams=4, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id)
        output_decoded = tokenizer.decode(output[0], skip_special_tokens=True)
        answer = output_decoded[len(prompt):].strip().upper()
        # print(output_decoded)
        pred_letter = next((option for option in answer if option in ['A','B','C','D']), None)
        if pred_letter is None:
            preds.append(-1)
        else:
            preds.append(ord(pred_letter) - ord('A'))
        gts.append(int(data['cop']))
    acc = acc_metric.compute(predictions=preds, references=gts)
    return acc


In [6]:
def print_metrics(acc, title):
    print(f"[{title}]-----Accuracy: {acc['accuracy']*100:.2f}%")

In [7]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"all params: {all_param}")
    print(f"trainable params: {trainable_params}")
    print(f"percentage of trainable params: {100 * (trainable_params / all_param)}%")

In [8]:
val_ds = load_dataset('medmcqa', split='validation')
if max_val_samples:
    val_ds = val_ds.select(range(max_val_samples))
print("Validation samples:", len(val_ds))

Validation samples: 2000


In [9]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, model_max_length=max_seq_length, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token
print("Max tokens:", tokenizer.model_max_length)

Max tokens: 4096


In [10]:
# Evaluate base model
print("Evaluating base model...")

base_model_loaded = AutoModelForCausalLM.from_pretrained(base_model).to(device)
base_model_loaded.config.use_cache = True
base_model_loaded.eval()
start_time = time.time()
acc = evaluate_model(base_model_loaded, tokenizer, val_ds, device, max_length = inference_max_length)
end_time = time.time()
print(f"Evaluation time: {end_time - start_time:.4f} seconds")
print("")
print_metrics(acc, "Base Model inference accuracy")

Evaluating base model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

                                                               

Evaluation time: 2432.0289 seconds

[Base Model inference accuracy]-----Accuracy: 33.40%




In [11]:
gc.collect()
torch.cuda.empty_cache()

In [12]:
# Evaluate 4_bit Quantized model
print("Evaluating 4_bit Quantized base model...")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
)

base_model_4_bit_quant = AutoModelForCausalLM.from_pretrained(
    base_model, quantization_config=bnb_config, device_map="auto"
)
base_model_4_bit_quant.config.use_cache = True
base_model_4_bit_quant.eval()

start_time = time.time()
acc = evaluate_model(base_model_4_bit_quant, tokenizer, val_ds, device, max_length = inference_max_length)
end_time = time.time()
print(f"Evaluation time: {end_time - start_time:.4f} seconds")
print("")
print_metrics(acc, "4-bit Quantized Base Model inference accuracy")

Evaluating 4_bit Quantized base model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

                                                               

Evaluation time: 777.4429 seconds

[4-bit Quantized Base Model inference accuracy]-----Accuracy: 33.55%




In [13]:
gc.collect()
torch.cuda.empty_cache()

In [14]:
# Evaluate 8_bit Quantized model
print("Evaluating 8_bit Quantized base model...")

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16,
)

base_model_8_bit_quant = AutoModelForCausalLM.from_pretrained(
    base_model, quantization_config=bnb_config, device_map="auto"
)
base_model_8_bit_quant.config.use_cache = True
base_model_8_bit_quant.eval()

start_time = time.time()
acc = evaluate_model(base_model_8_bit_quant, tokenizer, val_ds, device, max_length = inference_max_length)
end_time = time.time()
print(f"Evaluation time: {end_time - start_time:.4f} seconds")
print("")
print_metrics(acc, "8-bit Quantized Base Model inference accuracy")

Evaluating 8_bit Quantized base model...


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
wandb.login(key="xxxxxxxxxxx")
wandb.init(
    project="huggingface",
    entity="789team",
)

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjdou2[0m ([33m789team[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [12]:
# preprocessing training data
train_ds = load_dataset('medmcqa', split='train')
train_ds = train_ds.map(process_data, remove_columns=train_ds.column_names, num_proc=4)
tokenized = train_ds.map(lambda b: tokenize_fn(b, tokenizer), batched=True, remove_columns=train_ds.column_names, num_proc=4)

In [14]:
print("1 data instance")
print(train_ds[1])
labels = tokenized[1]["labels"]
non_negative_labels = [x for x in labels if x >= 0]
print(tokenizer.decode(non_negative_labels))
print(tokenized[1].keys())
print(tokenized[1]["labels"])
print(len(tokenized[1]["labels"]))
print(len(tokenized[1]["input_ids"]))
print(len(tokenized[1]["attention_mask"]))
print("")
print("10 data instance")
print(train_ds[10])
labels = tokenized[10]["labels"]
non_negative_labels = [x for x in labels if x >= 0]
print(tokenizer.decode(non_negative_labels))
print(tokenized[10].keys())
print(tokenized[10]["labels"])
print(len(tokenized[10]["labels"]))
print(len(tokenized[10]["input_ids"]))
print(len(tokenized[10]["attention_mask"]))

1 data instance
{'prompt': '### Instruction:\nPlease choose the correct answer (A, B, C, D) based on the multiple-choice medical question below. Output the letter corresponding to the best choice followed by a brief rationale starting with "Explanation:".\n\nQuestion: Which vitamin is supplied from only animal source:\nA. Vitamin C\nB. Vitamin B7\nC. Vitamin B12\nD. Vitamin D\n### Response: ', 'label': "C Explanation: Ans. (c) Vitamin B12 Ref: Harrison's 19th ed. P 640* Vitamin B12 (Cobalamin) is synthesized solely by microorganisms.* In humans, the only source for humans is food of animal origin, e.g., meat, fish, and dairy products.* Vegetables, fruits, and other foods of nonanimal origin doesn't contain Vitamin B12 .* Daily requirements of vitamin Bp is about 1-3 pg. Body stores are of the order of 2-3 mg, sufficient for 3-4 years if supplies are completely cut off."}
 C Explanation: Ans. (c) Vitamin B12 Ref: Harrison's 19th ed. P 640* Vitamin B12 (Cobalamin) is synthesized solely b

In [15]:
# 4bit-QLoRA fine-tuning with noise=0.2
print("Starting 4bit QLoRA fine-tuning with noise...")
path_4bit_qlora = "path_4bit_qlora"
final_tokenizer = "final_tokenizer"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
)
model_4bit = AutoModelForCausalLM.from_pretrained(
    base_model, quantization_config=bnb_config, device_map="auto"
)
print_trainable_parameters(model_4bit)
model_4bit = prepare_model_for_kbit_training(model_4bit)


lora_cfg = LoraConfig(r=8, lora_alpha=16, target_modules=['q_proj','k_proj','v_proj','o_proj'], lora_dropout=0.05, bias='none', task_type='CAUSAL_LM')
peft_model = get_peft_model(model_4bit, lora_cfg)
peft_model.config.use_cache = False

trainer = SFTTrainer(
    model=peft_model,
    args=SFTConfig(output_dir=os.path.join(lora_output, path_4bit_qlora), neftune_noise_alpha=0.2, warmup_steps=5,
                    per_device_train_batch_size=4, gradient_accumulation_steps=4,
                    max_steps=max_train_steps, learning_rate=2.5e-5, fp16=True, logging_steps=50,
                    logging_dir="./logs", save_strategy="steps", run_name="medmcqa_FT",
                    save_steps=max_train_steps//5, optim='paged_adamw_8bit', report_to="wandb",
                    lr_scheduler_type='cosine'),
    train_dataset=tokenized,
)
trainer.train()
tokenizer.save_pretrained(os.path.join(lora_output, path_4bit_qlora, final_tokenizer))

Starting 4bit QLoRA fine-tuning with noise...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

trainable params: 1050939392 || all params: 4540600320 || trainable%: 23.145384264959926


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Step,Training Loss
50,3.3798
100,0.355
150,0.2867
200,0.2687
250,0.26
300,0.2683
350,0.2642
400,0.2706
450,0.2582
500,0.2625


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


('/root/789pj/lora_output/path_4bit_qlora/final_tokenizer/tokenizer_config.json',
 '/root/789pj/lora_output/path_4bit_qlora/final_tokenizer/special_tokens_map.json',
 '/root/789pj/lora_output/path_4bit_qlora/final_tokenizer/tokenizer.json')

In [None]:
# 8bit-QLoRA fine-tuning with noise=0.2
print("Starting 8bit QLoRA fine-tuning with noise...")
path_8bit_qlora = "path_8bit_qlora"
final_tokenizer = "final_tokenizer"

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16,
)
model_8bit = AutoModelForCausalLM.from_pretrained(
    base_model, quantization_config=bnb_config, device_map="auto"
)
print_trainable_parameters(model_8bit)
model_8bit = prepare_model_for_kbit_training(model_8bit)

lora_cfg = LoraConfig(r=8, lora_alpha=16, target_modules=['q_proj','k_proj','v_proj','o_proj'], lora_dropout=0.05, bias='none', task_type='CAUSAL_LM')
peft_model = get_peft_model(model_8bit, lora_cfg)
peft_model.config.use_cache = False

trainer = SFTTrainer(
    model=peft_model,
    args=SFTConfig(output_dir=os.path.join(lora_output, path_8bit_qlora), neftune_noise_alpha=0.2, warmup_steps=5,
                    per_device_train_batch_size=4, gradient_accumulation_steps=4,
                    max_steps=max_train_steps, learning_rate=2.5e-5, fp16=True, logging_steps=50,
                    logging_dir="./logs", save_strategy="steps", run_name="medmcqa_FT",
                    save_steps=max_train_steps//5, optim='paged_adamw_8bit', report_to="wandb",
                    lr_scheduler_type='cosine'),
    train_dataset=tokenized,
)
trainer.train()
tokenizer.save_pretrained(os.path.join(lora_output, path_8bit_qlora, final_tokenizer))

In [None]:
# 4bit-QLoRA fine-tuning
print("Starting 4bit QLoRA fine-tuning without noise...")
path_4bit_qlora_no_noise = "path_4bit_qlora_no_noise"
final_tokenizer = "final_tokenizer"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
)
model_4bit = AutoModelForCausalLM.from_pretrained(
    base_model, quantization_config=bnb_config, device_map="auto"
)
print_trainable_parameters(model_4bit)
model_4bit = prepare_model_for_kbit_training(model_4bit)


lora_cfg = LoraConfig(r=8, lora_alpha=16, target_modules=['q_proj','k_proj','v_proj','o_proj'], lora_dropout=0.05, bias='none', task_type='CAUSAL_LM')
peft_model = get_peft_model(model_4bit, lora_cfg)
peft_model.config.use_cache = False

trainer = SFTTrainer(
    model=peft_model,
    args=SFTConfig(output_dir=os.path.join(lora_output, path_4bit_qlora_no_noise), neftune_noise_alpha=0, warmup_steps=5,
                    per_device_train_batch_size=4, gradient_accumulation_steps=4,
                    max_steps=max_train_steps, learning_rate=2.5e-5, fp16=True, logging_steps=50,
                    logging_dir="./logs", save_strategy="steps", run_name="medmcqa_FT",
                    save_steps=max_train_steps//5, optim='paged_adamw_8bit', report_to="wandb",
                    lr_scheduler_type='cosine'),
    train_dataset=tokenized,
)
trainer.train()
tokenizer.save_pretrained(os.path.join(lora_output, path_4bit_qlora_no_noise, final_tokenizer))

In [None]:
# 8bit-QLoRA fine-tuning
print("Starting 8bit QLoRA fine-tuning without noise...")
path_8bit_qlora_no_noise = "path_8bit_qlora_no_noise"
final_tokenizer = "final_tokenizer"

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16,
)
model_8bit = AutoModelForCausalLM.from_pretrained(
    base_model, quantization_config=bnb_config, device_map="auto"
)
print_trainable_parameters(model_8bit)
model_8bit = prepare_model_for_kbit_training(model_8bit)

lora_cfg = LoraConfig(r=8, lora_alpha=16, target_modules=['q_proj','k_proj','v_proj','o_proj'], lora_dropout=0.05, bias='none', task_type='CAUSAL_LM')
peft_model = get_peft_model(model_8bit, lora_cfg)
peft_model.config.use_cache = False

trainer = SFTTrainer(
    model=peft_model,
    args=SFTConfig(output_dir=os.path.join(lora_output, path_8bit_qlora_no_noise), neftune_noise_alpha=0, warmup_steps=5,
                    per_device_train_batch_size=4, gradient_accumulation_steps=4,
                    max_steps=max_train_steps, learning_rate=2.5e-5, fp16=True, logging_steps=50,
                    logging_dir="./logs", save_strategy="steps", run_name="medmcqa_FT",
                    save_steps=max_train_steps//5, optim='paged_adamw_8bit', report_to="wandb",
                    lr_scheduler_type='cosine'),
    train_dataset=tokenized,
)
trainer.train()
tokenizer.save_pretrained(os.path.join(lora_output, path_8bit_qlora_no_noise, final_tokenizer))

In [None]:
wandb.finish()

In [None]:
# Evaluate LoRA model
path_4bit_qlora = "path_4bit_qlora"
print("Evaluating 4bit QLoRA model with noise...")

lora_model_loaded = AutoModelForCausalLM.from_pretrained(os.path.join(lora_output, path_4bit_qlora, "checkpoint-500")).to(device)
lora_model_loaded.config.use_cache = True
lora_model_loaded.eval()

start_time = time.time()
acc2 = evaluate_model(lora_model_loaded, tokenizer, val_ds, device, max_length = inference_max_length)
end_time = time.time()
print(f"Evaluation time: {end_time - start_time:.4f} seconds")
print("")
print_metrics(acc2, "4bit QLoRA with noise model inference accuracy")

Evaluating 4bit QLoRA model with noise...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

                                                                 

--- LoRA Model ---
Accuracy       : 57.69%
Macro Precision: 57.86%
Macro Recall   : 57.34%
Macro F1       : 57.44%
Confusion Matrix:
    A  B  C  D
A | 829  214  196  109
B | 261  573  159   92
C | 171  105  571   78
D | 146  109  130  440

Classification Report:
               precision    recall  f1-score   support

           A       0.59      0.61      0.60      1348
           B       0.57      0.53      0.55      1085
           C       0.54      0.62      0.58       925
           D       0.61      0.53      0.57       825

    accuracy                           0.58      4183
   macro avg       0.58      0.57      0.57      4183
weighted avg       0.58      0.58      0.58      4183





In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Evaluate LoRA model
print("Evaluating 8bit QLoRA model with noise...")

lora_model_loaded = AutoModelForCausalLM.from_pretrained(os.path.join(lora_output, path_8bit_qlora, "checkpoint-500")).to(device)
lora_model_loaded.config.use_cache = True
lora_model_loaded.eval()

start_time = time.time()
acc2 = evaluate_model(lora_model_loaded, tokenizer, val_ds, device, max_length = inference_max_length)
end_time = time.time()
print(f"Evaluation time: {end_time - start_time:.4f} seconds")
print("")
print_metrics(acc2, "8bit QLoRA with noise model inference accuracy")

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Evaluate LoRA model
print("Evaluating 4bit QLoRA model without noise...")

lora_model_loaded = AutoModelForCausalLM.from_pretrained(os.path.join(lora_output, path_4bit_qlora_no_noise, "checkpoint-500")).to(device)
lora_model_loaded.config.use_cache = True
lora_model_loaded.eval()

start_time = time.time()
acc2 = evaluate_model(lora_model_loaded, tokenizer, val_ds, device, max_length = inference_max_length)
end_time = time.time()
print(f"Evaluation time: {end_time - start_time:.4f} seconds")
print("")
print_metrics(acc2, "4bit QLoRA without noise model inference accuracy")

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Evaluate LoRA model
print("Evaluating 8bit QLoRA model without noise...")

lora_model_loaded = AutoModelForCausalLM.from_pretrained(os.path.join(lora_output, path_8bit_qlora_no_noise, "checkpoint-500")).to(device)
lora_model_loaded.config.use_cache = True
lora_model_loaded.eval()

start_time = time.time()
acc2 = evaluate_model(lora_model_loaded, tokenizer, val_ds, device, max_length = inference_max_length)
end_time = time.time()
print(f"Evaluation time: {end_time - start_time:.4f} seconds")
print("")
print_metrics(acc2, "8bit QLoRA without noise model inference accuracy")

In [None]:
gc.collect()
torch.cuda.empty_cache()