In [1]:
import os

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "5"

In [3]:
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


### Perform a basic inference using the base instruct model from HuggingFace

In [None]:
model_id = "meta-llama/Llama-3.2-3B-Instruct"

pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto")

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Should I let my dog sleep on the bed with me?"},
]

outputs = pipe(
    messages,
    max_new_tokens=1024,
    do_sample=True
)

print(outputs[0]["generated_text"][-1]['content'])

### Writing the Fine-Tuning Code

In [4]:
with open('hf_token.key', 'r') as f:
    hf_token = f.read()

model_id = "meta-llama/Llama-3.2-3B-Instruct"

In [5]:
dataset_name = "ruslanmv/ai-medical-chatbot"
dataset = load_dataset(dataset_name, split="all")
dataset = dataset.shuffle(seed=42).select(range(1000))

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Adding a special token for pad token so that eos token can be recognized 
# (https://github.com/unslothai/unsloth/issues/416)
# https://github.com/huggingface/transformers/issues/22794
# https://github.com/huggingface/transformers/issues/23230
tokenizer.add_special_tokens({"pad_token": "<|reserved_special_token_0|>"})
tokenizer.padding_side = "right"
tokenizer.model_max_length = 1024

In [11]:
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto") # Must be float32 for MacBooks!

model.config.use_cache=False
model.config.pad_token_id = tokenizer.pad_token_id # Updating the model config to use the special pad token

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]


In [12]:
# Define a function to apply the chat template
def format_chat_template(example):
    messages = [
        {"role": "user", "content": example['Patient']},
        {"role": "assistant", "content": example['Doctor']}
    ]
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    return {"prompt": prompt}

In [19]:
new_dataset = dataset.map(format_chat_template)
new_dataset = new_dataset.train_test_split(0.05)

In [24]:
test_example = new_dataset['train']['prompt'][0]

In [28]:
test_example_tokenizer = tokenizer(test_example, padding="max_length", truncation=True)

In [36]:
# Tokenize the data
def tokenize_function(example):
    tokens = tokenizer(example['prompt'], padding="max_length", truncation=True)
    # Set padding token labels to -100 to ignore them in loss calculation
    tokens['labels'] = [
        -100 if token == tokenizer.pad_token_id else token for token in tokens['input_ids']
    ]
    return tokens

In [44]:
# Apply tokenize_function to each row
tokenized_dataset = new_dataset.map(tokenize_function)
tokenized_dataset = tokenized_dataset.remove_columns(['Description', 'Patient', 'Doctor', 'prompt'])

In [29]:
model.train()
training_args = TrainingArguments(
    output_dir="./llama32-sft-fine-tuned-model",
    eval_strategy="steps", # To evaluate during training
    eval_steps=40,
    logging_steps=40,
    save_steps=150,
    per_device_train_batch_size=2, # Adjust based on your hardware
    per_device_eval_batch_size=2,
    num_train_epochs=2, # How many times to loop through the dataset
    fp16=False, # Must be False for MacBooks
    report_to="none", # Here we can use something like tensorboard to see the training metrics
    log_level="info",
    learning_rate=1e-5, # Would avoid larger values here
    max_grad_norm=2 # Clipping the gradients is always a good idea
)

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer)

In [None]:
# Train the model
trainer.train()

In [None]:
# Save the model and tokenizer
trainer.save_model("./llama32-sft-fine-tuned-model")
tokenizer.save_pretrained("./llama32-sft-fine-tuned-model")

In [None]:
from transformers import pipeline

model_id = "llama32-sft-fine-tuned-model"
pipe = pipeline(
    "text-generation",
    model=model_id,
    device_map="auto",
)
messages = [
    {"role": "user", "content": "I am feeling a high fever. What do you recommend?"},
]
outputs = pipe(
    messages,
    max_new_tokens=1024
)
print(outputs[0]["generated_text"][-1]['content'])