In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers.trainer import TrainingArguments
from peft import LoraConfig, AutoPeftModelForCausalLM, get_peft_model, prepare_model_for_int8_training, PeftModel
from trl import SFTTrainer, setup_chat_format
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('hf_token.key', 'r') as f:
    hf_token = f.read()

In [3]:
base_model = "meta-llama/Meta-Llama-3-8B-Instruct"
new_model = "llama-3-8b-counsel"
torch_dtype = torch.float16

In [4]:
# QLoRA Config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True
)

In [5]:
# Load Model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config = bnb_config,
    torch_dtype = torch_dtype,
    device_map={"":0},
)

model.config.use_cache=False

Loading checkpoint shards: 100%|██████████| 4/4 [00:22<00:00,  5.52s/it]


In [6]:
tokenizer = AutoTokenizer.from_pretrained(base_model, token = hf_token)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
model, tokenizer = setup_chat_format(model, tokenizer)

In [7]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type = "CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj'],
)

model = get_peft_model(model, peft_config)

NameError: name 'model' is not defined

In [8]:
dataset_name = "ruslanmv/ai-medical-chatbot"
dataset = load_dataset(dataset_name, split="all")
dataset = dataset.shuffle(seed=42).select(range(1000))

In [9]:
def format_chat_template(row):
    row_json = [
        {"role" : "user", "content": row['Patient']},
        {"role" : "assistant", "content": row['Doctor']}
    ]
    
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

In [10]:
dataset = dataset.map(
            format_chat_template,
            num_proc=4
        )

dataset = dataset.train_test_split(test_size=0.1)

Map (num_proc=4): 100%|██████████| 1000/1000 [00:00<00:00, 4767.39 examples/s]


In [None]:
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=0.2,
    save_strategy="epoch",
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="none"
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

In [None]:
trainer.train()

### Saving the trained model

In [None]:
trainer.model.save_pretrained(new_model)

### Model Evaluation

In [None]:
model.config.use_cache = True

messages = [
    {
        "role": "user",
        "content": "Hello doctor, I have bad acne. How do I get rid of it?"
    }
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True,  truncation=True).to("cuda")

outputs = model.generate(**inputs, max_length=150, num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])

Merging the base model with the adapter

In [None]:
base_model = "meta-llama/Meta-Llama-3-8B-Instruct"
new_model = "llama-3-8b-counsel"
torch_dtype = torch.float16

In [None]:
# Load Model
base_model_reload = AutoModelForCausalLM.from_pretrained(
    base_model,
    return_dict=True,
    low_cpu_mem_usage=True,    
    trust_remote_code=True,
    torch_dtype = torch_dtype,
    device_map={"":0},
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model, token = hf_token)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
base_model_reload, tokenizer = setup_chat_format(base_model_reload, tokenizer)

Merge adapter with the base model

In [1]:
model = PeftModel.from_pretrained(base_model_reload, new_model)

In [None]:
model = model.merge_and_unload()

Model Inference from Merged Model

In [None]:
messages = [{"role": "user", "content": "Hello doctor, I have bad acne. How do I get rid of it?"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

outputs = pipe(prompt, max_new_tokens=120, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
print(outputs[0]["generated_text"])

In [None]:
model.save_pretrained("llama-3-8b-chat-doctor")
tokenizer.save_pretrained("llama-3-8b-chat-doctor")