In [1]:
import os

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers.trainer import TrainingArguments
from peft import LoraConfig, get_peft_model, PeftModel
from trl import SFTTrainer
from datasets import load_dataset
from transformers import pipeline


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
dataset_name = "ruslanmv/ai-medical-chatbot"
dataset = load_dataset(dataset_name, split="all")
dataset = dataset.shuffle(seed=42).select(range(1000))

In [5]:
def format_generation_template(row):    
    row["text"] = f"Patient: {row['Patient']}\n\nDoctor: {row['Doctor']}"
    return row

In [6]:
dataset = dataset.map(
            format_generation_template,
            num_proc=4
        )

dataset = dataset.train_test_split(test_size=0.1, seed=42)

In [10]:
with open('hf_token.key', 'r') as f:
    hf_token = f.read()

base_model = "meta-llama/Meta-Llama-3-8B"
new_model = "llama-3-8b-base-counsel"

In [11]:
tokenizer = AutoTokenizer.from_pretrained(base_model, padding='max_length', truncation=True, token = hf_token)
tokenizer.add_special_tokens({"pad_token": "<|reserved_special_token_0|>"})
tokenizer.padding_side = "right"
tokenizer.model_max_length = 256

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
# QLoRA Config for 4-bit quntization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True
)

# # For 8 bit quantization
# bnb_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_threshold=200.0)

In [14]:
# Load Model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config = bnb_config,
    torch_dtype = torch.bfloat16,
    device_map={'':torch.cuda.current_device()}
)

model.config.use_cache=False
model.config.pad_token_id = tokenizer.pad_token_id # Updating the model config to use the special pad token

Loading checkpoint shards: 100%|██████████| 4/4 [00:22<00:00,  5.58s/it]


In [15]:
peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type = "CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj'],
)

model = get_peft_model(model, peft_config)

In [16]:
training_arguments = TrainingArguments(
    output_dir=new_model,
    overwrite_output_dir=True,
    bf16=True,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=0.1,
    learning_rate=2e-4,
    logging_steps=5,
    logging_strategy="steps",
    log_level="info",
    save_strategy="epoch",
    num_train_epochs=2,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    warmup_steps=10,
    group_by_length=True,
    report_to="none",
    seed=42
)

In [17]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    dataset_text_field="text",
    peft_config=peft_config,
    max_seq_length=tokenizer.model_max_length,
    packing= False
)

Map: 100%|██████████| 900/900 [00:00<00:00, 1944.16 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1735.23 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Using auto half precision backend


In [18]:
trainer.train()

***** Running training *****
  Num examples = 900
  Num Epochs = 2
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 2
  Total optimization steps = 900
  Number of trainable parameters = 167,772,160


Step,Training Loss,Validation Loss
90,2.541,2.553343
180,2.544,2.52022
270,2.4244,2.50589
360,2.5493,2.49078
450,2.6496,2.475557
540,2.1728,2.494133
630,2.1613,2.497191
720,2.1026,2.484085
810,1.9621,2.497351
900,2.0402,2.4855


***** Running Evaluation *****
  Num examples = 100
  Batch size = 1
***** Running Evaluation *****
  Num examples = 100
  Batch size = 1
***** Running Evaluation *****
  Num examples = 100
  Batch size = 1
***** Running Evaluation *****
  Num examples = 100
  Batch size = 1
***** Running Evaluation *****
  Num examples = 100
  Batch size = 1
Saving model checkpoint to llama-3-8b-base-counsel/tmp-checkpoint-450
loading configuration file config.json from cache at /data/mn27889/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_laye

TrainOutput(global_step=900, training_loss=2.317001262770759, metrics={'train_runtime': 766.1622, 'train_samples_per_second': 2.349, 'train_steps_per_second': 1.175, 'total_flos': 1.665681100652544e+16, 'train_loss': 2.317001262770759, 'epoch': 2.0})

In [19]:
trainer.model.save_pretrained(new_model)

loading configuration file config.json from cache at /data/mn27889/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.38.2",
  "use_cache": true,
  "vocab_size": 128256
}



### Merging the base model with the adapter to get full model

In [23]:
base_model = "meta-llama/Meta-Llama-3-8B"
new_model = "llama-3-8b-base-counsel"

In [24]:
tokenizer = AutoTokenizer.from_pretrained(base_model, padding='max_length', truncation=True, token = hf_token)
tokenizer.add_special_tokens({"pad_token": "<|reserved_special_token_0|>"})
tokenizer.padding_side = "right"
tokenizer.model_max_length = 256

loading file tokenizer.json from cache at /data/mn27889/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /data/mn27889/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6/special_tokens_map.json
loading file tokenizer_config.json from cache at /data/mn27889/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6/tokenizer_config.json


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [25]:
# Load Model
base_model_reload = AutoModelForCausalLM.from_pretrained(
    base_model,
    return_dict=True,
    low_cpu_mem_usage=True,    
    trust_remote_code=True,
    torch_dtype = torch.bfloat16,
    device_map={"":torch.cuda.current_device()}
)

loading configuration file config.json from cache at /data/mn27889/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6/config.json
Model config LlamaConfig {
  "_name_or_path": "meta-llama/Meta-Llama-3-8B",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.38.2",
  "use_cache": true,
  "vocab_size": 128256
}

loading weights file model.safetensors from cache at /data/mn27889/.cache/huggingface/h

In [27]:
model = PeftModel.from_pretrained(base_model_reload, new_model)

In [28]:
model = model.merge_and_unload()
model.config.pad_token_id = tokenizer.pad_token_id # Updating the model config to use the special pad token

In [29]:
model.save_pretrained("llama-3-8b-base-chat-doctor")
tokenizer.save_pretrained("llama-3-8b-base-chat-doctor")

Configuration saved in llama-3-8b-base-chat-doctor/config.json
Configuration saved in llama-3-8b-base-chat-doctor/generation_config.json
The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 4 checkpoint shards. You can find where each parameters has been saved in the index located at llama-3-8b-base-chat-doctor/model.safetensors.index.json.
tokenizer config file saved in llama-3-8b-base-chat-doctor/tokenizer_config.json
Special tokens file saved in llama-3-8b-base-chat-doctor/special_tokens_map.json


('llama-3-8b-base-chat-doctor/tokenizer_config.json',
 'llama-3-8b-base-chat-doctor/special_tokens_map.json',
 'llama-3-8b-base-chat-doctor/tokenizer.json')

### Load merged Model and Tokenizer for Inference

In [4]:
model = AutoModelForCausalLM.from_pretrained(
    "llama-3-8b-base-chat-doctor",
    torch_dtype = torch.bfloat16,
    device_map={'':torch.cuda.current_device()}
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.58s/it]


In [5]:
tokenizer = AutoTokenizer.from_pretrained("llama-3-8b-base-chat-doctor")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [26]:
patient_text = "I am suffering from fever. How can I get rid of this?"
input_text = "Patient: " + patient_text + "\n\nDoctor:"

In [8]:
inputs = tokenizer(input_text, return_tensors='pt').to(model.device)

In [12]:
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

In [23]:
generation_config = model.generation_config
generation_config.pad_token_id = tokenizer.pad_token_id
generation_config.repetition_penalty = 2.0

outputs = model.generate(
    **inputs,
    max_new_tokens=tokenizer.model_max_length,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    num_return_sequences=1,
    generation_config = generation_config
)

In [24]:
text = tokenizer.decode(outputs[0], skip_special_tokens=False)

In [25]:
print(text)

<|begin_of_text|>Patient: I am suffering from fever. How can I get rid of this?

Doctor:  Hello, Welcome to HCM!     The symptoms you describe are likely due viral illness or infection like upper respiratory tract infections (URTI) which usually last for about a week and may include coughing that is worse at night as well the sore throat. So it's important not just take medicine but also rest, drink plenty fluids such water juice etc., use warm compresses on your chest if needed while avoiding smoking alcohol caffeine sugar spicy foods until better hope my answer was helpful Regards Dr.Sohil Patel General & Family Physician @Hippocrates Clinic Ahmedabad India   If u have more questions feel free ask me anytime...regards!!    Thanks For Choosing HCm!! :)            Good Luck And Take Care.. :):).Hope ur problem solved now.if any clarification then do nt hesitate in asking.welcome again.:D                   Thank You..:0)..Good Day To U....!!!!!!!:)                 Wish A Very Happy New 