In [3]:
import os

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [6]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers.trainer import TrainingArguments
from peft import LoraConfig, get_peft_model, PeftModel
from trl import SFTTrainer
from datasets import load_dataset
from transformers import pipeline


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
dataset_name = "ruslanmv/ai-medical-chatbot"
dataset = load_dataset(dataset_name, split="all")
dataset = dataset.shuffle(seed=42).select(range(1000))

In [5]:
def format_generation_template(row):    
    row["text"] = f"Patient: {row['Patient']}\n\nDoctor: {row['Doctor']}"
    return row

In [6]:
dataset = dataset.map(
            format_generation_template,
            num_proc=4
        )

dataset = dataset.train_test_split(test_size=0.1, seed=42)

In [7]:
with open('hf_token.key', 'r') as f:
    hf_token = f.read()

base_model = "meta-llama/Meta-Llama-3-8B"
new_model = "llama-3-8b-base-counsel"

In [8]:
tokenizer = AutoTokenizer.from_pretrained(base_model, padding='max_length', truncation=True, token = hf_token)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.model_max_length = 256

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
# QLoRA Config for 4-bit quntization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True
)

# # For 8 bit quantization
# bnb_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_threshold=200.0)

In [10]:
# Load Model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config = bnb_config,
    torch_dtype = torch.bfloat16,
    device_map={'':torch.cuda.current_device()}
)

model.config.use_cache=False

Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00,  2.09s/it]


In [11]:
peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type = "CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj'],
)

model = get_peft_model(model, peft_config)

In [12]:
training_arguments = TrainingArguments(
    output_dir=new_model,
    overwrite_output_dir=True,
    bf16=True,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=0.1,
    learning_rate=2e-4,
    logging_steps=5,
    logging_strategy="steps",
    log_level="info",
    save_strategy="epoch",
    num_train_epochs=2,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    warmup_steps=10,
    group_by_length=True,
    report_to="none",
    seed=42
)

In [13]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    dataset_text_field="text",
    peft_config=peft_config,
    max_seq_length=tokenizer.model_max_length,
    packing= False
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Using auto half precision backend


In [14]:
trainer.train()

***** Running training *****
  Num examples = 900
  Num Epochs = 2
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 2
  Total optimization steps = 900
  Number of trainable parameters = 167,772,160


Step,Training Loss,Validation Loss
90,2.5428,2.55643
180,2.5452,2.519853
270,2.4263,2.509076
360,2.5512,2.492161
450,2.6524,2.475791
540,2.168,2.495593
630,2.164,2.487162
720,2.1033,2.483036
810,1.9489,2.495726
900,2.0378,2.484911


***** Running Evaluation *****
  Num examples = 100
  Batch size = 1
***** Running Evaluation *****
  Num examples = 100
  Batch size = 1
***** Running Evaluation *****
  Num examples = 100
  Batch size = 1
***** Running Evaluation *****
  Num examples = 100
  Batch size = 1
***** Running Evaluation *****
  Num examples = 100
  Batch size = 1
Saving model checkpoint to llama-3-8b-base-counsel/tmp-checkpoint-450
loading configuration file config.json from cache at /data/mn27889/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_laye

TrainOutput(global_step=900, training_loss=2.318307656182183, metrics={'train_runtime': 781.1668, 'train_samples_per_second': 2.304, 'train_steps_per_second': 1.152, 'total_flos': 1.665681100652544e+16, 'train_loss': 2.318307656182183, 'epoch': 2.0})

In [15]:
trainer.model.save_pretrained(new_model)

loading configuration file config.json from cache at /data/mn27889/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.38.2",
  "use_cache": true,
  "vocab_size": 128256
}



### Merging the base model with the adapter to get full model

In [16]:
base_model = "meta-llama/Meta-Llama-3-8B"
new_model = "llama-3-8b-base-counsel"

In [17]:
# Load Model
base_model_reload = AutoModelForCausalLM.from_pretrained(
    base_model,
    return_dict=True,
    low_cpu_mem_usage=True,    
    trust_remote_code=True,
    torch_dtype = torch.bfloat16,
    device_map={"":torch.cuda.current_device()}
)

loading configuration file config.json from cache at /data/mn27889/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6/config.json
Model config LlamaConfig {
  "_name_or_path": "meta-llama/Meta-Llama-3-8B",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.38.2",
  "use_cache": true,
  "vocab_size": 128256
}

loading weights file model.safetensors from cache at /data/mn27889/.cache/huggingface/h

In [18]:
tokenizer = AutoTokenizer.from_pretrained(base_model, padding='max_length', truncation=True, token = hf_token)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.model_max_length = 256

loading file tokenizer.json from cache at /data/mn27889/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /data/mn27889/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6/special_tokens_map.json
loading file tokenizer_config.json from cache at /data/mn27889/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6/tokenizer_config.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [19]:
model = PeftModel.from_pretrained(base_model_reload, new_model)

In [20]:
model = model.merge_and_unload()

In [21]:
model.save_pretrained("llama-3-8b-base-chat-doctor")
tokenizer.save_pretrained("llama-3-8b-base-chat-doctor")

Configuration saved in llama-3-8b-base-chat-doctor/config.json
Configuration saved in llama-3-8b-base-chat-doctor/generation_config.json
The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 4 checkpoint shards. You can find where each parameters has been saved in the index located at llama-3-8b-base-chat-doctor/model.safetensors.index.json.
tokenizer config file saved in llama-3-8b-base-chat-doctor/tokenizer_config.json
Special tokens file saved in llama-3-8b-base-chat-doctor/special_tokens_map.json


('llama-3-8b-base-chat-doctor/tokenizer_config.json',
 'llama-3-8b-base-chat-doctor/special_tokens_map.json',
 'llama-3-8b-base-chat-doctor/tokenizer.json')

### Load merged Model and Tokenizer for Inference

In [7]:
model = AutoModelForCausalLM.from_pretrained(
    "llama-3-8b-base-chat-doctor",
    torch_dtype = torch.bfloat16,
    device_map={'':torch.cuda.current_device()}
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.53s/it]


In [8]:
tokenizer = AutoTokenizer.from_pretrained("llama-3-8b-base-chat-doctor", padding='max_length', truncation=True)
tokenizer.padding_side = "left"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
input_text = "I am suffering from Acne. How can I get rid of this?"

In [10]:
inputs = tokenizer(input_text, return_tensors='pt', padding='max_length', truncation=True).to(model.device)

In [11]:
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

In [12]:
generation_config = model.generation_config
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.repetition_penalty = 2.0

outputs = model.generate(
    **inputs,
    max_new_tokens=tokenizer.model_max_length,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    num_return_sequences=1,
    generation_config = generation_config
)

In [13]:
text = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [14]:
print(text)

I am suffering from Acne. How can I get rid of this? What is the best way to clear acne?
Hi... Welcome on HCM.. There are many ways you could try for getting away with your problem.... but before that, please tell us if its really a serious one or just something which bothers u sometimes and doesn't bother anyone else??? 1) Use any soap containing benzoyl peroxide (like Clearasil). You may have dry skin after using it so use moisturizing lotion as well. Do not overuse though coz then pimples will come back more fiercely than ever! :) Wash face daily morning & evening thoroughly, twice in case required(especially when coming out due some reason like partying etc.)2 ) Keep hands off ur nose&face all day long especially while eating because oily foods cause breakouts too3)Vit B complex helps tremendously4)Lemon juice works wonders.. Apply at night regularly5)treatment cream by dermatologist6)get facial once every fortnight7)dont touch / squeeze them8)papaya leaves apply paste9)baking soda