In [None]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 512 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

#Load the original LLama 3.1-8b (not the instruct one)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.12.2: Fast Llama patching. Transformers:4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.12.2 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
from datasets import Dataset, load_dataset, DatasetDict
#Load our Dataset
dataset = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k", split='train[:2%]')
dev_test = dataset.train_test_split(test_size=0.2)
train_valid = dev_test['train'].train_test_split(test_size=0.2)

dataset = DatasetDict({
    "train":train_valid['train'],
    "validation":train_valid['test'],
    "test":dev_test['test'],
})

README.md:   0%|          | 0.00/542 [00:00<?, ?B/s]

(…)-00000-of-00001-5e7cb295b9cff0bf.parquet:   0%|          | 0.00/70.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/112165 [00:00<?, ? examples/s]

In [None]:
def to_alpaca(data_point, deploy=False):
    """
  Transform the dataset we had to Alpaca format which is generally used to fine tune LLama family models.
  I will use the text field for fine tuning
    """
    COMMAND = "You are a doctor. Answer the following query by a patient."

    #a_instruction = data_point['instruction']
    a_input = data_point['input']
    a_response = data_point['output']

    if deploy:
        training_prompt = f"""
            ### Instruction:{COMMAND}
            ### Input:{a_input}
            ### Response:
            """.strip()
        example = {
            "question":a_input,
            "answer": a_response,
            "text": training_prompt
            }
    else:
        training_prompt = f"""
            ### Instruction:{COMMAND}
            ### Input:{a_input}
            ### Response:{a_response}
            """.strip()
        example = {
            "question":a_input,
            "answer": a_response,
            "text": training_prompt
            }

    return example

for key in ['train', 'validation', 'test']:
    dataset[key] = dataset[key].shuffle(seed=42).map(to_alpaca)
    #.remove_columns(['input', 'output'])

Map:   0%|          | 0/1435 [00:00<?, ? examples/s]

Map:   0%|          | 0/359 [00:00<?, ? examples/s]

Map:   0%|          | 0/449 [00:00<?, ? examples/s]

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset['train'],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=2):   0%|          | 0/1435 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer_stats = trainer.train()  #Training for 1 epoch

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,435 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.7915
2,3.0954
3,3.0369
4,2.8106
5,2.5889
6,2.7646
7,2.6016
8,2.44
9,2.4342
10,2.4895


In [None]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")
model.push_to_hub("h3lmi/lora_model", token = "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") # Online saving
tokenizer.push_to_hub("h3lmi/lora_model", token = "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") # Online saving
#This saved only the adapters in huggingface, So I'll save it with the two other methods
#(full model which is not pratical as I tested it and GGUF the most practical one)

README.md:   0%|          | 0.00/586 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/h3lmi/lora_model


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [None]:
#Save the full fine tuned model
model.save_pretrained_merged("lora_full_model", tokenizer)
model.push_to_hub_merged("h3lmi/lora_full_model", tokenizer, token = "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")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 3.93 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 32/32 [04:34<00:00,  8.58s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving lora_full_model/pytorch_model-00001-of-00004.bin...
Unsloth: Saving lora_full_model/pytorch_model-00002-of-00004.bin...
Unsloth: Saving lora_full_model/pytorch_model-00003-of-00004.bin...
Unsloth: Saving lora_full_model/pytorch_model-00004-of-00004.bin...
Done.


Unsloth: You are pushing to hub, but you passed your HF username = h3lmi.
We shall truncate h3lmi/lora_full_model to lora_full_model


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 3.94 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 32/32 [03:02<00:00,  5.69s/it]


Unsloth: Saving tokenizer...

  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

 Done.
Unsloth: Saving lora_full_model/pytorch_model-00001-of-00004.bin...
Unsloth: Saving lora_full_model/pytorch_model-00002-of-00004.bin...
Unsloth: Saving lora_full_model/pytorch_model-00003-of-00004.bin...
Unsloth: Saving lora_full_model/pytorch_model-00004-of-00004.bin...


README.md:   0%|          | 0.00/586 [00:00<?, ?B/s]

  0%|          | 0/4 [00:00<?, ?it/s]

pytorch_model-00003-of-00004.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

pytorch_model-00004-of-00004.bin:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

pytorch_model-00001-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00004.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/h3lmi/lora_full_model


In [None]:
#Save & push to hub the gguf version with default quantization_method
model.push_to_hub_gguf("h3lmi/modelgguf", tokenizer, token = "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")

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 5.7G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.58 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 44%|████▍     | 14/32 [00:01<00:01, 10.84it/s]We will save to Disk and not RAM now.
100%|██████████| 32/32 [01:18<00:00,  2.46s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving h3lmi/modelgguf/pytorch_model-00001-of-00004.bin...
Unsloth: Saving h3lmi/modelgguf/pytorch_model-00002-of-00004.bin...
Unsloth: Saving h3lmi/modelgguf/pytorch_model-00003-of-00004.bin...
Unsloth: Saving h3lmi/modelgguf/pytorch_model-00004-of-00004.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q8_0'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at h3lmi/modelgguf into q8_0 GGUF format.
The output location will be /content/h3lmi/modelgguf/unsloth.Q8_0.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: modelgguf
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INFO:hf-to-gguf:gguf: loading model part 'pytorch_

  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q8_0.gguf:   0%|          | 0.00/8.54G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/h3lmi/modelgguf


In [None]:
from unsloth import FastLanguageModel
model2, tokenizer2 = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2024.12.2: Fast Llama patching. Transformers:4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor

In [None]:
# Inference Test
FastLanguageModel.for_inference(model2) # Enable native 2x faster inference
inputs = tokenizer2(
[
   ' ### Instruction:You are a doctor. Answer the following query by a patient.\n            ### Input:Dear Doc,More than 6 months ago I got treated for scabies. all the rashes and marks on my body were gone except for those on my scrotum.a month after treatment, itching and the bumps on the sac reappearred. now from couple of days ago I have identified another bump with black dot in the centre. when touched or pressed I can feel the pain and it is on the penis.Kindly help me understand why I am facing this kind of problems near my sex organ when I never indulged in sexual activities till date.Also suggest me my immediate course of action.Thanks a lot.\n            ### Response'
    #)
], return_tensors = "pt").to("cuda")

outputs = model2.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer2.batch_decode(outputs)

['<|begin_of_text|> ### Instruction:You are a doctor. Answer the following query by a patient.\n            ### Input:Dear Doc,More than 6 months ago I got treated for scabies. all the rashes and marks on my body were gone except for those on my scrotum.a month after treatment, itching and the bumps on the sac reappearred. now from couple of days ago I have identified another bump with black dot in the centre. when touched or pressed I can feel the pain and it is on the penis.Kindly help me understand why I am facing this kind of problems near my sex organ when I never indulged in sexual activities till date.Also suggest me my immediate course of action.Thanks a lot.\n            ### Response:Hi, Welcome to Chat Doctor. I can understand your concern. It is not uncommon for scabies to return. I would suggest you to consult a dermatologist and get the treatment. You should also be evaluated for genital warts and other STDs. I hope I have answered your query. Let me know if']

In [None]:
dataset['test'][0]['text']

'### Instruction:You are a doctor. Answer the following query by a patient.\n            ### Input:Dear Doc,More than 6 months ago I got treated for scabies. all the rashes and marks on my body were gone except for those on my scrotum.a month after treatment, itching and the bumps on the sac reappearred. now from couple of days ago I have identified another bump with black dot in the centre. when touched or pressed I can feel the pain and it is on the penis.Kindly help me understand why I am facing this kind of problems near my sex organ when I never indulged in sexual activities till date.Also suggest me my immediate course of action.Thanks a lot.\n            ### Response:Hi, Scabies is notorious disease and sometimes remains for long time. Apply anti scabies lotion after forceful cleaning the part so that eruptions on penis get opened and lotion may enter the site properly. Scabies germs remain under eruption, so opening of eruptions is a must. Keep local hygiene clean and proper. G

# Test before and after fine tuning

In [None]:
!pip install llama-cpp-python



In [None]:
from llama_cpp import Llama
#the fine tuned version
llm = Llama.from_pretrained(
	repo_id="h3lmi/modelgguf",
	filename="unsloth.Q8_0.gguf",
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


unsloth.Q8_0.gguf:   0%|          | 0.00/8.54G [00:00<?, ?B/s]

llama_model_loader: loaded meta data with 29 key-value pairs and 292 tensors from /root/.cache/huggingface/hub/models--h3lmi--modelgguf/snapshots/18cfad8ce7e0170c52a2447b7a4af51bf4aee65c/./unsloth.Q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Meta Llama 3.1 8b Bnb 4bit
llama_model_loader: - kv   3:                       general.organization str              = Unsloth
llama_model_loader: - kv   4:                           general.finetune str              = bnb-4bit
llama_model_loader: - kv   5:                           general.basename str              = meta-llama-3.1
llama_model_loader: - kv   6:                

In [None]:
from llama_cpp import Llama
#the default version
default_llm = Llama.from_pretrained(
	repo_id="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
	filename="Meta-Llama-3.1-8B-Instruct-IQ2_M.gguf",
)


Meta-Llama-3.1-8B-Instruct-IQ2_M.gguf:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

llama_model_loader: loaded meta data with 33 key-value pairs and 292 tensors from /root/.cache/huggingface/hub/models--bartowski--Meta-Llama-3.1-8B-Instruct-GGUF/snapshots/bf5b95e96dac0462e2a09145ec66cae9a3f12067/./Meta-Llama-3.1-8B-Instruct-IQ2_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Meta Llama 3.1 8B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Meta-Llama-3.1
llama_model_loader: - kv   5:                         general.size_label str              = 8B
llam

In [None]:
#Test input is a query input from the dataset, let's verify the respone of the two versions
#I fine tuned the default version of LLama 3.1 8b and not the instruct version , so it is a completition , in case of chatbot the user
# will have to ask the question (input) and the rest will happen behind the scenes
prompt="""
        ### Instruction:You are a doctor. Answer the following query by a patient.
        ### Input:{}
        ### Response:
        """
test_input='I woke up this morning feeling the whole room is spinning when i was sitting down. I went to the bathroom walking unsteadily, as i tried to focus i feel nauseous. I try to vomit but it wont come out.. After taking panadol and sleep for few hours, i still feel the same.. By the way, if i lay down or sit down, my head do not spin, only when i want to move around then i feel the whole world is spinning.. And it is normal stomach discomfort at the same time? Earlier after i relieved myself, the spinning lessen so i am not sure whether its connected or coincidences.. Thank you doc!'
prompt.format(test_input)

'\n        ### Instruction:You are a doctor. Answer the following query by a patient.\n        ### Input:I woke up this morning feeling the whole room is spinning when i was sitting down. I went to the bathroom walking unsteadily, as i tried to focus i feel nauseous. I try to vomit but it wont come out.. After taking panadol and sleep for few hours, i still feel the same.. By the way, if i lay down or sit down, my head do not spin, only when i want to move around then i feel the whole world is spinning.. And it is normal stomach discomfort at the same time? Earlier after i relieved myself, the spinning lessen so i am not sure whether its connected or coincidences.. Thank you doc!\n        ### Response:\n        '

In [None]:
output = llm(
    prompt.format(test_input)
	,
	max_tokens=64,
	echo=False #show only the completition in the output ,more practical in deployment
)
print(output)

llama_perf_context_print:        load time =   74893.01 ms
llama_perf_context_print: prompt eval time =       0.00 ms /   161 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /    63 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =  148466.51 ms /   224 tokens


{'id': 'cmpl-0639f56c-db15-46fb-9ee9-c81887629db8', 'object': 'text_completion', 'created': 1733470784, 'model': '/root/.cache/huggingface/hub/models--h3lmi--modelgguf/snapshots/18cfad8ce7e0170c52a2447b7a4af51bf4aee65c/./unsloth.Q8_0.gguf', 'choices': [{'text': '           Hello, Thanks for your query. I can understand your concern. I can understand your concern. This is called vertigo. It is a symptom of a disorder of the vestibular apparatus in the inner ear or the vestibular nerve. The symptoms are generally brief but severe episodes of dizziness. Vertigo can be', 'index': 0, 'logprobs': None, 'finish_reason': 'length'}], 'usage': {'prompt_tokens': 161, 'completion_tokens': 64, 'total_tokens': 225}}


In [None]:
output['choices'][0]['text']
#original output: Hi, Thank you for posting your query. The most likely cause for your symptoms is benign paroxysmal positional vertigo (BPPV),
#a type of peripheral vertigo. In this condition, the most common symptom is dizziness or giddiness, which is made worse with movements.
#Accompanying nausea and vomiting are common. The condition is due to problem in the ear, and improves in a few days on own.
#Betahistine tablets would help relieve your symptoms. Doing vestibular rehabilitation or adaptation exercises would prevent the recurrence
#of these symptoms. An ENT evaluation would also help. I hope it helps. Best wishes, Chat Doctor.

'           Hello, Thanks for your query. I can understand your concern. I can understand your concern. This is called vertigo. It is a symptom of a disorder of the vestibular apparatus in the inner ear or the vestibular nerve. The symptoms are generally brief but severe episodes of dizziness. Vertigo can be'

In [None]:
default_output = default_llm(
    prompt.format(test_input)
    ,
	max_tokens=64,
	echo=False
)
print(default_output)

llama_perf_context_print:        load time =  190699.15 ms
llama_perf_context_print: prompt eval time =       0.00 ms /   161 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /    63 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =  276958.62 ms /   224 tokens


{'id': 'cmpl-a8b204a7-3556-4793-9169-74c1f4fb9bf0', 'object': 'text_completion', 'created': 1733471321, 'model': '/root/.cache/huggingface/hub/models--bartowski--Meta-Llama-3.1-8B-Instruct-GGUF/snapshots/bf5b95e96dac0462e2a09145ec66cae9a3f12067/./Meta-Llama-3.1-8B-Instruct-IQ2_M.gguf', 'choices': [{'text': ' When you wake up this morning, feeling like the whole room is spinning when you sit down, it could be a sign of a few different things. The first thing I want to ask you is, have you had any head injuries or trauma recently? Have you ever had a head injury in the past? It could be', 'index': 0, 'logprobs': None, 'finish_reason': 'length'}], 'usage': {'prompt_tokens': 161, 'completion_tokens': 64, 'total_tokens': 225}}


In [None]:
default_output['choices'] #didn't get the correct disease (vertigo)

[{'text': ' When you wake up this morning, feeling like the whole room is spinning when you sit down, it could be a sign of a few different things. The first thing I want to ask you is, have you had any head injuries or trauma recently? Have you ever had a head injury in the past? It could be',
  'index': 0,
  'logprobs': None,
  'finish_reason': 'length'}]

##LOAD THE Fine tuned adapter (worked only on GPU)

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 512 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "h3lmi/lora_model",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.12.2: Fast Llama patching. Transformers:4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Unsloth 2024.12.2 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[

   ' ### Instruction:You are a doctor. Answer the following query by a patient.\n          ### Input:Dear Doc,More than 6 months ago I got treated for scabies. all the rashes and marks on my body were gone except for those on my scrotum.a month after treatment, itching and the bumps on the sac reappearred. now from couple of days ago I have identified another bump with black dot in the centre. when touched or pressed I can feel the pain and it is on the penis.Kindly help me understand why I am facing this kind of problems near my sex organ when I never indulged in sexual activities till date.Also suggest me my immediate course of action.Thanks a lot.\n            ### Response'
    #)
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<|begin_of_text|> ### Instruction:You are a doctor. Answer the following query by a patient.\n            ### Input:Dear Doc,More than 6 months ago I got treated for scabies. all the rashes and marks on my body were gone except for those on my scrotum.a month after treatment, itching and the bumps on the sac reappearred. now from couple of days ago I have identified another bump with black dot in the centre. when touched or pressed I can feel the pain and it is on the penis.Kindly help me understand why I am facing this kind of problems near my sex organ when I never indulged in sexual activities till date.Also suggest me my immediate course of action.Thanks a lot.\n            ### Response:Hi,  Welcome to Chat Doctor.  I can understand your concern.  I would suggest you to consult a dermatologist and get a thorough examination done.  You may be having scabies again or some other skin infection.  You may need oral and topical antibiotics.  I hope this information helps you.']

In [None]:
#test the defaut model in the same question
model_default, tokenizer_default = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.12.2: Fast Llama patching. Transformers:4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
FastLanguageModel.for_inference(model_default) # Enable native 2x faster inference
inputs = tokenizer(
[

   ' ### Instruction:You are a doctor. Answer the following query by a patient.\n            ### Input:Dear Doc,More than 6 months ago I got treated for scabies. all the rashes and marks on my body were gone except for those on my scrotum.a month after treatment, itching and the bumps on the sac reappearred. now from couple of days ago I have identified another bump with black dot in the centre. when touched or pressed I can feel the pain and it is on the penis.Kindly help me understand why I am facing this kind of problems near my sex organ when I never indulged in sexual activities till date.Also suggest me my immediate course of action.Thanks a lot.\n            ### Response'
    )
], return_tensors = "pt").to("cpu")

outputs = model_default.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer_default.batch_decode(outputs)

['<|begin_of_text|> ### Instruction:You are a doctor. Answer the following query by a patient.\n            ### Input:Dear Doc,More than 6 months ago I got treated for scabies. all the rashes and marks on my body were gone except for those on my scrotum.a month after treatment, itching and the bumps on the sac reappearred. now from couple of days ago I have identified another bump with black dot in the centre. when touched or pressed I can feel the pain and it is on the penis.Kindly help me understand why I am facing this kind of problems near my sex organ when I never indulged in sexual activities till date.Also suggest me my immediate course of action.Thanks a lot.\n            ### Response:Dear Patient,Thank you for reaching out to me. I am sorry to hear about your health issues. I would like to know more about your symptoms. Could you please elaborate on the following points:1. What are the symptoms you are experiencing?2. Have you consulted any other doctors before?3. What']