In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/doctor-healthcare-100k/Doctor-HealthCare-100k.csv


In [2]:
%%capture
%pip install -U transformers 
%pip install -U datasets 
%pip install -U accelerate 
%pip install -U peft 
%pip install -U trl 
%pip install -U bitsandbytes 
%pip install -U wandb

# Accessing Gemma 2 Model from Hugging Face

We are loading the model in 4-bit quantization. 

In [3]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
login(token = hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig

modelName = "google/gemma-2-2b-it"

bnbConfig = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(modelName)

model = AutoModelForCausalLM.from_pretrained(
    modelName,
    device_map = "auto",
    quantization_config=bnbConfig
)

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [8]:
input_text = "How are you?"
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
outputs = model.generate(**input_ids, max_new_tokens=200)
print(outputs)
print(tokenizer.decode(outputs[0]))

tensor([[     2,   2299,    708,    692, 235336,    109, 235285, 235303, 235262,
           3900,   1578, 235269,   7593,    692, 235341,   2250,   1105,    692,
         235336,  44416, 235248,    108,    107]], device='cuda:0')
<bos>How are you?

I'm doing well, thank you! How about you? 😊 
<end_of_turn>


# Fine-tuning Steps for Gemma 2 Using LoRA On top of Qlora 4 bit Quantizattion

Load the necessary Python packages and the functions. 

In [9]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

Log in to Hugging Face CLI using the API key that we have saved using the Kaggle Secrets. 

In [10]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
login(token = hf_token)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Load the Weights and Biases API key from Kaggle secrets to initiate the project for model performance tracking. 
* Will use wandb Api key to track the progress

In [11]:
wb_token = user_secrets.get_secret("wandb")

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune Gemma-2-2b-it on doctor-healthcare dataset', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdiv20023041[0m ([33mdiv20023041-tata-consultancy-services[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [12]:
base_model = "google/gemma-2-2b-it"
dataset_name = "/kaggle/input/doctor-healthcare-100k/Doctor-HealthCare-100k.csv"
new_model = "Gemma-2-2b-it"

# Loading the model and tokenizer
* Setting the data type and attention implementation based on GPU.

In [13]:
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

* We must create the QLoRA configuration so that we can load the model with 4-bit precision, reducing memory usage and speeding up the fine-tuning process.
* QLoRA builds upon LoRA by loading the model in a quantized format (like 4-bit or 8-bit precision) to reduce memory usage even further.
* QLoRA enables the model to run with lower precision (4-bit or 8-bit quantization) while still being fine-tuned with LoRA’s low-rank adaptation matrices. By quantizing the main model, QLoRA allows much larger models to run on limited hardware, like consumer GPUs.

In [14]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)


Using the Model URL, LoRA configuration, and attention implementation, load the Gemma 2 2B-It model and the tokenizer.

In [15]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Adding the adapter to the layer
Create the Python function that will use the model and extract the names of all the linear modules. 

In [17]:
model

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 2304, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear4bit(in_features=2304, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2304, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=2304, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2304, bias=False)
          (rotary_emb): Gemma2RotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear4bit(in_features=2304, out_features=9216, bias=False)
          (up_proj): Linear4bit(in_features=2304, out_features=9216, bias=False)
          (down_proj): Linear4bit(in_features=9216, out_features=2304, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (pre_feedforward_layernorm)

Purpose of the Function: The function helps identify which layers are eligible for lora modifications (in this case, bnb.nn.Linear4bit layers), enabling efficient fine-tuning. By pinpointing these layers, practitioners can focus adaptation efforts where they are most impactful, without changing the entire model.

In [20]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

Fine-tuning the full model will take a lot of time, so to accelerate the training process, we will create and attach the adapter layer, resulting in a faster and more memory-efficient process. 

The adoption layer is created using the target modules and task type. Next, we set up the chat format for the model and tokenizer. Finally, we attach the base model to the adapter to create a Parameter Efficient Fine-Tuning (PEFT) model.

* LoRA adds small, low-rank matrices to each layer, allowing only these matrices to be trained. This minimizes the computational load and memory needed.

In [22]:
# LoRA config LoRA adds small, low-rank matrices to each layer, allowing only these matrices to be trained. 
#This minimizes the computational load and memory needed.
tokenizer.chat_template = None
peft_config = LoraConfig(
    r=16,     #  the rank in LoRA directly affects the number of trainable parameters in the model. more rank more parameters to train
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules,
)
model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

In [23]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PeftModelForCausalLM(
      (base_model): LoraModel(
        (model): Gemma2ForCausalLM(
          (model): Gemma2Model(
            (embed_tokens): Embedding(256002, 2304, padding_idx=0)
            (layers): ModuleList(
              (0-25): 26 x Gemma2DecoderLayer(
                (self_attn): Gemma2Attention(
                  (q_proj): lora.Linear4bit(
                    (base_layer): Linear4bit(in_features=2304, out_features=2048, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=2304, out_features=16, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=16, out_features=2048, bias=False)
                    )
                    (lora_embedding_A): Paramete

# Loading the dataset

In [24]:
df = pd.read_csv(dataset_name)
# df.head()

There should be no null values in dataframe

In [25]:
df.isna().sum()

instruction    0
input          0
output         0
dtype: int64

In [26]:
from datasets import load_dataset

dataset = load_dataset('csv', data_files=dataset_name, split='all')
# Shuffle the dataset and select the first 1000 samples
dataset = dataset.shuffle(seed=65).select(range(2000))
dataset


Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 2000
})

In [27]:
dataset['instruction'][0], dataset['input'][0], dataset['output'][0]

("If you are a doctor, please answer the medical questions based on the patient's description.",
 'Hi i am a teenager. about 2 days a go i found about 5 slim lumps across my forehead. do you no what this could be my mum says that it is just boils but im worried could help me. also i have been have a lot of headaches/migraines as well. it also herts when i touch them.',
 "Hi, Dear I studied your query in all it details and I understood your concerns. Cause - On whatever limited facts given you seem to have Acne, or pimples, and they are painful to touch. The migraine or headaches is a separate ailment and don't correlate with painful acne on forehead. So don't worry.Hence, To reduce your worry Please consult for opinion from ER doctor. Plz hit thanks and write excellent Reviews if this would resolve your query. Plz don't worry and do Welcome for any further query in this regard to me. Have a Good Day. Chat Doctor. N.")

# Model Output before fine tunning

In [28]:
input_text = dataset['input'][0]
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
outputs = model.generate(**input_ids, max_new_tokens=200)
print(tokenizer.decode(outputs[0]))

<bos>Hi i am a teenager. about 2 days a go i found about 5 slim lumps across my forehead. do you no what this could be my mum says that it is just boils but im worried could help me. also i have been have a lot of headaches/migraines as well. it also herts when i touch them.

It's important to remember that I am not a medical professional and cannot give medical advice. 

**What you should do:**

1. **See a doctor:** The best thing to do is to see a doctor as soon as possible. They can examine the lumps, determine the cause, and recommend the appropriate treatment. 
2. **Keep a record:** Keep a record of the lumps, their location, size, and any other symptoms you experience. This will help your doctor make a diagnosis.
3. **Don't self-treat:** Avoid trying to treat the lumps yourself. This could worsen the condition or delay proper treatment.
4. **Be honest with your doctor:** Be honest with your doctor about your symptoms, including the headaches and migraines. This will help them mak

# Dataset for Gemma model

In [29]:
def format_chat_template(row):
    row_json = [{"role": "system", "content": row["instruction"]},
               {"role": "user", "content": row["input"]},
               {"role": "assistant", "content": row["output"]}]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc= 4,
)

dataset

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 2000
})

In [30]:
print(dataset['text'][0])

<|im_start|>system
If you are a doctor, please answer the medical questions based on the patient's description.<|im_end|>
<|im_start|>user
Hi i am a teenager. about 2 days a go i found about 5 slim lumps across my forehead. do you no what this could be my mum says that it is just boils but im worried could help me. also i have been have a lot of headaches/migraines as well. it also herts when i touch them.<|im_end|>
<|im_start|>assistant
Hi, Dear I studied your query in all it details and I understood your concerns. Cause - On whatever limited facts given you seem to have Acne, or pimples, and they are painful to touch. The migraine or headaches is a separate ailment and don't correlate with painful acne on forehead. So don't worry.Hence, To reduce your worry Please consult for opinion from ER doctor. Plz hit thanks and write excellent Reviews if this would resolve your query. Plz don't worry and do Welcome for any further query in this regard to me. Have a Good Day. Chat Doctor. N.<|i

For model evaluation, we will split out the dataset into training and test split. 

In [31]:
dataset = dataset.train_test_split(test_size=0.1)
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 1800
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 200
    })
})

# Complaining and training the model

We will now set the training argument and STF(Soft Token Finetuning (STF)) parameters and then start the training process. 

You can change the various hyperparameters based on your environment, compute, and memory. The hyperparameters below are optimized for the Kaggle Notebook. So, if you want to run the same thing on Google Colab, please consider experimenting with training algorithms. 

In [32]:
# Setting Hyperparamter
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length= 512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

model.config.use_cache = False
trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss
180,2.6828,2.509114
360,2.8809,2.461782
540,2.4908,2.417842
720,2.9915,2.394087
900,2.1671,2.382602


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=900, training_loss=2.5211652338504793, metrics={'train_runtime': 1543.2458, 'train_samples_per_second': 1.166, 'train_steps_per_second': 0.583, 'total_flos': 5689470458304000.0, 'train_loss': 2.5211652338504793, 'epoch': 1.0})

# Model evaluation

In [33]:
wandb.finish()
model.config.use_cache = True

VBox(children=(Label(value='0.573 MB of 0.573 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▅▃▂▁
eval/runtime,▆▁█▆▅
eval/samples_per_second,▂█▁▂▃
eval/steps_per_second,▂█▁▂▃
train/epoch,▁▁▁▁▁▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇█
train/global_step,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▄▅▅▅▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇██
train/grad_norm,▅▅▇▅▆▅▃▆▄▄▃▂▄▃▂▄▂█▂▅▄▆▆▁▄▅▄▆▅▄▃▄▄▃▅▄▇▂▃▆
train/learning_rate,▄▆███▇▇▇▇▇▆▆▆▅▅▅▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁
train/loss,█▂▅▃▃▃▄▅▆▁▄▁▂▃▃▂▄▄▅▂▂▁▂▄▃▄▃▅▃▂▃▃▂▂▃▅▂▂▃▃

0,1
eval/loss,2.3826
eval/runtime,62.1438
eval/samples_per_second,3.218
eval/steps_per_second,3.218
total_flos,5689470458304000.0
train/epoch,1.0
train/global_step,900.0
train/grad_norm,4.08873
train/learning_rate,0.0
train/loss,2.1671


# Results after fine tuning
* To get more accuracy you can use heavy compute and increase batch size and many other parameters. This notebook just shows us the methods. 
* Although, we can see the outputs are different and related to dataset which shows our code is working fine.

In [34]:
input_text = "Hi i am a teenager. about 2 days a go i found about 5 slim lumps across my forehead. do you no what this could be my mum says that it is just boils but im worried could help me. also i have been have a lot of headaches/migraines as well. it also herts when i touch them."
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
outputs = model.generate(**input_ids, max_new_tokens=200)
print(tokenizer.decode(outputs[0]))

<bos>Hi i am a teenager. about 2 days a go i found about 5 slim lumps across my forehead. do you no what this could be my mum says that it is just boils but im worried could help me. also i have been have a lot of headaches/migraines as well. it also herts when i touch them. i have been having a lot of acne as well. i have been having a lot of headaches/migraines as well. it also herts when i touch them. i have been having a lot of acne as well. i have been having a lot of headaches/migraines as well. it also herts when i touch them. i have been having a lot of acne as well. i have been having a lot of headaches/migraines as well. it also herts when i touch them. i have been having a lot of acne as well. i have been having a lot of headaches/migraines as well. it also herts when i touch them. i have been having a lot of acne as well. i have been having a lot of headaches/migraines as well. it also herts when i touch them. i have been having a lot of acne as well. i have been having a l

# Saving the changes

In [36]:
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.1M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Divyanshugard/Gemma-2-2b-it/commit/5293f468cdb85a47382cf33c5f03e76268337a55', commit_message='Upload model', commit_description='', oid='5293f468cdb85a47382cf33c5f03e76268337a55', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Divyanshugard/Gemma-2-2b-it', endpoint='https://huggingface.co', repo_type='model', repo_id='Divyanshugard/Gemma-2-2b-it'), pr_revision=None, pr_num=None)