In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/customer-support-training-dataset-27k/Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv
/kaggle/input/llama-3.2/transformers/3b-instruct/1/model.safetensors.index.json
/kaggle/input/llama-3.2/transformers/3b-instruct/1/config.json
/kaggle/input/llama-3.2/transformers/3b-instruct/1/model-00001-of-00002.safetensors
/kaggle/input/llama-3.2/transformers/3b-instruct/1/model-00002-of-00002.safetensors
/kaggle/input/llama-3.2/transformers/3b-instruct/1/README.md
/kaggle/input/llama-3.2/transformers/3b-instruct/1/USE_POLICY.md
/kaggle/input/llama-3.2/transformers/3b-instruct/1/tokenizer.json
/kaggle/input/llama-3.2/transformers/3b-instruct/1/tokenizer_config.json
/kaggle/input/llama-3.2/transformers/3b-instruct/1/LICENSE.txt
/kaggle/input/llama-3.2/transformers/3b-instruct/1/special_tokens_map.json
/kaggle/input/llama-3.2/transformers/3b-instruct/1/.gitattributes
/kaggle/input/llama-3.2/transformers/3b-instruct/1/generation_config.json


In [3]:
%%capture
%pip install -U transformers 
%pip install -U datasets 
%pip install -U accelerate 
%pip install -U peft 
%pip install -U trl 
%pip install -U bitsandbytes 
%pip install -U wandb

Access LLama from Kaggle. We can also access it from hugging face but we will use Kaggle this time.
* Go to the https://www.kaggle.com/models/metaresearch/llama-3.2 website
* There will be a link fill out the form at https://www.llama.com/llama-downloads/ website
* Select both lightweight and vision models. Your Name, DOB should be same as in your Gmail ID and your Kaggle Account. Fill Organization as Kaggle if logining from Kaggle if you are trying to access model from Hugging Face fill organization as Hugging Face.
* After getting access from make a new notebook click on add input and add the meta llama model.

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TextStreamer
import torch


base_model = "/kaggle/input/llama-3.2/transformers/3b-instruct/1"

tokenizer = AutoTokenizer.from_pretrained(base_model)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    return_dict=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Set pad_token_id to avoid receiving warning messages.


In [5]:
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id

In [8]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

In [9]:
messages = [{"role": "user", "content": "Where is delhi"}]

prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

outputs = pipe(prompt, max_new_tokens=120, do_sample=True)

print(outputs[0]["generated_text"])

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Where is delhi<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Delhi is the capital city of India, located in the northern part of the country. It is situated in the Indo-Gangetic Plain, which is a fertile region along the Yamuna River.

Geographically, Delhi is located in the National Capital Territory of Delhi (NCT) and is surrounded by the following states:

* Haryana to the north and west
* Uttar Pradesh to the east
* Rajasthan to the southwest

Delhi is also a major urban agglomeration, with a population of over 29 million people, making it one of the largest cities in the world.




# Fine-tuning Llama 3.2 3B Instruct Using LoRA On top of Qlora 4 bit Quantizattion

In [10]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

In [11]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
login(token = hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
wb_token = user_secrets.get_secret("wandb")

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune Llama 3.2 on Customer Support Dataset', 
    job_type="training", 
    anonymous="allow"
)

In [13]:
base_model = "/kaggle/input/llama-3.2/transformers/3b-instruct/1"
new_model = "llama-3.2-3b-it-CustomerSupport-ChatBot"
dataset_name = "/kaggle/input/customer-support-training-dataset-27k/Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv"

# Loading the model and tokenizer
* Setting the data type and attention implementation based on GPU.

In [14]:
# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

Load the model and tokenizer by providing the local model directory. Even though our model is small, loading the full model and fine-tuning it will take some time. Instead, we will load the model in 4-bit quantization which is referred as QLora.s.

In [15]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)

# Loading and processing the dataset

In [17]:
df = pd.read_csv(dataset_name)
df.isna().sum()

flags          0
instruction    0
category       0
intent         0
response       0
dtype: int64

In [18]:
df.head(3)

Unnamed: 0,flags,instruction,category,intent,response
0,B,question about cancelling order {{Order Number}},ORDER,cancel_order,I've understood you have a question regarding ...
1,BQZ,i have a question about cancelling oorder {{Or...,ORDER,cancel_order,I've been informed that you have a question ab...
2,BLQZ,i need help cancelling puchase {{Order Number}},ORDER,cancel_order,I can sense that you're seeking assistance wit...


Already the dataset is quite cleaned. But we will take only those columns which are requrired for best Fine Tuning.

In [19]:
df = df.drop(['flags', 'category','intent'], axis=1)
df.head(3)

Unnamed: 0,instruction,response
0,question about cancelling order {{Order Number}},I've understood you have a question regarding ...
1,i have a question about cancelling oorder {{Or...,I've been informed that you have a question ab...
2,i need help cancelling puchase {{Order Number}},I can sense that you're seeking assistance wit...


In [20]:
df.to_csv('updated_dataset.csv', index=False)
dataset_nam = "/kaggle/working/updated_dataset.csv"
len(pd.read_csv(dataset_name))

26872

In [21]:
from datasets import load_dataset

dataset = load_dataset('csv', data_files=dataset_nam, split='all')
# Shuffle the dataset and select the first 1000 samples
dataset = dataset.shuffle(seed=65).select(range(1000))
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['instruction', 'response'],
    num_rows: 1000
})

In [22]:
dataset['instruction'][0],dataset['response'][0]

('where do i enter a different shipping address',
 'Ah, I understand that you\'re looking to enter a different shipping address. Allow me to guide you through the process:\n\n1. Log in to your account on our website.\n2. Navigate to the "My Account" or "Profile" section, which can usually be found in the top right corner of the page.\n3. Look for the "Shipping Addresses" or similar option. It may also be labeled as "Manage Addresses" or "Delivery Information."\n4. Click on that option to access your saved addresses.\n5. To enter a different shipping address, you\'ll most likely have the choice to either "Edit" an existing address or "Add a New Address."\n6. If you choose to edit, find the address you want to update and click on the "Edit" or "Modify" button. Make your changes and then save.\n7. If you want to add a completely new address, click on the "Add a New Address" or a similar option. Fill in the necessary details and save.\n\nRemember to double-check all the information you ent

# Model Response Before Fine Tuning

In [23]:
messages = [{"role": "user", "content": "where do i enter a different shipping address"}]

prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

outputs = pipe(prompt, max_new_tokens=120, do_sample=True)

print(outputs[0]["generated_text"])

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

where do i enter a different shipping address<|eot_id|><|start_header_id|>assistant<|end_header_id|>

To enter a different shipping address, the steps may vary slightly depending on the platform or website you're using. Here are some general steps for popular online shopping platforms:

**Amazon:**

1. Sign in to your Amazon account.
2. Go to the product details page.
3. Click on "Shipping" or "Delivery" (usually located at the top of the page).
4. Select "Enter a different shipping address" or "Add a new shipping address".
5. Enter the new shipping address and click "Save".

**eBay:**

1. Sign in to your eBay account.



# Preparing Dataset For LLama 3.2 Fine Tuning

In [24]:
instruction = """You are a top-rated customer service agent named John. 
    Be polite to customers and answer all their questions.
    """
def format_chat_template(row):
    
    row_json = [{"role": "system", "content": instruction },
               {"role": "user", "content": row["instruction"]},
               {"role": "assistant", "content": row["response"]}]
    
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc= 4,
)
dataset


  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

  self.pid = os.fork()


Dataset({
    features: ['instruction', 'response', 'text'],
    num_rows: 1000
})

In [25]:
print(dataset['text'][0])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a top-rated customer service agent named John. 
    Be polite to customers and answer all their questions.<|eot_id|><|start_header_id|>user<|end_header_id|>

where do i enter a different shipping address<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Ah, I understand that you're looking to enter a different shipping address. Allow me to guide you through the process:

1. Log in to your account on our website.
2. Navigate to the "My Account" or "Profile" section, which can usually be found in the top right corner of the page.
3. Look for the "Shipping Addresses" or similar option. It may also be labeled as "Manage Addresses" or "Delivery Information."
4. Click on that option to access your saved addresses.
5. To enter a different shipping address, you'll most likely have the choice to either "Edit" an existing address or "Add a New Address."
6. If you choose to edit, find the address you want to update and cli

# Setting up the model
1. Purpose of the Function: The function helps identify which layers are eligible for lora modifications (in this case, bnb.nn.Linear4bit layers), enabling efficient fine-tuning. By pinpointing these layers, practitioners can focus adaptation efforts where they are most impactful, without changing the entire model.

In [26]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

Lora Technique for efficent Fine Tuning

In [27]:
# LoRA config
tokenizer.chat_template = None
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


# Hyperparameters
# Below is a list of hyperparameters that can be used to optimize the training process:
* 
output_dir: The output directory is where the model predictions and checkpoints will be store
* .
num_train_epochs: One training epo
* h.
fp16/bf16: Disable fp16/bf16 train
* ng.
per_device_train_batch_size: Batch size per GPU for trai
* 
per_device_eval_batch_size: Batch size per GPU for evaluation
* 
gradient_accumulation_steps: This refers to the number of steps required to accumulate the gradients during the update proces
* .
gradient_checkpointing: Enabling gradient checkpointi
* ng.
max_grad_norm: Gradient clipp
* ing.
learning_rate: Initial learning 
* rate.
weight_decay: Weight decay is applied to all layers except bias/LayerNorm we
* ights.
Optim: Model optimizer (AdamW opti
* mizer).
lr_scheduler_type: Learning rate s
* chedule.
max_steps: Number of traini
* ng steps.
warmup_ratio: Ratio of steps for a line
* ar warmup.
group_by_length: This can significantly improve performance and accelerate the traini
* ng process.
save_steps: Save checkpoint every 25 u
* pdate steps.
logging_st
eps: Log every 25 update steps.

In [28]:
dataset = dataset.train_test_split(test_size=0.1)
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'response', 'text'],
        num_rows: 900
    })
    test: Dataset({
        features: ['instruction', 'response', 'text'],
        num_rows: 100
    })
})

In [29]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

Setting Supervised Fine tuning parameters

In [30]:
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length= 512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)
model.config.use_cache = False
trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss
90,0.8123,0.814006
180,0.8411,0.731385
270,0.9992,0.698661
360,0.7167,0.669213
450,0.4897,0.653303


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=450, training_loss=0.7693972663084666, metrics={'train_runtime': 708.9126, 'train_samples_per_second': 1.27, 'train_steps_per_second': 0.635, 'total_flos': 2768436576362496.0, 'train_loss': 0.7693972663084666, 'epoch': 1.0})

The training loss gradually reduced. Which means we are doing fine with small dataset, so we can increase the data depending on the gpu available.


In [None]:
wandb.finish()

# Testing the model Again 
* We will see that accuracy is changed know, w.r.t previous results.

In [31]:
messages = [{"role": "system", "content": instruction},
    {"role": "user", "content": "where do i enter a different shipping address?"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=200, num_return_sequences=1)
text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])


I've got that you're looking to enter a different shipping address. To do this, you can visit the "My Account" section on our website and navigate to the "Shipping Addresses" or "Address Book" section. From there, you can add a new shipping address or edit an existing one. If you need any further assistance or have any other questions, please don't hesitate to ask.system


#  Saving the tokenizer and model

In [None]:
# Save the fine-tuned model
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False)

# Make sure to save the notebook with the outputs
* Follow Phase 2 Merging and Exporting Fine-tuned Llama 3.2 