In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%%capture
%pip install -U transformers 
%pip install -U datasets 
%pip install -U accelerate 
%pip install -U peft 
%pip install -U trl 
%pip install -U bitsandbytes 
%pip install -U wandb

In [1]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset, Dataset
from trl import SFTTrainer, setup_chat_format
import numpy as np
import pandas as pd
import transformers
from tqdm.auto import tqdm

2025-10-13 13:12:12.229152: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760361132.409409     130 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760361132.461146     130 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
login(token = hf_token)

In [5]:
wb_token = user_secrets.get_secret("wandb")

wandb.login(key=wb_token)
run = wandb.init(
    project='fine-tuning bert', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mgargibendale[0m ([33mgargibendale-university-of-mumbai[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
base_model = "/kaggle/input/llama-3.2/transformers/3b-instruct/1"
new_model = "llama-3.2-3b-it-Ecommerce-ChatBot"
dataset_name = "bitext/Bitext-customer-support-llm-chatbot-training-dataset"

In [6]:
# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

In [42]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

In [43]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    quantization_config=bnb_config,
    attn_implementation=attn_implementation,
    num_labels=1
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
# 3. Prepare for k-bit LoRA training
model = prepare_model_for_kbit_training(model)

In [11]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [12]:
df = pd.read_csv('/kaggle/input/my-dataset/final_train_dataset.csv')

In [13]:
import ast

df["bullet_points"] = df["bullet_points"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)
# Merge the list into a single string
df["bullet_points"] = df["bullet_points"].apply(
    lambda x: " ".join(x) if isinstance(x, list) else x
)

In [14]:
df["text"] = (df["item_name"].fillna("") + " " +
    df["bullet_points"].fillna("") + " " +
    "value: " + df["value"].fillna("").astype(str) + " " +
    "unit: " + df["unit"].fillna("")
                )

In [15]:
# Create a new DataFrame with selected columns
df_train = df[['text', 'price']].copy()
df_train = df_train.rename(columns={"price": "label"})

In [16]:
y = df.price.values

In [38]:
input_ids = []
attention_masks = []
for sentence in tqdm(df_train["text"]):
    encoded_dict = tokenizer.encode_plus(
                        sentence,                      # Sentence to encode.
                        truncation=True,
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 512,           # Pad & truncate all sentences.
                        padding="max_length",
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])
# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
y = torch.tensor(y)

  0%|          | 0/56102 [00:00<?, ?it/s]

  y = torch.tensor(y)


In [None]:
type(input_ids)

In [40]:
from sklearn.model_selection import train_test_split

train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    input_ids, attention_masks, y, test_size=0.2, random_state=42
)

train_dataset = Dataset.from_dict({
    "input_ids": train_inputs,
    "attention_mask": train_masks,
    "labels": train_labels
})

eval_dataset = Dataset.from_dict({
    "input_ids": val_inputs,
    "attention_mask": val_masks,
    "labels": val_labels
})

In [None]:
#Importing the dataset
dataset = load_dataset(dataset_name, split="train")
dataset = dataset.shuffle(seed=65).select(range(1000)) # Only use 1000 samples for quick demo
# Create a 90/10 train-test split
dataset = dataset.train_test_split(test_size=0.1)
instruction = """You are a top-rated customer service agent named John. 
    Be polite to customers and answer all their questions.
    """
def format_chat_template(row):
    
    row_json = [{"role": "system", "content": instruction },
               {"role": "user", "content": row["instruction"]},
               {"role": "assistant", "content": row["response"]}]
    
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc= 4,
)

In [None]:
dataset['train']['text'][3]

In [None]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

In [None]:
modules

In [45]:
# LoRA config
peft_config = LoraConfig(
    r=16, #rank of the lora matrix
    lora_alpha=32,#learning rate
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS",
    target_modules=["query", "value"]
)
model = get_peft_model(model, peft_config)

In [46]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir="/kaggle/working/new-model",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=1e-4,
    group_by_length=True,
    report_to="wandb",
    fp16=True,
    bf16=False,
    dataloader_num_workers=4,  # Add to TrainingArguments
dataloader_pin_memory=True,
    gradient_checkpointing=True,  # Add to TrainingArguments
)

In [35]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.squeeze()
    
    # SMAPE
    numerator = np.abs(predictions - labels)
    denominator = (np.abs(predictions) + np.abs(labels)) / 2
    mask = denominator != 0
    smape = np.where(mask, numerator[mask] / denominator[mask], 0)
    smape = np.mean(smape) * 100
    
    # Other metrics
    mae = mean_absolute_error(labels, predictions)
    rmse = np.sqrt(mean_squared_error(labels, predictions))
    
    return {
        "smape": smape,
        "mae": mae,
        "rmse": rmse
    }

In [47]:
# Setting sft parameters
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=training_arguments,
    compute_metrics=compute_metrics,
)

In [48]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss,Validation Loss,Smape,Mae,Rmse
500,328.6042,312.954254,63.363272,11.767222,17.690512
1000,292.2014,281.591339,62.098002,11.586773,16.780684


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TrainOutput(global_step=1403, training_loss=329.0363700360973, metrics={'train_runtime': 3429.8185, 'train_samples_per_second': 13.086, 'train_steps_per_second': 0.409, 'total_flos': 1.1890008924592128e+16, 'train_loss': 329.0363700360973, 'epoch': 1.0})

In [49]:
wandb.finish()

0,1
eval/loss,█▁
eval/mae,█▁
eval/rmse,█▁
eval/runtime,▁█
eval/samples_per_second,█▁
eval/smape,█▁
eval/steps_per_second,▁▁
train/epoch,▁▂▂▃▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██
train/global_step,▁▂▂▃▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██
train/grad_norm,▃▁▂▁▃▁▂▂█▅▃▂▃▃▂▂▄▂

0,1
eval/loss,281.59134
eval/mae,11.58677
eval/rmse,16.78068
eval/runtime,226.5578
eval/samples_per_second,49.528
eval/smape,62.098
eval/steps_per_second,3.099
total_flos,1.1890008924592128e+16
train/epoch,1
train/global_step,1403


In [None]:
messages = [{"role": "system", "content": instruction},
    {"role": "user", "content": "I bought the same item twice, cancel order {{Order Number}}"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])

In [None]:
# Save the fine-tuned model
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False)

In [50]:
# Save the fine-tuned model
trainer.save_model("/kaggle/working/final-model-large")
tokenizer.save_pretrained("/kaggle/working/final-model-large")

('/kaggle/working/final-model-large/tokenizer_config.json',
 '/kaggle/working/final-model-large/special_tokens_map.json',
 '/kaggle/working/final-model-large/vocab.txt',
 '/kaggle/working/final-model-large/added_tokens.json',
 '/kaggle/working/final-model-large/tokenizer.json')