# Creating dataset link

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        link = os.path.join(dirname, filename)
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/final_dataset.json


# Installing necessary Libraries

In [2]:
!pip install transformers peft accelerate
!pip install huggingface_hub
!pip install -U bitsandbytes
!pip install datasets

Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.13.2-py3-none-any.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.13.2
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1


# Logging into Hugging face client for Loading Models

In [None]:
!huggingface-cli login --token <enter you HF token>

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Importing necessary libraries

In [4]:
import torch
from transformers import AutoTokenizer,AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, EarlyStoppingCallback
from transformers import BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, PeftModel
import bitsandbytes as bnb
from datasets import Dataset,load_dataset
import json

# Loading the dataset

In [5]:
with open(link, 'r') as file:
    data = json.load(file)

In [6]:
print(type(data))

<class 'list'>


The code converts a Python list (where each element is typically a dictionary or structured data) into a Dataset object.

In [7]:
dataset = Dataset.from_list(data)

# Splitting the dataset in train, test, validation data 

In [8]:
dataset_split = dataset.train_test_split(test_size = 0.2)
train_data = dataset_split['train']
temp_data = dataset_split['test']

In [9]:
validation_data_split = temp_data.train_test_split(test_size = 0.5)
validation_data = validation_data_split['train']
test_data = validation_data_split['test']

In [10]:
train_data

Dataset({
    features: ['question', 'answer', 'context'],
    num_rows: 13762
})

In [11]:
validation_data

Dataset({
    features: ['question', 'answer', 'context'],
    num_rows: 1720
})

In [12]:
test_data

Dataset({
    features: ['question', 'answer', 'context'],
    num_rows: 1721
})

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Efficient Model Loading and Fine-Tuning with Quantization and LoRA Configuration

1) The model is loaded with automatic device mapping (device_map="auto"), ensuring optimal use of available hardware resources.
2) The LoraConfig is set up to apply low-rank adaptation (LoRA), allowing for efficient fine-tuning by modifying only a small number of parameters in key layers
3) By utilizing BitsAndBytesConfig, the model is loaded with 4-bit precision to reduce memory footprint, which is particularly beneficial for deploying large models on limited hardware.

In [14]:
model_name = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer =  AutoTokenizer.from_pretrained(model_name)
bnb_config = BitsAndBytesConfig(

    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="fp4",
    bnb_4bit_compute_dtype="float16"
)

model =  AutoModelForCausalLM.from_pretrained(
    
    model_name,
    quantization_config = bnb_config,
    device_map="auto",
)

peft_config = LoraConfig(

    r=32,
    lora_alpha=32,
    target_modules=["q_proj","o_proj","k_proj","down_proj","gate_proj","v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="QUESTION_ANSWERING"
)


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [15]:
print(f"Model is on: {next(model.parameters()).device}")

Model is on: cuda:0


### Loading the base model with peft configuration in peft_modle object

In [16]:
peft_model = get_peft_model(model, peft_config)

1) The preprocess_function takes a batch of examples and creates a formatted string for each question-context pair by combining them into a single input format: "Question: {question} Context: {context}".

2) It then tokenizes these inputs with a maximum length of 128 tokens, applying padding and truncation as needed. The corresponding answers are also tokenized in the same manner, treating them as labels for training.

3) The resulting tokenized inputs and their associated labels are returned in a format suitable for model training, allowing for efficient handling of the question-answering task.

In [17]:
tokenizer.pad_token = tokenizer.eos_token
def preprocess_function(examples):

    inputs = [f"Question: {q} Context: {c}" for q, c in zip(examples['question'], examples['context'])]
    outputs = examples['answer']
    model_inputs = tokenizer(inputs, max_length=128, padding="max_length", truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(outputs, max_length=128, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


# Tokenizing the Data

### Applies the previously defined preprocess_function to the train_data dataset, resulting in a tokenized version of the training data. 

In [18]:
tokenized_train_dataset = train_data.map(preprocess_function, batched=True)

Map:   0%|          | 0/13762 [00:00<?, ? examples/s]



In [19]:
tokenized_eval_dataset = validation_data.map(preprocess_function, batched = True)

Map:   0%|          | 0/1720 [00:00<?, ? examples/s]

In [20]:
tokenized_test_dataset = test_data.map(preprocess_function, batched = True)

Map:   0%|          | 0/1721 [00:00<?, ? examples/s]

In [21]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# Training the Model

The TrainingArguments object defines essential configurations for the training process, such as the output directory for saved models, evaluation strategies, learning rate, batch sizes, and the number of training epochs

In [22]:
training_args = TrainingArguments(

    output_dir="./llama-3.2-3b-chatbot",

    eval_strategy="steps",

    eval_steps = 200,

    learning_rate=5e-5,

    per_device_train_batch_size=16,

    per_device_eval_batch_size=4,

    num_train_epochs=10,

    gradient_accumulation_steps=4,

    weight_decay=0.1,

    logging_dir="./logs",

    fp16=True,

    logging_steps=200,

    save_strategy="steps",

    save_total_limit=3,

    save_steps=200,

    optim="paged_adamw_8bit",

    label_names = ["input_ids","attention_mask","labels"],

    report_to=['wandb'],

    lr_scheduler_type="linear",

    load_best_model_at_end=True,
    
    metric_for_best_model="eval_loss", 
    
    greater_is_better=False,

)


trainer = Trainer(


    model = peft_model,

    args = training_args,

    eval_dataset=tokenized_eval_dataset,

    train_dataset=tokenized_train_dataset,

    data_collator = data_collator,

    tokenizer = tokenizer,

    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],

)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


### Total parameter count for training

In [23]:
def count_trainable_parameters(model):

    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return trainable_params

num_trainable_parameters = count_trainable_parameters(model)
print(f"Number of trainable parameters: {num_trainable_parameters}")

Number of trainable parameters: 38535168


In [24]:
# trainer.train(resume_from_checkpoint=True)
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112662411111134, max=1.0…

Step,Training Loss,Validation Loss
200,2.08,1.866203
400,1.78,1.779634
600,1.6778,1.732178
800,1.5875,1.70929
1000,1.5276,1.697249
1200,1.4633,1.686626
1400,1.4138,1.692958
1600,1.368,1.686423
1800,1.3349,1.689047
2000,1.3008,1.690194


TrainOutput(global_step=2000, training_loss=1.553379364013672, metrics={'train_runtime': 15276.4798, 'train_samples_per_second': 9.009, 'train_steps_per_second': 0.141, 'total_flos': 2.8060581125947392e+17, 'train_loss': 1.553379364013672, 'epoch': 9.291521486643438})

# Saving the model to root directory

In [25]:
save_directory = "./llama-3.2-3b-chatbot-instruct"
peft_model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('./llama-3.2-3b-chatbot-instruct/tokenizer_config.json',
 './llama-3.2-3b-chatbot-instruct/special_tokens_map.json',
 './llama-3.2-3b-chatbot-instruct/tokenizer.json')

### Merging the peft model's weights with the base model weigts to get a combined model

In [26]:
merged_model = PeftModel.from_pretrained(model, save_directory)
merged_model = merged_model.merge_and_unload()



In [27]:
merged_model.save_pretrained(save_directory)

# Inference

In [35]:
input_text = "Question: which is the top performing stock in india? Answer:"

inputs = tokenizer(input_text, return_tensors="pt",padding = True)
device = merged_model.device
print(device)
inputs = {key: value.to(device) for key, value in inputs.items()}


with torch.no_grad():

    outputs = merged_model.generate(
        
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        pad_token_id=tokenizer.eos_token_id,
        max_length=200
    )

output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(output_text)

cuda:0
Question: which is the top performing stock in india? Answer: Reliance Industries Limited (RIL)
Explanation: Reliance Industries Limited (RIL) is the top-performing stock in India, driven by its diversified business portfolio, including petrochemicals, energy, and retail. The company has consistently delivered strong financial performance, with a market capitalization of over $100 billion. RIL's top-line growth has been driven by its oil and gas business, as well as its retail and petrochemicals segments. The company's stock price has been volatile in recent times, influenced by global commodity prices, but it remains a favorite among investors due to its strong fundamentals and growth prospects. As of now, RIL is the top-performing stock in India, with a market capitalization of over $100 billion. 
Note: This is a hypothetical example, please note that the actual data and performance of the stocks may vary. The above information is for illustrative purposes only.


In [36]:
trainer.model = merged_model

# Pushing the finetuned model to HUGGING FACE HUB

In [30]:
# trainer.push_to_hub()

In [31]:
# peft_model.push_to_hub("Akshit-77/llama-3.2-3b-chatbot")