### **Transfer Learning on TinyLlama with PEFT (LoRA)**

### Installing dependencies

In [None]:
!pip install peft accelerate transformers datasets trl bitsandbytes tensorflow nltk absl-py rouge-score sacrebleu bert-score

### Importing libraries

In [None]:
import os
import torch
import pandas as pd
import re
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
)
from peft import get_peft_model, LoraConfig, TaskType
from datasets import Dataset
from transformers.trainer_utils import set_seed
from evaluate import load as load_metric
import matplotlib.pyplot as plt
import numpy as np

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Setting seed value for reproducibility

In [None]:
set_seed(42)

### Function to clean text

In [None]:
def clean_text(text):
    text = text.lower().strip()
    text = re.sub(r"[“”\"'`]+", "", text)
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text

### Function to load and split data into **train and test set**

In [None]:
def load_and_split_data(sample_size=2000):
    df = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-bioasq/data/test.parquet/part.0.parquet")
    df.dropna(subset=['question', 'answer'], inplace=True)
    df['text'] = df.apply(lambda x: f"Question: {x['question']}\nAnswer: {x['answer']}", axis=1)
    if sample_size is not None and len(df) > sample_size:
        df = df.sample(sample_size, random_state=42).reset_index(drop=True)
    train_df, test_df = train_test_split(df[['text']], test_size=0.2, random_state=42)
    return DatasetDict({
        "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
        "test": Dataset.from_pandas(test_df.reset_index(drop=True))
    })

In [None]:
dataset = load_and_split_data(sample_size=600)

### Function to **tokenize** and **preprocess data**

In [None]:
def preprocess(example, tokenizer, max_length=256):
    encoding = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=max_length
    )
    encoding["labels"] = encoding["input_ids"].copy()  # Ensure labels are present for CausalLM
    return encoding

In [None]:
def tokenize_dataset(dataset, tokenizer):
    return dataset.map(lambda x: preprocess(x, tokenizer), batched=True)

In [None]:
def get_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    return tokenizer

In [None]:
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = get_tokenizer(MODEL_NAME)
tokenized_dataset = tokenize_dataset(dataset, tokenizer)

Map: 100%|██████████| 480/480 [00:00<00:00, 4580.33 examples/s]
Map: 100%|██████████| 120/120 [00:00<00:00, 3583.37 examples/s]


### Setting up the model with **LoRA**

In [None]:
def setup_model_with_lora(model_name):
    from transformers import AutoConfig

    config = AutoConfig.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        config=config,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map=None  # avoid lazy loading to meta
    )
    model = model.to("cuda" if torch.cuda.is_available() else "cpu")  # ensure proper loading

    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=8,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        target_modules=["q_proj", "v_proj"]
    )
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    return model


In [None]:
model = setup_model_with_lora(MODEL_NAME)

trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.1023


### Setting up the **training process**

In [None]:
def setup_training(model, tokenizer, dataset, num_of_epochs):
    args = TrainingArguments(
        output_dir="../models/lora_bioasq_tinyllama",
        per_device_train_batch_size=2,
        num_train_epochs=num_of_epochs,
        learning_rate=2e-4,
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        warmup_ratio=0.1,
        fp16=False,
        gradient_accumulation_steps=1,
        logging_dir="./logs",
        logging_steps=25,
        save_strategy="epoch",
        save_total_limit=2,
        report_to="none"
    )
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        # tokenizer=tokenizer,
    )
    return trainer

In [None]:
NO_OF_EPOCHS = 2
trainer = setup_training(model, tokenizer, tokenized_dataset, num_of_epochs=NO_OF_EPOCHS)
trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
25,4.6473
50,0.8693
75,0.6868
100,0.5808
125,0.636
150,0.5864
175,0.6225
200,0.5989
225,0.4951
250,0.5438




TrainOutput(global_step=480, training_loss=0.8134771890938282, metrics={'train_runtime': 11445.7484, 'train_samples_per_second': 0.084, 'train_steps_per_second': 0.042, 'total_flos': 1527111525335040.0, 'train_loss': 0.8134771890938282, 'epoch': 2.0})

### Saving the model and tokenizer

In [None]:
trainer.save_model("../models/lora_bioasq_tinyllama")

In [None]:
tokenizer.save_pretrained("../models/lora_bioasq_tinyllama")

('../models/lora_bioasq_tinyllama\\tokenizer_config.json',
 '../models/lora_bioasq_tinyllama\\special_tokens_map.json',
 '../models/lora_bioasq_tinyllama\\chat_template.jinja',
 '../models/lora_bioasq_tinyllama\\tokenizer.json')

### **Validation Loss**

In [None]:
eval_metrics = trainer.evaluate()
print(f"Validation Loss (eval_loss): {eval_metrics.get('eval_loss', 'N/A')}")



Validation Loss (eval_loss): 0.5355337858200073


In [None]:
def generate_answer(model, tokenizer, prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
    model.eval()
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            max_new_tokens=100,
            num_return_sequences=1,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id
        )
    return tokenizer.decode(output[0], skip_special_tokens=True).strip()

In [None]:
example_prompt = "Question: What is the treatment for tuberculosis?\nAnswer:"
print(generate_answer(model, tokenizer, example_prompt))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Question: What is the treatment for tuberculosis?
Answer: Tuberculosis is a bacterial infection caused by Mycobacterium tuberculosis. Treatment for tuberculosis is based on the patient's stage of disease, the type of tuberculosis, and the patient's immune status.


### Evaluating the model

In [None]:
def evaluate_model(model, tokenizer, dataset, label, max_samples=30):
    rouge = load_metric("rouge")
    bertscore = load_metric("bertscore")
    meteor = load_metric("meteor")
    bleu = load_metric("bleu")

    refs = []
    preds = []
    print(f"\nEvaluating on {label} set (first {max_samples} samples)...")
    for i, example in enumerate(dataset):
        if i >= max_samples:
            break
        question = example['text'].split('\n')[0]
        prompt = f"{question}\nAnswer:"
        pred = generate_answer(model, tokenizer, prompt)
        preds.append(pred.strip())
        refs.append(example['text'].split('\n')[1].replace('Answer: ', '').strip())
        if i % 5 == 0:
            print(f"Sample {i+1}/{max_samples} done.")

    rouge_result = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
    bert_result = bertscore.compute(predictions=preds, references=refs, lang="en")
    meteor_result = meteor.compute(predictions=preds, references=refs)
    bleu_result = bleu.compute(
        predictions=[' '.join(pred.split()) for pred in preds],
        references=[[' '.join(ref.split())] for ref in refs]
    )

    print("\nROUGE-L Score:", rouge_result['rougeL'])
    print("BERTScore F1:", sum(bert_result['f1']) / len(bert_result['f1']))
    print("METEOR Score:", meteor_result['meteor'])
    print("BLEU Score:", bleu_result['bleu'])

### Evaluation on **training set**

In [None]:
evaluate_model(model, tokenizer, dataset["train"], label="train")


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!



Evaluating on train set (first 30 samples)...
Sample 1/30 done.
Sample 6/30 done.
Sample 11/30 done.
Sample 16/30 done.
Sample 21/30 done.
Sample 26/30 done.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



ROUGE-L Score: 0.25717784755980444
BERTScore F1: 0.8693346540133159
METEOR Score: 0.2840768499963113
BLEU Score: 0.08057544283108803


### Evaluation on **testing set**

In [None]:
evaluate_model(model, tokenizer, dataset["test"], label="test")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!



Evaluating on test set (first 30 samples)...
Sample 1/30 done.
Sample 6/30 done.
Sample 11/30 done.
Sample 16/30 done.
Sample 21/30 done.
Sample 26/30 done.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



ROUGE-L Score: 0.26234347192124596
BERTScore F1: 0.8629373371601105
METEOR Score: 0.2918086697696288
BLEU Score: 0.05961356375973927


### Displaying **Token Importance**

In [None]:
def input_token_importance(model, tokenizer, prompt, max_new_tokens=20):
    """
    Print input tokens that most influence the generated answer.
    """
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids[0]
    orig_output = generate_answer(model, tokenizer, prompt)
    orig_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    print(f"Original generated answer: {orig_output}")
    for i in range(1, len(input_ids)-1):  # skip special tokens
        perturbed_ids = torch.cat([input_ids[:i], input_ids[i+1:]])
        perturbed_prompt = tokenizer.decode(perturbed_ids, skip_special_tokens=True)
        perturbed_output = generate_answer(model, tokenizer, perturbed_prompt)
        if orig_output != perturbed_output:
            print(f"Token '{orig_tokens[i]}' is IMPORTANT: changes output to: {perturbed_output}")
        else:
            print(f"Token '{orig_tokens[i]}' is not important.")

In [None]:
prompt = "Question: What is the treatment for tuberculosis?\nAnswer:"
input_token_importance(model, tokenizer, prompt)

Original generated answer: Question: What is the treatment for tuberculosis?
Answer: Tuberculosis is a bacterial infection caused by Mycobacterium tuberculosis. Treatment for tuberculosis is based on the patient's stage of disease, the type of tuberculosis, and the patient's immune status.
Token '▁Question' is IMPORTANT: changes output to: : What is the treatment for tuberculosis?
Answer: Tuberculosis is a bacterial infection caused by Mycobacterium tuberculosis. It is a major cause of death worldwide.
Token ':' is IMPORTANT: changes output to: Question What is the treatment for tuberculosis?
Answer: The treatment for tuberculosis is multidrug therapy.
Token '▁What' is IMPORTANT: changes output to: Question: is the treatment for tuberculosis?
Answer: The treatment for tuberculosis is multidrug therapy with isoniazid, rifampicin, and pyrazinamide.
Token '▁is' is IMPORTANT: changes output to: Question: What the treatment for tuberculosis?
Answer: Tuberculosis is a bacterial infection cau

# Fine tuning for TinyLlama

### First Round of Tuning

Reload the Dataset and Tokenizer

In [None]:
# Re-import necessary libraries (already in the notebook, but ensuring they're available)
from datasets import DatasetDict, Dataset
from transformers import AutoTokenizer
import pandas as pd
from sklearn.model_selection import train_test_split

# Reload and split the dataset (same as original)
def load_and_split_data(sample_size=600):
    df = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-bioasq/data/test.parquet/part.0.parquet")
    df.dropna(subset=['question', 'answer'], inplace=True)
    df['text'] = df.apply(lambda x: f"Question: {x['question']}\nAnswer: {x['answer']}", axis=1)
    if sample_size is not None and len(df) > sample_size:
        df = df.sample(sample_size, random_state=42).reset_index(drop=True)
    train_df, test_df = train_test_split(df[['text']], test_size=0.2, random_state=42)
    return DatasetDict({
        "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
        "test": Dataset.from_pandas(test_df.reset_index(drop=True))
    })

dataset = load_and_split_data(sample_size=600)

# Reload tokenizer
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenize dataset (reusing original preprocess function)
def preprocess(example, tokenizer, max_length=256):
    encoding = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=max_length
    )
    encoding["labels"] = encoding["input_ids"].copy()
    return encoding

def tokenize_dataset(dataset, tokenizer):
    return dataset.map(lambda x: preprocess(x, tokenizer), batched=True)

tokenized_dataset = tokenize_dataset(dataset, tokenizer)

Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Update LoRA Configuration and Reload Model

In [None]:
from transformers import AutoConfig, AutoModelForCausalLM
from peft import get_peft_model, LoraConfig, TaskType
import torch

def setup_model_with_lora(model_name):
    config = AutoConfig.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        config=config,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map=None
    )
    model = model.to("cuda" if torch.cuda.is_available() else "cpu")

    # Updated LoRA configuration
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=16,  # Increased from 8 to 16
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"]  # Added k_proj, o_proj
    )
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    return model

# Load model with updated LoRA configuration
model = setup_model_with_lora(MODEL_NAME)

trainable params: 4,505,600 || all params: 1,104,553,984 || trainable%: 0.4079


Update Training Configuration

In [None]:
from transformers import Trainer, TrainingArguments

def setup_training(model, tokenizer, dataset, num_of_epochs):
    args = TrainingArguments(
        output_dir="../models/lora_bioasq_tinyllama_tuned",
        per_device_train_batch_size=2,
        num_train_epochs=num_of_epochs,  # Increased to 4
        learning_rate=1e-4,  # Reduced from 2e-4
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        warmup_ratio=0.1,
        fp16=False,
        gradient_accumulation_steps=1,
        logging_dir="./logs",
        logging_steps=25,
        save_strategy="epoch",
        save_total_limit=2,
        report_to="none"
    )
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
    )
    return trainer

# Set up trainer with 4 epochs
NO_OF_EPOCHS = 1
trainer = setup_training(model, tokenizer, tokenized_dataset, num_of_epochs=NO_OF_EPOCHS)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Train the Model

In [None]:
# Train the model
trainer.train()

Step,Training Loss
25,3.881
50,0.7927
75,0.6743
100,0.5753
125,0.6343
150,0.5839
175,0.6189
200,0.5976
225,0.4956


TrainOutput(global_step=240, training_loss=0.9584276914596558, metrics={'train_runtime': 9155.3539, 'train_samples_per_second': 0.052, 'train_steps_per_second': 0.026, 'total_flos': 766047179243520.0, 'train_loss': 0.9584276914596558, 'epoch': 1.0})

In [None]:
# Save the model
trainer.save_model("./lora_bioasq_tinyllama_tuned1")

# Save the tokenizer
tokenizer.save_pretrained("./lora_bioasq_tinyllama_tuned1")

('./lora_bioasq_tinyllama_tuned1/tokenizer_config.json',
 './lora_bioasq_tinyllama_tuned1/special_tokens_map.json',
 './lora_bioasq_tinyllama_tuned1/chat_template.jinja',
 './lora_bioasq_tinyllama_tuned1/tokenizer.model',
 './lora_bioasq_tinyllama_tuned1/added_tokens.json',
 './lora_bioasq_tinyllama_tuned1/tokenizer.json')

Evaluate the Model

In [None]:
from evaluate import load as load_metric

def evaluate_model(model, tokenizer, dataset, label, max_samples=30):
    rouge = load_metric("rouge")
    bertscore = load_metric("bertscore")
    meteor = load_metric("meteor")
    bleu = load_metric("bleu")

    refs = []
    preds = []
    print(f"\nEvaluating on {label} set (first {max_samples} samples)...")
    for i, example in enumerate(dataset):
        if i >= max_samples:
            break
        question = example['text'].split('\n')[0]
        prompt = f"{question}\nAnswer:"
        pred = generate_answer(model, tokenizer, prompt)
        preds.append(pred.strip())
        refs.append(example['text'].split('\n')[1].replace('Answer: ', '').strip())
        if i % 5 == 0:
            print(f"Sample {i+1}/{max_samples} done.")

    rouge_result = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
    bert_result = bertscore.compute(predictions=preds, references=refs, lang="en")
    meteor_result = meteor.compute(predictions=preds, references=refs)
    bleu_result = bleu.compute(
        predictions=[' '.join(pred.split()) for pred in preds],
        references=[[' '.join(ref.split())] for ref in refs]
    )

    print("\nROUGE-L Score:", rouge_result['rougeL'])
    print("BERTScore F1:", sum(bert_result['f1']) / len(bert_result['f1']))
    print("METEOR Score:", meteor_result['meteor'])
    print("BLEU Score:", bleu_result['bleu'])

# Evaluate on test set
evaluate_model(model, tokenizer, dataset["test"], label="test")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Evaluating on test set (first 30 samples)...
Sample 1/30 done.
Sample 6/30 done.
Sample 11/30 done.
Sample 16/30 done.
Sample 21/30 done.
Sample 26/30 done.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



ROUGE-L Score: 0.24578130142550025
BERTScore F1: 0.8618259251117706
METEOR Score: 0.27741167751265405
BLEU Score: 0.05358951227114722


Test Sample Output

In [None]:
def generate_answer(model, tokenizer, prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
    model.eval()
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            max_new_tokens=100,
            num_return_sequences=1,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id
        )
    return tokenizer.decode(output[0], skip_special_tokens=True).strip()

# Test the prompt
example_prompt = "Question: What is the treatment for tuberculosis?\nAnswer:"
print(generate_answer(model, tokenizer, example_prompt))

Question: What is the treatment for tuberculosis?
Answer: Tuberculosis is a bacterial infection caused by Mycobacterium tuberculosis. Treatment for tuberculosis is multidrug therapy with isoniazid, rifampicin, and pyrazinamide.


###Issue Faced in First Round

Issue: Decreased evaluation metrics despite improved sample output

- Description: The first round improved the specificity of the tuberculosis answer, but test set metrics (ROUGE-L, METEOR, BLEU) dropped, indicating poor generalization.
-This could be due to:
 - Insufficient Training: 1 epoch was not enough for the model to fully adapt with the increased LoRA rank (r=16), leading to underfitting or incomplete convergence.
 - Overfitting Risk: The increased model capacity (more trainable parameters with r=16 and additional target modules) may cause the model to memorize training data patterns, reducing performance on diverse test samples.
 - Dataset Limitations: The small dataset (480 training samples) and lack of input variation may limit robustness, as seen in the token importance analysis where small input changes significantly alter outputs.
 - Learning Rate: The reduced learning rate (1e-4) may still be too high for stable convergence in 1 epoch, causing suboptimal weight updates.



### Second Round of Tuning

In [None]:
# Re-import necessary libraries
from datasets import DatasetDict, Dataset
from transformers import AutoTokenizer
import pandas as pd
from sklearn.model_selection import train_test_split
import random

# Simple paraphrase function (manual for efficiency, no external model)
def paraphrase_question(question):
    question = question.replace("Question: ", "")
    synonyms = {
        "what is": ["what are", "describe", "explain"],
        "treatment": ["therapy", "management", "cure"],
        "cause": ["reason", "etiology", "source"],
        "for": ["of", "related to"],
    }
    for key, options in synonyms.items():
        if key in question.lower():
            question = question.replace(key, random.choice(options))
    return f"Question: {question}"

# Reload and augment dataset
def load_and_split_data(sample_size=600):
    df = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-bioasq/data/test.parquet/part.0.parquet")
    df.dropna(subset=['question', 'answer'], inplace=True)
    df['text'] = df.apply(lambda x: f"Question: {x['question']}\nAnswer: {x['answer']}", axis=1)
    if sample_size is not None and len(df) > sample_size:
        df = df.sample(sample_size, random_state=42).reset_index(drop=True)

    # Augment training data
    augmented = []
    for _, row in df.iterrows():
        paraphrased_text = paraphrase_question(row['text'].split('\n')[0]) + '\n' + row['text'].split('\n')[1]
        augmented.append({'text': paraphrased_text})
    augmented_df = pd.DataFrame(augmented)
    df = pd.concat([df[['text']], augmented_df]).reset_index(drop=True)

    train_df, test_df = train_test_split(df[['text']], test_size=0.2, random_state=42)
    return DatasetDict({
        "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
        "test": Dataset.from_pandas(test_df.reset_index(drop=True))
    })

dataset = load_and_split_data(sample_size=600)

# Reload tokenizer
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenize dataset
def preprocess(example, tokenizer, max_length=256):
    encoding = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=max_length
    )
    encoding["labels"] = encoding["input_ids"].copy()
    return encoding

def tokenize_dataset(dataset, tokenizer):
    return dataset.map(lambda x: preprocess(x, tokenizer), batched=True)

tokenized_dataset = tokenize_dataset(dataset, tokenizer)

Map:   0%|          | 0/960 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

### Update LoRA Configuration with Higher Dropout

In [None]:
from transformers import AutoConfig, AutoModelForCausalLM
from peft import get_peft_model, LoraConfig, TaskType
import torch

def setup_model_with_lora(model_name):
    config = AutoConfig.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        config=config,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map=None
    )
    model = model.to("cuda" if torch.cuda.is_available() else "cpu")

    # Updated LoRA configuration
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=16,
        lora_alpha=32,
        lora_dropout=0.1,  # Increased from 0.05
        bias="none",
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"]
    )
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    return model

# Load model
model = setup_model_with_lora(MODEL_NAME)



trainable params: 4,505,600 || all params: 1,104,553,984 || trainable%: 0.4079


### Update Training Configuration with Gradient Accumulation

In [None]:
from transformers import Trainer, TrainingArguments

def setup_training(model, tokenizer, dataset, num_of_epochs):
    args = TrainingArguments(
        output_dir="../models/lora_bioasq_tinyllama_tuned2",
        per_device_train_batch_size=2,
        num_train_epochs=num_of_epochs,
        learning_rate=1e-4,
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        warmup_ratio=0.1,
        fp16=False,  # Set to True if using GPU
        gradient_accumulation_steps=2,  # Simulate batch size of 4
        logging_dir="./logs",
        logging_steps=25,
        save_strategy="steps",
        save_steps=100,  # Save checkpoint every 100 steps
        eval_strategy="steps",  # Evaluate every 100 steps
        eval_steps=100,
        save_total_limit=2,
        report_to="none"
    )
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
    )
    return trainer

# Set up trainer with 1 epoch
NO_OF_EPOCHS = 1
trainer = setup_training(model, tokenizer, tokenized_dataset, num_of_epochs=NO_OF_EPOCHS)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


### Train the Model with Early Stopping Simulation

In [None]:
# Train the model
trainer.train()

# Load the best checkpoint based on validation loss (manually check logs)
# Example: If lowest eval_loss is at step 400, load that checkpoint
from peft import PeftModel
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
).to("cuda" if torch.cuda.is_available() else "cpu")
model = PeftModel.from_pretrained(model, "./lora_bioasq_tinyllama_tuned2/checkpoint-400")

Step,Training Loss,Validation Loss
100,0.5591,0.594777


### Issue Faced in Second Round

Issue: Second round of tuning stopped prematurely and took excessive time, likely due to increased dataset size (~960 samples) and computational demands of LoRA configuration (r=16, additional target modules).

- Next Steps:
  - Reduce sample_size to 400 (~640 training samples).
  - Enable gradient_checkpointing=True to lower memory usage.
  - Lower LoRA rank to r=12.
  - Set tokenizer.pad_token = "[PAD]", update model.config.pad_token_id.
  - Train 1 epoch, learning rate 5e-5, lora_dropout=0.1.
  - Monitor CPU/memory; use GPU with fp16=True if available.
  - Target ROUGE-L > 0.27, METEOR > 0.29, BLEU > 0.06.
  - Verify output specificity and robustness with input_token_importance
  - Document metrics, losses, outputs, and runtime.

### Save the Tuned Model and Tokenizer

In [None]:
# Save the model
trainer.save_model("./lora_bioasq_tinyllama_tuned2")

# Save the tokenizer
tokenizer.save_pretrained("./lora_bioasq_tinyllama_tuned2")

### Evaluate the Model

In [None]:
from evaluate import load as load_metric

def evaluate_model(model, tokenizer, dataset, label, max_samples=30):
    rouge = load_metric("rouge")
    bertscore = load_metric("bertscore")
    meteor = load_metric("meteor")
    bleu = load_metric("bleu")

    refs = []
    preds = []
    print(f"\nEvaluating on {label} set (first {max_samples} samples)...")
    for i, example in enumerate(dataset):
        if i >= max_samples:
            break
        question = example['text'].split('\n')[0]
        prompt = f"{question}\nAnswer:"
        pred = generate_answer(model, tokenizer, prompt)
        preds.append(pred.strip())
        refs.append(example['text'].split('\n')[1].replace('Answer: ', '').strip())
        if i % 5 == 0:
            print(f"Sample {i+1}/{max_samples} done.")

    rouge_result = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
    bert_result = bertscore.compute(predictions=preds, references=refs, lang="en")
    meteor_result = meteor.compute(predictions=preds, references=refs)
    bleu_result = bleu.compute(
        predictions=[' '.join(pred.split()) for pred in preds],
        references=[[' '.join(ref.split())] for ref in refs]
    )

    print("\nROUGE-L Score:", rouge_result['rougeL'])
    print("BERTScore F1:", sum(bert_result['f1']) / len(bert_result['f1']))
    print("METEOR Score:", meteor_result['meteor'])
    print("BLEU Score:", bleu_result['bleu'])

# Evaluate on test set
evaluate_model(model, tokenizer, dataset["test"], label="test")

### Test Sample Output

In [None]:
def generate_answer(model, tokenizer, prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
    model.eval()
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            max_new_tokens=100,
            num_return_sequences=1,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id
        )
    return tokenizer.decode(output[0], skip_special_tokens=True).strip()

# Test the prompt
example_prompt = "Question: What is the treatment for tuberculosis?\nAnswer:"
print(generate_answer(model, tokenizer, example_prompt))