## NLP Assignment 3

### QA Model

#### Necessary Installs

In [1]:
! pip install evaluate sacrebleu rouge_score -U nltk requests

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nltk-3.9.1-py3-none-any.whl 

#### Imports

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os, sys, json, torch, warnings

from torch import nn
from torch.nn import functional as F

from transformers import (AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModelForSequenceClassification, 
                        TrainingArguments, Trainer, DataCollatorWithPadding)

from datasets import load_dataset
from evaluate import load

from huggingface_hub import login
from prettytable import PrettyTable

#### Disabling Warnings

In [3]:
print("Current Working Directory: ",os.getcwd())
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

Current Working Directory:  /kaggle/working


#### Setting Up Access Token

In [4]:
os.environ["Bpp06"] = "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"  # Replace it with your Access Token

In [5]:
login(token=os.getenv("Bpp06"), add_to_git_credential=True)

Token is valid (permission: fineGrained).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [6]:
model_name = "meta-llama/Llama-3.2-1B"

#### Loading Pretrained Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, token=os.getenv("Bp06"))
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


#### Number of Parameters of Pretrained Model

In [9]:
def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params += params
    print(table)

#### Loading SQUAD Dataset

In [None]:
squad = load_dataset("rajpurkar/squad_v2")

#### Preprocessing SQUAD Data

In [11]:
def preprocess_squad_data(examples):
    inputs = [q + " " + c for q, c in zip(examples["question"], examples["context"])]

    start_positions = []
    end_positions = []
    ans = []
    for answer, context in zip(examples["answers"], examples["context"]):
        if len(answer["text"]) > 0:
            answer_text = answer["text"][0]
            start_idx = answer["answer_start"][0]
            end_idx = start_idx + len(answer_text)
        else:
            start_idx = 0
            end_idx = 0
            answer_text = ""

        start_positions.append(start_idx)
        end_positions.append(end_idx)
        ans.append(answer_text)
        
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )
    model_inputs["start_positions"] = start_positions
    model_inputs["end_positions"] = end_positions
    model_inputs["answers"] = ans

    return {k: v for k, v in model_inputs.items()}

In [None]:
train_dataset = squad["train"].select(range(8000)).map(preprocess_squad_data, batched=True)
test_dataset = squad["validation"].select(range(2000)).map(preprocess_squad_data, batched=True)

#### CustomQA Model Class

In [13]:
class CustomQAModel(nn.Module):
    def __init__(self, model_name):
        super(CustomQAModel, self).__init__()
        
        # Load configuration and base model
        self.config = AutoConfig.from_pretrained(model_name)
        self.base_model = AutoModelForCausalLM.from_pretrained(model_name, config=self.config)

        # Add a question answering head (linear layer)
        hidden_size = self.config.hidden_size
        self.qa_outputs = nn.Linear(hidden_size, 2)  # Start and end ind

    def forward(self, input_ids, attention_mask=None, start_positions=None, end_positions=None):
        # Pass inputs through the base model
        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True
        )

        # Use the last hidden state for QA head
        hidden_states = outputs.hidden_states[-1]
        logits = self.qa_outputs(hidden_states)

        # Split into start and end logits
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        loss = None
        if start_positions is not None and end_positions is not None:
            # Ignore padding positions when calculating loss
            ignored_index = start_logits.size(1)  # seq_length
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # Compute the loss using CrossEntropyLoss
            start_loss = F.cross_entropy(start_logits, start_positions, ignore_index=ignored_index)
            end_loss = F.cross_entropy(end_logits, end_positions, ignore_index=ignored_index)
            loss = (start_loss + end_loss) / 2

        return {
            "start_logits": start_logits,
            "end_logits": end_logits,
            "loss": loss,
        }
    def save_pretrained(self, save_directory):
        os.makedirs(save_directory, exist_ok=True)
        # Save model weights
        model_path = os.path.join(save_directory, "pytorch_model.bin")
        torch.save(self.state_dict(), model_path)

        # Save configuration (replace with actual configuration logic)
        config = {"model_name": "CustomQAModel"}
        config_path = os.path.join(save_directory, "config.json")
        with open(config_path, "w") as f:
            json.dump(config, f)
    
    @classmethod
    def from_pretrained(cls, save_directory, model_name):
        model = cls(model_name)
        model.base_model = AutoModelForCausalLM.from_pretrained(save_directory)
        model.qa_outputs.load_state_dict(torch.load(f"{save_directory}/qa_outputs.bin"))
        return model

In [None]:
qa_model = CustomQAModel(model_name)
qa_model.config.pad_token_id = qa_model.config.eos_token_id

#### Counting the Parameters of QA Mode

In [15]:
count_parameters(qa_model)

+------------------------------------------------------------+------------+
|                          Modules                           | Parameters |
+------------------------------------------------------------+------------+
|            base_model.model.embed_tokens.weight            | 262668288  |
|     base_model.model.layers.0.self_attn.q_proj.weight      |  4194304   |
|     base_model.model.layers.0.self_attn.k_proj.weight      |  1048576   |
|     base_model.model.layers.0.self_attn.v_proj.weight      |  1048576   |
|     base_model.model.layers.0.self_attn.o_proj.weight      |  4194304   |
|       base_model.model.layers.0.mlp.gate_proj.weight       |  16777216  |
|        base_model.model.layers.0.mlp.up_proj.weight        |  16777216  |
|       base_model.model.layers.0.mlp.down_proj.weight       |  16777216  |
|      base_model.model.layers.0.input_layernorm.weight      |    2048    |
| base_model.model.layers.0.post_attention_layernorm.weight  |    2048    |
|     base_m

#### Number of Parameters before Freezing the Base Model Parameters

In [16]:
total_params = sum(p.numel() for p in qa_model.parameters())
base_total_params = sum(p.numel() for p in qa_model.base_model.parameters())
trainable_params = sum(p.numel() for p in qa_model.parameters() if p.requires_grad)
print("Parameters before freezing the base model parameters")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Total Base model parameters: {base_total_params:,}")

Parameters before freezing the base model parameters
Total parameters: 1,235,818,498
Trainable parameters: 1,235,818,498
Total Base model parameters: 1,235,814,400


#### Freezing the Base Model Parameters

In [17]:
for param in qa_model.base_model.parameters():
    param.requires_grad = False

#### Total Number of Parameters after Freezing the Base Model Parameters

In [18]:
# Count total parameters
total_params = sum(p.numel() for p in qa_model.parameters())
base_total_params = sum(p.numel() for p in qa_model.base_model.parameters())
trainable_params = sum(p.numel() for p in qa_model.parameters() if p.requires_grad)
print("Parameters after Freezing the Base Model Parameters:")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Total Base model parameters: {base_total_params:,}")

Parameters after Freezing the Base Model Parameters:
Total parameters: 1,235,818,498
Trainable parameters: 4,098
Total Base model parameters: 1,235,814,400


#### Loading Evaluation Metrics

In [None]:
exact_match_metric = load("exact_match")
f1_metric = load("f1")
bleu_metric = load("bleu")
rouge_metric = load("rouge")
meteor_metric = load("meteor")
squad_metric = load("squad_v2")

In [20]:
def evaluate_model(model, device, dataset, tokenizer, metrics=["f1", "squad_v2", "bleu", "exact_match", "meteor", "rouge"]):
    model.eval()  # Set the model to evaluation mode
    model.to(device)  # Ensure the model is on the correct device
    predictions = []
    references = []
    print("Evaluating...")
    for example in dataset:
        inputs = tokenizer(
            example["question"],
            example["context"],
            max_length=512,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        inputs = {key: value.to(device) for key, value in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
    
        start_logits = outputs["start_logits"].cpu().numpy()
        end_logits = outputs["end_logits"].cpu().numpy()


        start_idx = np.argmax(start_logits)
        end_idx = np.argmax(end_logits)

        predicted_text = tokenizer.decode(
            inputs["input_ids"][0][start_idx:end_idx + 1],
            skip_special_tokens=True
        )
        predictions.append(predicted_text)

        # Ground truth answer
        references.append(example["answers"])

    # Calculate metrics
    results = {}
    if "f1" in metrics:
        f1_scores = [
            compute_f1(pred, ref) for pred, ref in zip(predictions, references)
        ]
        results["f1"] = np.mean(f1_scores)

    if "exact_match" in metrics:
        em_scores = [
            compute_exact_match(pred, ref) for pred, ref in zip(predictions, references)
        ]
        results["exact_match"] = np.mean(em_scores)

    if "bleu" in metrics:
        valid_references = [[r] for r in references]
        if predictions and valid_references:
            try:
                bleu_scores = bleu_metric.compute(
                    predictions=predictions,
                    references=valid_references
                )
                results["bleu"] = bleu_scores["bleu"]
            except ZeroDivisionError as e:
                print("ZeroDivisionError during BLEU calculation. Returning partial result.")
                results["bleu"] = '0.0 (because ZeroDivisionError during BLEU Calculation)'
        else:
            print("No valid data points for BLEU calculation. Setting BLEU score to 0.")
            results["bleu"] = '0.0 (because no valid data points for BLEU Calculation)'


    if "meteor" in metrics:
        meteor_scores = meteor_metric.compute(
            predictions=predictions,
            references=references
        )
        results["meteor"] = meteor_scores["meteor"]

    if "rouge" in metrics:
        rouge_scores = rouge_metric.compute(
            predictions=predictions,
            references=references
        )
        results["rouge"] = rouge_scores
    
    if "squad_v2" in metrics:
        # Restructure predictions
        formatted_predictions = [
            {
                "id": example["id"],
                "prediction_text": prediction,
                "no_answer_probability": 0.0  # Adjust if using no-answer probabilities
            }
            for example, prediction in zip(dataset, predictions)
        ]
    
        # Restructure references
        formatted_references = [
            {
                "id": example["id"],
                "answers": [{"text": example["answers"], "answer_start": example["start_positions"]}]
            }
            for example in dataset
        ]
    
        # Compute SQuAD v2 scores
        squad_v2_scores = squad_metric.compute(
            predictions=formatted_predictions,
            references=formatted_references
        )
        results["squad_v2"] = squad_v2_scores
    for key in metrics:
        if(type(results[key]) != dict):
            print(key, ":", results[key])
        else:
            print(key, ":")
            pprint(results[key])

from pprint import pprint
def compute_f1(pred, ref):
    pred_tokens = pred.split()
    ref_tokens = ref.split()
    common = set(pred_tokens) & set(ref_tokens)
    num_same = len(common)

    if len(pred_tokens) == 0 or len(ref_tokens) == 0:
        return int(pred_tokens == ref_tokens)
    if num_same == 0:
        return 0

    precision = num_same / len(pred_tokens)
    recall = num_same / len(ref_tokens)
    f1 = 2 * precision * recall / (precision + recall)
    return f1


def compute_exact_match(pred, ref):
    return int(pred.strip() == ref.strip())


In [21]:
evaluate_model(qa_model, device, test_dataset, tokenizer)

Evaluating...


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


f1 : 0.3144356825189179
squad_v2 :
{'HasAns_exact': 30.2,
 'HasAns_f1': 31.836490539007123,
 'HasAns_total': 500,
 'best_exact': 30.2,
 'best_exact_thresh': 0.0,
 'best_f1': 31.836490539007123,
 'best_f1_thresh': 0.0,
 'exact': 30.2,
 'f1': 31.836490539007123,
 'total': 500}
bleu : 0.004205215763694626
exact_match : 0.302
meteor : 0.028120382620535067
rouge :
{'rouge1': 0.017202131138604027,
 'rouge2': 0.005994900218554977,
 'rougeL': 0.016735073197505806,
 'rougeLsum': 0.01678976408627315}


#### Training Arguments

In [22]:
import wandb
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    fp16=True,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=False,
    report_to="wandb", 
)

#### Defining the Trainer

In [23]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = Trainer(
    model=qa_model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    eval_dataset=test_dataset
)

#### Training the Model

In [24]:
wandb.init(project="huggingface", name="llama_telugu")
wandb.watch(qa_model, log="all", log_freq=100)
trainer.train()
trainer.model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112965733333466, max=1.0…



Epoch,Training Loss,Validation Loss
0,4.8347,No log


('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/tokenizer.json')

#### Evaluating Model after Training

In [25]:
qa_model.to(device)
evaluate_model(qa_model, device, test_dataset, tokenizer)

Evaluating...
f1 : 0.06528022822167888
squad_v2 :
{'HasAns_exact': 5.0,
 'HasAns_f1': 7.761453452808612,
 'HasAns_total': 500,
 'best_exact': 5.0,
 'best_exact_thresh': 0.0,
 'best_f1': 7.761453452808612,
 'best_f1_thresh': 0.0,
 'exact': 5.0,
 'f1': 7.761453452808612,
 'total': 500}
bleu : 0.005040420825422111
exact_match : 0.042
meteor : 0.05902862809853723
rouge :
{'rouge1': 0.027745307977290832,
 'rouge2': 0.013237317729345024,
 'rougeL': 0.02687145779982931,
 'rougeLsum': 0.027163246918168953}


In [26]:
wandb.finish()

VBox(children=(Label(value='0.307 MB of 0.307 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇████
train/grad_norm,█▃▆▂▅▃▅▂▁▂▂▄▅▁▂▂▂▅▂▁▁▁▂▃▂▃▁▁▃▁▁
train/learning_rate,███▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁
train/loss,█▆▅▄▄▂▃▃▃▃▂▃▃▄▂▃▃▃▂▂▃▂▂▂▂▂▃▁▃▁▂

0,1
eval/runtime,120.964
eval/samples_per_second,4.133
eval/steps_per_second,0.265
total_flos,0.0
train/epoch,0.9984
train/global_step,312.0
train/grad_norm,357127.15625
train/learning_rate,1e-05
train/loss,4.8347
train_loss,4.92475


#### Uploading Model to HuggingFace

In [None]:
from huggingface_hub import HfApi

api = HfApi()
api.upload_folder(
    folder_path="./saved_model",
    repo_id="bp03/QuestionAnswering_SQUADV2_Llamma_3.2_1B",
    commit_message="Added fine-tuned model"
)