## Task 2 : Claim Normalization using BART (Base) & T5 (small)
### By Gour Krishna Dey | MT24035

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Pretrained Model (BART-base & T5-small)

If you use kaggle then unzip the existing model whihc will be import for testing purpose

In [None]:
!unzip -q "/kaggle/input/your-dataset-name/t5_clan_model.zip" -d "/kaggle/working/Model"

# All Dependencies

In [None]:
!pip install -U transformers datasets sentencepiece evaluate bert-score rouge-score matplotlib nltk sacrebleu contractions
!pip install --upgrade transformers
!pip install transformers datasets evaluate rouge-score sacrebleu bert-score --upgrade --quiet

# Preprocessing Pipeline

This Code will preprocessed given noise input.csv file and make an cleaned .csv file which can be use for further inference

In [None]:
import pandas as pd
import re  # Regular expressions
import string
import contractions  # contraxtions library to expand contractions


def text_cleaner(text):
    if pd.isna(text):
        return ""

    text = contractions.fix(text)  # Expand Contraction : ex - "can't" -> "cannot"

    text = re.sub(r"@\w+", "", text)  # Twitter handles remove
    text = re.sub(r"http\S+|www\S+", "", text)  # URLs remove
    text = re.sub(r"<.*?>", "", text)  # HTML tags remove

    text = (
        text.replace('"', "").replace("“", "").replace("”", "")
    )  # Remove double quotes

    text = re.sub(r"#(\w+)", r"\1", text)  # Remover Hashtags but keep the original word

    # Remove newlines, tabs, extra whitespace
    text = text.replace("\n", " ").replace("\r", " ").replace("\t", " ")
    text = re.sub(r"\s+", " ", text)

    text = text.lower()  # Convert to lowercase

    # Remove non-informative punctuation ( Ex - "@#$%^&*()[]{};:'<>?/" )
    text = re.sub(
        f"[{re.escape(string.punctuation.replace('.', '').replace(',', '').replace('!', '').replace('?', ''))}]",
        "",
        text,
    )

    return text.strip()


def raw_data_preprocesser(input_path, output_path):
    df = pd.read_csv(input_path)

    df = df[["PID", "Social Media Post", "Normalized Claim"]]

    df.dropna(
        subset=["Social Media Post", "Normalized Claim"], inplace=True
    )  # Drop rows with NaN in either column

    df["Social Media Post"] = df["Social Media Post"].apply(
        text_cleaner
    )  # Clean the "Social Media Post" column
    df["Normalized Claim"] = df["Normalized Claim"].apply(
        text_cleaner
    )  # Clean the "Normalized Claim" column

    df = df[
        (df["Social Media Post"].str.strip() != "")
        & (df["Normalized Claim"].str.strip() != "")
    ]  # Drop empty strings after cleaning

    df.drop_duplicates(subset=["Social Media Post", "Normalized Claim"], inplace=True)

    df.to_csv(output_path, index=False)
    print(f"Cleaned data saved to {output_path}")

    return df


input_raw_csv = "/home/slimsense/GourKrishna/NLP3/Data/Test_data_2.csv"  ## Modify this variable with ACTUAL RAW CSV data
output_cleaned_csv = "/home/slimsense/GourKrishna/NLP3/Data/Test_data2_Processed.csv"  ## Modify this variable with ACTUAL CLEANED CSV data
cleaned_df = raw_data_preprocesser(
    input_raw_csv, output_cleaned_csv
)  ## convert input csv -> output csv

In [None]:
# For testing
import pandas as pd
df = pd.read_csv("/kaggle/working/test_data_cleaned.csv")
df

# Code to Train BART-BASE Piepeline

In [None]:
import pandas as pd
import torch  # PyTorch library for tensor operations
from transformers import (
    BartTokenizer,
    BartForConditionalGeneration,
)  # BART tokenizer and model
from torch.utils.data import DataLoader  # DataLoader for batching and shuffling data
from torch.optim import AdamW  # AdamW optimizer for training
from datasets import Dataset  # Hugging Face Datasets library
from sklearn.model_selection import train_test_split  # Train-test split
import evaluate  # Evaluation library for NLP tasks
from bert_score import score  # bert-score library for evaluation
from tqdm import tqdm  # Progress bar library to see progress


# Load and pre - proces the dataset
df = pd.read_csv(
    "/home/slimsense/GourKrishna/NLP3/Data/CLAN_data_cleaned.csv"
)  ## UPDATE THIS PATH WITH THE ACTUAL PATH of the TRAININD CSV FILE
df = df[["Social Media Post", "Normalized Claim"]].dropna()

# Split the dataset into : 70% train, 15% validation, 15% test as mentioned in the question
train_df, temp_df = train_test_split(df, test_size=0.30, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Convert pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Load BART tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

# Tokenization function
"""
From - 
{
  'Social Media Post': 'Joe Biden is great',
  'Normalized Claim': 'Biden is amazing'
}
To - {
  'input_ids': [0, 314, 567, 17, 1024, 2, 1, 1, 1],
  'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0],
  'labels': [0, 201, 421, 90, 2, -100, -100, -100, -100],
}
"""


def tokenize_function(examples):
    inputs = tokenizer(
        examples["Social Media Post"],
        max_length=128,
        truncation=True,
        padding="max_length",
    )
    targets = tokenizer(
        examples["Normalized Claim"],
        max_length=128,
        truncation=True,
        padding="max_length",
    )

    labels = targets["input_ids"]
    labels = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_seq]
        for label_seq in labels
    ]

    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": labels,
    }


# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"]
)
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Load model and device ( will use GPU if available and preferrable for training & testing both)
model = BartForConditionalGeneration.from_pretrained(
    "facebook/bart-base"
)  # Model for conditional generation (ex- text summarization)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=1)

# Use ADAM Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Load evaluation metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

# Training loop
num_epochs = 5
best_val_loss = float("inf")

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids, attention_mask=attention_mask, labels=labels
        )
        loss = outputs.loss
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_train_loss = train_loss / len(train_loader)

    model.eval()
    val_loss = 0.0
    predictions = []
    references = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids, attention_mask=attention_mask, labels=labels
            )
            val_loss += outputs.loss.item()

            # Generate predictions
            generated_ids = model.generate(
                input_ids=input_ids, attention_mask=attention_mask, max_length=64
            )  # Here actual predictions are generated based on the input_ids and attention_mask

            # Will replace -100 in labels before starting decoding
            labels = torch.where(
                labels != -100,
                labels,
                torch.tensor(tokenizer.pad_token_id).to(labels.device),
            )

            # Decode predictions and labels
            decoded_preds = tokenizer.batch_decode(
                generated_ids, skip_special_tokens=True
            )
            decoded_labels = tokenizer.batch_decode(
                labels, skip_special_tokens=True
            )  # skip_special_tokens ex - <pad>, <s>, </s>

            predictions.extend(
                decoded_preds
            )  # Append the decoded predictions to the list
            references.extend(decoded_labels)  # Append the decoded labels to the list

    avg_val_loss = val_loss / len(val_loader)

    rouge_result = rouge.compute(
        predictions=predictions, references=references, rouge_types=["rougeL"]
    )  # ROUGE-L score computation
    bleu_result = bleu.compute(
        predictions=predictions, references=references
    )  # BLEU-4 score computation
    bertscore_result = score(
        predictions, references, lang="en", verbose=False
    )  # BERTScore computation
    bertscore_avg = bertscore_result[2].mean().item()

    print(f"\nEpoch {epoch+1} Summary:")  # Print the summary of the epoch
    print(f"Train Loss : {avg_train_loss:.4f}")
    print(f"Val Loss   : {avg_val_loss:.4f}")
    print(f"ROUGE-L    : {rouge_result['rougeL']:.4f}")
    print(f"BLEU-4     : {bleu_result['bleu']:.4f}")
    print(f"BERTScore  : {bertscore_avg:.4f}")

    if avg_val_loss < best_val_loss:  # Save the best model based on validation loss
        best_val_loss = avg_val_loss
        model.save_pretrained(
            "/home/slimsense/GourKrishna/NLP3/Model/bart_model_output/final"
        )
        tokenizer.save_pretrained(
            "/home/slimsense/GourKrishna/NLP3/Model/bart_model_output/final"
        )
        print("Best model saved.")

# Save the test split for inference
test_df.to_csv(
    "/home/slimsense/GourKrishna/NLP3/Data/test_data_cleaned.csv", index=False
)

# INFERENCE PIPELINE (BART)

In [1]:
import pandas as pd  # Pandas library for data manipuletion
import torch  # PyTorch library for tensor operations
from transformers import (
    BartTokenizer,
    BartForConditionalGeneration,
)  # BART tokenizar and model
from datasets import Dataset  # Hugging-Face Datasets library
from torch.utils.data import DataLoader  # DataLoader for batching and shufling data
import evaluate  # evalution library for NLP tasks
from bert_score import score  # bert-score library

# Load priorly saved model and tokenizer
# model_path = "/Model/BART/bart_model_output/final"  # CHNAGE WITH SAVE MODELS PATH
model_path = r"E:\MTechCSE\Study\Sem2\NLP\Assignment\Assignment_3\Final\Model\BART\bart_model_output\final"

model = BartForConditionalGeneration.from_pretrained(model_path)
tokenizer = BartTokenizer.from_pretrained(model_path)
model.eval()

# Load test data for inference
test_df = pd.read_csv(
    r"E:\MTechCSE\Study\Sem2\NLP\Assignment\Assignment_3\Final\Data\Test_data.csv"
)  # CHANGE WITH PROCESSED TEST DATA's PATH (File upon which we need to test)
test_df = test_df[["Social Media Post", "Normalized Claim"]].dropna()

# Tokenize test data
test_dataset = Dataset.from_pandas(test_df)


def tokenize_function(examples):
    inputs = tokenizer(
        examples["Social Media Post"],
        max_length=128,
        truncation=True,
        padding="max_length",
    )  # define the input sequence
    targets = tokenizer(
        examples["Normalized Claim"],
        max_length=128,
        truncation=True,
        padding="max_length",
    )  # define the target sequence

    labels = [  # Will replace -100 with the pad token id for the labels
        [(label if label != tokenizer.pad_token_id else -100) for label in label_seq]
        for label_seq in targets["input_ids"]
    ]

    return {  # return the tokenized inputs and labels
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": labels,
        "label_ids": targets["input_ids"],  # Keep original label IDs for decoding
    }


# Apply tokenization
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "labels", "label_ids"]
)  # set the format

# Move model to device (GPU is preferred)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# DataLoader for test set
test_loader = DataLoader(tokenized_test_dataset, batch_size=1)

predictions = []  # List to store the predictions
references = []  # List to store the references (actual labels)

# Inference loop
for batch in test_loader:  # Iterate over the test DataLoader
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)

    with torch.no_grad():  # Disable gradient calculation
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=128,
            num_beams=4,
            early_stopping=True,
        )

    decoded_preds = tokenizer.batch_decode(
        outputs, skip_special_tokens=True
    )  # Decode the generated predictions
    decoded_labels = tokenizer.batch_decode(
        batch["label_ids"], skip_special_tokens=True
    )  # Decode the original labels

    predictions.extend(decoded_preds)  # Predictions are appended to the lis
    references.extend(decoded_labels)  # References are appended to the list

# Evaluation
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

rouge_result = rouge.compute(
    predictions=predictions, references=references, rouge_types=["rougeL"]
)  # ROUGE-L score computation
bleu_result = bleu.compute(
    predictions=predictions, references=references
)  # BLEU-4 score computation
bertscore_result = score(
    predictions, references, lang="en", verbose=False
)  # BERTScore computation
bertscore_avg = bertscore_result[2].mean().item()

print("\nTest Set Evaluation:")  # Print the evaluation results
print(f"ROUGE-L : {rouge_result['rougeL']:.4f}")
print(f"BLEU-4  : {bleu_result['bleu']:.4f}")
print(f"BERTScore (F1): {bertscore_avg:.4f}")

output_df = pd.DataFrame(
    {  # DataFrame to store the results
        "Original Post": test_df["Social Media Post"],
        "Reference Claim": references,
        "Predicted Claim": predictions,
    }
)
output_df.to_csv(
    r"E:\MTechCSE\Study\Sem2\NLP\Assignment\Assignment_3\Final\Data\bart_test_predictions.csv", index=False
)  # WILL SAVE THE predictions

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 20/20 [00:00<00:00, 312.66 examples/s]
Using the latest cached version of the module from C:\Users\Gour krishna Dey\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--bleu\9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Wed Jun  4 10:47:23 2025) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Test Set Evaluation:
ROUGE-L : 0.2407
BLEU-4  : 0.0592
BERTScore (F1): 0.8721


# Code to train T5-Small Pipeline

In [None]:
!pip uninstall -y transformers
!pip install transformers==4.51.1 --quiet

In [None]:
import transformers
print(transformers.__version__)

In [None]:
import pandas as pd  # Data manipulation and analysis
import torch  # PyTorch library for tensor opn
from datasets import Dataset  # Hugging Face Datasets library
from transformers import (  # Tokenizer and model for T5
    T5Tokenizer,
    T5ForConditionalGeneration,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    TrainerCallback,
)
from nltk.translate.bleu_score import (
    sentence_bleu,
    SmoothingFunction,
)  # BLEU score computation
from rouge_score import rouge_scorer  # ROUGE score computation
from bert_score import score as bertscore  # BERTScore computation
from transformers import TrainingArguments

print("TrainingArguments from:", TrainingArguments.__module__)


# Check for GPU (Prefarable)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))
else:
    print("CUDA not available.")

# Data Loading and Preprocessing
df = pd.read_csv("/home/slimsense/GourKrishna/NLP3/Data/CLAN_data_cleaned.csv")
df = df[["Social Media Post", "Normalized Claim"]].dropna().drop_duplicates()
df = df[
    (df["Social Media Post"].str.strip() != "")
    & (df["Normalized Claim"].str.strip() != "")
]  # Drop empty strings after cleaning

# Split the dataset into train, validation, and test sets (70-15-15 as mentioned in the question)
train_df = df.sample(frac=0.7, random_state=42)  # 70% for training
temp_df = df.drop(train_df.index)
val_df = temp_df.sample(frac=0.5, random_state=42)  # 15% for validation
test_df = temp_df.drop(val_df.index)

# Convert pandas DataFrames to Hugging Face Datasets
train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
val_ds = Dataset.from_pandas(val_df.reset_index(drop=True))
test_ds = Dataset.from_pandas(test_df.reset_index(drop=True))

# Load T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small", legacy=False)
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(
    device
)  # Load T5 model


# Preprocessing function
def preprocess(example):  # Preprocess the dataset
    input_enc = tokenizer(
        "normalize: " + example["Social Media Post"],  # Add prefix to the input
        max_length=256,
        padding="max_length",
        truncation=True,
    )
    target_enc = tokenizer(  # Tokenize the target
        example["Normalized Claim"],
        max_length=128,
        padding="max_length",
        truncation=True,
    )
    input_enc["labels"] = [  # Set the labels for the target
        (label if label != tokenizer.pad_token_id else -100)
        for label in target_enc["input_ids"]
    ]
    return input_enc  # Return the tokenized input and labels togethr


# Apply preprocessing to the datasets
train_ds = train_ds.map(preprocess)
val_ds = val_ds.map(preprocess)
test_ds = test_ds.map(preprocess)

# Set the format for PyTorch
train_ds.set_format(type="torch")
val_ds.set_format(type="torch")
test_ds.set_format(type="torch")


# Data Collator
class EvalMetricsCallback(TrainerCallback):
    def on_evaluate(
        self, args, state, control, **kwargs
    ):  # Callback to compute evaluation metrics
        model.eval()
        predictions, references = [], []

        for i in range(min(100, len(val_ds))):
            sample = val_ds[i]  # Get the sample from the validation dataset
            input_ids = (
                sample["input_ids"].unsqueeze(0).to(device)
            )  # Add batch dimension
            attn_mask = (
                sample["attention_mask"].unsqueeze(0).to(device)
            )  # Move attention mask
            labels = sample["labels"]  # Get the labels

            with torch.no_grad():  # Disable gradient calculation
                gen_ids = model.generate(  # Generate predictions
                    input_ids=input_ids,
                    attention_mask=attn_mask,
                    max_length=128,
                    num_beams=4,
                )

            pred = tokenizer.decode(
                gen_ids[0], skip_special_tokens=True
            )  # Decode the generated IDs
            ref = tokenizer.decode(
                [t for t in labels if t != -100], skip_special_tokens=True
            )  # Decode the labels

            predictions.append(pred)  # Append the decoded predictions
            references.append(ref)  # Append the decoded references

        # Compute BLEU-4
        bleu_scores = [
            sentence_bleu(
                [ref.split()],
                pred.split(),
                smoothing_function=SmoothingFunction().method1,
            )
            for pred, ref in zip(predictions, references)
        ]
        print(f"\nEpoch {int(state.epoch)} Metrics:")
        print("BLEU-4    :", round(sum(bleu_scores) / len(bleu_scores), 4))

        # ROUGE-L
        rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
        rouge_scores = [
            rouge.score(r, p)["rougeL"].fmeasure
            for p, r in zip(predictions, references)
        ]
        print("ROUGE-L   :", round(sum(rouge_scores) / len(rouge_scores), 4))

        # BERTScore
        P, R, F1 = bertscore(predictions, references, lang="en", verbose=False)
        print("BERTScore :", round(F1.mean().item(), 4))


# Training Arguments define
training_args = TrainingArguments(
    output_dir="/home/slimsense/GourKrishna/NLP3/Model/T5/t5_clan",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-5,
    num_train_epochs=5,
    do_eval=True,
    do_train=True,
    logging_dir="./logs",
    logging_steps=100,
    save_steps=200,
    eval_steps=200,
    save_total_limit=2,
    evaluation_strategy="steps",
    report_to="none",
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model)  # Data collator for T5

trainer = Trainer(  # Trainer for T5
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
    callbacks=[EvalMetricsCallback()],
)

trainer.train()  # Train the model
trainer.save_model("/home/slimsense/GourKrishna/NLP3/Model/T5/t5_clan")
tokenizer.save_pretrained("/home/slimsense/GourKrishna/NLP3/Model/T5/t5_clan")

# Inference
sample = test_ds[0]
input_ids = sample["input_ids"].unsqueeze(0).to(device)
attn_mask = sample["attention_mask"].unsqueeze(0).to(device)

# Generate predictions
with torch.no_grad():
    output = model.generate(
        input_ids=input_ids, attention_mask=attn_mask, max_length=128
    )

print("\nPrediction Example :")  # Print the example for debug
print("Input     :", tokenizer.decode(sample["input_ids"], skip_special_tokens=True))
print("Prediction:", tokenizer.decode(output[0], skip_special_tokens=True))
print(
    "Reference :",
    tokenizer.decode(
        [i for i in sample["labels"] if i != -100], skip_special_tokens=True
    ),
)

# INFERENCE PIPELINE (T5-Small)

In [1]:
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from nltk.translate.bleu_score import (
    sentence_bleu,
    SmoothingFunction,
)  # BLEU score computation
from rouge_score import rouge_scorer
from bert_score import score as bertscore
from tqdm import tqdm


# Data Loading and Preprocessing
df = pd.read_csv(
    r"E:\MTechCSE\Study\Sem2\NLP\Assignment\Assignment_3\Final\Data\Test_data.csv"
)  ## CHANGE this path with you processed test.csv data (Upon which we need to apply pipeline)
df = df[["Social Media Post", "Normalized Claim"]].dropna().drop_duplicates()
df = df[
    (df["Social Media Post"].str.strip() != "")
    & (df["Normalized Claim"].str.strip() != "")
]


# Load T5 tokenizer and pretrained saved model
model_path = r"E:\MTechCSE\Study\Sem2\NLP\Assignment\Assignment_3\Final\Model\T5\t5_clan"  # CHange the Model path also (with pretrained saved checkpoints)
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

# Move model to device (GPU is preferred)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()


# Inference function
def generate_prediction(text):
    input_text = "normalize: " + text
    encodings = tokenizer(
        input_text, return_tensors="pt", padding=True, truncation=True, max_length=256
    )
    input_ids = encodings["input_ids"].to(device)
    attn_mask = encodings["attention_mask"].to(device)

    with torch.no_grad():
        gen_ids = model.generate(
            input_ids=input_ids, attention_mask=attn_mask, max_length=128, num_beams=4
        )

    return tokenizer.decode(gen_ids[0], skip_special_tokens=True)


# Generate predictions for the test set
predictions = []
for text in tqdm(df["Social Media Post"]):
    pred = generate_prediction(text)
    predictions.append(pred)

df["Predicted Claim"] = predictions


# Evaluation metrics
references = df["Normalized Claim"].tolist()
preds = df["Predicted Claim"].tolist()

# BLEU-4
bleu_scores = [
    sentence_bleu(
        [ref.split()], pred.split(), smoothing_function=SmoothingFunction().method1
    )
    for ref, pred in zip(references, preds)
]
bleu4 = round(sum(bleu_scores) / len(bleu_scores), 4)

# ROUGE-L
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
rouge_scores = [
    rouge.score(ref, pred)["rougeL"].fmeasure for ref, pred in zip(references, preds)
]
rougeL = round(sum(rouge_scores) / len(rouge_scores), 4)

# BERTScore
_, _, f1 = bertscore(preds, references, lang="en", verbose=False)
bert_score = round(f1.mean().item(), 4)


# Print evaluation results
print("\nFinal Evaluation on Test Set:")
print("BLEU-4    :", bleu4)
print("ROUGE-L   :", rougeL)
print("BERTScore :", bert_score)

# Save predictions to CSV
df.to_csv(r"E:\MTechCSE\Study\Sem2\NLP\Assignment\Assignment_3\Final\Data\T5_Test_Predictions.csv", index=False)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 20/20 [00:26<00:00,  1.34s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Final Evaluation on Test Set:
BLEU-4    : 0.0442
ROUGE-L   : 0.2804
BERTScore : 0.8659
