In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
!pip install transformers datasets evaluate torch sentencepiece sacrebleu



In [7]:
import pandas as pd
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, TrainerCallback
from datasets import Dataset
import evaluate
import numpy as np

In [8]:
df = pd.read_csv("/content/drive/MyDrive/ClaimNormalization/CLAN_data_cleaned.csv")
df = df[["Social Media Post", "Normalized Claim"]]
df.columns = ["noisy_claim", "normalized_claim"]

train_df = df.sample(frac=0.7, random_state=42)
temp_df = df.drop(train_df.index)
val_df = temp_df.sample(frac=0.5, random_state=42)
test_df = temp_df.drop(val_df.index)

# Convert to Dataset format
dataset = {
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df),
    "test": Dataset.from_pandas(test_df)
}
dataset

{'train': Dataset({
     features: ['noisy_claim', 'normalized_claim', '__index_level_0__'],
     num_rows: 1603
 }),
 'validation': Dataset({
     features: ['noisy_claim', 'normalized_claim', '__index_level_0__'],
     num_rows: 344
 }),
 'test': Dataset({
     features: ['noisy_claim', 'normalized_claim', '__index_level_0__'],
     num_rows: 343
 })}

In [9]:
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Add special tokens
tokenizer.add_tokens(["<claim>", "</claim>", "<normalized>"])
model.resize_token_embeddings(len(tokenizer))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Embedding(32103, 768)

In [10]:
def preprocess_function(examples):
    inputs = ["normalize claim: <claim>" + str(claim) + "</claim>" for claim in examples["noisy_claim"]]
    targets = ["<normalized>" + str(norm) + "</normalized>" for norm in examples["normalized_claim"]]

    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        targets,
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = {k: v.map(preprocess_function, batched=True) for k, v in dataset.items()}

Map:   0%|          | 0/1603 [00:00<?, ? examples/s]

Map:   0%|          | 0/344 [00:00<?, ? examples/s]

Map:   0%|          | 0/343 [00:00<?, ? examples/s]

In [11]:
from transformers import GenerationConfig

training_args = Seq2SeqTrainingArguments(
    output_dir="./T5-claim-normalization",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    generation_config=GenerationConfig(
        max_length=128,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True
    ),
    fp16=True,
    logging_steps=50,
    report_to="none"
)




In [12]:
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
bleu = evaluate.load("bleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [p.strip() for p in decoded_preds]
    decoded_labels = [[l.strip()] for l in decoded_labels]

    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    bertscore_result = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
    bleu_result = bleu.compute(predictions=decoded_preds, references=decoded_labels)

    return {
        "rougeL": rouge_result["rougeL"].mid.fmeasure,
        "bertscore": np.mean(bertscore_result["f1"]),
        "bleu4": bleu_result["bleu"]
    }

In [13]:
class TrainingLogger(TrainerCallback):
    def on_train_begin(self, args, state, control, **kwargs):
        print("\n| Epoch | Training Loss | Validation Loss | ROUGE-L | BERTScore | BLEU-4 |")

    def on_epoch_end(self, args, state, control, **kwargs):
        logs = {}
        for log in state.log_history:
            if "eval_rougeL" in log:
                logs = {
                    "epoch": int(log["epoch"]),
                    "eval_loss": log["eval_loss"],
                    "rougeL": log["eval_rougeL"],
                    "bertscore": log["eval_bertscore"],
                    "bleu4": log["eval_bleu4"]
                }
            if "loss" in log and "eval_loss" not in log:
                logs["train_loss"] = log["loss"]

        print(f"| {logs['epoch']:<5} | {logs['train_loss']:.4f} | {logs['eval_loss']:.4f} | "
              f"{logs['rougeL']:.4f} | {logs['bertscore']:.4f} | {logs['bleu4']:.4f} |")

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[TrainingLogger()]
)

trainer.train()

  trainer = Seq2SeqTrainer(



| Epoch | Training Loss | Validation Loss | ROUGE-L | BERTScore | BLEU-4 |


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


In [None]:
model_save_path = "/content/drive/MyDrive/ClaimNormalization/T5_Small_5/t5_base_claim_norm.pth"
torch.save(model.state_dict(), model_save_path)
tokenizer.save_pretrained("/content/drive/MyDrive/ClaimNormalization/T5_Small_5/t5_tokenizer")

In [None]:
test_results = trainer.predict(tokenized_dataset["test"])
print("\nTest Set Metrics:")
print(f"ROUGE-L: {test_results.metrics['test_rougeL']:.4f}")
print(f"BERTScore: {test_results.metrics['test_bertscore']:.4f}")
print(f"BLEU-4: {test_results.metrics['test_bleu4']:.4f}")

In [None]:
!pip install torch transformers  # First install dependencies

from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

def load_claim_normalizer(model_path, tokenizer_path):
    """Load saved model and tokenizer"""
    # Load tokenizer with special tokens
    tokenizer = T5Tokenizer.from_pretrained(tokenizer_path)

    # Initialize model with proper embedding size
    model = T5ForConditionalGeneration.from_pretrained("t5-base")
    model.resize_token_embeddings(len(tokenizer))  # Critical for custom tokens

    # Load trained weights
    model.load_state_dict(torch.load(model_path))
    model.eval()

    return model, tokenizer

def normalize_claim(text, model, tokenizer):

    input_text = f"normalize claim: <claim>{text}</claim>"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    outputs = model.generate(
        input_ids,
        max_length=128,
        num_beams=5,
        early_stopping=True,
        no_repeat_ngram_size=2
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Usage
model, tokenizer = load_claim_normalizer(
    model_path="/content/drive/MyDrive/ClaimNormalization/T5_Base/t5_base_claim_norm.pth",
    tokenizer_path="/content/drive/MyDrive/ClaimNormalization/T5_Base/t5_tokenizer"
)

test_claim = "COVID vax causes heart attacks!!"
print(f"Input: {test_claim}")
print(f"Normalized: {normalize_claim(test_claim, model, tokenizer)}")
