In [35]:
!pip install transformers datasets evaluate torch sentencepiece sacrebleu rouge_score bert-score



In [36]:
import pandas as pd
import torch
import numpy as np
import os
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    TrainerCallback,
    DataCollatorForSeq2Seq,
    GenerationConfig
)
from datasets import Dataset
import evaluate

In [37]:
df = pd.read_csv("/kaggle/input/clan-data/CLAN_data_cleaned.csv")
df = df[["Social Media Post", "Normalized Claim"]]
df.columns = ["noisy_claim", "normalized_claim"]

# Clean data
df = df.dropna().reset_index(drop=True)
df = df[(df["noisy_claim"] != "") & (df["normalized_claim"] != "")]

# 70-15-15 split
train_df = df.sample(frac=0.7, random_state=42)
temp_df = df.drop(train_df.index)
val_df = temp_df.sample(frac=0.5, random_state=42)
test_df = temp_df.drop(val_df.index)

dataset = {
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df),
    "test": Dataset.from_pandas(test_df)
}
dataset

{'train': Dataset({
     features: ['noisy_claim', 'normalized_claim', '__index_level_0__'],
     num_rows: 1597
 }),
 'validation': Dataset({
     features: ['noisy_claim', 'normalized_claim', '__index_level_0__'],
     num_rows: 342
 }),
 'test': Dataset({
     features: ['noisy_claim', 'normalized_claim', '__index_level_0__'],
     num_rows: 342
 })}

In [38]:
# Initialize model and tokenizer
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Initialize model
model = T5ForConditionalGeneration.from_pretrained(
    model_name,
    device_map="auto"
)

# Add special tokens
tokenizer.add_tokens(["<claim>", "</claim>", "<normalized>"])
model.resize_token_embeddings(len(tokenizer))

# CRITICAL FIX: Set decoder_start_token_id for T5
model.config.decoder_start_token_id = tokenizer.pad_token_id

In [39]:
tokenizer.add_tokens(["<claim>", "</claim>", "<normalized>"])
model.resize_token_embeddings(len(tokenizer))

# Preprocessing function
def preprocess_function(examples):
    inputs = ["normalize claim: <claim>" + str(claim) + "</claim>" 
              for claim in examples["noisy_claim"]]
    targets = ["<normalized>" + str(norm) + "</normalized>" 
               for norm in examples["normalized_claim"]]

    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        targets,
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = {k: v.map(preprocess_function, batched=True) for k, v in dataset.items()}

Map:   0%|          | 0/1597 [00:00<?, ? examples/s]

Map:   0%|          | 0/342 [00:00<?, ? examples/s]

Map:   0%|          | 0/342 [00:00<?, ? examples/s]

In [40]:
# Metrics
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
bleu = evaluate.load("bleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds = [p.strip() for p in decoded_preds]
    decoded_labels = [[l.strip()] for l in decoded_labels]
    
    rouge_result = rouge.compute(
        predictions=decoded_preds, 
        references=decoded_labels, 
        use_stemmer=True
    )
    
    bertscore_result = bertscore.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        lang="en"
    )
    
    bleu_result = bleu.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )
    
    return {
        "rougeL": rouge_result["rougeL"].mid.fmeasure,
        "bertscore": np.mean(bertscore_result["f1"]),
        "bleu4": bleu_result["bleu"]
    }

In [41]:
# Training arguments - CHANGED: Disabled FP16
training_args = Seq2SeqTrainingArguments(
    output_dir="./T5-claim-normalization",
    eval_strategy="epoch",  
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    generation_config=GenerationConfig(
        max_length=128,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True
    ),
    fp16=False,  # Disabled FP16 to avoid gradient issues
    logging_steps=50,
    report_to="none"
)


In [42]:
# Training logger callback
class TrainingLogger(TrainerCallback):
    def on_train_begin(self, args, state, control, **kwargs):
        print("\n| Epoch | Train Loss | Valid Loss | ROUGE-L | BERTScore | BLEU-4 |")
    
    def on_epoch_end(self, args, state, control, **kwargs):
        train_log = next(
            (log for log in reversed(state.log_history) 
             if "loss" in log and "eval_loss" not in log),
            None
        )
        
        eval_log = next(
            (log for log in reversed(state.log_history) 
             if "eval_loss" in log),
            None
        )

        if train_log and eval_log:
            print(f"| {int(train_log['epoch']):<5} | "
                  f"{train_log['loss']:.4f} | "
                  f"{eval_log['eval_loss']:.4f} | "
                  f"{eval_log.get('eval_rougeL', 0):.4f} | "
                  f"{eval_log.get('eval_bertscore', 0):.4f} | "
                  f"{eval_log.get('eval_bleu4', 0):.4f} |")


In [43]:
torch.cuda.empty_cache()

In [None]:
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    padding=True
)

# Clear GPU memory
torch.cuda.empty_cache()

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[TrainingLogger()]
)

# Train the model
trainer.train()


| Epoch | Train Loss | Valid Loss | ROUGE-L | BERTScore | BLEU-4 |


Epoch,Training Loss,Validation Loss


In [None]:
os.makedirs("/kaggle/working/T5-claim-normalization", exist_ok=True)
model.save_pretrained("/kaggle/working/T5-claim-normalization")
tokenizer.save_pretrained("/kaggle/working/T5-claim-normalization")

In [None]:
# Evaluate on test set
test_results = trainer.predict(tokenized_dataset["test"])
print("\nTest Set Metrics:")
print(f"ROUGE-L: {test_results.metrics['test_rougeL']:.4f}")
print(f"BERTScore: {test_results.metrics['test_bertscore']:.4f}")
print(f"BLEU-4: {test_results.metrics['test_bleu4']:.4f}")

In [None]:
def load_claim_normalizer(model_dir, tokenizer_dir):
    tokenizer = T5Tokenizer.from_pretrained(tokenizer_dir)
    model = T5ForConditionalGeneration.from_pretrained("t5-base")
    model.load_state_dict(torch.load(os.path.join(model_dir, "t5_claim_norm.pth")))
    
    # FIX: Set decoder token for inference
    model.config.decoder_start_token_id = tokenizer.pad_token_id  
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()
    return model, tokenizer, device

def normalize_claim(text, model, tokenizer, device):
    input_text = f"normalize claim: <claim>{text}</claim>"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
    outputs = model.generate(
        inputs.input_ids,
        max_length=128,
        num_beams=5,
        early_stopping=True,
        no_repeat_ngram_size=2
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

sample_input = "COVID vaccines contain microchips to track people"
normalized_output = normalize_claim(sample_input, model, tokenizer, device)
print(f"\nInput: {sample_input}")
print(f"Normalized: {normalized_output}")