In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/clan-complete/test_data_cleaned.csv
/kaggle/input/clan-complete/CLAN_data.csv
/kaggle/input/clan-complete/CLAN_data_cleaned_test.csv
/kaggle/input/clan-complete/CLAN_data_cleaned.csv


In [5]:
!unzip -q "/kaggle/input/your-dataset-name/t5_clan_model.zip" -d "/kaggle/working/Model"

unzip:  cannot find or open /kaggle/input/your-dataset-name/t5_clan_model.zip, /kaggle/input/your-dataset-name/t5_clan_model.zip.zip or /kaggle/input/your-dataset-name/t5_clan_model.zip.ZIP.


# All Dependencies

In [2]:
!pip install -U transformers datasets sentencepiece evaluate bert-score rouge-score matplotlib nltk sacrebleu contractions
!pip install --upgrade transformers
!pip install transformers datasets evaluate rouge-score sacrebleu bert-score --upgrade --quiet



# Preprocessing Pipeline

In [None]:
import pandas as pd
import re
import string
import contractions

def clean_text(text):
    if pd.isna(text):
        return ""

    # Expand contractions
    text = contractions.fix(text)

    # Remove twitter handles, URLs, HTML tags
    text = re.sub(r"@\w+", "", text)                     # Remove Twitter handles
    text = re.sub(r"http\S+|www\S+", "", text)           # Remove URLs
    text = re.sub(r"<.*?>", "", text)                    # Remove HTML tags

    # Remove quotes but keep the content
    text = text.replace('"', "").replace("“", "").replace("”", "")

    # Replace hashtags with just the word
    text = re.sub(r"#(\w+)", r"\1", text)

    # Remove newlines, tabs, extra whitespace
    text = text.replace("\n", " ").replace("\r", " ").replace("\t", " ")
    text = re.sub(r"\s+", " ", text)

    # Lowercase everything
    text = text.lower()

    # Remove non-informative punctuation (preserve . , ! ? for structure if needed)
    text = re.sub(f"[{re.escape(string.punctuation.replace('.', '').replace(',', '').replace('!', '').replace('?', ''))}]", "", text)

    return text.strip()

def preprocess_clan_raw_data(input_path, output_path):
    # Load CSV
    df = pd.read_csv(input_path)

    # Keep only necessary columns
    df = df[["PID", "Social Media Post", "Normalized Claim"]]

    # Drop rows with missing data
    df.dropna(subset=["Social Media Post", "Normalized Claim"], inplace=True)

    # Clean both post and normalized claim
    df["Social Media Post"] = df["Social Media Post"].apply(clean_text)
    df["Normalized Claim"] = df["Normalized Claim"].apply(clean_text)

    # Drop rows where cleaned content is now empty
    df = df[(df["Social Media Post"].str.strip() != "") & (df["Normalized Claim"].str.strip() != "")]

    # Drop duplicates
    df.drop_duplicates(subset=["Social Media Post", "Normalized Claim"], inplace=True)

    # Save cleaned dataset
    df.to_csv(output_path, index=False)
    print(f"Cleaned data saved to {output_path}")

    return df

cleaned_df = preprocess_clan_raw_data("/kaggle/input/clan-complete/CLAN_data.csv", "/kaggle/working//CLAN_data_processed.csv")


In [None]:
# For testing
import pandas as pd
df = pd.read_csv("/kaggle/working/test_data_cleaned.csv")
df

# Code to Train BART-BASE Piepeline

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import BartTokenizer, BartForConditionalGeneration
from datasets import Dataset
from sklearn.model_selection import train_test_split
import evaluate
from bert_score import score
from tqdm import tqdm


# Load and preprocess the dataset
df = pd.read_csv("/kaggle/input/clan-complete/CLAN_data_cleaned.csv")
df = df[["Social Media Post", "Normalized Claim"]].dropna()

# Split the dataset: 70% train, 15% validation, 15% test
train_df, temp_df = train_test_split(df, test_size=0.30, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Convert pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Load BART tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

# Tokenization function
def tokenize_function(examples):
    inputs = tokenizer(examples["Social Media Post"], max_length=128, truncation=True, padding="max_length")
    targets = tokenizer(examples["Normalized Claim"], max_length=128, truncation=True, padding="max_length")
    
    labels = targets["input_ids"]
    labels = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_seq]
        for label_seq in labels
    ]
    
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": labels
    }


# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Load model and device
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=1)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Load evaluation metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

# Training loop
num_epochs = 5
best_val_loss = float("inf")

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_train_loss = train_loss / len(train_loader)

    model.eval()
    val_loss = 0.0
    predictions = []
    references = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

            # Generate predictions
            generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=64)

            # Replace -100 in labels before decoding
            labels = torch.where(labels != -100, labels, torch.tensor(tokenizer.pad_token_id).to(labels.device))

            # Decode
            decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)


            predictions.extend(decoded_preds)
            references.extend(decoded_labels)

    avg_val_loss = val_loss / len(val_loader)

    rouge_result = rouge.compute(predictions=predictions, references=references, rouge_types=["rougeL"])
    bleu_result = bleu.compute(predictions=predictions, references=references)
    bertscore_result = score(predictions, references, lang="en", verbose=False)
    bertscore_avg = bertscore_result[2].mean().item()

    print(f"\nEpoch {epoch+1} Summary:")
    print(f"Train Loss : {avg_train_loss:.4f}")
    print(f"Val Loss   : {avg_val_loss:.4f}")
    print(f"ROUGE-L    : {rouge_result['rougeL']:.4f}")
    print(f"BLEU-4     : {bleu_result['bleu']:.4f}")
    print(f"BERTScore  : {bertscore_avg:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        model.save_pretrained("/kaggle/working//bart_model_output/final")
        tokenizer.save_pretrained("/kaggle/working//bart_model_output/final")
        print("Best model saved.")

# Save the test split for inference
test_df.to_csv("/kaggle/working/test_data_cleaned.csv", index=False)

# INFERENCE PIPELINE (BART)

In [7]:
import pandas as pd
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
from datasets import Dataset
from torch.utils.data import DataLoader
import evaluate
from bert_score import score

# Load saved model and tokenizer
model_path = "/kaggle/input/t5_small/pytorch/default/1/bart_model_output/final"                                # CHNAGED WITH SAVE MODELS PATH
model = BartForConditionalGeneration.from_pretrained(model_path)
tokenizer = BartTokenizer.from_pretrained(model_path)
model.eval()

# Load test data
test_df = pd.read_csv("/kaggle/working/test_data_cleaned.csv")       # CHANGED WITH PROCESSED TEST DATA's PATH
test_df = test_df[["Social Media Post", "Normalized Claim"]].dropna()

# Tokenize test data
test_dataset = Dataset.from_pandas(test_df)

def tokenize_function(examples):
    inputs = tokenizer(examples["Social Media Post"], max_length=128, truncation=True, padding="max_length")
    targets = tokenizer(examples["Normalized Claim"], max_length=128, truncation=True, padding="max_length")
    
    labels = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_seq]
        for label_seq in targets["input_ids"]
    ]
    
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": labels,
        "label_ids": targets["input_ids"]  # Keep original label IDs for decoding
    }

# Apply tokenization
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels", "label_ids"])

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# DataLoader for test set
test_loader = DataLoader(tokenized_test_dataset, batch_size=1)

predictions = []
references = []

# Inference loop
for batch in test_loader:
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=128,
            num_beams=4,
            early_stopping=True
        )

    decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(batch["label_ids"], skip_special_tokens=True)

    predictions.extend(decoded_preds)
    references.extend(decoded_labels)

# Evaluation
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

rouge_result = rouge.compute(predictions=predictions, references=references, rouge_types=["rougeL"])
bleu_result = bleu.compute(predictions=predictions, references=references)
bertscore_result = score(predictions, references, lang="en", verbose=False)
bertscore_avg = bertscore_result[2].mean().item()

print("\nTest Set Evaluation:")
print(f"ROUGE-L : {rouge_result['rougeL']:.4f}")
print(f"BLEU-4  : {bleu_result['bleu']:.4f}")
print(f"BERTScore (F1): {bertscore_avg:.4f}")

# Optional: Save predictions to CSV
output_df = pd.DataFrame({
    "Original Post": test_df["Social Media Post"],
    "Reference Claim": references,
    "Predicted Claim": predictions
})
output_df.to_csv("/kaggle/working/bart_test_predictions.csv", index=False)


Map:   0%|          | 0/343 [00:00<?, ? examples/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Test Set Evaluation:
ROUGE-L : 0.3314
BLEU-4  : 0.2220
BERTScore (F1): 0.8822


# Code to train T5-Small Pipeline

In [None]:
!pip uninstall -y transformers
!pip install transformers==4.51.1 --quiet

In [3]:
import transformers
print(transformers.__version__)

4.51.1


In [4]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration, 
    Trainer, TrainingArguments, 
    DataCollatorForSeq2Seq, TrainerCallback
)
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from bert_score import score as bertscore

# --- 0. Check for GPU ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))
else:
    print("CUDA not available.")

# --- 1. Load and Prepare Data ---
df = pd.read_csv("/kaggle/input/clan-complete/CLAN_data_cleaned_test.csv")
df = df[["Social Media Post", "Normalized Claim"]].dropna().drop_duplicates()
df = df[(df["Social Media Post"].str.strip() != "") & (df["Normalized Claim"].str.strip() != "")]

train_df = df.sample(frac=0.7, random_state=42)
temp_df = df.drop(train_df.index)
val_df = temp_df.sample(frac=0.5, random_state=42)
test_df = temp_df.drop(val_df.index)

train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
val_ds = Dataset.from_pandas(val_df.reset_index(drop=True))
test_ds = Dataset.from_pandas(test_df.reset_index(drop=True))

# --- 2. Tokenizer and Model ---
tokenizer = T5Tokenizer.from_pretrained("t5-small", legacy=False)
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

# --- 3. Preprocessing ---
def preprocess(example):
    input_enc = tokenizer(
        "normalize: " + example["Social Media Post"],
        max_length=256,
        padding="max_length",
        truncation=True
    )
    target_enc = tokenizer(
        example["Normalized Claim"],
        max_length=128,
        padding="max_length",
        truncation=True
    )
    input_enc["labels"] = [
        (label if label != tokenizer.pad_token_id else -100)
        for label in target_enc["input_ids"]
    ]
    return input_enc

train_ds = train_ds.map(preprocess)
val_ds = val_ds.map(preprocess)
test_ds = test_ds.map(preprocess)

train_ds.set_format(type="torch")
val_ds.set_format(type="torch")
test_ds.set_format(type="torch")

# --- 4. Evaluation Callback ---
class EvalMetricsCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, **kwargs):
        model.eval()
        predictions, references = [], []

        for i in range(min(100, len(val_ds))):
            sample = val_ds[i]
            input_ids = sample["input_ids"].unsqueeze(0).to(device)
            attn_mask = sample["attention_mask"].unsqueeze(0).to(device)
            labels = sample["labels"]

            with torch.no_grad():
                gen_ids = model.generate(
                    input_ids=input_ids,
                    attention_mask=attn_mask,
                    max_length=128,
                    num_beams=4
                )

            pred = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
            ref = tokenizer.decode([t for t in labels if t != -100], skip_special_tokens=True)

            predictions.append(pred)
            references.append(ref)

        # Compute BLEU-4
        bleu_scores = [
            sentence_bleu([ref.split()], pred.split(), smoothing_function=SmoothingFunction().method1)
            for pred, ref in zip(predictions, references)
        ]
        print(f"\nEpoch {int(state.epoch)} Metrics:")
        print("BLEU-4    :", round(sum(bleu_scores) / len(bleu_scores), 4))

        # ROUGE-L
        rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
        rouge_scores = [rouge.score(r, p)['rougeL'].fmeasure for p, r in zip(predictions, references)]
        print("ROUGE-L   :", round(sum(rouge_scores) / len(rouge_scores), 4))

        # BERTScore
        P, R, F1 = bertscore(predictions, references, lang="en", verbose=False)
        print("BERTScore :", round(F1.mean().item(), 4))

# --- 5. Training ---
training_args = TrainingArguments(
    output_dir="./t5_clan",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-5,
    num_train_epochs=5,
    do_eval=True,
    do_train=True,
    logging_dir='./logs',
    logging_steps=100,
    save_steps=200,
    eval_steps=200,
    save_total_limit=2,
    evaluation_strategy="steps",
    report_to="none"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
    callbacks=[EvalMetricsCallback()]
)

trainer.train()
trainer.save_model("/kaggle/working/t5_clan")
tokenizer.save_pretrained("/kaggle/working/t5_clan")

# --- 6. Predict One Sample ---
sample = test_ds[0]
input_ids = sample["input_ids"].unsqueeze(0).to(device)
attn_mask = sample["attention_mask"].unsqueeze(0).to(device)

with torch.no_grad():
    output = model.generate(input_ids=input_ids, attention_mask=attn_mask, max_length=128)

print("\nPrediction Example:")
print("Input     :", tokenizer.decode(sample["input_ids"], skip_special_tokens=True))
print("Prediction:", tokenizer.decode(output[0], skip_special_tokens=True))
print("Reference :", tokenizer.decode([i for i in sample["labels"] if i != -100], skip_special_tokens=True))

Using device: cuda
Tesla T4


Map:   0%|          | 0/330 [00:00<?, ? examples/s]

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

# INFERENCE PIPELINE (T5-Small)

In [6]:
# --- 1. Imports ---
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from bert_score import score as bertscore
from tqdm import tqdm


# --- 2. Load Test Data ---
df = pd.read_csv("/kaggle/working/test_data_cleaned.csv")
df = df[["Social Media Post", "Normalized Claim"]].dropna().drop_duplicates()
df = df[(df["Social Media Post"].str.strip() != "") & (df["Normalized Claim"].str.strip() != "")]


# --- 3. Load Trained Model and Tokenizer ---
model_path = "/kaggle/input/t5_small/pytorch/default/1/t5_clan"  # same as used in training
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()


# --- 4. Inference Function ---
def generate_prediction(text):
    input_text = "normalize: " + text
    encodings = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=256)
    input_ids = encodings["input_ids"].to(device)
    attn_mask = encodings["attention_mask"].to(device)

    with torch.no_grad():
        gen_ids = model.generate(input_ids=input_ids, attention_mask=attn_mask, max_length=128, num_beams=4)

    return tokenizer.decode(gen_ids[0], skip_special_tokens=True)


# --- 5. Predict on Test Data ---
predictions = []
for text in tqdm(df["Social Media Post"]):
    pred = generate_prediction(text)
    predictions.append(pred)

df["Predicted Claim"] = predictions


# --- 6. Evaluation Metrics ---
references = df["Normalized Claim"].tolist()
preds = df["Predicted Claim"].tolist()

# BLEU-4
bleu_scores = [
    sentence_bleu([ref.split()], pred.split(), smoothing_function=SmoothingFunction().method1)
    for ref, pred in zip(references, preds)
]
bleu4 = round(sum(bleu_scores) / len(bleu_scores), 4)

# ROUGE-L
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_scores = [rouge.score(ref, pred)['rougeL'].fmeasure for ref, pred in zip(references, preds)]
rougeL = round(sum(rouge_scores) / len(rouge_scores), 4)

# BERTScore
_, _, f1 = bertscore(preds, references, lang="en", verbose=False)
bert_score = round(f1.mean().item(), 4)


# --- 7. Output Results ---
print("\nFinal Evaluation on Test Set:")
print("BLEU-4    :", bleu4)
print("ROUGE-L   :", rougeL)
print("BERTScore :", bert_score)

# Save predictions to CSV
df.to_csv("T5_Test_Predictions.csv", index=False)


100%|██████████| 343/343 [02:44<00:00,  2.08it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Final Evaluation on Test Set:
BLEU-4    : 0.1731
ROUGE-L   : 0.3564
BERTScore : 0.8815
