In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/clan-dataset/CLAN_data.csv
/kaggle/input/clan-dataset/2_Cleaning_Paper.ipynb
/kaggle/input/clan-dataset/CLAN_data_cleaned.csv


## Dependencies Install

In [8]:
!pip install pandas torch scikit-learn transformers datasets evaluate bert-score tqdm bleu rogue rouge-score sacrebleu bert-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=e073351eb93d771f03ba52d5735ede0f5f6c41286a960f1a2864705820e929ed
  Stored in directory: /root/.cache/pip/wheels/5f/dd/

# Training

In [10]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import BartTokenizer, BartForConditionalGeneration
from datasets import Dataset
from sklearn.model_selection import train_test_split
import evaluate
from bert_score import score
from tqdm import tqdm


# Load and preprocess the dataset
df = pd.read_csv("/kaggle/input/clan-dataset/CLAN_data_cleaned.csv")
df = df[["Social Media Post", "Normalized Claim"]].dropna()

# Split the dataset: 70% train, 15% validation, 15% test
train_df, temp_df = train_test_split(df, test_size=0.30, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Convert pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Load BART tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

# Tokenization function
def tokenize_function(examples):
    inputs = tokenizer(examples["Social Media Post"], max_length=128, truncation=True, padding="max_length")
    targets = tokenizer(examples["Normalized Claim"], max_length=128, truncation=True, padding="max_length")
    
    labels = targets["input_ids"]
    labels = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_seq]
        for label_seq in labels
    ]
    
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": labels
    }


# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Load model and device
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=1)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Load evaluation metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

# Training loop
num_epochs = 5
best_val_loss = float("inf")

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_train_loss = train_loss / len(train_loader)

    model.eval()
    val_loss = 0.0
    predictions = []
    references = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

            # Generate predictions
            generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=64)

            # Replace -100 in labels before decoding
            labels = torch.where(labels != -100, labels, torch.tensor(tokenizer.pad_token_id).to(labels.device))

            # Decode
            decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)


            predictions.extend(decoded_preds)
            references.extend(decoded_labels)

    avg_val_loss = val_loss / len(val_loader)

    rouge_result = rouge.compute(predictions=predictions, references=references, rouge_types=["rougeL"])
    bleu_result = bleu.compute(predictions=predictions, references=references)
    bertscore_result = score(predictions, references, lang="en", verbose=False)
    bertscore_avg = bertscore_result[2].mean().item()

    print(f"\nEpoch {epoch+1} Summary:")
    print(f"Train Loss : {avg_train_loss:.4f}")
    print(f"Val Loss   : {avg_val_loss:.4f}")
    print(f"ROUGE-L    : {rouge_result['rougeL']:.4f}")
    print(f"BLEU-4     : {bleu_result['bleu']:.4f}")
    print(f"BERTScore  : {bertscore_avg:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        model.save_pretrained("/kaggle/working//bart_model_output/final")
        tokenizer.save_pretrained("/kaggle/working//bart_model_output/final")
        print("Best model saved.")

# Save the test split for inference
test_df.to_csv("/kaggle/working/test_data_cleaned.csv", index=False)


Map:   0%|          | 0/1596 [00:00<?, ? examples/s]

Map:   0%|          | 0/342 [00:00<?, ? examples/s]

Map:   0%|          | 0/343 [00:00<?, ? examples/s]

Epoch 1 Training: 100%|██████████| 200/200 [01:07<00:00,  2.96it/s]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1 Summary:
Train Loss : 2.9860
Val Loss   : 2.6923
ROUGE-L    : 0.3349
BLEU-4     : 0.2014
BERTScore  : 0.8812




Best model saved.


Epoch 2 Training: 100%|██████████| 200/200 [01:07<00:00,  2.98it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 2 Summary:
Train Loss : 2.3344
Val Loss   : 2.6490
ROUGE-L    : 0.3410
BLEU-4     : 0.2143
BERTScore  : 0.8820
Best model saved.


Epoch 3 Training: 100%|██████████| 200/200 [01:06<00:00,  2.99it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 3 Summary:
Train Loss : 1.9392
Val Loss   : 2.6619
ROUGE-L    : 0.3484
BLEU-4     : 0.2210
BERTScore  : 0.8835


Epoch 4 Training: 100%|██████████| 200/200 [01:06<00:00,  3.00it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 4 Summary:
Train Loss : 1.5860
Val Loss   : 2.7456
ROUGE-L    : 0.3412
BLEU-4     : 0.2142
BERTScore  : 0.8839


Epoch 5 Training: 100%|██████████| 200/200 [01:06<00:00,  3.01it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 5 Summary:
Train Loss : 1.4011
Val Loss   : 2.8829
ROUGE-L    : 0.3493
BLEU-4     : 0.2231
BERTScore  : 0.8846


## Test Data Evaluate

In [21]:
# Final evaluation on the test set
model.eval()
test_predictions = []
test_references = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating Test Set"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Generate predictions
        generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=64)

        # Replace -100 in labels with pad_token_id before decoding
        labels = torch.where(labels != -100, labels, torch.tensor(tokenizer.pad_token_id).to(labels.device))

        # Decode predictions and references
        decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        test_predictions.extend(decoded_preds)
        test_references.extend(decoded_labels)

# Compute metrics
rouge_result = rouge.compute(predictions=test_predictions, references=test_references, rouge_types=["rougeL"])
bleu_result = bleu.compute(predictions=test_predictions, references=test_references)
bertscore_result = score(test_predictions, test_references, lang="en", verbose=False)
bertscore_avg = bertscore_result[2].mean().item()

# Print results
print("\n✅ Test Set Metrics:")
print(f"ROUGE-L   : {rouge_result['rougeL']:.4f}")
print(f"BLEU-4    : {bleu_result['bleu']:.4f}")
print(f"BERTScore : {bertscore_avg:.4f}")


Evaluating Test Set:   0%|          | 0/100 [00:00<?, ?it/s]


KeyError: 'input_ids'

# Inference

In [22]:
import pandas as pd
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
from datasets import Dataset
from torch.utils.data import DataLoader
import evaluate
from bert_score import score

# Load saved model and tokenizer
model_path = "/kaggle/working/bart_model_output/final"
model = BartForConditionalGeneration.from_pretrained(model_path)
tokenizer = BartTokenizer.from_pretrained(model_path)
model.eval()

# Load test data
test_df = pd.read_csv("/kaggle/input/testdataclan/test_data_cleaned.csv")
test_df = test_df[["Social Media Post", "Normalized Claim"]].dropna()

# Tokenize test data
test_dataset = Dataset.from_pandas(test_df)

def tokenize_function(examples):
    inputs = tokenizer(examples["Social Media Post"], max_length=128, truncation=True, padding="max_length")
    targets = tokenizer(examples["Normalized Claim"], max_length=128, truncation=True, padding="max_length")
    
    labels = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_seq]
        for label_seq in targets["input_ids"]
    ]
    
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": labels,
        "label_ids": targets["input_ids"]  # Keep original label IDs for decoding
    }

# Apply tokenization
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels", "label_ids"])

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# DataLoader for test set
test_loader = DataLoader(tokenized_test_dataset, batch_size=1)

predictions = []
references = []

# Inference loop
for batch in test_loader:
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=128,
            num_beams=4,
            early_stopping=True
        )

    decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(batch["label_ids"], skip_special_tokens=True)

    predictions.extend(decoded_preds)
    references.extend(decoded_labels)

# Evaluation
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

rouge_result = rouge.compute(predictions=predictions, references=references, rouge_types=["rougeL"])
bleu_result = bleu.compute(predictions=predictions, references=references)
bertscore_result = score(predictions, references, lang="en", verbose=False)
bertscore_avg = bertscore_result[2].mean().item()

print("\nTest Set Evaluation:")
print(f"ROUGE-L : {rouge_result['rougeL']:.4f}")
print(f"BLEU-4  : {bleu_result['bleu']:.4f}")
print(f"BERTScore (F1): {bertscore_avg:.4f}")

# Optional: Save predictions to CSV
output_df = pd.DataFrame({
    "Original Post": test_df["Social Media Post"],
    "Reference Claim": references,
    "Predicted Claim": predictions
})
output_df.to_csv("/kaggle/working/bart_test_predictions.csv", index=False)


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Test Set Evaluation:
ROUGE-L : 0.1981
BLEU-4  : 0.0000
BERTScore (F1): 0.8605


In [23]:
import pandas as pd
from datasets import Dataset
from torch.utils.data import DataLoader

# Load external test CSV
external_df = pd.read_csv("/kaggle/input/complete/CLAN_data_cleaned_test.csv")
external_df = external_df[["Social Media Post", "Normalized Claim"]].dropna()

# Tokenize external test set
external_dataset = Dataset.from_pandas(external_df)

def tokenize_function(examples):
    inputs = tokenizer(examples["Social Media Post"], max_length=128, truncation=True, padding="max_length")
    targets = tokenizer(examples["Normalized Claim"], max_length=128, truncation=True, padding="max_length")
    
    labels = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_seq]
        for label_seq in targets["input_ids"]
    ]
    
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": labels,
        "label_ids": targets["input_ids"]
    }

# Tokenize and set format
external_dataset = external_dataset.map(tokenize_function, batched=True)
external_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels", "label_ids"])

external_loader = DataLoader(external_dataset, batch_size=1)

# Inference on external dataset
external_preds = []
external_refs = []

with torch.no_grad():
    for batch in tqdm(external_loader, desc="Evaluating External Test Set"):
        input_ids = batch["input_ids"].to(model.device)
        attention_mask = batch["attention_mask"].to(model.device)

        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=128,
            num_beams=4,
            early_stopping=True
        )

        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(batch["label_ids"], skip_special_tokens=True)

        external_preds.extend(decoded_preds)
        external_refs.extend(decoded_labels)

# Compute metrics
rouge_result = rouge.compute(predictions=external_preds, references=external_refs, rouge_types=["rougeL"])
bleu_result = bleu.compute(predictions=external_preds, references=external_refs)
bertscore_result = score(external_preds, external_refs, lang="en", verbose=False)

print("\n[External Test CSV Evaluation]")
print(f"ROUGE-L    : {rouge_result['rougeL']:.4f}")
print(f"BLEU-4     : {bleu_result['bleu']:.4f}")
print(f"BERTScore  : {bertscore_result[2].mean().item():.4f}")

# Save predictions (optional)
output_df = pd.DataFrame({
    "Original Post": external_df["Social Media Post"],
    "Reference Claim": external_refs,
    "Predicted Claim": external_preds
})
output_df.to_csv("/kaggle/working/bart_external_test_predictions.csv", index=False)


Map:   0%|          | 0/471 [00:00<?, ? examples/s]

Evaluating External Test Set: 100%|██████████| 471/471 [01:35<00:00,  4.95it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[External Test CSV Evaluation]
ROUGE-L    : 0.3694
BLEU-4     : 0.2470
BERTScore  : 0.8890
