Code to Mount Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
!cp "/content/sample_data/CLAN_data_cleaned.csv" .


Imports

In [6]:
!pip install datasets
!pip install evaluate
!pip install rouge_score
!pip install sacrebleu
!pip install bert-score
!pip install transformers torch



In [25]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import Dataset
from torch.utils.data import DataLoader
import numpy as np
from tqdm import tqdm
from transformers import BartForConditionalGeneration, BartTokenizer
from evaluate import load
from transformers import AutoModelForSeq2SeqLM
from torch.optim import AdamW
from transformers import get_scheduler
import torch.nn as nn

Checking for GPU

In [13]:
print(torch.cuda.is_available())  # Should return True

True


Load Dataset

In [18]:
import pandas as pd
from datasets import Dataset

# Load cleaned data
df = pd.read_csv("/content/sample_data/CLAN_data_cleaned.csv")

# Rename columns for easier processing
df = df.rename(columns={"Social Media Post": "text", "Normalized Claim": "claim"})

# Drop NaN values if any
df = df.dropna().reset_index(drop=True)

# Convert to Hugging Face Dataset format
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['PID', 'text', 'claim'],
    num_rows: 2281
})

Tokenization using BERT & T5

In [19]:
# Load tokenizers
bart_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large")
t5_tokenizer = AutoTokenizer.from_pretrained("t5-base")

# Define tokenization function
def tokenize_data(examples):
    return {
        "input_ids": bart_tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)["input_ids"],
        "labels": bart_tokenizer(examples["claim"], padding="max_length", truncation=True, max_length=128)["input_ids"]
    }

# Tokenize dataset
tokenized_dataset = dataset.map(tokenize_data, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/2281 [00:00<?, ? examples/s]

Train Test Split

In [20]:
train_test_split = tokenized_dataset.train_test_split(test_size=0.15)
train_data = train_test_split["train"]
val_data = train_test_split["test"]


Prepare Pytorch Dataloader

In [21]:
# Define function to format dataset
def collate_fn(batch):
    input_ids = torch.tensor([item["input_ids"] for item in batch])
    labels = torch.tensor([item["labels"] for item in batch])
    return {"input_ids": input_ids, "labels": labels}

# Create Dataloaders
batch_size = 8
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


Load Pretrained BERT

In [22]:
# Load Models
bart_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large").cuda()
t5_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base").cuda()


pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Optimizer & Scheduler

In [23]:

# Define Optimizer
optimizer = AdamW(bart_model.parameters(), lr=5e-5)

# Learning Rate Scheduler
num_training_steps = len(train_dataloader) * 5  # 5 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


Training Loop with **BART**

In [None]:
# Define Loss Function
loss_fn = nn.CrossEntropyLoss()

# Training Loop
num_epochs = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for epoch in range(num_epochs):
    bart_model.train()
    total_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}")

    for batch in progress_bar:
        input_ids, labels = batch["input_ids"].to(device), batch["labels"].to(device)

        # Forward pass
        outputs = bart_model(input_ids=input_ids, labels=labels)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        progress_bar.set_postfix({"Loss": loss.item()})

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} Loss: {avg_loss}")


Epoch 1: 100%|██████████| 243/243 [03:56<00:00,  1.03it/s, Loss=1.15]


Epoch 1 Loss: 2.438393953896354


Epoch 2: 100%|██████████| 243/243 [03:58<00:00,  1.02it/s, Loss=0.562]


Epoch 2 Loss: 0.7646152332977012


Epoch 3: 100%|██████████| 243/243 [03:57<00:00,  1.02it/s, Loss=0.416]


Epoch 3 Loss: 0.6795876665125168


Epoch 4: 100%|██████████| 243/243 [03:58<00:00,  1.02it/s, Loss=0.382]


Epoch 4 Loss: 0.6110214032999282


Epoch 5: 100%|██████████| 243/243 [03:58<00:00,  1.02it/s, Loss=0.667]

Epoch 5 Loss: 0.5694837722268125





Evaluate The Model **BART**

In [None]:
# Load Metrics
rouge = load("rouge")
bleu = load("bleu")
bertscore = load("bertscore")

# Evaluation Function
def evaluate(model, dataloader):
    model.eval()
    predictions, references = [], []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)

            outputs = model.generate(input_ids)
            decoded_preds = bart_tokenizer.batch_decode(outputs, skip_special_tokens=True)
            decoded_labels = bart_tokenizer.batch_decode(labels, skip_special_tokens=True)

            predictions.extend(decoded_preds)
            references.extend(decoded_labels)

    # Compute Metrics
    rouge_scores = rouge.compute(predictions=predictions, references=references)
    bleu_scores = bleu.compute(predictions=predictions, references=references)
    bert_scores = bertscore.compute(predictions=predictions, references=references, lang="en")

    return rouge_scores, bleu_scores, bert_scores

# Run Evaluation
rouge_scores, bleu_scores, bert_scores = evaluate(bart_model, val_dataloader)

# Print Scores
print("ROUGE-L:", rouge_scores["rougeL"])
print("BLEU-4:", bleu_scores["bleu"])
print("BERTScore:", sum(bert_scores["f1"]) / len(bert_scores["f1"]))


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Evaluating: 100%|██████████| 43/43 [00:27<00:00,  1.58it/s]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ROUGE-L: 0.30995344585214
BLEU-4: 0.16790239521079575
BERTScore: 0.8711567272597778




Save **BART** Model

In [None]:
bart_model.save_pretrained("/content/ClaimNormalization_bart_model")
bart_tokenizer.save_pretrained("/content/ClaimNormalization_bart_model")

('/content/ClaimNormalization_bart_model/tokenizer_config.json',
 '/content/ClaimNormalization_bart_model/special_tokens_map.json',
 '/content/ClaimNormalization_bart_model/vocab.json',
 '/content/ClaimNormalization_bart_model/merges.txt',
 '/content/ClaimNormalization_bart_model/added_tokens.json',
 '/content/ClaimNormalization_bart_model/tokenizer.json')

Save the Model Checkpoint

In [None]:
checkpoint_path = "/content/drive/MyDrive/ClaimNormalization_bart_model/checkpoint_epoch_5.pth"
torch.save(bart_model.state_dict(), checkpoint_path)


Check the Model Output

In [9]:
# Define Paths to Model and Checkpoint
checkpoint_path = "/content/drive/MyDrive/ClaimNormalization_bart_model/checkpoint_epoch_5.pth"
model_path = "/content/drive/MyDrive/ClaimNormalization_bart_model"

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load Tokenizer and Model
bart_tokenizer = BartTokenizer.from_pretrained(model_path)
bart_model = BartForConditionalGeneration.from_pretrained(model_path).to(device)

# Load Saved Checkpoint Weights
bart_model.load_state_dict(torch.load(checkpoint_path, map_location=device))
bart_model.eval()  # Set model to evaluation mode

# Define Claim Normalization Function
def normalize_claim(text):
    input_ids = bart_tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128).input_ids.to(device)
    output_ids = bart_model.generate(input_ids)
    return bart_tokenizer.decode(output_ids[0], skip_special_tokens=True)


Using device: cuda


In [12]:
# Test Inference on a Noisy Claim
# test_text = "BREAKING:!!! Prez B!den's enorgy plAn W1LL baN ACs 4 seniors!!! 😱 #NoMoreCooling"
test_text = input("Enter Original Claim : ")
print("Normalized Claim:", normalize_claim(test_text))


Enter Original Claim : BREAKING🚨: Scientists PROVE EARTH IS FLAT!!! 🌎🤯 #WakeUp
Normalized Claim: BREAKING🚨: Scientists PROVE EARTH is FLAT!!!
