Code to Mount Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
## To disable weihgt & bias
report_to="none"

### Imports

In [19]:
!pip install transformers datasets evaluate rouge_score sacrebleu bert-score --quiet
!pip install -U transformers datasets



In [22]:
!pip install -q transformers datasets evaluate rouge-score sacrebleu bert-score
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import (
    BartTokenizer,
    BartForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from datasets import Dataset, DatasetDict
import evaluate


Checking for GPU

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

Using device: cuda


### Create Hugging Face Dataset

In [23]:
# Load data
df = pd.read_csv("/content/drive/MyDrive/ClaimNormalization/CLAN_data_cleaned.csv")
df = df[["Social Media Post", "Normalized Claim"]].dropna()

# Create proper 70-15-15 splits
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Convert to Hugging Face dataset
dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
    "validation": Dataset.from_pandas(val_df.reset_index(drop=True)),
    "test": Dataset.from_pandas(test_df.reset_index(drop=True))
})


### Model

In [24]:
model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

### Dataset Preparation

In [25]:
def preprocess_function(examples):
    inputs = examples["Social Media Post"]
    targets = examples["Normalized Claim"]

    model_inputs = tokenizer(
        inputs,
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    # Updated tokenization for targets
    labels = tokenizer(
        text_target=targets,
        max_length=64,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/1596 [00:00<?, ? examples/s]

Map:   0%|          | 0/342 [00:00<?, ? examples/s]

Map:   0%|          | 0/343 [00:00<?, ? examples/s]

### Evaluation Metric Define & Set for Epoch wise output

In [27]:
import evaluate
import numpy as np
from transformers import TrainerCallback

rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
bertscore = evaluate.load("bertscore")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in labels with the pad token ID
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge_score = rouge.compute(predictions=decoded_preds, references=decoded_labels)["rougeL"]
    bleu_score = bleu.compute(predictions=decoded_preds, references=[[ref] for ref in decoded_labels])["bleu"]
    bert_score = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")["f1"]

    return {
        "rougeL": rouge_score,
        "bleu4": bleu_score ,
        "bertscore": np.mean(bert_score)
    }



class MetricLogger(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and 'loss' in logs:
            print(f"[Epoch {int(state.epoch)}] Training Loss: {logs['loss']:.4f}")
        if logs and 'eval_loss' in logs:
            print(f"[Epoch {int(state.epoch)}] Validation Loss: {logs['eval_loss']:.4f}")
        if logs and 'eval_rougeL' in logs:
            print(f"[Epoch {int(state.epoch)}] ROUGE-L: {logs['eval_rougeL']:.4f}, BLEU-4: {logs['eval_bleu4']:.2f}, BERTScore: {logs['eval_bertscore']:.4f}")


### Training Args

In [26]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./bart_model_output",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=10,
    predict_with_generate=True,
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    report_to="none"
)

### Training Code

In [28]:
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[MetricLogger()]
)

trainer.train()


Epoch,Training Loss,Validation Loss,Rougel,Bleu4,Bertscore
1,0.9932,0.973333,0.318913,17.295929,0.87429
2,0.9515,0.943216,0.320123,17.097373,0.875584
3,0.7058,0.949222,0.311752,16.106677,0.875306
4,0.6605,0.960861,0.321251,16.94216,0.876626
5,0.4896,0.971404,0.322755,17.071146,0.877248


[Epoch 0] Training Loss: 9.3768
[Epoch 0] Training Loss: 6.3778
[Epoch 0] Training Loss: 4.7921
[Epoch 0] Training Loss: 3.6324
[Epoch 0] Training Loss: 2.8710
[Epoch 0] Training Loss: 1.9688
[Epoch 0] Training Loss: 1.5644
[Epoch 0] Training Loss: 1.2564
[Epoch 0] Training Loss: 1.2072
[Epoch 0] Training Loss: 1.2265
[Epoch 0] Training Loss: 1.0601
[Epoch 0] Training Loss: 1.1334
[Epoch 0] Training Loss: 1.0672
[Epoch 0] Training Loss: 1.1197
[Epoch 0] Training Loss: 1.0319
[Epoch 0] Training Loss: 1.2004
[Epoch 0] Training Loss: 1.0826
[Epoch 0] Training Loss: 0.9198
[Epoch 0] Training Loss: 1.0807
[Epoch 1] Training Loss: 0.9932


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Epoch 1] Validation Loss: 0.9733
[Epoch 1] ROUGE-L: 0.3189, BLEU-4: 17.30, BERTScore: 0.8743




[Epoch 1] Training Loss: 0.7434
[Epoch 1] Training Loss: 0.8533
[Epoch 1] Training Loss: 0.9803
[Epoch 1] Training Loss: 0.7849
[Epoch 1] Training Loss: 0.8424
[Epoch 1] Training Loss: 0.8830
[Epoch 1] Training Loss: 0.9213
[Epoch 1] Training Loss: 0.7719
[Epoch 1] Training Loss: 0.9494
[Epoch 1] Training Loss: 0.7498
[Epoch 1] Training Loss: 0.8901
[Epoch 1] Training Loss: 0.8547
[Epoch 1] Training Loss: 0.8260
[Epoch 1] Training Loss: 0.7773
[Epoch 1] Training Loss: 0.7875
[Epoch 1] Training Loss: 0.8928
[Epoch 1] Training Loss: 0.8144
[Epoch 1] Training Loss: 0.9154
[Epoch 1] Training Loss: 0.8173
[Epoch 2] Training Loss: 0.9515
[Epoch 2] Validation Loss: 0.9432
[Epoch 2] ROUGE-L: 0.3201, BLEU-4: 17.10, BERTScore: 0.8756
[Epoch 2] Training Loss: 0.6193
[Epoch 2] Training Loss: 0.6114
[Epoch 2] Training Loss: 0.7101
[Epoch 2] Training Loss: 0.6851
[Epoch 2] Training Loss: 0.6225
[Epoch 2] Training Loss: 0.6358
[Epoch 2] Training Loss: 0.6635
[Epoch 2] Training Loss: 0.6970
[Epoch 2] 

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=1000, training_loss=0.9852107567787171, metrics={'train_runtime': 421.9561, 'train_samples_per_second': 18.912, 'train_steps_per_second': 2.37, 'total_flos': 608212117094400.0, 'train_loss': 0.9852107567787171, 'epoch': 5.0})

### Save Model

In [31]:
model_save_path = "/content/drive/MyDrive/ClaimNormalization/BART_Small_5/checkpoint_small_epoch_5.pth"
torch.save(model.state_dict(), model_save_path)

In [32]:
tokenizer.save_pretrained("/content/drive/MyDrive/ClaimNormalization/BART_Small_5/claim_norm_tokenizer")

('/content/drive/MyDrive/ClaimNormalization/BART_Small_5/claim_norm_tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/ClaimNormalization/BART_Small_5/claim_norm_tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/ClaimNormalization/BART_Small_5/claim_norm_tokenizer/vocab.json',
 '/content/drive/MyDrive/ClaimNormalization/BART_Small_5/claim_norm_tokenizer/merges.txt',
 '/content/drive/MyDrive/ClaimNormalization/BART_Small_5/claim_norm_tokenizer/added_tokens.json')

###Evaluate Test Data

In [34]:
test_results = trainer.predict(tokenized_dataset["test"])
print("\nTest Set Metrics:")
print(f"ROUGE-L: {test_results.metrics['test_rougeL']:.4f}")
print(f"BLEU-4: {test_results.metrics['test_bleu4']:.4f}")
print(f"BERTScore: {test_results.metrics['test_bertscore']:.4f}")



Test Set Metrics:
ROUGE-L: 0.3280
BLEU-4: 17.2328
BERTScore: 0.8778


###Inference Test

In [36]:
from transformers import BartForConditionalGeneration, AutoTokenizer
import torch

def generate_claim(user_input: str) -> str:
    # Define paths
    model_path = "/content/drive/MyDrive/ClaimNormalization/BART_Small_5/checkpoint_small_epoch_5.pth"
    tokenizer_path = "/content/drive/MyDrive/ClaimNormalization/BART_Small_5/claim_norm_tokenizer"

    # 1. Load tokenizer from saved directory
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

    # 2. Initialize model architecture then load weights
    model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
    model.load_state_dict(torch.load(model_path))

    # 3. Device setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()

    # 4. Tokenize with loaded tokenizer
    inputs = tokenizer(
        user_input,
        max_length=128,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    ).to(device)

    # 5. Generate claim
    outputs = model.generate(
        inputs.input_ids,
        max_length=64,
        num_beams=4,
        early_stopping=True
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)



In [38]:
# Usage
test_post = "Did you know 🤯 that eating garlic 🧄 every day will make you IMMUNE to ALL viruses 🦠?! Try it NOW! 🔥🔥🔥"
print("Generated Claim:", generate_claim(test_post))

Generated Claim: eating garlic every day will make you IMMUNE to all viruses
