Code to Mount Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
## To disable weihgt & bias
report_to="none"

### Imports

In [19]:
!pip install transformers datasets evaluate rouge_score sacrebleu bert-score --quiet
!pip install -U transformers datasets



In [3]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset, DatasetDict
import evaluate
from transformers import TrainerCallback
import random

Checking for GPU

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

Using device: cuda


Load Dataset

In [5]:
df = pd.read_csv("/content/drive/MyDrive/ClaimNormalization/CLAN_data_cleaned.csv").dropna()
df = df.dropna().drop_duplicates().reset_index(drop=True)
df = df[["Social Media Post", "Normalized Claim"]]

train_df, temp_df = train_test_split(df, test_size=0.30, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.50, random_state=42)

### Create Hugging Face Dataset

In [6]:
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df),
    "test": Dataset.from_pandas(test_df)
})


### Model

In [7]:
model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### Tokenizer

In [8]:
def preprocess(examples):
    model_inputs = tokenizer(
        examples["Social Media Post"],
        max_length=128,
        padding="max_length",
        truncation=True
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["Normalized Claim"],
            max_length=64,
            padding="max_length",
            truncation=True
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess, batched=True)

Map:   0%|          | 0/1596 [00:00<?, ? examples/s]



Map:   0%|          | 0/342 [00:00<?, ? examples/s]

Map:   0%|          | 0/343 [00:00<?, ? examples/s]

### Evaluation Metric Define & Set for Epoch wise output

In [13]:
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
bertscore = evaluate.load("bertscore")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge_score = rouge.compute(predictions=pred_str, references=label_str)["rougeL"]
    bleu_score = bleu.compute(predictions=pred_str, references=label_str)["bleu"]
    bert_score = bertscore.compute(predictions=pred_str, references=label_str, lang="en")["f1"]

    return {
        "rougeL": rouge_score,
        "bleu4": bleu_score * 100,
        "bertscore": np.mean(bert_score)
    }

class MetricLogger(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and 'loss' in logs:
            print(f"[Epoch {int(state.epoch)}] Training Loss: {logs['loss']:.4f}")
        if logs and 'eval_loss' in logs:
            print(f"[Epoch {int(state.epoch)}] Validation Loss: {logs['eval_loss']:.4f}")
        if logs and 'eval_rougeL' in logs:
            print(f"[Epoch {int(state.epoch)}] ROUGE-L: {logs['eval_rougeL']:.4f}, BLEU-4: {logs['eval_bleu4']:.2f}, BERTScore: {logs['eval_bertscore']:.4f}")


### Training Args

In [21]:
training_args = TrainingArguments(
    output_dir="./bart_model_output",
    do_train=True,
    do_eval=True,
    logging_dir="./logs",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=10
)


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

### Training Code

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


trainer.train()
trainer.evaluate()

NameError: name 'train_dataset' is not defined

### Save Model

In [None]:
model_path = "best_bart_claim_model.pth"
torch.save(model.state_dict(), model_path)
tokenizer.save_pretrained("./best_tokenizer")

###Evaluate Test Data

In [None]:
rouge_final = rouge.compute(predictions=test_preds, references=test_refs)["rougeL"]
bleu_final = bleu.compute(predictions=test_preds, references=test_refs)["bleu"]
bert_final = bertscore.compute(predictions=test_preds, references=test_refs, lang="en")["f1"]

print(f"ROUGE-L: {rouge_final:.4f}")
print(f"BLEU-4 : {bleu_final * 100:.2f}")
print(f"BERTScore: {np.mean(bert_final):.4f}")

###Inference Test

In [None]:
def generate_claims(posts):
    model.eval()
    inputs = tokenizer(posts, return_tensors="pt", padding=True, truncation=True, max_length=128).to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=64, num_beams=4, early_stopping=True)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

test_inputs = dataset["test"]["post"]
test_refs = dataset["test"]["normalized_claim"]
test_preds = generate_claims(test_inputs)

In [11]:
import transformers
print(transformers.__version__)


4.51.0
