In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
import torch

if torch.cuda.is_available():
    print(f"GPU : {torch.cuda.get_device_name(0)}")
else:
    print("No GPU")

GPU : Tesla T4


In [None]:
!cp "/content/drive/MyDrive/ClaimNormalization/CLAN_data_cleaned.csv" .

### Install Depndencies

In [None]:
!pip install -q transformers datasets evaluate rouge-score sacrebleu bert-score --use-deprecated=legacy-resolver

In [None]:
import pkg_resources

required_packages = ['transformers', 'datasets', 'evaluate', 'rouge-score', 'sacrebleu', 'bert-score']
installed_packages = {pkg.key for pkg in pkg_resources.working_set}

missing_packages = [pkg for pkg in required_packages if pkg not in installed_packages]

if not missing_packages:
    print("All dependencies are installed.")
else:
    print(f"Missing dependencies: {missing_packages}")


All dependencies are installed.


### All Imports

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

### Dataset Loading

In [None]:
df = pd.read_csv('/content/drive/MyDrive/ClaimNormalization/CLAN_data_cleaned.csv')

# Strict 70-15-15 split (seed=42 for reproducibility)
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df.reset_index(drop=True)),
    'validation': Dataset.from_pandas(val_df.reset_index(drop=True)),
    'test': Dataset.from_pandas(test_df.reset_index(drop=True))
})

### Model Initialization

In [None]:
from transformers import AutoTokenizer, BartForConditionalGeneration

model_checkpoint = "facebook/bart-large"

# Initialize tokenizer and model FIRST
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large")
model = BartForConditionalGeneration.from_pretrained(model_checkpoint)

# Verify initialization
print("Model architecture:", model.config.architectures[0])
print("Tokenizer vocab size:", tokenizer.vocab_size)

Model architecture: BartModel
Tokenizer vocab size: 50265


### Data Load and Modify

In [None]:
def preprocess_function(examples):
    inputs = [str(post) for post in examples["Social Media Post"]]
    targets = [str(claim) for claim in examples["Normalized Claim"]]

    model_inputs = tokenizer(
        inputs,
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=64,
            truncation=True,
            padding="max_length"
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Create tokenized dataset with proper splits
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

# Verify dataset structure
print("Tokenized dataset keys:", tokenized_dataset.keys())

Map:   0%|          | 0/1603 [00:00<?, ? examples/s]



Map:   0%|          | 0/343 [00:00<?, ? examples/s]

Map:   0%|          | 0/344 [00:00<?, ? examples/s]

Tokenized dataset keys: dict_keys(['train', 'validation', 'test'])


###  Hyperparams Tune

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Updated from evaluation_strategy
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    weight_decay=0.01,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    report_to="none",
    logging_strategy="epoch"
)


### Evaluation Metric set

In [None]:
import evaluate
import numpy as np

rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
bertscore = evaluate.load("bertscore")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    # Clamp predictions to valid token IDs
    preds = np.clip(preds, 0, len(tokenizer)-1)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Handle empty predictions
    decoded_preds = [pred if pred.strip() else " " for pred in decoded_preds]

    # Compute metrics with error handling
    try:
        rouge_results = rouge.compute(
            predictions=decoded_preds,
            references=decoded_labels,
            use_stemmer=True
        )
    except:
        rouge_results = {"rougeL": 0.0}

    try:
        bleu_results = bleu.compute(
            predictions=decoded_preds,
            references=[[ref] for ref in decoded_labels]
        )
    except:
        bleu_results = {"bleu": 0.0}

    try:
        bert_results = bertscore.compute(
            predictions=decoded_preds,
            references=decoded_labels,
            lang="en"
        )
    except:
        bert_results = {"f1": [0.0]}

    return {
        "rougeL": round(rouge_results["rougeL"], 4),
        "bleu": round(bleu_results["bleu"], 4),
        "bert_score": round(np.mean(bert_results["f1"]), 4)
    }

### Training

In [None]:
from transformers import Seq2SeqTrainer, EarlyStoppingCallback

# Update the CustomTrainer class
class CustomTrainer(Seq2SeqTrainer):
    def log(self, logs, start_time=None):
        # Add learning rate to logs
        logs["learning_rate"] = self.optimizer.param_groups[0]["lr"]
        # Call parent method with proper signature
        super().log(logs, start_time=start_time)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,  # Changed from bart_tokenizer
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

train_results = trainer.train()

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Rougel,Bleu,Bert Score
1,8.8564,6.053843,0.3171,0.1898,0.8714
2,5.5679,4.753925,0.3117,0.1845,0.8713
3,4.8316,4.354738,0.3134,0.1824,0.8708
4,4.3998,4.065684,0.3149,0.1868,0.8717


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


###  Evaluation

In [None]:
print("\nTraining completed. Final metrics:")
print(f"Training Loss: {train_results.metrics['train_loss']:.4f}")
print(f"Validation Loss: {train_results.metrics['eval_loss']:.4f}")
print(f"ROUGE-L: {train_results.metrics['eval_rougeL']:.4f}")
print(f"BLEU-4: {train_results.metrics['eval_bleu']:.4f}")
print(f"BERTScore: {train_results.metrics['eval_bert_score']:.4f}")


Training completed. Final metrics:
Training Loss: 5.6520


KeyError: 'eval_loss'

###  Model save

In [None]:
trainer.save_model("/content/drive/MyDrive/ClaimNormalization/BART_Large_5")
print("Model saved")

In [None]:
test_results = trainer.predict(tokenized_datasets["test"])
print("\nTest Set Metrics:")
print(f"ROUGE-L: {test_results.metrics['test_rougeL']:.4f}")
print(f"BLEU-4: {test_results.metrics['test_bleu']:.4f}")
print(f"BERTScore: {test_results.metrics['test_bert_score']:.4f}")

In [None]:
# Save the trained model to a specific path
import torch

checkpoint_path = "/content/drive/MyDrive/ClaimNormalization/BART_Large_5/checkpoint_large_epoch_5.pth"
torch.save(model.state_dict(), checkpoint_path)

print(f"Model saved at {checkpoint_path}")

###  Inference With Saved Model

In [None]:
from transformers import BartForConditionalGeneration, AutoTokenizer
import torch

# Load components
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")
model.load_state_dict(torch.load("/content/drive/MyDrive/ClaimNormalization/BART_Large_10/checkpoint_large_epoch_10.pth"))
model.eval()  # Set to evaluation mode
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large")

# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)


# Check for Claims
def normalize_claim(post: str) -> str:
    inputs = tokenizer(post, return_tensors="pt", max_length=128, truncation=True).to(device)
    outputs = model.generate(
        inputs.input_ids,
        max_length=64,
        num_beams=4,
        early_stopping=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


### Check with User Input

In [None]:
test_text = input("Enter Original Claim : ")
print("Normalized Claim:", normalize_claim(test_text))