<a href="https://colab.research.google.com/github/gilbiton1/gilbiton1/blob/main/Mobilebert_with_augmentation_final_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
!pip install transformers datasets evaluate peft nltk

from evaluate import load as load_metric
import numpy as np
import random
from nltk.corpus import wordnet
from transformers import MobileBertTokenizerFast, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback
from datasets import load_dataset, concatenate_datasets
import pandas as pd
import evaluate
import torch
import nltk
from peft import LoraConfig, get_peft_model, TaskType
import json
import os
from sklearn.metrics import f1_score

# Download WordNet resources for synonym replacement
nltk.download('wordnet')
nltk.download('omw-1.4')

# Disable Weights & Biases (WandB) logging
os.environ["WANDB_DISABLED"] = "true"

# Define GLUE tasks
glue_tasks = ["qnli", "rte", "sst2", "mrpc", "stsb", "qqp", "mnli", "cola"]
results = {}
results_df = pd.DataFrame(columns=["Dataset", "Steps", "Train Loss", "Validation Loss", "F1"])

# Mapping of fine-tuned models per task
finetuned_model_map = {
    "qnli": "Alireza1044/mobilebert_QNLI",
    "mnli": "Alireza1044/mobilebert_mnli",
    "sst2": "Alireza1044/mobilebert_sst2",
    "qqp": "Alireza1044/mobilebert_QQP",
    "rte": "Alireza1044/mobilebert_RTE",
    "mrpc": "Alireza1044/mobilebert_MRPC",
    "cola": "Alireza1044/mobilebert_CoLA",
    "stsb": "Alireza1044/mobilebert_stsb"
}

def synonym_replacement(sentence):
    """Replaces words in a sentence with their synonyms."""
    words = sentence.split()
    new_sentence = []
    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            if synonym != word:
                new_sentence.append(synonym)
            else:
                new_sentence.append(word)
        else:
            new_sentence.append(word)
    return ' '.join(new_sentence)

def random_word_swap(sentence):
    """Swaps exactly 2 pairs of words in the sentence (total 4 words involved)."""
    words = sentence.split()
    if len(words) < 4:
        return sentence  # Requires at least 4 words for 2 swaps

    indices = list(range(len(words)))
    random.shuffle(indices)

    # Take 4 distinct indices, divide them into pairs
    idx1, idx2, idx3, idx4 = indices[:4]

    # First swap
    words[idx1], words[idx2] = words[idx2], words[idx1]

    # Second swap
    words[idx3], words[idx4] = words[idx4], words[idx3]

    return ' '.join(words)

def inject_typos(sentence, max_typos=2):
    """Injects up to 2 typos into the sentence by replacing characters."""
    if len(sentence) < 2:
        return sentence  # Cannot alter anything

    indices = list(range(len(sentence)))
    random.shuffle(indices)
    indices = indices[:min(max_typos, len(sentence))]

    chars = list(sentence)
    for i in indices:
        if chars[i].isalpha():
            chars[i] = random.choice('abcdefghijklmnopqrstuvwxyz')

    return ''.join(chars)

def augment_sentence(sentence):
    """Applies a random augmentation method to a sentence."""
    methods = [synonym_replacement, random_word_swap, inject_typos]
    method = random.choice(methods)
    return method(sentence)

for task_name in glue_tasks:
    print(f"Running training and evaluation for task: {task_name}")
    dataset = load_dataset("glue", task_name)

    # --- Augment 10% of training set ---
    augmentation_ratio = 0.1
    train_dataset_full = dataset["train"]

    shuffled = train_dataset_full.shuffle(seed=42)
    num_to_augment = int(len(shuffled) * augmentation_ratio)

    to_augment_10 = shuffled.select(range(num_to_augment))
    original_90 = shuffled.select(range(num_to_augment, len(shuffled)))

    # Augmentation function per task
    def augment_examples(example):
        if task_name in ["sst2", "cola"]:
            example["sentence"] = augment_sentence(example["sentence"])
        elif task_name in ["stsb", "mrpc", "rte"]:
            example["sentence1"] = augment_sentence(example["sentence1"])
            example["sentence2"] = augment_sentence(example["sentence2"])
        elif task_name == "qqp":
            example["question1"] = augment_sentence(example["question1"])
        elif task_name == "qnli":
            example["question"] = augment_sentence(example["question"])
        return example

    # Apply augmentation
    augmented_10 = to_augment_10.map(augment_examples)

    # Merge back: 90% original + 10% augmented
    augmented_train_dataset = concatenate_datasets([original_90, augmented_10])
    print(f"\n Augmentation Example for task: {task_name}")
    original_example = to_augment_10[0]
    augmented_example = augmented_10[0]

    # Display based on task
    if task_name in ["sst2", "cola"]:
        print("Original :", original_example["sentence"])
        print("Augmented:", augmented_example["sentence"])
    elif task_name in ["stsb", "mrpc", "rte"]:
        print("Original sentence1 :", original_example["sentence1"])
        print("Augmented sentence1:", augmented_example["sentence1"])
        print("Original sentence2 :", original_example["sentence2"])
        print("Augmented sentence2:", augmented_example["sentence2"])
    elif task_name == "qqp":
        print("Original question1 :", original_example["question1"])
        print("Augmented question1:", augmented_example["question1"])
        print("Original question2 :", original_example["question2"])

    # Determine number of labels dynamically
    num_labels = 1 if task_name == "stsb" else len(set(dataset["train"]["label"]))

    model_name = finetuned_model_map.get(task_name, "google/mobilebert-uncased")
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )
    print(f"Using device: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')}")

    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=8,
        lora_alpha=32,
        lora_dropout=0.1,
        bias="none",
        target_modules=["query", "value"],
    )
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    tokenizer = MobileBertTokenizerFast.from_pretrained('google/mobilebert-uncased')

    def tokenize_function(examples):
        """Tokenizes inputs based on the specific GLUE task format."""
        if task_name in ["mnli", "rte"]:
            return tokenizer(
                examples["premise"] if "premise" in examples else examples["sentence1"],
                examples["hypothesis"] if "hypothesis" in examples else examples["sentence2"],
                padding="longest", truncation=True, max_length=512
            )
        elif task_name in ["qqp", "mrpc"]:
            return tokenizer(
                examples["question1"] if "question1" in examples else examples["sentence1"],
                examples["question2"] if "question2" in examples else examples["sentence2"],
                padding="longest", truncation=True, max_length=512
            )
        elif task_name in ["stsb"]:
            return tokenizer(
                examples["sentence1"],
                examples["sentence2"],
                 padding="longest", truncation=True, max_length=512
            )
        elif task_name == "qnli":
            return tokenizer(
                examples["question"],
                examples["sentence"],
                padding="longest", truncation=True, max_length=512
            )
        else:  # Tasks like SST-2, CoLA, etc.
            return tokenizer(
                examples["sentence"],
                padding="longest", truncation=True, max_length=512
            )

    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    tokenized_train = augmented_train_dataset.map(tokenize_function, batched=True)
    train_dataset = tokenized_train
    print(f"Train dataset size: {len(train_dataset)}")

    available_splits = tokenized_datasets.keys()
    eval_split = next((split for split in ["validation", "dev", "validation_matched", "validation_mismatched", "test"] if split in available_splits), None)
    if not eval_split:
        raise ValueError(f"No valid evaluation split found for task: {task_name}")
    eval_dataset = tokenized_datasets[eval_split]

    # Dynamic calculation based on dataset size
    steps_per_epoch = len(train_dataset) // 64
    eval_steps = max(steps_per_epoch // 2, 10) // 2  # Evaluate every half epoch at least

    training_args = TrainingArguments(
        output_dir=f'drive/MyDrive/mobilebert1303/results_{task_name}',
        evaluation_strategy="steps",
        logging_strategy="steps",
        logging_steps=eval_steps,
        eval_steps=eval_steps,
        learning_rate=2e-5,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        num_train_epochs=2,
        weight_decay=0.01,
        save_strategy="steps",
        save_steps=eval_steps,
        save_total_limit=1,
        dataloader_num_workers=8,
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred

        if task_name == "stsb":  # Regression task
            predictions = logits.squeeze()
            metric = load_metric("glue", task_name)
            return metric.compute(predictions=predictions, references=labels)
        else:
            predictions = np.argmax(logits, axis=-1)
            metric = load_metric("glue", task_name)
            accuracy = (predictions == labels).mean()
            f1 = f1_score(labels, predictions, average='macro')  # Calculate F1
            return {"accuracy": accuracy, "f1": f1}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    )

    train_history = trainer.train()
    task_results = trainer.evaluate()

    # Store results in DataFrame
    train_loss_values = []
    eval_loss_values = []
    steps = []
    f1_scores = []
    log_history = trainer.state.log_history
    df = pd.DataFrame(log_history)

    df.to_csv("drive/MyDrive/1303mobilebert/training_log_full.csv", index=False)
    for log in trainer.state.log_history:
        if "loss" in log:
            train_loss_values.append(log["loss"])
            steps.append(log["step"])
        if "eval_loss" in log:
            eval_loss_values.append(log["eval_loss"])
        if "eval_f1" in log:
            f1_scores.append(log["eval_f1"])

    # Insert new information into results_df
    task_results_entry = {
        "Dataset": task_name,
        "Steps": training_args.eval_steps,
        "Train Loss": train_loss_values[-1] if train_loss_values else None,  # Last training loss
        "Validation Loss": eval_loss_values[-1] if eval_loss_values else None,  # Last evaluation loss
        "F1": f1_scores[-1] if f1_scores else None  # Last measured F1
    }

    task_results_entry = pd.DataFrame([task_results_entry])  # Create a new DataFrame
    results_df = pd.concat([results_df, task_results_entry], ignore_index=True)

    # Save training history
    with open(f'drive/MyDrive/1303mobilebert/train_history_{task_name}.json', 'w') as f:
        json.dump(trainer.state.log_history, f)

    # Save evaluation results
    with open(f'drive/MyDrive/1303mobilebert/eval_results_{task_name}.json', 'w') as f:
        json.dump(task_results, f)
    results[task_name] = task_results

# Save DataFrame to CSV in Google Drive
results_df.to_csv("drive/MyDrive/1303mobilebert/results_summary.csv", index=False)
print("Final Results for all tasks:", results)



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Running training and evaluation for task: qnli

🔍 Augmentation Example for task: qnli
Using device: cuda
trainable params: 173,058 || all params: 24,755,972 || trainable%: 0.6991


Map:   0%|          | 0/5463 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Train dataset size: 104743


Step,Training Loss,Validation Loss,Accuracy,F1
409,0.1699,0.406753,0.905546,0.905485
818,0.1663,0.376679,0.906645,0.906596
1227,0.1699,0.405779,0.906096,0.906046
1636,0.1633,0.418275,0.906279,0.906213
2045,0.1668,0.355285,0.907011,0.906964
2454,0.1729,0.377742,0.906645,0.9066
2863,0.1672,0.362297,0.907011,0.906961
3272,0.1734,0.363528,0.907011,0.906961


Downloading builder script: 0.00B [00:00, ?B/s]

  results_df = pd.concat([results_df, task_results_entry], ignore_index=True)


Running training and evaluation for task: rte


train-00000-of-00001.parquet:   0%|          | 0.00/584k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/69.0k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/621k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/277 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/249 [00:00<?, ? examples/s]


🔍 Augmentation Example for task: rte
Original sentence1 : Thanks to a global ban on the ivory trade that was passed in 1989 by the Convention on International Trade in Endangered Species of Wild Fauna and Flora (CITES), the African elephant population may be reversing its spiral toward extinction
Augmented sentence1: Thanks to a global ban on the ivory trade thax was passed in 1989 by the Convention on International Trade in Endsngered Species of Wild Fauna and Flora (CITES), the African elephant population may be reversing its spiral toward extinction
Original sentence2 : The ban on ivory trade has been effective in protecting the elephant from extinction.
Augmented sentence2: The ban on ivory trvde has been effecthve in protecting the elephant from extinction.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/98.7M [00:00<?, ?B/s]

Using device: cuda
trainable params: 173,058 || all params: 24,755,972 || trainable%: 0.6991


Map:   0%|          | 0/2490 [00:00<?, ? examples/s]

Map:   0%|          | 0/277 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2490 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Train dataset size: 2490


Step,Training Loss,Validation Loss,Accuracy,F1
9,0.2895,0.856942,0.66787,0.659669
18,0.2783,0.867423,0.66426,0.655569
27,0.3382,0.871303,0.66426,0.654738
36,0.2975,0.869444,0.66426,0.654738
45,0.2955,0.863453,0.66787,0.658867
54,0.2881,0.86212,0.66787,0.658867
63,0.3252,0.861202,0.66787,0.658867
72,0.2997,0.860383,0.66787,0.658867


Running training and evaluation for task: sst2


train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Map:   0%|          | 0/6734 [00:00<?, ? examples/s]


🔍 Augmentation Example for task: sst2
Original : klein , charming in comedies like american pie and dead-on in election , 
Augmented: klein , charming iv comedies like american pie and gead-on in election , 


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/98.7M [00:00<?, ?B/s]

Using device: cuda
trainable params: 173,058 || all params: 24,755,972 || trainable%: 0.6991


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/98.5M [00:00<?, ?B/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Train dataset size: 67349


Step,Training Loss,Validation Loss,Accuracy,F1
263,0.098,0.411742,0.901376,0.901124
526,0.1488,0.676432,0.902523,0.902307
789,0.0956,0.615755,0.902523,0.902285
1052,0.12,0.570842,0.902523,0.902285
1315,0.1031,0.557294,0.90367,0.903446
1578,0.0933,0.521855,0.902523,0.902285
1841,0.211,0.653973,0.90367,0.903446
2104,0.0928,0.607897,0.904817,0.904606


Running training and evaluation for task: mrpc


train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Map:   0%|          | 0/366 [00:00<?, ? examples/s]


🔍 Augmentation Example for task: mrpc
Original sentence1 : " The public is understandably losing patience with these unwanted phone calls , unwanted intrusions , " he said at a White House ceremony .
Augmented sentence1: " The public is understandably losing patience with these unwanted phone calls , unwonted intrusions , " he said at a White Houst ceremony .
Original sentence2 : " While many good people work in the telemarketing industry , the public is understandably losing patience with these unwanted phone calls , unwanted intrusions , " Mr. Bush said .
Augmented sentence2: " While many good people work in the telemarketing industry , the public is understandably losing patience with these unwanted phone calls , unwanted intrvsions , " Mr. Bush said .


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/98.7M [00:00<?, ?B/s]

Using device: cuda
trainable params: 173,058 || all params: 24,755,972 || trainable%: 0.6991


Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Train dataset size: 3668


Step,Training Loss,Validation Loss,Accuracy,F1
14,0.3746,0.366798,0.835784,0.794428
28,0.3673,0.365086,0.835784,0.796642
42,0.3839,0.365898,0.835784,0.796642
56,0.4296,0.36571,0.835784,0.796642
70,0.3688,0.365158,0.838235,0.800208
84,0.382,0.364824,0.838235,0.800208
98,0.3836,0.364565,0.835784,0.796642
112,0.4299,0.364793,0.835784,0.796642


Running training and evaluation for task: stsb


train-00000-of-00001.parquet:   0%|          | 0.00/502k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/151k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/114k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Map:   0%|          | 0/574 [00:00<?, ? examples/s]


🔍 Augmentation Example for task: stsb
Original sentence1 : Five killed in China building collapse
Augmented sentence1: Five killed in China building colmapse
Original sentence2 : At least 20 killed in Syria Saturday
Augmented sentence2: astatine least twenty kill inch Syria Saturday


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/98.7M [00:00<?, ?B/s]

Using device: cuda
trainable params: 172,545 || all params: 24,754,946 || trainable%: 0.6970


model.safetensors:   0%|          | 0.00/98.5M [00:00<?, ?B/s]

Map:   0%|          | 0/5749 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1379 [00:00<?, ? examples/s]

Map:   0%|          | 0/5749 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Train dataset size: 5749


Step,Training Loss,Validation Loss,Pearson,Spearmanr
22,0.2813,0.544474,0.877205,0.873386
44,0.2304,0.538225,0.877022,0.873141
66,0.4235,0.538367,0.876995,0.873097
88,0.2472,0.53901,0.876746,0.872792
110,0.2441,0.534239,0.876649,0.872714
132,0.272,0.541737,0.87663,0.872652
154,0.2351,0.540698,0.876563,0.872609
176,0.2394,0.539726,0.876547,0.87258


  results_df = pd.concat([results_df, task_results_entry], ignore_index=True)


Running training and evaluation for task: qqp


train-00000-of-00001.parquet:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/3.73M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/36.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/363846 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/40430 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/390965 [00:00<?, ? examples/s]

Map:   0%|          | 0/36384 [00:00<?, ? examples/s]


🔍 Augmentation Example for task: qqp
Original question1 : What would Rhaegar and Jon have thought of each other?
Augmented question1: What would Rhaegar and ton havu thought of each other?
Original question2 : What is the amount of torque a 2000 Jeep Cherokee can output?


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/98.7M [00:00<?, ?B/s]

Using device: cuda
trainable params: 173,058 || all params: 24,755,972 || trainable%: 0.6991


Map:   0%|          | 0/363846 [00:00<?, ? examples/s]

Map:   0%|          | 0/40430 [00:00<?, ? examples/s]

Map:   0%|          | 0/390965 [00:00<?, ? examples/s]

Map:   0%|          | 0/363846 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Train dataset size: 363846


Step,Training Loss,Validation Loss,Accuracy,F1
1421,0.2358,0.246003,0.896339,0.890586
2842,0.1969,0.24773,0.896884,0.890835
4263,0.1916,0.250657,0.896958,0.890705
5684,0.196,0.245708,0.897329,0.891119
7105,0.1903,0.248293,0.897329,0.891223
8526,0.1865,0.249781,0.897428,0.891243
9947,0.1888,0.249365,0.897106,0.890939
11368,0.1898,0.249385,0.89718,0.891014


Running training and evaluation for task: mnli


Map:   0%|          | 0/39270 [00:00<?, ? examples/s]


🔍 Augmentation Example for task: mnli
Using device: cuda
trainable params: 173,571 || all params: 24,756,998 || trainable%: 0.7011


Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

Map:   0%|          | 0/392702 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Train dataset size: 392702


Step,Training Loss,Validation Loss,Accuracy,F1
1533,0.4444,0.463465,0.819969,0.818639
3066,0.4387,0.465239,0.820173,0.81882
4599,0.4405,0.466233,0.819868,0.818484


Step,Training Loss,Validation Loss,Accuracy,F1
1533,0.4444,0.463465,0.819969,0.818639
3066,0.4387,0.465239,0.820173,0.81882
4599,0.4405,0.466233,0.819868,0.818484
6132,0.4443,0.464341,0.820886,0.81962
7665,0.4435,0.464343,0.820071,0.818684
9198,0.4672,0.465118,0.820275,0.818865
10731,0.4362,0.464977,0.820479,0.81905
12264,0.4398,0.465398,0.820581,0.819129


Running training and evaluation for task: cola


Map:   0%|          | 0/855 [00:00<?, ? examples/s]


🔍 Augmentation Example for task: cola
Original : Sodium is a little too peppy for me to want to try mixing and water in a teacup.
Augmented: Sodium is a little too peppy for me to want to trr mixing and wxter in a teacup.
Using device: cuda
trainable params: 173,058 || all params: 24,755,972 || trainable%: 0.6991


Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Train dataset size: 8551


Step,Training Loss,Validation Loss,Accuracy,F1
33,0.364,0.668255,0.797699,0.708926
66,0.3776,0.679862,0.793864,0.700657
99,0.3904,0.657068,0.794823,0.701586
132,0.366,0.639465,0.794823,0.701586
165,0.3381,0.637935,0.794823,0.701586
198,0.3747,0.635682,0.793864,0.698779
231,0.3453,0.629965,0.793864,0.698779
264,0.3127,0.62937,0.793864,0.698779


Final Results for all tasks: {'qnli': {'eval_loss': 0.3635263741016388, 'eval_accuracy': 0.9070107999267801, 'eval_f1': 0.9069605179521152, 'eval_runtime': 6.6838, 'eval_samples_per_second': 817.354, 'eval_steps_per_second': 12.867, 'epoch': 2.0}, 'rte': {'eval_loss': 0.8601447939872742, 'eval_accuracy': 0.6678700361010831, 'eval_f1': 0.6588669950738917, 'eval_runtime': 1.1768, 'eval_samples_per_second': 235.379, 'eval_steps_per_second': 4.249, 'epoch': 2.0}, 'sst2': {'eval_loss': 0.6078978180885315, 'eval_accuracy': 0.9048165137614679, 'eval_f1': 0.9046056230171753, 'eval_runtime': 1.5092, 'eval_samples_per_second': 577.782, 'eval_steps_per_second': 9.276, 'epoch': 2.0}, 'mrpc': {'eval_loss': 0.3647850751876831, 'eval_accuracy': 0.8357843137254902, 'eval_f1': 0.7966419437149892, 'eval_runtime': 1.8915, 'eval_samples_per_second': 215.706, 'eval_steps_per_second': 3.701, 'epoch': 2.0}, 'stsb': {'eval_loss': 0.5397212505340576, 'eval_pearson': 0.8765463208278277, 'eval_spearmanr': 0.8725

In [None]:
####Loss PLOTS#########

import pandas as pd
import matplotlib.pyplot as plt

log_df = pd.read_csv("/content/Cleaned_Training___Evaluation_Logs_Summary.csv")

for dataset in log_df["Dataset"].unique():
    df = log_df[log_df["Dataset"] == dataset].sort_values(by="Step")

    plt.figure(figsize=(10, 5))

    train_steps = df[df["Train_Loss"].notna()]["Step"]
    train_loss = df[df["Train_Loss"].notna()]["Train_Loss"]
    eval_steps = df[df["Eval_Loss"].notna()]["Step"]
    eval_loss = df[df["Eval_Loss"].notna()]["Eval_Loss"]

    plt.plot(train_steps, train_loss, label="Train Loss", marker='o', linestyle='-')
    plt.plot(eval_steps, eval_loss, label="Eval Loss", marker='x', linestyle='--')

    plt.title(f"Loss over Steps - {dataset}")
    plt.xlabel("Step")
    plt.ylabel("Loss")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
###BOOLQ Regular Mobilebert##########
!pip install transformers datasets

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
from tqdm import tqdm

# Load the BoolQ dataset
dataset = load_dataset("boolq", split="validation")

# Define the pre-trained model name
model_name = 'csarron/mobilebert-uncased-squad-v2'

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()  # Set model to evaluation mode

# Preprocessing function for tokenization
def preprocess(example):
    """Tokenizes question and passage, truncates and pads to max length."""
    return tokenizer(example["question"], example["passage"],
                       truncation=True, padding="max_length", max_length=512, return_tensors="pt")

# Evaluation loop
correct = 0
for example in tqdm(dataset):
    inputs = preprocess(example)
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs = model(**{k: v.to(model.device) for k, v in inputs.items()}) # Move inputs to model's device
    pred = torch.argmax(outputs.logits).item()  # Get predicted label
    label = int(example["answer"])  # Get true label
    correct += int(pred == label)  # Increment correct count if prediction matches label

# Calculate accuracy
accuracy = correct / len(dataset)
print(f"✅ Accuracy on BoolQ validation set: {accuracy:.4f}")

In [None]:
##########BOOLQ Mobilebert with augmentation############

!pip install transformers datasets evaluate peft nltk

import random
from nltk.corpus import wordnet
from transformers import MobileBertTokenizerFast, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset, Dataset
import evaluate
import numpy as np
import torch
import nltk
from peft import LoraConfig, get_peft_model, TaskType
import os
import json

# Basic settings
nltk.download('wordnet')
nltk.download('omw-1.4')
os.environ["WANDB_DISABLED"] = "true"  # Disable Weights & Biases logging

# Load the BOOLQ dataset
dataset = load_dataset("boolq")

# Augmentation functions
def synonym_replacement(sentence):
    """Replaces words in a sentence with their synonyms."""
    words = sentence.split()
    new_sentence = []
    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            new_sentence.append(synonym if synonym != word else word)
        else:
            new_sentence.append(word)
    return ' '.join(new_sentence)

def random_word_swap(sentence, swap_ratio=0.3):
    """Randomly swaps words in a sentence."""
    words = sentence.split()
    num_swaps = max(1, int(len(words) * swap_ratio))
    if len(words) < 2:
        return sentence
    indices = list(range(len(words)))
    random.shuffle(indices)
    for i in range(0, min(num_swaps, len(words) - 1), 2):
        idx1, idx2 = indices[i], indices[i + 1]
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)

def inject_typos(sentence, prob=0.2):
    """Injects typos into a sentence."""
    chars = list(sentence)
    for i in range(len(chars)):
        if random.random() < prob:
            chars[i] = random.choice('abcdefghijklmnopqrstuvwxyz')
    return ''.join(chars)

def augment_sentence(sentence):
    """Applies a random augmentation method to a sentence."""
    methods = [synonym_replacement, random_word_swap, inject_typos]
    return random.choice(methods)(sentence)

# Convert labels to integers
def convert_labels(example):
    """Converts boolean answers to integer labels."""
    example["label"] = int(example["answer"])
    return example

# Tokenizer initialization
tokenizer = MobileBertTokenizerFast.from_pretrained("google/mobilebert-uncased")

def tokenize_function(examples):
    """Tokenizes question and passage."""
    return tokenizer(
        examples["question"],
        examples["passage"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

# Tokenization and label conversion
dataset = dataset.map(convert_labels)
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["question", "passage", "answer"])

# Augmentation of 10% of the training data
train_dataset = dataset["train"]
num_augment = int(0.1 * len(train_dataset))
augmented_questions = []
augmented_passages = []
augmented_labels = []

indices = random.sample(range(len(train_dataset)), num_augment)
for idx in indices:
    q = augment_sentence(train_dataset[idx]["question"])
    p = augment_sentence(train_dataset[idx]["passage"])
    augmented_questions.append(q)
    augmented_passages.append(p)
    augmented_labels.append(int(train_dataset[idx]["answer"]))

# Create augmented dataset
augmented_dataset = Dataset.from_dict({
    "question": augmented_questions,
    "passage": augmented_passages,
    "answer": augmented_labels
}).map(convert_labels).map(tokenize_function, batched=True, remove_columns=["question", "passage", "answer"])

# Combine original (non-augmented) and augmented datasets
new_train = train_dataset.select([i for i in range(len(train_dataset)) if i not in indices])
new_train_tokenized = new_train.map(convert_labels).map(tokenize_function, batched=True, remove_columns=["question", "passage", "answer"])
final_train_dataset = torch.utils.data.ConcatDataset([new_train_tokenized.with_format("torch"), augmented_dataset.with_format("torch")])

# Load MobileBERT model
model = AutoModelForSequenceClassification.from_pretrained(
    'csarron/mobilebert-uncased-squad-v2',
    num_labels=2
)

# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["query", "value"]
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Evaluation metrics
accuracy_metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=preds, references=labels)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results_boolq",
    evaluation_strategy="steps",
    eval_steps=1000,
    learning_rate=5e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=1
)

# Trainer initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_train_dataset,
    eval_dataset=tokenized_datasets["validation"].with_format("torch"),
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics
)

# Training
trainer.train()

# Save model and tokenizer
model.save_pretrained("./mobilebert_boolq_lora")
tokenizer.save_pretrained("./mobilebert_boolq_lora")

# Final evaluation
results = trainer.evaluate()
with open("./results_boolq.json", "w") as f:
    json.dump(results, f)

print("Final Evaluation Results:", results)

In [None]:
###Augmentation Precetange check comparison###
!pip install transformers datasets evaluate peft nltk

import numpy as np, pandas as pd, random, torch, matplotlib.pyplot as plt
from datasets import load_dataset, concatenate_datasets
from transformers import MobileBertTokenizerFast, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from peft import LoraConfig, get_peft_model, TaskType
from evaluate import load as load_metric
from sklearn.metrics import f1_score
from nltk.corpus import wordnet
import nltk

nltk.download('wordnet')
nltk.download('omw-1.4')
tokenizer = MobileBertTokenizerFast.from_pretrained('google/mobilebert-uncased')
task_name = "rte"
dataset = load_dataset("glue", task_name)
results = {}

def synonym_replacement(sentence):
    """Replaces words in a sentence with their synonyms."""
    words = sentence.split()
    new_sentence = []
    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            if synonym != word:
                new_sentence.append(synonym)
            else:
                new_sentence.append(word)
        else:
            new_sentence.append(word)
    return ' '.join(new_sentence)

def random_word_swap(sentence):
    """Swaps exactly 2 pairs of words in the sentence (total 4 words involved)."""
    words = sentence.split()
    if len(words) < 4:
        return sentence  # Requires at least 4 words for 2 swaps

    indices = list(range(len(words)))
    random.shuffle(indices)

    # Take 4 distinct indices, divide them into pairs
    idx1, idx2, idx3, idx4 = indices[:4]

    # First swap
    words[idx1], words[idx2] = words[idx2], words[idx1]

    # Second swap
    words[idx3], words[idx4] = words[idx4], words[idx3]

    return ' '.join(words)

def inject_typos(sentence, max_typos=2):
    """Injects up to 2 typos into the sentence by replacing characters."""
    if len(sentence) < 2:
        return sentence  # Cannot alter anything

    indices = list(range(len(sentence)))
    random.shuffle(indices)
    indices = indices[:min(max_typos, len(sentence))]

    chars = list(sentence)
    for i in indices:
        if chars[i].isalpha():
            chars[i] = random.choice('abcdefghijklmnopqrstuvwxyz')

    return ''.join(chars)

def augment_sentence(sentence):
    """Applies a random augmentation method to a sentence."""
    methods = [synonym_replacement, random_word_swap, inject_typos]
    method = random.choice(methods)
    return method(sentence)

def augment_examples(example):
    """Applies augmentation to sentence1 and sentence2."""
    example["sentence1"] = augment_sentence(example["sentence1"])
    example["sentence2"] = augment_sentence(example["sentence2"])
    return example

def tokenize_function(examples):
    """Tokenizes sentence1 and sentence2."""
    return tokenizer(examples["sentence1"], examples["sentence2"],
                       truncation=True, padding="longest", max_length=512)

num_labels = 2
eval_dataset = dataset["validation"].map(tokenize_function, batched=True)

for ratio in range(10, 50, 10):
    print(f"\n🔁 Running with augmentation ratio: {ratio}%")
    aug_ratio = ratio / 100.0
    train_data = dataset["train"].shuffle(seed=42)
    n = int(len(train_data) * aug_ratio)
    augmented = train_data.select(range(n)).map(augment_examples)
    rest = train_data.select(range(n, len(train_data)))
    train_dataset = concatenate_datasets([rest, augmented]).map(tokenize_function, batched=True)

    model = AutoModelForSequenceClassification.from_pretrained("Alireza1044/mobilebert_RTE", num_labels=num_labels)
    model = get_peft_model(model, LoraConfig(
        task_type=TaskType.SEQ_CLS, r=8, lora_alpha=32,
        lora_dropout=0.1, bias="none", target_modules=["query", "value"]
    ))

    training_args = TrainingArguments(
        output_dir=f"./results_rte_aug{ratio}",
        evaluation_strategy="steps",
        eval_steps=max(len(train_dataset) // 128, 10),
        per_device_train_batch_size=32,
        num_train_epochs=3,
        logging_steps=10,
        save_strategy="no",
        learning_rate=2e-5,
        weight_decay=0.01,
        report_to="none"
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        metric = load_metric("glue", task_name)
        return metric.compute(predictions=preds, references=labels)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_metrics
    )

    trainer.train()
    logs = trainer.state.log_history
    steps, f1s = [], []
    for log in logs:
        if "eval_f1" in log:
            steps.append(log["step"])
            f1s.append(log["eval_f1"])
    results[ratio] = {"steps": steps, "f1s": f1s}

# 📊 Plot F1 vs Steps for each augmentation percentage
plt.figure(figsize=(10, 6))
for ratio, data in results.items():
    plt.plot(data["steps"], data["f1s"], label=f"{ratio}% Aug")
plt.xlabel("Steps")
plt.ylabel("F1 Score")
plt.title("F1 Score vs Steps for Different Augmentation Ratios (RTE)")
plt.legend()
plt.grid(True)
plt.show()