Source code: https://github.com/Adapter-Hub/adapters/tree/main/notebooks

In [None]:
!pip install -Uq adapters
!pip install -q datasets
!pip install -q accelerate
!pip install --upgrade pip setuptools wheel
!pip install pyarrow



In [None]:
import pandas as pd
from datasets import Dataset
from adapters import AutoAdapterModel
from transformers import AutoTokenizer, TrainingArguments, EvalPrediction
from adapters import AdapterTrainer
import torch
import numpy as np
from adapters.composition import Fuse
from sklearn.model_selection import train_test_split

In [None]:
trainDf = pd.read_excel('/content/train_set(2).xlsx')

In [None]:
testDf = pd.read_excel('/content/test_set(1).xlsx')

In [None]:
# Here, I did mapping of sentiment labels to integers.
labelMapping = {"POS": 1, "NEG": 0,"NTL":2}

In [None]:
# Here, I applied the mapping to the train and test datasets
trainDf["Label"] = trainDf["Label"].map(labelMapping)
testDf["Label"] = testDf["Label"].map(labelMapping)

In [None]:
#Here, I split the training data into training and validation sets into 80 & 20 percent
trainDf, valDf = train_test_split(trainDf, test_size=0.2, random_state=42)

In [None]:
# Here, I converted the dataframes to datasets
trainDataset = Dataset.from_pandas(trainDf)
valDataset = Dataset.from_pandas(valDf)
testDataset = Dataset.from_pandas(testDf)

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# Load the model with adapters
model = AutoAdapterModel.from_pretrained("bert-base-multilingual-cased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Here I loaded the Mizo adapter and Chungli Ao adapter
model.load_adapter("Blue7Bird/my-Telugu_Translated-mBERT-adapter3", load_as="myTeluguAdapter", set_active=True)
model.load_adapter("Blue7Bird/my-English_Translated_-mBERT-adapter2", load_as="myEnglishAdapter", set_active=True)
model.load_adapter("Blue7Bird/my-codemix_mBERT-adapter2", load_as="myCodemixAdapter", set_active=True)

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

'myCodemixAdapter'

In [None]:
# Here, I did Adapter Fusion setup
adapterSetup = Fuse("myEnglishAdapter","myCodemixAdapter","myTeluguAdapter")
model.add_adapter_fusion(adapterSetup)
model.train_adapter_fusion(adapterSetup)

In [None]:
numLabels = 3
model.add_classification_head(
    "Telugu_codemix_SentimentTask",
    num_labels=numLabels,
    id2label={"POS": 1, "NEG": 0, "NTL": 2}
)

In [None]:
# Here, I duilt a function to encode datasets
def encodeBatch(batch):
    encoding = tokenizer(
        batch["Text"],
        max_length=180,
        truncation=True,
        padding="max_length",
    )
    encoding["labels"] = batch["Label"]
    return encoding


In [None]:
# Here, I encoded datasets
trainDataset = trainDataset.map(encodeBatch, batched=True)
valDataset = valDataset.map(encodeBatch, batched=True)
testDataset = testDataset.map(encodeBatch, batched=True)

Map:   0%|          | 0/13494 [00:00<?, ? examples/s]

Map:   0%|          | 0/3374 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
# Here, I Set the format for PyTorch
trainDataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
valDataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
testDataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
# Here, I Define metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
    acc = accuracy_score(labels, preds)
    return {'Accuracy': acc, 'F1': f1, 'Precision': precision, 'Recall': recall}


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from transformers import TrainingArguments, EarlyStoppingCallback, TrainerCallback

In [None]:
# Here, I Set up training arguments
trainingArgs = TrainingArguments(
    learning_rate=0.0001,
    num_train_epochs=20,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_steps=10,
    output_dir="./training_output",
    logging_dir='./logs',
    overwrite_output_dir=True,
    remove_unused_columns=False,
    fp16=True,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    eval_steps=10,
    load_best_model_at_end=True,
    warmup_steps=500,
    weight_decay=0.001,
)

# Here, I initialized the trainer with the training arguments
trainer = AdapterTrainer(
    model=model,
    args=trainingArgs,
    train_dataset=trainDataset,
    eval_dataset=valDataset,  # Here I used the validation dataset for evaluation during training
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=100)]
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
10,2.2283,1.108383,0.268228,0.268228,0.268228,0.268228
20,2.2108,1.092199,0.334618,0.334618,0.334618,0.334618
30,2.1718,1.064167,0.4508,0.4508,0.4508,0.4508
40,2.0978,1.022757,0.536752,0.536752,0.536752,0.536752
50,2.0016,0.967375,0.548607,0.548607,0.548607,0.548607
60,1.8981,0.903395,0.557499,0.557499,0.557499,0.557499
70,1.8129,0.798583,0.682276,0.682276,0.682276,0.682276
80,1.6301,0.678283,0.717842,0.717842,0.717842,0.717842
90,1.4183,0.586804,0.729994,0.729994,0.729994,0.729994
100,1.182,0.519476,0.774748,0.774748,0.774748,0.774748


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=1670, training_loss=0.9190535416860066, metrics={'train_runtime': 1871.799, 'train_samples_per_second': 144.182, 'train_steps_per_second': 4.509, 'total_flos': 6498863983760400.0, 'train_loss': 0.9190535416860066, 'epoch': 3.957345971563981})

In [None]:
model.set_active_adapters(adapterSetup)

In [None]:
# Here, I evaluated the model on validation set
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 0.4131855070590973, 'eval_Accuracy': 0.8236514522821576, 'eval_F1': 0.8236514522821576, 'eval_Precision': 0.8236514522821576, 'eval_Recall': 0.8236514522821576, 'eval_runtime': 9.539, 'eval_samples_per_second': 353.705, 'eval_steps_per_second': 22.12, 'epoch': 3.957345971563981}


In [None]:
# Here, I evaluated the model on the test dataset
testEvalResult = trainer.evaluate(eval_dataset=testDataset)
print(testEvalResult)

{'eval_loss': 0.586686909198761, 'eval_Accuracy': 0.76, 'eval_F1': 0.76, 'eval_Precision': 0.76, 'eval_Recall': 0.76, 'eval_runtime': 8.502, 'eval_samples_per_second': 352.857, 'eval_steps_per_second': 22.112, 'epoch': 3.957345971563981}
