In [None]:
# Here I Installed necessary libraries
!pip install -Uq adapters
!pip install -q datasets
!pip install -q accelerate
!pip install pyarrow==8.0.0
!pip install wandb

In [None]:
# Here I imported required libraries
import pandas as pd
from datasets import Dataset
from adapters import AutoAdapterModel
from transformers import AutoTokenizer, TrainingArguments, EvalPrediction
from transformers import TrainingArguments, EarlyStoppingCallback, TrainerCallback
from adapters import AdapterTrainer
import torch
import numpy as np
from adapters.composition import Fuse
from sklearn.model_selection import train_test_split
import wandb


In [None]:
# Here I loaded the train and test datasets from Excel files
df1 = pd.read_excel("/content/Chungli_ao_Train.xlsx")
df2 = pd.read_excel("/content/mizo_sentiment_dataset.xlsx")

In [None]:
df = pd.concat([df1, df2])
#df = df1

In [None]:
trainDf = df
testDf = pd.read_excel('/content/Chungli_Ao_test.xlsx')

In [None]:
# Here I drop the __index_level_0__ column if it exists
if '__index_level_0__' in trainDf.columns:
    trainDf.drop(columns=['__index_level_0__'], inplace=True)

In [None]:
# Here I mapped the sentiment labels to integers
labelMapping = {"POSITIVE": 1, "NEGATIVE": 0}
trainDf["Sentiment"] = trainDf["Sentiment"].map(labelMapping)
testDf["Sentiment"] = testDf["Sentiment"].map(labelMapping)

In [None]:
# Here I split the training data into 80% train and 20% validation sets
trainDf, valDf = train_test_split(trainDf, test_size=0.2, random_state=42)

In [None]:
# Here, I converted the dataframes to datasets
trainDataset = Dataset.from_pandas(trainDf)
valDataset = Dataset.from_pandas(valDf)
testDataset = Dataset.from_pandas(testDf)

In [None]:
# Here, I loaded Mizo BERT tokenizer and model instead of Chungli Ao BERT
tokenizer = AutoTokenizer.from_pretrained("robzchhangte/MizBERT")
model = AutoAdapterModel.from_pretrained("robzchhangte/MizBERT")

In [None]:
# Here I loaded the Mizo adapter and Chungli Ao adapter
model.load_adapter("tona3738/my-chungliao-adapter", load_as="myChungliaoAdapter", set_active=True)
model.load_adapter("tona3738/my-mizo-adapter", load_as="myMizoAdapter", set_active=True)

In [None]:
# Here, I did Adapter Fusion setup
adapterSetup = Fuse("myChungliaoAdapter", "myMizoAdapter")
model.add_adapter_fusion(adapterSetup)
model.train_adapter(adapterSetup)

In [None]:
# Here, I added classification head for the target task (binary sentiment classification)
numLabels = 2
model.add_classification_head(
    "mizoSentimentTask",
    num_labels=numLabels,
    id2label={0: "NEGATIVE", 1: "POSITIVE"}
)

# Here, I duilt a function to encode datasets
def encodeBatch(batch):
    encoding = tokenizer(
        batch["Text"],
        max_length=180,
        truncation=True,
        padding="max_length",
    )
    encoding["labels"] = batch["Sentiment"]
    return encoding

In [None]:
# Here, I encoded datasets
trainDataset = trainDataset.map(encodeBatch, batched=True)
valDataset = valDataset.map(encodeBatch, batched=True)
testDataset = testDataset.map(encodeBatch, batched=True)

In [None]:
# Here, I Set the format for PyTorch
trainDataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
valDataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
testDataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
# Here, I Define metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
    acc = accuracy_score(labels, preds)
    return {'Accuracy': acc, 'F1': f1, 'Precision': precision, 'Recall': recall}

In [None]:
# Here, I Set up training arguments
trainingArgs = TrainingArguments(
    learning_rate=0.0001,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_steps=10,
    output_dir="./training_output",
    logging_dir='./logs',
    overwrite_output_dir=True,
    remove_unused_columns=False,
    fp16=True,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    eval_steps=10,
    load_best_model_at_end=True,  # Load the best model
    report_to='wandb',
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.001,
)

# Here, I initialized the trainer with the training arguments
trainer = AdapterTrainer(
    model=model,
    args=trainingArgs,
    train_dataset=trainDataset,
    eval_dataset=valDataset,  # Here I used the validation dataset for evaluation during training
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]
)

# Here, I trained the model
trainer.train()

In [None]:
# Here, I evaluated the model on validation set
metrics = trainer.evaluate()
print(metrics)

In [None]:
# Here, I evaluated the model on the test dataset
testEvalResult = trainer.evaluate(eval_dataset=testDataset)
print(testEvalResult)