# Import packages

In [1]:
import argparse
import os

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    PeftType,
    PrefixTuningConfig,
    PromptEncoderConfig,
)

from evaluate import load
from torch.utils.data import DataLoader, Dataset
import torch
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
from tqdm import tqdm

# Read files

In [2]:
import pandas as pd
#load train data

def get_file(path):
    cols = ['id', 'text', 'label', 'intensity']
    anger = pd.read_csv(path + 'anger.txt', header=None, sep='\t', names=cols, index_col=0)
    fear = pd.read_csv(path + 'fear.txt', header=None, sep='\t', names=cols, index_col=0)
    sad = pd.read_csv(path + 'sadness.txt', header=None, sep='\t', names=cols, index_col=0)
    joy = pd.read_csv(path + 'joy.txt', header=None, sep='\t', names=cols, index_col=0)
    # Combine the DataFrames
    combined_df = pd.concat([anger, fear, sad, joy])
    # Shuffle the combined DataFrame
    shuffled = combined_df.sample(frac=1).reset_index(drop=True)
    return shuffled

train=get_file("train/")
val=get_file("Dev/")

In [3]:
train

Unnamed: 0,text,label,intensity
0,@lebara - worst possible decision I could have...,sadness,0.479
1,Has anyone noticed that @npr stories in recent...,sadness,0.500
2,@___margs @juliana_f_reyes Hey Michael! Thanks...,fear,0.250
3,"Trying to loveee somebody, just wanna love som...",joy,0.354
4,#picoftheday : How...why... Really... !!\n ...,fear,0.362
...,...,...,...
3608,Transitioning to a new job is hard when you ha...,joy,0.521
3609,@ily_geuly call me now I'm laying in my bed mo...,sadness,0.740
3610,All the 'juniors' are now wearing purple at ol...,sadness,0.646
3611,Wearing all black tomorrow as I continue to mo...,sadness,0.779


# Prepare the dataset

In [4]:
class EmotionDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.dataframe = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        sentence = self.dataframe.iloc[idx]['text']
        label = self.dataframe.iloc[idx]['label']

        # Tokenize the sentence
        inputs = self.tokenizer(sentence, truncation=True, padding='max_length', max_length=512, return_tensors="pt")
        inputs = {key: val.squeeze(0) for key, val in inputs.items()}

        # Convert label to a numeric format if necessary
        label_to_id = {'joy': 0, 'anger': 1, 'fear': 2, 'sadness': 3}
        label_id = label_to_id[label]

        inputs['labels'] = torch.tensor(label_id, dtype=torch.long)

        return inputs


# Define the prefix tuning configurations

In [5]:
batch_size = 16
model_name_or_path = "cardiffnlp/twitter-roberta-base-emotion"
peft_type = PeftType.PREFIX_TUNING
device = "cuda"
num_epochs = 4

peft_config = PrefixTuningConfig(task_type="SEQ_CLS", num_virtual_tokens=10)
lr = 0.008





# Define the pre-trained LLM

In [6]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,padding_side='left')

# Create the dataset
train_dataset = EmotionDataset(train, tokenizer)
val_dataset = EmotionDataset(val, tokenizer)

# DataLoader
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=batch_size)
label_to_id = {'joy': 0, 'anger': 1, 'fear': 2, 'sadness': 3}
# Model
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, num_labels=len(label_to_id))

## Print the model and peft details

In [7]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model

trainable params: 777,988 || all params: 125,426,696 || trainable%: 0.6202730557456444


PeftModelForSequenceClassification(
  (base_model): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-11): 12 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(

# Define the optimizer

In [8]:
optimizer = AdamW(params=model.parameters(), lr=lr)

# Instantiate scheduler
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),
    num_training_steps=(len(train_dataloader) * num_epochs),
)

# Fine-tune!

In [9]:
model.to(device)
f1_metric = load('f1', config_name='multiclass', average='weighted')
accuracy_metric = load('accuracy')
for epoch in range(num_epochs):
    model.train()
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    for step, batch in enumerate(tqdm(val_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        predictions, references = predictions, batch["labels"]
        accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])
        f1_metric.add_batch(predictions=predictions, references=batch["labels"])

    # Compute final metric values
    final_accuracy = accuracy_metric.compute()
    final_f1 = f1_metric.compute(average='weighted')
    print(f"Accuracy: {final_accuracy['accuracy']}")
    print(f"F1 Score: {final_f1['f1']}")

100%|████████████████████████████████████████████████████████████████████████████████| 226/226 [00:45<00:00,  4.98it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 22/22 [00:02<00:00,  9.66it/s]


Accuracy: 0.6570605187319885
F1 Score: 0.6567599975141224


100%|████████████████████████████████████████████████████████████████████████████████| 226/226 [00:44<00:00,  5.04it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 22/22 [00:02<00:00,  9.94it/s]


Accuracy: 0.7319884726224783
F1 Score: 0.729596788945238


100%|████████████████████████████████████████████████████████████████████████████████| 226/226 [00:44<00:00,  5.06it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 22/22 [00:02<00:00,  9.70it/s]


Accuracy: 0.6829971181556196
F1 Score: 0.6758584106853177


100%|████████████████████████████████████████████████████████████████████████████████| 226/226 [00:45<00:00,  5.00it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 22/22 [00:02<00:00,  9.64it/s]

Accuracy: 0.7809798270893372
F1 Score: 0.779587184263078



