# Import packages

In [1]:
import argparse
import os

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    PeftType,
    PromptEncoder,
    PromptEncoderConfig
)
from evaluate import load
from torch.utils.data import DataLoader, Dataset
import torch
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
from tqdm import tqdm

# Read files

In [2]:
import pandas as pd
#load train data

def get_file(path):
    cols = ['id', 'text', 'label', 'intensity']
    anger = pd.read_csv(path + 'anger.txt', header=None, sep='\t', names=cols, index_col=0)
    fear = pd.read_csv(path + 'fear.txt', header=None, sep='\t', names=cols, index_col=0)
    sad = pd.read_csv(path + 'sadness.txt', header=None, sep='\t', names=cols, index_col=0)
    joy = pd.read_csv(path + 'joy.txt', header=None, sep='\t', names=cols, index_col=0)
    # Combine the DataFrames
    combined_df = pd.concat([anger, fear, sad, joy])
    # Shuffle the combined DataFrame
    shuffled = combined_df.sample(frac=1).reset_index(drop=True)
    return shuffled

train=get_file("train/")
val=get_file("Dev/")
train

Unnamed: 0,text,label,intensity
0,Look forward to the detours because they bring...,joy,0.667
1,"GKN so lively as well, mad quick",joy,0.320
2,@siomo @NEWSTALK1010 20 says he gets reelected...,fear,0.271
3,Hope was an instinct only the reasoning human ...,fear,0.521
4,"sure, ohio state is terrible, ohio is awful, e...",fear,0.500
...,...,...,...
3608,@British_Airways #lost #bag #stillwaiting go h...,sadness,0.542
3609,If my luck the rest of Fall goes anything like...,anger,0.271
3610,@Onision @Eugenia_Cooney annoyed by the good l...,joy,0.104
3611,Unbelievable takes 10 minutes to get through t...,anger,0.604


# Prepare the dataset

In [3]:
class EmotionDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.dataframe = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        sentence = self.dataframe.iloc[idx]['text']
        label = self.dataframe.iloc[idx]['label']

        # Tokenize the sentence
        inputs = self.tokenizer(sentence, truncation=True, max_length=None, return_tensors="pt")
        inputs = {key: val.squeeze(0) for key, val in inputs.items()}

        # Convert label to a numeric format if necessary
        label_to_id = {'joy': 0, 'anger': 1, 'fear': 2, 'sadness': 3}
        label_id = label_to_id[label]

        inputs['labels'] = torch.tensor(label_id, dtype=torch.long)

        return inputs

def collate_fn(examples):
    return tokenizer.pad(examples, padding="longest", return_tensors="pt")


# Define the p tuning configurations

In [4]:
batch_size = 16
model_name_or_path = "cardiffnlp/twitter-roberta-base-emotion"
peft_type = PeftType.P_TUNING
device = "cuda"
num_epochs = 4
peft_config = PromptEncoderConfig(task_type="SEQ_CLS", 
                                 num_virtual_tokens=20,
                                 encoder_reparameterization_type="MLP", #note the options are MLP, LSTM, or EMB
                                 encoder_hidden_size=128) # hidden state size of MLP
lr = 0.005





# Define the pre-trained LLM

In [5]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,padding_side='left')
label_to_id = {'joy': 0, 'anger': 1, 'fear': 2, 'sadness': 3}
# Model
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, num_labels=len(label_to_id))

train_dataset = EmotionDataset(train, tokenizer)
val_dataset = EmotionDataset(val, tokenizer)
# Use the collate_fn in your DataLoaders
train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=collate_fn, batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, shuffle=True, collate_fn=collate_fn, batch_size=batch_size)

### Print the model and peft details

In [7]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model

trainable params: 823,044 || all params: 125,471,752 || trainable%: 0.6559595979818629


PeftModelForSequenceClassification(
  (base_model): PeftModelForSequenceClassification(
    (base_model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(in_features=768, out_features=768, bias=True)
                  (key): Linear(in_features=768, out_features=768, bias=True)
                  (value): Linear(in_features=768, out_features=768, bias=True)
                  (dropout): 

# Define the optimizer

In [8]:
optimizer = AdamW(params=model.parameters(), lr=lr)

# Instantiate scheduler
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),
    num_training_steps=(len(train_dataloader) * num_epochs),
)

# Fine-tune!

In [9]:
model.to(device)
f1_metric = load('f1', config_name='multiclass', average='weighted')
accuracy_metric = load('accuracy')
for epoch in range(num_epochs):
    model.train()
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    for step, batch in enumerate(tqdm(val_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        predictions, references = predictions, batch["labels"]
        accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])
        f1_metric.add_batch(predictions=predictions, references=batch["labels"])

    # Compute final metric values
    final_accuracy = accuracy_metric.compute()
    final_f1 = f1_metric.compute(average='weighted')
    print(f"Accuracy: {final_accuracy['accuracy']}")
    print(f"F1 Score: {final_f1['f1']}")

  0%|                                                                                          | 0/226 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|████████████████████████████████████████████████████████████████████████████████| 226/226 [00:06<00:00, 34.73it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 22/22 [00:00<00:00, 57.93it/s]


Accuracy: 0.6484149855907781
F1 Score: 0.6357025344250339


100%|████████████████████████████████████████████████████████████████████████████████| 226/226 [00:05<00:00, 37.85it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 22/22 [00:00<00:00, 60.83it/s]


Accuracy: 0.659942363112392
F1 Score: 0.656506507834601


100%|████████████████████████████████████████████████████████████████████████████████| 226/226 [00:06<00:00, 37.59it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 22/22 [00:00<00:00, 61.48it/s]


Accuracy: 0.6829971181556196
F1 Score: 0.6873436206358451


100%|████████████████████████████████████████████████████████████████████████████████| 226/226 [00:05<00:00, 38.67it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 22/22 [00:00<00:00, 60.22it/s]

Accuracy: 0.723342939481268
F1 Score: 0.7226385163084428



