# ASR Based Speech Emotion Recognition on IEMOCAP

## Import Libraries

In [49]:
import os
import json
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW

## Preprocess Dataset

In [18]:
# function to help construct dictionary of sentence id to text using transcriptions files
def id_2_text_construct(id_2_text, transcriptions_dir):
    for filename in os.listdir(transcriptions_dir):
        if filename.split('.')[-1] != 'txt':
            continue

        with open(os.path.join(transcriptions_dir, filename), 'r') as file:
            for line in file:
                line_split = line.split()

                if line_split[0].startswith('Ses'):
                    id_2_text[line_split[0]] = ' '.join(line_split[2:])
                    
    return id_2_text

In [26]:
# function to help construct dictionary of sentence id to label using evaluation files
def id_2_label_construct(id_2_label, evaluation_dir):
    for filename in os.listdir(evaluation_dir):
        if filename.split('.')[-1] != 'txt':
            continue

        with open(os.path.join(evaluation_dir, filename), 'r') as file:
            for line in file:
                line_split = line.split()

                if len(line_split) >= 4 and line_split[3].startswith('Ses'):
                    sentence_id = line_split[3]
                    label = line_split[4]
                    if label != 'xxx' and label != 'oth':
                        id_2_label[sentence_id] = label
                        
    return id_2_label

In [30]:
# initialize dictionaries
id_2_text = {}
id_2_label = {}

# iterate through all the session directories
for session_num in range(1, 6):
    session_dir = 'Dataset/IEMOCAP/Session{}'.format(session_num)
    transcriptions_dir = os.path.join(session_dir, 'dialog/transcriptions')
    evaluation_dir = os.path.join(session_dir, 'dialog/EmoEvaluation')

    id_2_text = id_2_text_construct(id_2_text, transcriptions_dir)
    id_2_label = id_2_label_construct(id_2_label, evaluation_dir)

# iterate through all id_2_label sentence ids and locate corresponding text
idx_2_label = {}
label_2_idx = {}
text_and_label = []

idx_count = 0

for sentence_id in id_2_label:
    label = id_2_label[sentence_id]
    text = id_2_text[sentence_id]

    if label not in label_2_idx:
        label_2_idx[label] = idx_count
        idx_2_label[idx_count] = label
        idx_count += 1
    
    text_and_label.append((text, label_2_idx[label]))

# save the dictionaries and lists to dataset directory
dataset_dir = 'Dataset/IEMOCAP'

with open(os.path.join(dataset_dir, 'idx_2_label.json'), 'w') as json_file:
    json.dump(idx_2_label, json_file)

with open(os.path.join(dataset_dir, 'label_2_idx.json'), 'w') as json_file:
    json.dump(label_2_idx, json_file)

with open(os.path.join(dataset_dir, 'text_and_label.json'), 'w') as json_file:
    json.dump(text_and_label, json_file)

## Generate Train and Validation Dataset

In [37]:
# load the list from the JSON file and separate text and label data
with open(os.path.join(dataset_dir, 'text_and_label.json'), 'r') as json_file:
    text_and_label = json.load(json_file)

text_data = []
label_data = []

for text, label in text_and_label:
    text_data.append(text)
    label_data.append(label)

In [38]:
# separate data into train and valid datasets
train_texts, valid_texts, train_labels, valid_labels = train_test_split(text_data, label_data, test_size=0.12, random_state=42, stratify=label_data)

## Define Dataset

In [64]:
# define dataset for IEMOCAP text emotion classification
class IEMOCAP_Text_Dataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

In [78]:
# load idx to label dictionary
with open(os.path.join(dataset_dir, 'idx_2_label.json'), 'r') as json_file:
    idx_2_label = json.load(json_file)

# define tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(idx_2_label))

# define dataset
train_dataset = IEMOCAP_Text_Dataset(train_texts, train_labels, tokenizer)
valid_dataset = IEMOCAP_Text_Dataset(valid_texts, valid_labels, tokenizer)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Finetune Model

In [79]:
# define training parameters
batch_size = 8
learning_rate = 1e-5
num_epochs = 5

In [80]:
# create data loaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

In [81]:
# set up optimizer and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

# training loop
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("mps")
model.to(device)

for epoch in range(num_epochs):
    # train
    print("Start Train for Epoch {}".format(epoch+1))
    model.train()
    train_loss = 0
    train_correct_preds = 0
    for batch in tqdm(train_dataloader):
        inputs = {key: val.to(device) for key, val in batch.items()}
        outputs = model(**inputs)
        logits = outputs.logits
        labels = inputs['labels'].to(device)
        loss = loss_fn(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

        # calculate train accuracy
        _, train_predictions = torch.max(logits, dim=1)
        train_correct_preds += torch.sum(train_predictions == labels).item()

    # validation
    print("Start Validation for Epoch {}".format(epoch+1))
    model.eval()
    valid_loss = 0
    valid_correct_preds = 0
    with torch.no_grad():
        for batch in tqdm(valid_dataloader):
            inputs = {key: val.to(device) for key, val in batch.items()}
            outputs = model(**inputs)
            logits = outputs.logits
            labels = inputs['labels'].to(device)
            loss = loss_fn(logits, labels)
            valid_loss += loss.item()

            # calculate validation accuracy
            _, valid_predictions = torch.max(logits, dim=1)
            valid_correct_preds += torch.sum(valid_predictions == labels).item()

    avg_train_loss = train_loss / len(train_dataloader)
    train_accuracy = train_correct_preds / len(train_dataset)
    avg_valid_loss = valid_loss / len(valid_dataloader)
    valid_accuracy = valid_correct_preds / len(valid_dataset)

    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Valid Loss: {avg_valid_loss:.4f}, Valid Accuracy: {valid_accuracy:.4f}\n")

Start Train for Epoch 1


100%|██████████| 829/829 [05:13<00:00,  2.65it/s]


Start Validation for Epoch 1


100%|██████████| 113/113 [00:08<00:00, 13.23it/s]


Epoch 1/5, Train Loss: 1.5546, Train Accuracy: 0.4066, Valid Loss: 1.3347, Valid Accuracy: 0.4978

Start Train for Epoch 2


100%|██████████| 829/829 [05:09<00:00,  2.68it/s]


Start Validation for Epoch 2


100%|██████████| 113/113 [00:08<00:00, 13.29it/s]


Epoch 2/5, Train Loss: 1.1506, Train Accuracy: 0.5707, Valid Loss: 1.2335, Valid Accuracy: 0.5531

Start Train for Epoch 3


100%|██████████| 829/829 [04:40<00:00,  2.96it/s]


Start Validation for Epoch 3


100%|██████████| 113/113 [00:08<00:00, 13.45it/s]


Epoch 3/5, Train Loss: 0.9412, Train Accuracy: 0.6509, Valid Loss: 1.3171, Valid Accuracy: 0.5254

Start Train for Epoch 4


100%|██████████| 829/829 [05:18<00:00,  2.60it/s]


Start Validation for Epoch 4


100%|██████████| 113/113 [00:08<00:00, 13.33it/s]


Epoch 4/5, Train Loss: 0.7995, Train Accuracy: 0.7008, Valid Loss: 1.2818, Valid Accuracy: 0.5642

Start Train for Epoch 5


100%|██████████| 829/829 [05:11<00:00,  2.66it/s]


Start Validation for Epoch 5


100%|██████████| 113/113 [00:08<00:00, 13.29it/s]

Epoch 5/5, Train Loss: 0.6819, Train Accuracy: 0.7544, Valid Loss: 1.3970, Valid Accuracy: 0.5465






## Save the Model

In [82]:
# specify the directory to save the models
model_dir = 'Models'

if not os.path.exists(model_dir):
    os.makedirs(model_dir)

finetuned_model_dir = os.path.join(model_dir, 'finetuned_roberta_IEMOCAP')

# save model and tokenizer to specified directory
model.save_pretrained(finetuned_model_dir)
tokenizer.save_pretrained(finetuned_model_dir)

('Models/finetuned_roberta_IEMOCAP/tokenizer_config.json',
 'Models/finetuned_roberta_IEMOCAP/special_tokens_map.json',
 'Models/finetuned_roberta_IEMOCAP/vocab.json',
 'Models/finetuned_roberta_IEMOCAP/merges.txt',
 'Models/finetuned_roberta_IEMOCAP/added_tokens.json')