In [1]:
import json
import random
import torch
import os
import csv

import pandas as pd
import numpy as np
import torch.nn as nn

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, random_split
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set to GPU if available.
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('Using GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using CPU instead.')

Using GPU: NVIDIA GeForce GTX 1070 Ti


In [3]:
# Load the dataset.
df = pd.read_json("Task 1-20231125T063955Z-001\Task 1\MaSaC_train_erc.json")
# df = pd.read_json("Task 3-20231125T064001Z-001\Task 3\MELD_train_efr.json")

with open("translated_task1.json", "r") as f:
    translation = json.load(f)

# Load the bert tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Get the input ids (bert embedding index) and attention masks (mask of whether a token is padding).
# Note that this makes all utterances seperate examples, as bert does not support paragraphs (<SEP> only support a sentence pair).
X_input_ids = []
X_attention_mask = []

for utterances in df["utterances"]:
    for utterance in utterances:
        encoded_dict = tokenizer.encode_plus(translation[utterance], add_special_tokens = True, max_length = 160, pad_to_max_length = True,
                                             return_attention_mask = True, return_tensors = 'pt', truncation = True)
        X_input_ids.append(encoded_dict['input_ids'])
        X_attention_mask.append(encoded_dict['attention_mask'])

# Convert emotions to labels.
label_to_index = {'contempt':0, 'anger':1, 'surprise':2, 'fear':3, 'disgust':4, 'sadness':5, 'joy':6, 'neutral':7}
Y = []
for emotions in df["emotions"]: 
    Y.extend([label_to_index[emotion] for emotion in emotions])

X_input_ids = torch.cat(X_input_ids, dim=0)
X_attention_mask = torch.cat(X_attention_mask, dim=0)
Y = torch.tensor(Y)



In [4]:
# Prepare the dataset and data loaders.
dataset = TensorDataset(X_input_ids, X_attention_mask, Y)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
batch_size = 32

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = batch_size)
validation_dataloader = DataLoader(val_dataset, sampler = SequentialSampler(val_dataset), batch_size = batch_size)

In [5]:
# Prepare the model.
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels = 8, output_attentions = False, output_hidden_states = False,
)
model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [6]:
# Prepare the optimizer and learning rate scheduler.
epochs = 4
total_steps = len(train_dataloader) * epochs
optimizer = AdamW(model.parameters(), lr = 5e-5, eps = 1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)



In [7]:
# Evaluation.
def evaluation(model):
    print("Running Validation...")
    index_to_label = {0:'contempt', 1:'anger', 2:'surprise', 3:'fear', 4:'disgust', 5:'sadness', 6:'joy', 7:'neutral'}
    model.eval()
    total_eval_loss = 0
    predictions = []
    labels = []
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
    
        with torch.no_grad():
            result = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels, return_dict=True)
    
        logits = result.logits
        total_eval_loss += loss.item()
    
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
    
        predictions.extend([index_to_label[i] for i in np.argmax(logits, axis=1).flatten()])
        labels.extend([index_to_label[i] for i in label_ids.flatten()])

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    print("Validation Loss: {0:.2f}".format(avg_val_loss))
    print(classification_report(labels, predictions))

In [8]:
seed_val = 10086
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Training loop.
for epoch_i in range(0, epochs):
    print()
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Running Training...')

    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):
        if step % 100 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        model.zero_grad()

        result = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels, return_dict=True)
        loss = result.loss
        logits = result.logits
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    print()
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print()

    # After each epoch, run validation once.
    evaluation(model)

print()
print("Training complete!")


Running Training...
  Batch   100  of    240.
  Batch   200  of    240.

  Average training loss: 1.56

Running Validation...
Validation Loss: 1.60
              precision    recall  f1-score   support

       anger       0.50      0.01      0.02        95
    contempt       0.00      0.00      0.00        49
     disgust       0.00      0.00      0.00        14
        fear       0.33      0.05      0.08        43
         joy       0.63      0.26      0.37       170
     neutral       0.46      0.97      0.62       365
     sadness       0.00      0.00      0.00        62
    surprise       0.00      0.00      0.00        53

    accuracy                           0.47       851
   macro avg       0.24      0.16      0.14       851
weighted avg       0.39      0.47      0.35       851


Running Training...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  Batch   100  of    240.
  Batch   200  of    240.

  Average training loss: 1.34

Running Validation...
Validation Loss: 1.66
              precision    recall  f1-score   support

       anger       0.52      0.15      0.23        95
    contempt       0.28      0.22      0.25        49
     disgust       1.00      0.07      0.13        14
        fear       0.33      0.02      0.04        43
         joy       0.53      0.31      0.39       170
     neutral       0.50      0.88      0.64       365
     sadness       0.14      0.03      0.05        62
    surprise       0.55      0.30      0.39        53

    accuracy                           0.49       851
   macro avg       0.48      0.25      0.27       851
weighted avg       0.47      0.49      0.42       851


Running Training...
  Batch   100  of    240.
  Batch   200  of    240.

  Average training loss: 1.01

Running Validation...
Validation Loss: 1.40
              precision    recall  f1-score   support

       anger     

In [9]:
# Save the model.
output_dir = '\model_save_translated_hinglish_task1'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('\\model_save\\tokenizer_config.json',
 '\\model_save\\special_tokens_map.json',
 '\\model_save\\vocab.txt',
 '\\model_save\\added_tokens.json')

In [6]:
# Load the saved model.
output_dir = '\model_save_translated_hinglish_task1'
saved_model = BertForSequenceClassification.from_pretrained(output_dir, output_hidden_states = True)
saved_tokenizer = BertTokenizer.from_pretrained(output_dir)
saved_model.to(device)
# evaluation(saved_model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [17]:
# Extract sentence embeddings.
def extract_embedding(model):
    all_dataloader = DataLoader(dataset, sampler = SequentialSampler(dataset), batch_size = batch_size)
    model.eval()
    cnt = 0
    with open('task1_sentence_embedding.csv', 'w', newline='') as csvfile:
        embedding_writer = csv.writer(csvfile, delimiter=' ')
        for batch in all_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            cnt += 1
            if cnt % 100 == 0:
                print("Processed", cnt, "batches")
        
            with torch.no_grad():
                result = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels, return_dict=True)
                # Sentence embedding is the states of the first token of the last hidden layer for BertForSequenceClassification.
                embedding_batch = result.hidden_states[-1][:,0,:]
                for embedding in embedding_batch:
                    embedding_writer.writerow(embedding.cpu().numpy())

extract_embedding(saved_model)

Processed 100 batches
Processed 200 batches
Processed 300 batches
Processed 400 batches
Processed 500 batches
Processed 600 batches
Processed 700 batches
Processed 800 batches
Processed 900 batches
Processed 1000 batches
