In [1]:
import json
import random
import torch
import os

import pandas as pd
import numpy as np
import torch.nn as nn

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, random_split
from sklearn.metrics import classification_report

In [2]:
# Set to GPU if available.
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('Using GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using CPU instead.')

Using GPU: NVIDIA GeForce RTX 3090


In [4]:
# Load the dataset.
#df = pd.read_json("Task 1-20231125T063955Z-001\Task 1\MaSaC_train_erc.json")
df = pd.read_json('data/T1_train_erc.json')
# Load the hing bert tokenizer.
tokenizer = AutoTokenizer.from_pretrained("l3cube-pune/hing-bert")

# Get the input ids (bert embedding index) and attention masks (mask of whether a token is padding).
# Note that this makes all utterances seperate examples, as bert does not support paragraphs (<SEP> only support a sentence pair).
X_input_ids = []
X_attention_mask = []

for utterances in df["utterances"]:
    for utterance in utterances:
        encoded_dict = tokenizer.encode_plus(utterance, add_special_tokens = True, max_length = 160, pad_to_max_length = True,
                                             return_attention_mask = True, return_tensors = 'pt', truncation = True)
        X_input_ids.append(encoded_dict['input_ids'])
        X_attention_mask.append(encoded_dict['attention_mask'])

# Convert emotions to labels.
label_to_index = {'contempt':0, 'anger':1, 'surprise':2, 'fear':3, 'disgust':4, 'sadness':5, 'joy':6, 'neutral':7}
Y = []
for emotions in df["emotions"]: 
    Y.extend([label_to_index[emotion] for emotion in emotions])

X_input_ids = torch.cat(X_input_ids, dim=0)
X_attention_mask = torch.cat(X_attention_mask, dim=0)
Y = torch.tensor(Y)

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



In [5]:
# Prepare the dataset and data loaders.
dataset = TensorDataset(X_input_ids, X_attention_mask, Y)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
batch_size = 32

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = batch_size)
validation_dataloader = DataLoader(val_dataset, sampler = SequentialSampler(val_dataset), batch_size = batch_size)

In [6]:
# Prepare the model.
model = AutoModelForSequenceClassification.from_pretrained(
    "l3cube-pune/hing-bert", num_labels = 8, output_attentions = False, output_hidden_states = False
)
model.cuda()

config.json:   0%|          | 0.00/716 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at l3cube-pune/hing-bert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [7]:
# Prepare the optimizer and learning rate scheduler.
epochs = 4
total_steps = len(train_dataloader) * epochs
optimizer = AdamW(model.parameters(), lr = 5e-5, eps = 1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)



In [8]:
# Evaluation.
def evaluation(model):
    print("Running Validation...")
    index_to_label = {0:'contempt', 1:'anger', 2:'surprise', 3:'fear', 4:'disgust', 5:'sadness', 6:'joy', 7:'neutral'}
    model.eval()
    total_eval_loss = 0
    predictions = []
    labels = []
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
    
        with torch.no_grad():
            result = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels, return_dict=True)
    
        logits = result.logits
        total_eval_loss += loss.item()
    
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
    
        predictions.extend([index_to_label[i] for i in np.argmax(logits, axis=1).flatten()])
        labels.extend([index_to_label[i] for i in label_ids.flatten()])

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    print("Validation Loss: {0:.2f}".format(avg_val_loss))
    print(classification_report(labels, predictions))

In [9]:
seed_val = 10086
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Training loop.
for epoch_i in range(0, epochs):
    print()
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Running Training...')

    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):
        if step % 100 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        model.zero_grad()

        result = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels, return_dict=True)
        loss = result.loss
        logits = result.logits
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    print()
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print()

    # After each epoch, run validation once.
    evaluation(model)

print()
print("Training complete!")


Running Training...
  Batch   100  of    240.
  Batch   200  of    240.

  Average training loss: 1.51

Running Validation...
Validation Loss: 1.36
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        68
    contempt       1.00      0.04      0.07        52
     disgust       0.00      0.00      0.00         9
        fear       0.00      0.00      0.00        58
         joy       0.56      0.37      0.45       162
     neutral       0.51      0.92      0.65       396
     sadness       0.00      0.00      0.00        62
    surprise       0.52      0.25      0.34        44

    accuracy                           0.51       851
   macro avg       0.32      0.20      0.19       851
weighted avg       0.43      0.51      0.41       851


Running Training...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  Batch   100  of    240.
  Batch   200  of    240.

  Average training loss: 1.22

Running Validation...
Validation Loss: 0.92
              precision    recall  f1-score   support

       anger       0.38      0.25      0.30        68
    contempt       0.44      0.13      0.21        52
     disgust       0.20      0.11      0.14         9
        fear       0.18      0.09      0.12        58
         joy       0.49      0.48      0.48       162
     neutral       0.55      0.74      0.63       396
     sadness       0.23      0.18      0.20        62
    surprise       0.53      0.20      0.30        44

    accuracy                           0.49       851
   macro avg       0.37      0.27      0.30       851
weighted avg       0.46      0.49      0.46       851


Running Training...
  Batch   100  of    240.
  Batch   200  of    240.

  Average training loss: 0.88

Running Validation...
Validation Loss: 1.00
              precision    recall  f1-score   support

       anger     

In [None]:
# Save the model.
output_dir = '\model_save_hinglish'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
# Load the saved model.
saved_model = BertForSequenceClassification.from_pretrained(output_dir)
saved_tokenizer = BertTokenizer.from_pretrained(output_dir)
saved_model.to(device)
evaluation(saved_model)

In [10]:
# Load the new data
new_df = pd.read_json('data/T1_test_erc.json')

new_input_ids = []
new_attention_masks = []

for utterances in new_df["utterances"]:
    for utterance in utterances:
        encoded_dict = tokenizer.encode_plus(
            utterance, 
            add_special_tokens = True, 
            max_length = 160, 
            pad_to_max_length = True,
            return_attention_mask = True, 
            return_tensors = 'pt', 
            truncation = True
        )
        new_input_ids.append(encoded_dict['input_ids'])
        new_attention_masks.append(encoded_dict['attention_mask'])

new_input_ids = torch.cat(new_input_ids, dim=0)
new_attention_masks = torch.cat(new_attention_masks, dim=0)

# Create a DataLoader for the new data
new_dataset = TensorDataset(new_input_ids, new_attention_masks)
new_dataloader = DataLoader(new_dataset, batch_size = 32)

# Predict
model.eval()
predictions = []

with torch.no_grad():
    for batch in new_dataloader:
        b_input_ids, b_input_mask = batch
        b_input_ids = b_input_ids.to('cuda')
        b_input_mask = b_input_mask.to('cuda')
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        predictions.extend(np.argmax(logits, axis=1))

print(predictions[:10])



[5, 7, 3, 5, 1, 7, 7, 7, 7, 7]


In [11]:
index_to_label = {v: k for k, v in label_to_index.items()}

# Map each prediction to its corresponding emotion name
predicted_emotions = [index_to_label[label] for label in predictions]

# Write the emotions to a .txt file
with open('predictions2.txt', 'w') as file:
    for emotion in predicted_emotions:
        file.write(emotion + '\n')