In [1]:
import json
import random
import torch
import os

import pandas as pd
import numpy as np
import torch.nn as nn

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, random_split
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set to GPU if available.
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('Using GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using CPU instead.')

Using GPU: NVIDIA GeForce GTX 1070 Ti


In [3]:
# Load the dataset.
def load_dataset(fname, train=True):
    df = pd.read_json(fname)
    # df = pd.read_json("Task 1-20231125T063955Z-001\Task 1\MaSaC_train_erc.json")
    
    # Load the hing bert tokenizer.
    tokenizer = AutoTokenizer.from_pretrained("l3cube-pune/hing-bert")
    
    # Get the input ids (bert embedding index) and attention masks (mask of whether a token is padding).
    # Note that this makes all utterances seperate examples, as bert does not support paragraphs (<SEP> only support a sentence pair).
    X_input_ids = []
    X_attention_mask = []
    
    for utterances in df["utterances"]:
        for utterance in utterances:
            encoded_dict = tokenizer.encode_plus(utterance, add_special_tokens = True, max_length = 160, pad_to_max_length = True,
                                                 return_attention_mask = True, return_tensors = 'pt', truncation = True)
            X_input_ids.append(encoded_dict['input_ids'])
            X_attention_mask.append(encoded_dict['attention_mask'])
    
    # Convert emotions to labels.
    label_to_index = {'contempt':0, 'anger':1, 'surprise':2, 'fear':3, 'disgust':4, 'sadness':5, 'joy':6, 'neutral':7}
    Y = []
    if train:
        for emotions in df["emotions"]: 
            Y.extend([label_to_index[emotion] for emotion in emotions])
    else:
        for utterances in df["utterances"]: 
            Y.extend([0 for utterance in utterances])
    
    X_input_ids = torch.cat(X_input_ids, dim=0)
    X_attention_mask = torch.cat(X_attention_mask, dim=0)
    Y = torch.tensor(Y)
    return TensorDataset(X_input_ids, X_attention_mask, Y)

In [4]:
# Prepare the dataset and data loaders.
# dataset = TensorDataset(X_input_ids, X_attention_mask, Y)

# train_size = int(0.9 * len(dataset))
# val_size = len(dataset) - train_size
# batch_size = 32

# train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

batch_size = 32
# train_dataset = load_dataset("Task 1-20231125T063955Z-001\Task 1\MaSaC_train_erc.json")
# val_dataset = load_dataset("MaSaC_val_erc.json")
train_dataset = load_dataset("Task 2-20231125T064000Z-001\Task 2\MaSaC_train_efr.json")
val_dataset = load_dataset("MaSaC_val_efr.json")

train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = batch_size)
validation_dataloader = DataLoader(val_dataset, sampler = SequentialSampler(val_dataset), batch_size = batch_size)



In [5]:
# Prepare the model.
model = AutoModelForSequenceClassification.from_pretrained(
    "l3cube-pune/hing-bert", num_labels = 8, output_attentions = False, output_hidden_states = False
)
model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at l3cube-pune/hing-bert and are newly initialized: ['bert.pooler.dense.weight', 'classifier.weight', 'bert.pooler.dense.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [6]:
# Prepare the optimizer and learning rate scheduler.
epochs = 4
total_steps = len(train_dataloader) * epochs
optimizer = AdamW(model.parameters(), lr = 5e-5, eps = 1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)



In [7]:
# Evaluation.
def evaluation(model):
    print("Running Validation...")
    index_to_label = {0:'contempt', 1:'anger', 2:'surprise', 3:'fear', 4:'disgust', 5:'sadness', 6:'joy', 7:'neutral'}
    model.eval()
    total_eval_loss = 0
    predictions = []
    labels = []
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
    
        with torch.no_grad():
            result = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels, return_dict=True)
    
        logits = result.logits
        total_eval_loss += loss.item()
    
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
    
        predictions.extend([index_to_label[i] for i in np.argmax(logits, axis=1).flatten()])
        labels.extend([index_to_label[i] for i in label_ids.flatten()])

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    print("Validation Loss: {0:.2f}".format(avg_val_loss))
    print(classification_report(labels, predictions))

In [8]:
seed_val = 10086
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Training loop.
for epoch_i in range(0, epochs):
    print()
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Running Training...')

    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):
        if step % 100 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        model.zero_grad()

        result = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels, return_dict=True)
        loss = result.loss
        logits = result.logits
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    print()
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print()

    # After each epoch, run validation once.
    evaluation(model)

print()
print("Training complete!")


Running Training...
  Batch   100  of  3,087.
  Batch   200  of  3,087.
  Batch   300  of  3,087.
  Batch   400  of  3,087.
  Batch   500  of  3,087.
  Batch   600  of  3,087.
  Batch   700  of  3,087.
  Batch   800  of  3,087.
  Batch   900  of  3,087.
  Batch 1,000  of  3,087.
  Batch 1,100  of  3,087.
  Batch 1,200  of  3,087.
  Batch 1,300  of  3,087.
  Batch 1,400  of  3,087.
  Batch 1,500  of  3,087.
  Batch 1,600  of  3,087.
  Batch 1,700  of  3,087.
  Batch 1,800  of  3,087.
  Batch 1,900  of  3,087.
  Batch 2,000  of  3,087.
  Batch 2,100  of  3,087.
  Batch 2,200  of  3,087.
  Batch 2,300  of  3,087.
  Batch 2,400  of  3,087.
  Batch 2,500  of  3,087.
  Batch 2,600  of  3,087.
  Batch 2,700  of  3,087.
  Batch 2,800  of  3,087.
  Batch 2,900  of  3,087.
  Batch 3,000  of  3,087.

  Average training loss: 0.55

Running Validation...
Validation Loss: 0.39
              precision    recall  f1-score   support

       anger       0.91      0.93      0.92       639
    contempt  

In [9]:
# Save the model.
output_dir = '\model_save_hinglish_2'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained("l3cube-pune/hing-bert")
tokenizer.save_pretrained(output_dir)

('\\model_save_hinglish_2\\tokenizer_config.json',
 '\\model_save_hinglish_2\\special_tokens_map.json',
 '\\model_save_hinglish_2\\vocab.txt',
 '\\model_save_hinglish_2\\added_tokens.json',
 '\\model_save_hinglish_2\\tokenizer.json')

In [14]:
# Load the saved model.
output_dir = '\model_save_hinglish_2'
saved_model = AutoModelForSequenceClassification.from_pretrained(output_dir, output_hidden_states = True)
saved_tokenizer = AutoTokenizer.from_pretrained(output_dir)
saved_model.to(device)
# evaluation(saved_model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [17]:
# Extract sentence embeddings.
import csv
def extract_embedding(model):
    test_dataset = load_dataset("MaSaC_test_efr.json", train=False)
    test_dataloader = DataLoader(test_dataset, sampler = SequentialSampler(test_dataset), batch_size = batch_size)
    model.eval()
    cnt = 0
    fnames = ['task2_sentence_embedding_train.csv', 'task2_sentence_embedding_val.csv', 'task2_sentence_embedding_test.csv']
    idx = 0
    for data_loader in [train_dataloader, validation_dataloader, test_dataloader]:
        with open(fnames[idx], 'w', newline='') as csvfile:
            embedding_writer = csv.writer(csvfile, delimiter=' ')
            for batch in data_loader:
                b_input_ids = batch[0].to(device)
                b_input_mask = batch[1].to(device)
                b_labels = batch[2].to(device)
                cnt += 1
                if cnt % 100 == 0:
                    print("Processed", cnt, "batches")
            
                with torch.no_grad():
                    result = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels, return_dict=True)
                    # Sentence embedding is the states of the first token of the last hidden layer for BertForSequenceClassification.
                    embedding_batch = result.hidden_states[-1][:,0,:]
                    for embedding in embedding_batch:
                        embedding_writer.writerow(embedding.cpu().numpy())
        idx += 1

extract_embedding(saved_model)



Processed 100 batches
Processed 200 batches
Processed 300 batches
Processed 400 batches
Processed 500 batches
Processed 600 batches
Processed 700 batches
Processed 800 batches
Processed 900 batches
Processed 1000 batches
Processed 1100 batches
Processed 1200 batches
Processed 1300 batches
Processed 1400 batches
Processed 1500 batches
Processed 1600 batches
Processed 1700 batches
Processed 1800 batches
Processed 1900 batches
Processed 2000 batches
Processed 2100 batches
Processed 2200 batches
Processed 2300 batches
Processed 2400 batches
Processed 2500 batches
Processed 2600 batches
Processed 2700 batches
Processed 2800 batches
Processed 2900 batches
Processed 3000 batches
Processed 3100 batches
Processed 3200 batches
Processed 3300 batches
Processed 3400 batches
Processed 3500 batches


In [None]:
def predict_emotion(model):
    test_dataset = load_dataset("MaSaC_test_erc.json", train=False)
    test_dataloader = DataLoader(test_dataset, sampler = SequentialSampler(test_dataset), batch_size = batch_size)
    index_to_label = {0:'contempt', 1:'anger', 2:'surprise', 3:'fear', 4:'disgust', 5:'sadness', 6:'joy', 7:'neutral'}
    model.eval()
    cnt = 0

    predictions = []
    for batch in test_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        cnt += 1
        if cnt % 100 == 0:
            print("Processed", cnt, "batches")
    
        with torch.no_grad():
            result = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels, return_dict=True)
            logits = result.logits.detach().cpu().numpy()
            predictions.extend([index_to_label[i] + "\n" for i in np.argmax(logits, axis=1).flatten()])
    # print(predictions)
    
    with open("answer1.txt", "w") as f:
        f.writelines(predictions)

# predict_emotion(saved_model)