In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split

from tqdm import trange
from tabulate import tabulate

import pandas as pd
import numpy as np
import torch
import random

In [None]:
def get_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print('There are %d GPU(s) available.' % torch.cuda.device_count())
        print('We will use the GPU:', torch.cuda.get_device_name(0))
    else:
        print('No GPU available, using the CPU instead.')
        device = torch.device("cpu")
    return device


device = get_device()

Loading our dataset:

In [None]:
df = pd.read_csv("../input/rudabank/data_tagged.csv")

df

In [None]:
df.groupby(["tag"]).count()

In [None]:
df.replace("appreciatiom", "appreciation", inplace=True)
df.drop(df[df["tag"] == "apology_response"].index, inplace=True)

df.groupby(["tag"]).count()

In [None]:
classes = {"apology": 0,
           "appreciation": 1,
           "avoiding": 2,
           "back-channeling": 3,
           "closing": 4,
           "command": 5,
           "disapproval": 6,
           "neg_answer": 7,
           "open_question": 8,
           "opening": 9,
           "other_answers": 10,
           "pos_answer": 11,
           "statement": 12,
           "thanking": 13,
           "yes_no_question": 14}

df.replace({"tag": classes}, inplace=True)

df.groupby(["tag"]).count()

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
utterances_df = df[["tagged_utterance"]]

utterances_df

In [None]:
labels_df = df[["tag"]]

labels_df

Fine-Tuning RuBERT base cased conversational for tokenizing dialog utterances:

In [None]:
tokenizer = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased-conversational")

In [None]:
text = utterances_df.tagged_utterance.values
labels = labels_df.tag.values

In [None]:
def print_rand_sentence():
    index = random.randint(0, len(text)-1)
    table = np.array([tokenizer.tokenize(text[index]), 
                      tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text[index]))]).T
    print(tabulate(table,
                   headers = ['Tokens', 'Token IDs'],
                   tablefmt = 'fancy_grid'))

print_rand_sentence()

In [None]:
def preprocessing(input_text, tokenizer):
    '''
    Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
      - input_ids: list of token ids
      - token_type_ids: list of token type ids
      - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
    '''
    return tokenizer.encode_plus(
                          input_text,
                          add_special_tokens = True,
                          max_length = 32,
                          pad_to_max_length = True,
                          return_attention_mask = True,
                          return_tensors = 'pt'
                     )

In [None]:
token_id = []
attention_masks = []
for sample in text:
    encoding_dict = preprocessing(sample, tokenizer)
    token_id.append(encoding_dict['input_ids']) 
    attention_masks.append(encoding_dict['attention_mask'])

token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

In [None]:
def print_rand_sentence_encoding():
    '''Displays tokens, token IDs and attention mask of a random text sample'''
    index = random.randint(0, len(text) - 1)
    tokens = tokenizer.tokenize(tokenizer.decode(token_id[index]))
    token_ids = [i.numpy() for i in token_id[index]]
    attention = [i.numpy() for i in attention_masks[index]]

    table = np.array([tokens, token_ids, attention]).T
    print(tabulate(table, 
                   headers = ['Tokens', 'Token IDs', 'Attention Mask'],
                   tablefmt = 'fancy_grid'))

print_rand_sentence_encoding()

In [None]:
val_ratio = 0.2

batch_size = 16

train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)

train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [None]:
def b_tp(preds, labels):
    return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
    return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
    return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
    return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
    preds = np.argmax(preds, axis = 1).flatten()
    labels = labels.flatten()
    tp = b_tp(preds, labels)
    tn = b_tn(preds, labels)
    fp = b_fp(preds, labels)
    fn = b_fn(preds, labels)
    b_accuracy = (tp + tn) / len(labels)
    b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
    b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
    b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
    return b_accuracy, b_precision, b_recall, b_specificity

In [None]:
model = BertForSequenceClassification.from_pretrained(
    "DeepPavlov/rubert-base-cased-conversational",
    num_labels = 15,
    output_attentions = False,
    output_hidden_states = False,
)

optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08
                              )


model.cuda()

In [None]:
epochs = 4

for _ in trange(epochs, desc = 'Epoch'):
    model.train()
    
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        
        train_output.loss.backward()
        optimizer.step()
        
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

        
    model.eval()
    
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_specificity = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            eval_output = model(b_input_ids, 
                                token_type_ids = None, 
                                attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
        
        if b_precision != 'nan': val_precision.append(b_precision)
            
        if b_recall != 'nan': val_recall.append(b_recall)
            
        if b_specificity != 'nan': val_specificity.append(b_specificity)

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')

Final metrics are as follows:
- Train loss: 0.2083
- Validation Accuracy: 0.1228
- Validation Precision: 0.8235
- Validation Recall: 0.9062
- Validation Specificity: 0.8708

In [None]:
new_sentence = "Откуда ты?"

test_ids = []
test_attention_mask = []

encoding = preprocessing(new_sentence, tokenizer)

test_ids.append(encoding['input_ids'])
test_attention_mask.append(encoding['attention_mask'])
test_ids = torch.cat(test_ids, dim=0)
test_attention_mask = torch.cat(test_attention_mask, dim=0)

with torch.no_grad():
    output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))

prediction = np.argmax(output.logits.cpu().numpy()).flatten().item()

print('Input Sentence: ', new_sentence)
print('Predicted Class: ', prediction)

In [None]:
classes

As we can see, the model is able to predict the correct label for this class. Let's test the whole dialog:

In [None]:
utters = np.array([["Извините, это место занято?",
           "Нет, пожалуйста, не стесняйтесь.",
           "Большое спасибо.",
           "Вы работаете в Шанхае?",
           "да я делаю.",
           "А как насчет тебя?",
           "Нет, я турист.",
           "Это потрясающее место!",
           "Это намного больше, чем я себе представлял, и гораздо более захватывающе!",
           "Здесь так много интересного.",
           "Ты можешь сказать это еще раз!",
           "Это гораздо современнее, чем люди себе представляют.",
           "Откуда ты?",
           "Хм, что ж, давайте посмотрим...",
           "Я родом из Канзаса.",
           "Гораздо более тихое и умиротворенное место , чем здесь , это точно!",
           "Ага..."],
          ["yes_no_question",
           "neg_answer",
           "thanking",
           "yes_no_question",
           "pos_answer",
           "open_question",
           "neg_answer",
           "appreciation",
           "statement",
           "statement",
           "back-channeling",
           "statement",
           "open_question",
           "back-channeling",
           "other_answers",
           "statement",
           "back-channeling"]]).T
test_df = pd.DataFrame(utters, columns=["utterances", "tags"])

test_df

In [None]:
def get_preds(new_sentence: str):
    test_ids = []
    test_attention_mask = []

    encoding = preprocessing(new_sentence, tokenizer)

    test_ids.append(encoding['input_ids'])
    test_attention_mask.append(encoding['attention_mask'])
    test_ids = torch.cat(test_ids, dim=0)
    test_attention_mask = torch.cat(test_attention_mask, dim=0)

    with torch.no_grad():
        output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))

    prediction = np.argmax(output.logits.cpu().numpy()).flatten().item()
    
    return prediction

In [None]:
test_df["preds"] = test_df.apply(lambda x: get_preds(x["utterances"]), axis=1)

In [None]:
test_df

In [None]:
inv_classes = {v: k for k, v in classes.items()}

inv_classes

In [None]:
test_df.replace({"preds": inv_classes}, inplace=True)

test_df

In [None]:
test_df["agreement"] = test_df.apply(lambda x: 1 if x["tags"] == x["preds"] else 0, axis=1)

test_df

In [None]:
print("Agreement % between actual and predicted tags: {}".format(test_df["agreement"].sum() / len(test_df) * 100))

70.6% of the model's predictions match the actual labels. While this can be considered rather accurate, this metric can definitely be improved towards better classification.