**!!! Edit -- Notebook settings -- GPU !!!**

# MultiWOZ dataset import

In [1]:
!wget 'https://www.repository.cam.ac.uk/bitstream/handle/1810/294507/MULTIWOZ2.1.zip'

--2020-10-20 08:58:47--  https://www.repository.cam.ac.uk/bitstream/handle/1810/294507/MULTIWOZ2.1.zip
Resolving www.repository.cam.ac.uk (www.repository.cam.ac.uk)... 131.111.98.67
Connecting to www.repository.cam.ac.uk (www.repository.cam.ac.uk)|131.111.98.67|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13794372 (13M) [application/zip]
Saving to: ‘MULTIWOZ2.1.zip’


2020-10-20 08:58:50 (4.56 MB/s) - ‘MULTIWOZ2.1.zip’ saved [13794372/13794372]



In [2]:
!unzip 'MULTIWOZ2.1.zip'

Archive:  MULTIWOZ2.1.zip
   creating: MULTIWOZ2.1/
  inflating: MULTIWOZ2.1/train_db.json  
   creating: __MACOSX/
   creating: __MACOSX/MULTIWOZ2.1/
  inflating: __MACOSX/MULTIWOZ2.1/._train_db.json  
  inflating: MULTIWOZ2.1/testListFile.json  
  inflating: __MACOSX/MULTIWOZ2.1/._testListFile.json  
  inflating: MULTIWOZ2.1/.DS_Store   
  inflating: __MACOSX/MULTIWOZ2.1/._.DS_Store  
  inflating: MULTIWOZ2.1/police_db.json  
  inflating: __MACOSX/MULTIWOZ2.1/._police_db.json  
  inflating: MULTIWOZ2.1/ontology.json  
  inflating: __MACOSX/MULTIWOZ2.1/._ontology.json  
  inflating: MULTIWOZ2.1/dialogue_acts.json  
  inflating: __MACOSX/MULTIWOZ2.1/._dialogue_acts.json  
  inflating: MULTIWOZ2.1/data.json   
  inflating: __MACOSX/MULTIWOZ2.1/._data.json  
  inflating: MULTIWOZ2.1/taxi_db.json  
  inflating: __MACOSX/MULTIWOZ2.1/._taxi_db.json  
  inflating: MULTIWOZ2.1/README.json  
  inflating: __MACOSX/MULTIWOZ2.1/._README.json  
  inflating: MULTIWOZ2.1/restaurant_db.json  
  infla

In [3]:
# reading downloaded json files and extracting the relevant data into pandas DataFrame

import os
import json
import re
import pandas as pd
import random

def preload_data():
    # from MultiWOZ-Parser; reading the names of the files for training, testing, validation datasets
    # https://github.com/jojonki/MultiWOZ-Parser/blob/master/parser.py

    def load_json(data_file):
        if os.path.isfile(data_file):
            with open(data_file, 'r') as read_file:
                data = json.load(read_file)
                return data

    def load_list_file(list_file):
        with open(list_file, 'r') as read_file:
            dialog_id_list = read_file.readlines()
            dialog_id_list = [l.strip('\n') for l in dialog_id_list]
            return dialog_id_list
        return
    
    # extracts the utterances from the MultiWOZ dataset
    def get_utterances(data):
        utterances = []

        for block in data:
            data = block['log']

            for ut in data:
                # replacing whitespace characters with spaces
                text = re.sub("\\s", " ", ut['text'])
                text = re.sub("[^a-zA-Z0-9 ]+", "", ut['text'])

                utterances.append(text)

        return utterances
    
    def split_data(data):
        X = []
        Y = []
        for i in range(len(data)):
            tokens = data[i].split()

            if (i <= len(data)/2) and (len(tokens) > 4):
                # picking random point for splitting the conversation turn
                l = random.randrange(1, len(tokens) - 3)
                # splitting data
                X.append(' '.join(tokens[:l]))
                # adding 0 to the target list -> 0 -- interrupted turn 
                Y.append('interrupted')

            # second section of the dataset is made out of full utterances
            else:
                # adding the full uninterrupted conversation turn
                X.append(data[i])
                # adding 1 to the target list -> 1 -- uninterrupted turn 
                Y.append('finished')

        # shuffling the dataset
        c = list(zip(X, Y, data))
        random.shuffle(c)
        X, Y, data = zip(*c)

        return X,Y,data

    # extracting data
    dialog_data_file = './MULTIWOZ2.1/data.json'
    dialog_data = load_json(dialog_data_file)
    dialog_id_list = list(set(dialog_data.keys()))

    valid_list_file = './MULTIWOZ2.1/valListFile.json'
    test_list_file = './MULTIWOZ2.1/testListFile.json'

    valid_id_list = list(set(load_list_file(valid_list_file)))
    test_id_list = load_list_file(test_list_file)
    train_id_list = [did for did in dialog_id_list if did not in (valid_id_list + test_id_list)]

    train_data = [v for k, v in dialog_data.items() if k in train_id_list]
    valid_data = [v for k, v in dialog_data.items() if k in valid_id_list]
    test_data = [v for k, v in dialog_data.items() if k in test_id_list]
    
    # merging all datasets together
    data = train_data + valid_data + test_data
    utterances = get_utterances(data)
    
    X, Y, data_clean = split_data(utterances)
    
    return pd.DataFrame(data={'text': X, 'intent': Y}).sample(20000)

In [4]:
# getting the dataset as dataframe
df = preload_data()

In [5]:
# making the 'intent' coulmn categorical
df['intent'] = pd.Categorical(df['intent'])
# adding a new column with category codes instead of strings
df['codes'] = df.intent.cat.codes

In [6]:
df.head()

Unnamed: 0,text,intent,codes
34131,Maybe how much does it cost,finished,0
90379,I would like an expensive restaurant,finished,0
22098,Okay I recommend the Acorn Guest,interrupted,1
112656,Im confused you said your arriving and leaving...,finished,0
72583,Howdy Im looking for a moderately priced place...,finished,0


In [7]:
# transformes (BERT models) installation
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/19/22/aff234f4a841f8999e68a7a94bdd4b60b4cebcfeca5d67d61cd08c9179de/transformers-3.3.1-py3-none-any.whl (1.1MB)
[K     |▎                               | 10kB 22.1MB/s eta 0:00:01[K     |▋                               | 20kB 6.4MB/s eta 0:00:01[K     |█                               | 30kB 7.8MB/s eta 0:00:01[K     |█▎                              | 40kB 8.5MB/s eta 0:00:01[K     |█▌                              | 51kB 7.3MB/s eta 0:00:01[K     |█▉                              | 61kB 8.2MB/s eta 0:00:01[K     |██▏                             | 71kB 8.5MB/s eta 0:00:01[K     |██▌                             | 81kB 8.8MB/s eta 0:00:01[K     |██▉                             | 92kB 9.0MB/s eta 0:00:01[K     |███                             | 102kB 9.6MB/s eta 0:00:01[K     |███▍                            | 112kB 9.6MB/s eta 0:00:01[K     |███▊                            | 122kB 9.6M

In [8]:
import numpy as np
import time
from sklearn import metrics

from transformers import BertTokenizer, BertModel, BertForSequenceClassification

import tensorflow as tf
import torch
from torch.utils.data import Dataset, DataLoader

# Tokenizer building

Selection based on the Hugging Face models: https://huggingface.co/models

In [9]:
# smallest version of BERT
MODEL_TYPE='google/bert_uncased_L-2_H-128_A-2'

# standard version of BERT
# MODEL_TYPE='bert-base-uncased'

In [10]:
# maximal length (in tokens) of a sentence
MAX_LEN = 128
# training batch size
TRAIN_BATCH_SIZE = 32
# valiadtion batch size
VALID_BATCH_SIZE = TRAIN_BATCH_SIZE * 2
# number of training epochs
EPOCHS = 10
# learning rate
LEARNING_RATE = 2e-5

# tokenizer initialization
tokenizer = BertTokenizer.from_pretrained(MODEL_TYPE)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [11]:
# the class handles data extraction
class CustomDataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
    # tokenizer to tokenize the sentences
    self.tokenizer = tokenizer
    # dataset as dataframe
    self.data = dataframe
    # sentences
    self.text = dataframe.text
    # intent targets
    self.targets = self.data.codes
    # max length for tokenizer
    self.max_len = max_len

  # returns the length of the dataset
  def __len__(self):
    return len(self.text)

  # returns information about a sentence indexed by the index parameter
  def __getitem__(self, index):
    text = str(self.text[index])
    text = " ".join(text.split())

    # tokenization of the sentence
    inputs = self.tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        max_length=self.max_len,
        truncation=True,
        padding='max_length',
        return_token_type_ids=True
    )
    ids = inputs['input_ids']
    mask = inputs['attention_mask']
    token_type_ids = inputs["token_type_ids"]

    return {
        'ids': torch.tensor(ids, dtype=torch.long),
        'mask': torch.tensor(mask, dtype=torch.long),
        'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        'targets': torch.tensor(self.targets[index], dtype=torch.float)
    }

In [12]:
# ratio of the training set and validation set
train_size = 0.8
train_dataset = df.sample(frac=train_size)

# make testing dataset by dropping the "training rows"
test_dataset = df.drop(train_dataset.index).reset_index(drop=True)
# make training dataset
train_dataset = train_dataset.reset_index(drop=True)

# prints the sizes of the datasets
print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

# make classes that handle the data
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (20000, 3)
TRAIN Dataset: (16000, 3)
TEST Dataset: (4000, 3)


In [13]:
# training parameters
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0}

# validation parameters
test_params = {'batch_size': VALID_BATCH_SIZE,
               'shuffle': True,
               'num_workers': 0}

# torch loaders for data handling
training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [14]:
# set device to GPU
device = 'cuda'

In [15]:
# load the pretrained BERT classifier for sequences -> attentions need to be outputted for later visualizations
model = BertForSequenceClassification.from_pretrained(MODEL_TYPE, output_attentions=True, num_labels=len(np.unique(np.array(df.intent))))
# making use of the GPU
model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=382.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=17743809.0, style=ProgressStyle(descrip…




Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification w

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-12, element

In [16]:
# loss function defition
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets.long())

In [17]:
# optimizer for training
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [18]:
# training epoch definition
def train(epoch):
  model.train()

  # for counting the average accuracy on epoch data
  accuracy = 0
  # for counting the average loss on epoch data
  loss_cnt = 0

  for _, data in enumerate(training_loader, 0):
    # extracting batch data
    ids = data['ids'].to(device, dtype = torch.long)
    mask = data['mask'].to(device, dtype = torch.long)
    token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
    targets = data['targets'].to(device, dtype = torch.float)

    # extracting predicted logits
    outputs = model(ids, mask, token_type_ids)[0]

    # selecting the predicted sentiment from the logits
    _outputs = [np.argmax(i) for i in outputs.cpu().detach().numpy()]
    # formatting the batch targets = true sentiment
    _targets = np.array(targets.cpu().detach().numpy().tolist())

    # adding batch accuracy
    accuracy += metrics.accuracy_score(_targets, _outputs)

    optimizer.zero_grad()
    # calculating the batch loss
    loss = loss_fn(outputs, targets)
    # adding batch loss
    loss_cnt += loss.item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  # print epoch results
  print(f'Epoch: {epoch + 1}, Loss: {loss_cnt/len(training_loader)}, Accuracy: {accuracy/len(training_loader)}')

In [19]:
# training loop
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 1, Loss: 0.5285614928603173, Accuracy: 0.7524375
Epoch: 2, Loss: 0.40983015298843384, Accuracy: 0.819
Epoch: 3, Loss: 0.3672740350663662, Accuracy: 0.8420625
Epoch: 4, Loss: 0.33792868642508983, Accuracy: 0.8558125
Epoch: 5, Loss: 0.31984183594584464, Accuracy: 0.8670625
Epoch: 6, Loss: 0.2995242453664541, Accuracy: 0.874
Epoch: 7, Loss: 0.2888459212630987, Accuracy: 0.88025
Epoch: 8, Loss: 0.2790718381404877, Accuracy: 0.8840625
Epoch: 9, Loss: 0.2661203009486198, Accuracy: 0.89225
Epoch: 10, Loss: 0.25876606957614423, Accuracy: 0.8940625


In [20]:
def validation(epoch):
  model.eval()
  
  # for batch accuracy calculation
  accuracy = 0

  with torch.no_grad():
    for _, data in enumerate(testing_loader, 0):
      # extracting batch data
      ids = data['ids'].to(device, dtype = torch.long)
      mask = data['mask'].to(device, dtype = torch.long)
      token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
      targets = data['targets'].to(device, dtype = torch.float)

      # extracting predicted logits
      outputs = model(ids, mask, token_type_ids)[0]
      
      # selecting the predicted sentiment from the logits
      _outputs = [np.argmax(i) for i in outputs.cpu().detach().numpy()]
      # formatting the batch targets = true sentiment
      _targets = np.array(targets.cpu().detach().numpy().tolist())
      
      # adding batch accuracy
      accuracy += metrics.accuracy_score(_targets, _outputs)

  return accuracy/len(testing_loader)

In [21]:
# for counting the overall average validation accuracy
accuracy = 0

# validation loop
for epoch in range(EPOCHS):
    accuracy += validation(epoch)

# print validation accuracy
print(f'Validation accuracy: {accuracy/EPOCHS}')

Validation accuracy: 0.8979910714285714


# Attention visualization

In [22]:
# download the bertviz module
import sys

!rm -r bertviz_repo
!test -d bertviz_repo || git clone https://github.com/jessevig/bertviz bertviz_repo
if not 'bertviz_repo' in sys.path: sys.path += ['bertviz_repo']

rm: cannot remove 'bertviz_repo': No such file or directory
Cloning into 'bertviz_repo'...
remote: Enumerating objects: 1074, done.[K
remote: Total 1074 (delta 0), reused 0 (delta 0), pack-reused 1074[K
Receiving objects: 100% (1074/1074), 99.41 MiB | 22.64 MiB/s, done.
Resolving deltas: 100% (687/687), done.


In [23]:
# function for attention visualization in the cell
def call_html():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              "d3": "https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.8/d3.min",
              jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
            },
          });
        </script>
        '''))

In [24]:
# import bertviz
from bertviz import head_view

In [25]:
# tokenizes selected sentence
def process_sentence(sentence):
  # tokenize sentence
  inputs = tokenizer.encode_plus(
            sentence,
            None,
            add_special_tokens=True,
            return_token_type_ids=True
        )

  # extract data from tokenization
  ids = torch.tensor([inputs['input_ids']], dtype=torch.long)
  mask = torch.tensor([inputs['attention_mask']], dtype=torch.long)
  token_type_ids = torch.tensor([inputs['token_type_ids']], dtype=torch.long)

  # calculate BERT output
  output = model(ids, mask, token_type_ids)

  return output, ids

In [26]:
# the visualization might not appear during the first exection, but it will with the following cell execution
# this handles the error message
first_run = True

In [39]:
# ####################################################################################
#   THE FOLLOWING CODE SELECTS RANDOM SENTIMENT AND RANDOM SENTENCE FROM THE DATASET 
# ####################################################################################


# model needs to run on CPU to visualize
model = model.to('cpu')

# array of all available intents in the dataset
intent_categ = np.unique(np.array(df.intent))
# selecting random intent for visualization
intent_index = np.random.randint(0, len(intent_categ))

# extracting the senetences with the selected intent
text_extract = df[df.intent == intent_categ[intent_index]].text
# selecting random sentence index from the extracted set
text_index = np.random.randint(0, len(text_extract))

# selecting the random sentence
sentence = text_extract.iloc[text_index]

# processing the sentence (tokenization mainly)
output, ids = process_sentence(sentence)

# selecting the result of the model (predicted intent)
result = np.argmax(np.array(output[0].detach().numpy())[0])

# print results (intent frequency - occurencies in the dataset)
print('Selected intent:', intent_categ[intent_index], ', intent frequency:', len(text_extract),'/',len(df))
print('Predicted intent:', np.array(intent_categ)[result])
print()
print('Selected sentence:', sentence)
print()

# ------------------------------------------------------------
# VISUALIZATION SECTION

# saving the attention output
attention = output[-1]

input_id_list = ids[0].tolist()
tokens = tokenizer.convert_ids_to_tokens(input_id_list)
call_html()

# visualization module
head_view(attention, tokens)

if first_run:
  print('Reload the cell if the visualization didn\'t appear.')
  first_run = False

Selected intent: finished , intent frequency: 10402 / 20000
Predicted intent: finished

Selected sentence: There are number of moderately priced options and one that falls in the cheap price range Do you have a price range in mind



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>