# Slot Filling

In [38]:
#Mount Google drive
#from google.colab import drive
#drive.mount('/content/drive')

#change current working directory
#%cd "/content/drive/Shareddrives/NLI"

In [39]:
#!pip install transformers
#!pip install seqeval
#!pip install datasets
#!python3 -m spacy download en_core_web_lg

In [40]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
from transformers import XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig
from transformers import XLMForSequenceClassification, XLMTokenizer, XLMConfig
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig
from transformers import AlbertForSequenceClassification, AlbertTokenizer, AlbertConfig
from seqeval.metrics import classification_report
from datasets import load_dataset

import nltk
import matplotlib.pyplot as plt

from tqdm import tqdm
import re

nltk.download('punkt')

import spacy

nlp = spacy.load("en_core_web_lg")

[nltk_data] Downloading package punkt to /home/mister/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [41]:
DATASET_DIRECTORY = "./../dataset.hf/"
SAVE_PARAMETERS_TO = "./saved_parameters.json"
SAVE_MODEL_TO_PATH = "./saved_model/"

PRETRAINED_MODELS = {
    'bert': 'bert-large-uncased',
    'roberta': 'roberta-large',
    'xlnet': 'xlnet-large-cased',
    'xlm': 'xlm-mlm-en-2048',
    'distilbert': 'distilbert-base-uncased',
    'albert':'albert-base-v2'
}

MODEL_CLASSES = {
    'bert': (BertForSequenceClassification, BertTokenizer, BertConfig),
    'roberta': (RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig),
    'xlnet': (XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig),
    'xlm': (XLMForSequenceClassification, XLMTokenizer, XLMConfig),
    'distilbert': (DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig),
    'albert':(AlbertForSequenceClassification,AlbertTokenizer, AlbertConfig)
}

MODEL_TYPE = 'roberta'
PRETRAINED_MODEL_NAME = PRETRAINED_MODELS[MODEL_TYPE]

model_class, tokenizer_class, config_class = MODEL_CLASSES[MODEL_TYPE]


TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 1e-05

In [42]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cpu


In [43]:
# Run it only once
#from datasets import load_dataset
#dataset = load_dataset("multi_woz_v22")

# Run it only once
#dataset.save_to_disk(DATASET_DIRECTORY)

from datasets import DatasetDict
data = DatasetDict.load_from_disk(DATASET_DIRECTORY)

In [44]:
# Extracting only hotel and restaurant features from the dataset
def preprocess_split(dataset, split):
    df = dataset[split].to_pandas()
    new_df = pd.DataFrame(columns = df.columns)
    for i in range(len(df)):
        # Taken from notebook, to know which lines to skip
        row = df.loc[i]
        if not any(set(row.turns['frames'][turn_id]['service']).intersection(['hotel', 'restaurant']) for turn_id,utt in enumerate(row.turns['utterance'])):
            continue

        new_df.loc[len(new_df)] = row
    return new_df

In [45]:
def process_service_list(service_list):
    services = set()
    if len(service_list) == 0:
        services.add('other')
    for service in service_list:
        if service == 'restaurant':
            services.add('restaurant')
        elif service == 'hotel':
            services.add('hotel')
        else:
            services.add('other')
        if len(services) == 3:
            break
    return list(services)

In [46]:
train = preprocess_split(data, 'train')
val = preprocess_split(data, 'validation')
test = preprocess_split(data, 'test')

In [47]:
ignored_tags = []

In [48]:
def extract_token_bio_tags(dataset):
    utterances = [[]]
    count = 0

    for i in tqdm(range(len(dataset))):
        turns = dataset.loc[i].turns
        for utterance, speaker, dialogue_act, frames in zip(turns['utterance'], turns['speaker'], turns['dialogue_acts'], turns['frames']):
            if speaker != 0:
                continue
            if 'other' in process_service_list(frames['service']):
                continue

            span_info = dialogue_act['span_info']
            act_slot_names = span_info['act_slot_name']
            act_slot_values = span_info['act_slot_value']
            span_starts = span_info['span_start']
            span_ends = span_info['span_end']
            slots = {slot_name : {'start': start, 'end': end} for slot_name, start, end in zip(act_slot_names, span_starts, span_ends)}

            #Preprocess

            last_tag = 0
            utterances.append([])
            for slot_name in slots:
                slot_start, slot_end = slots[slot_name]['start'], slots[slot_name]['end']
                tokens = nlp(" ".join(utterance[slot_start:slot_end].lower().split()))
                #tokens = re.sub("[^a-zA-Z0-9]", " ", utterance[slot_start:slot_end]).lower().split()
                for word in nlp(" ".join(utterance[last_tag:slot_start].lower().split())):
                #for word in re.sub("[^a-zA-Z0-9]", " ", utterance[last_tag:slot_start]).lower().split():
                  utterances[count].append((str(word), "O"))

                for j, token in enumerate(tokens):
                    bio_type = 'B-' if j == 0 else 'I-'
                    if bio_type + slot_name not in ignored_tags:
                        utterances[count].append((str(token), bio_type + slot_name))
                last_tag = slot_end

            #for word in re.sub("[^a-zA-Z0-9]", " ", utterance[last_tag:]).lower().split():
            for word in nlp(" ".join(utterance[last_tag:].lower().split())):
                  utterances[count].append((str(word), "O"))
            count = count+1


    return utterances

In [49]:
# building X, y
train_word_tag = extract_token_bio_tags(train)
valid_word_tag = extract_token_bio_tags(val)
test_word_tag = extract_token_bio_tags(test)

print("\nSize train_sentencs: ", len(train_word_tag))
print("Size valid_sentences: ", len(valid_word_tag))
print("Size test_sentences: ", len(test_word_tag))



100%|██████████| 6321/6321 [07:08<00:00, 14.76it/s]
100%|██████████| 762/762 [00:56<00:00, 13.57it/s]
100%|██████████| 745/745 [00:50<00:00, 14.68it/s]


Size train_sentencs:  23542
Size valid_sentences:  2668
Size test_sentences:  2619





In [50]:
from collections import Counter
print("Number of utterances:", len(train_word_tag))
print("Frequency")
counter = sorted(Counter([tag for sentence in train_word_tag for _, tag in sentence]).items(), key=lambda x: x[1], reverse=True)
for tag, val in counter:
    print(f'{tag:20} {val}')

print("\nEach tag appears in number of documents")
docs = {}
for i, sentence in enumerate(train_word_tag):
    for _, tag in sentence:
        if tag not in docs:
            docs[(i, tag)] = 1
        else:
            docs[(i, tag)] += 1
counter = Counter([tag for i, tag in docs])
for tag in counter:
  print(f'{tag:20} {counter[tag]}')


Number of utterances: 23542
Frequency
O                    296492
B-area               4324
B-pricerange         4301
B-food               3805
B-bookpeople         3223
B-bookday            3208
B-bookstay           2306
I-name               2301
B-type               2150
B-stars              1841
B-name               1787
B-booktime           1561
I-area               826
I-food               649
I-pricerange         443
I-bookpeople         226
I-bookday            150
I-type               60
I-booktime           50
I-bookstay           2

Each tag appears in number of documents
O                    23532
B-area               4324
B-pricerange         4301
B-type               2150
B-bookstay           2306
B-bookpeople         3223
B-bookday            3208
I-pricerange         219
B-name               1787
I-name               1406
B-food               3805
B-booktime           1561
I-food               492
B-stars              1841
I-area               517
I-bookday            15

In [27]:
all_words = set([word for sentence in train_word_tag for word, _ in sentence])

max_seq_len = max([len(x) for x in train_word_tag])
print("max_seq_leng:", max_seq_len)

n_words = len(all_words)
print("Number of words:", n_words)
print("Words: ", all_words)

# Define the max_length for BERT and the tokenizer based on the longest sentence in the training set
max_length = 128

max_seq_leng: 42
Number of words: 2424
Words:  {'noontime', '%', 'traditional', 'property', 'able', 'pub', 'otherwise', '9:45', 'quite', '7:30', 'zmzlmlr9', 'ratings', 'interested', 'turn', 'wifi.can', 'almost', 'eats', '8:15', 'cuisine', 'fins', 'proceed', 'mid', 'apologize', '0', 'belgian', 'lodege', 'lankan', 'nothing', 'nah', 'satruday', 'rated', 'fails', 'star', 'trouble', 'o', 'confusing', '18th', 'walking', 'seriously', 'panahar', 'asap', '14:30', 'confusion', 'resaurant', 'w', 'glutton', 'error', 'switching', 'seating', 'aylesbray', '19:45', 'concerned', 'wednesday', 'don;t', 'vary', 'bar', 'rooms', 'their', 'restaurant.is', 'named,"the', 'yelp', 'decently', 'scratch', 'definitely', 'fi', 'appealing', 'kitchen', 'many', 'locations', 'ashely', 'surely', 'caribbean', 'children', 'join', 'specifcally', 'totally', 'you-', 'east', 'ranked', 'arms', 'problem', 'hours', 'ideal', 'locals', 'apologies', 'fun', '16:00', 'taste', 'teusday', 'birthday', 'eclectic', 'anytime', 'taking', 'pa

In [28]:
all_tags = set([tag for sentence in train_word_tag for _, tag in sentence])
num_tags = len(all_tags)
print("Number of tags:", num_tags)
print("Tags:", all_tags)

Number of tags: 20
Tags: {'B-bookpeople', 'B-name', 'B-bookstay', 'I-booktime', 'I-food', 'B-type', 'I-name', 'I-bookpeople', 'B-area', 'B-booktime', 'I-bookday', 'I-area', 'B-pricerange', 'B-bookday', 'I-bookstay', 'O', 'B-food', 'B-stars', 'I-type', 'I-pricerange'}


In [29]:
tag2index = {tag: i for i, tag in enumerate(all_tags)}
index2tag = {i: tag for tag, i in tag2index.items()}

In [30]:
print("tag2index:", tag2index)
print("index2tag:", index2tag)

tag2index: {'B-bookpeople': 0, 'B-name': 1, 'B-bookstay': 2, 'I-booktime': 3, 'I-food': 4, 'B-type': 5, 'I-name': 6, 'I-bookpeople': 7, 'B-area': 8, 'B-booktime': 9, 'I-bookday': 10, 'I-area': 11, 'B-pricerange': 12, 'B-bookday': 13, 'I-bookstay': 14, 'O': 15, 'B-food': 16, 'B-stars': 17, 'I-type': 18, 'I-pricerange': 19}
index2tag: {0: 'B-bookpeople', 1: 'B-name', 2: 'B-bookstay', 3: 'I-booktime', 4: 'I-food', 5: 'B-type', 6: 'I-name', 7: 'I-bookpeople', 8: 'B-area', 9: 'B-booktime', 10: 'I-bookday', 11: 'I-area', 12: 'B-pricerange', 13: 'B-bookday', 14: 'I-bookstay', 15: 'O', 16: 'B-food', 17: 'B-stars', 18: 'I-type', 19: 'I-pricerange'}


In [31]:
# Initialize Bert tokenizer
tokenizer = tokenizer_class.from_pretrained(PRETRAINED_MODEL_NAME, do_lower_case=True)

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [32]:
class SlotFillingDataset(Dataset):
  def __init__(self, X, y, tokenizer, max_len):
        self.len = len(X)
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels
        sentence = self.X[index]
        word_labels = self.y[index]

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                             is_split_into_words=True,
                             return_offsets_mapping=True,
                             padding='max_length',
                             truncation=True,
                             max_length=self.max_len)

        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [tag2index[label] for label in word_labels]
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100

        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
          if mapping[0] == 0 and mapping[1] != 0:
            # overwrite label
            encoded_labels[idx] = labels[i]
            i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)

        return item

  def __len__(self):
        return self.len

In [33]:
X_train = [[word for word, _ in sentence] for sentence in train_word_tag]
y_train = [[tag for _, tag in sentence] for sentence in train_word_tag]

X_valid = [[word for word, _ in sentence] for sentence in valid_word_tag]
y_valid = [[tag for _, tag in sentence] for sentence in valid_word_tag]

X_test = [[word for word, _ in sentence] for sentence in test_word_tag]
y_test = [[tag for _, tag in sentence] for sentence in test_word_tag]

training_set = SlotFillingDataset(X_train, y_train, tokenizer, max_length)
validation_set = SlotFillingDataset(X_valid, y_valid, tokenizer, max_length)
testing_set = SlotFillingDataset(X_test, y_test, tokenizer, max_length)

In [37]:
index = 3
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[index]["input_ids"]), training_set[index]["labels"]):
  print(f'{token:10} {label:3} ')

[CLS]      -100 
yes         15 
.           15 
can         15 
you         15 
book        15 
it          15 
for         15 
me          15 
?           15 
[SEP]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD]      -100 
[PAD] 

In [26]:
training_loader = DataLoader(training_set, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
validation_loader = DataLoader(validation_set, batch_size=VALID_BATCH_SIZE, shuffle=False)
testing_loader = DataLoader(testing_set, batch_size=VALID_BATCH_SIZE, shuffle=False)

In [27]:
model = model_class.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=len(tag2index))
model.to(device)

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), 

In [28]:
inputs = training_set[2]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["labels"].unsqueeze(0)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)

outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
initial_loss = outputs[0]
initial_loss

tensor(2.9810, device='cuda:0', grad_fn=<NllLossBackward0>)

In [29]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 128, 20])

In [30]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [31]:
def valid(model, data_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(data_loader):

            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)

            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = outputs[0]
            eval_logits = outputs[1]

            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)

            # if idx % 100==0:
            #     loss_step = eval_loss/nb_eval_steps
            #     print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)

            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)

            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(labels)
            eval_preds.extend(predictions)

            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [index2tag[id.item()] for id in eval_labels]
    predictions = [index2tag[id.item()] for id in eval_preds]

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps

    # print(f"Validation : loss {eval_loss:.2f} accuracy {eval_accuracy:.2f}")

    return labels, predictions, eval_loss, eval_accuracy

In [32]:
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    with tqdm(training_loader, 'batch') as tepoch:
        for idx, batch in enumerate(tepoch):
            tepoch.set_description(f'Epoch {epoch}')

            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)

            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss, tr_logits = outputs[0], outputs[1]
            tr_loss += loss.item()

            nb_tr_steps += 1
            nb_tr_examples += labels.size(0)

            # if idx % 100==0:
            #     loss_step = tr_loss/nb_tr_steps
            #     print(f"Training loss per 100 training steps: {loss_step:.2f}")

            # compute training accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)

            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
            #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))

            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            tr_labels.extend(labels)
            tr_preds.extend(predictions)

            tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            tr_accuracy += tmp_tr_accuracy

            # gradient clipping
            torch.nn.utils.clip_grad_norm_(
                parameters=model.parameters(), max_norm=10
            )

            # backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            tepoch.set_postfix(loss=tr_loss / nb_tr_steps, accuracy=tr_accuracy / nb_tr_steps)

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps

    _, _ , val_loss, val_acc = valid(model, validation_loader)

    return epoch_loss, tr_accuracy, val_loss, val_acc


In [None]:
history = {'train_loss':[], 'train_accuracy':[], 'val_loss':[], 'val_accuracy':[], 'epoch':[]}
for epoch in range(EPOCHS):
    train_loss, train_acc, val_loss, val_acc = train(epoch)

    history['train_loss'].append(train_loss)
    history['train_accuracy'].append(train_acc)
    history['val_loss'].append(val_loss)
    history['val_accuracy'].append(val_acc)
    history['epoch'].append(epoch+1)


Epoch 0: 100%|██████████| 2943/2943 [32:22<00:00,  1.52it/s, accuracy=0.973, loss=0.0957]
Epoch 1: 100%|██████████| 2943/2943 [32:28<00:00,  1.51it/s, accuracy=0.981, loss=0.0528]
Epoch 2:  39%|███▉      | 1152/2943 [12:41<19:46,  1.51it/s, accuracy=0.984, loss=0.0459]

In [None]:
labels, predictions, loss, accuracy = valid(model, testing_loader)
print(f"Validation : loss {loss:.2f} accuracy {accuracy:.2f}")

In [None]:
from seqeval.metrics import classification_report

print(classification_report([labels], [predictions]))

# for source, pred in zip(labels, predictions):
#   print(f'{source:20} {pred}')

In [None]:
model.save_pretrained('./model/task2_nlp')