# Non-Categorical Slot Filling

In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 15.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 47.7MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 43.5MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


In [3]:
import numpy as np
import torch
#import pulp
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering, BertForQuestionAnswering, BertTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [5]:
path = "gdrive/MyDrive/COLX563_lab4/data/"

### Prepare training data

In [14]:
from collections import defaultdict
import re

regex_aspect = r"-.*="
regex_answer = r"=.*"
categorical = {"pricerange", "area", "bookday", "bookpeople", "parking", "internet", "stars", "type", "bookstay"}
utt_train = []
aspect_train = []
answer_train = []

with open(path+"WOZ_train_utt.txt") as u, open(path+"WOZ_train_ans.txt") as a:
    utterances = u.readlines()
    answers = a.readlines()
    for i, utt in enumerate(utterances):
        utt = utt.strip()
        aspects = answers[i].strip().split("|")[1:]
        match_aspect = [re.findall(regex_aspect, aspect)[0].strip("-=") for aspect in aspects]
        match_answer = [re.findall(regex_answer, aspect)[0].strip("-=") for aspect in aspects]
        for j, aspect in enumerate(match_aspect):
              if aspect not in categorical:
                    utt_train.append(utt)
                    aspect_train.append(aspect)
                    answer_train.append(match_answer[j])

In [123]:
utt_train[:5]

['Hi there! Can you give me some info on Cityroomz?',
 'I am looking for a hotel named alyesbray lodge guest house.',
 'I am looking for a restaurant. I would like something cheap that has Chinese food.',
 'Yeah, could you recommend a good gastropub?',
 'I want to find an expensive restaurant and serves european food. Can i also have the address, phone number and its area. ?']

In [124]:
aspect_train[:5]

['name', 'name', 'food', 'food', 'food']

In [125]:
answer_train[:5]

['cityroomz',
 'alyesbray lodge guest house',
 'chinese',
 'gastropub',
 'european']

### Adapted from Lab 3

In [11]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-distilled-squad') ## for Kaggle

def convert_to_BERT_tensors(aspect, utterance):
    '''takes a parallel list of aspect and utterances'''
    #your code here
    BERT_tensor = tokenizer(text=aspect,
                            text_pair=utterance,
                            padding='max_length', 
                            truncation=True, 
                            max_length=512, 
                            return_tensors='pt',  
                            return_attention_mask=True)
    BERT_tensor.to(device)
    return BERT_tensor["input_ids"], BERT_tensor['attention_mask']

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [10]:
def get_answer_span_tensor(aspect,utterance,answer):
    # your code here
    '''given the input slot, utterance, and answer, yields an answer span'''
    input_tokens = tokenizer.tokenize('[CLS]'+aspect+'[SEP]'+utterance+'[CLS]')
    answer_tokens = tokenizer.tokenize(answer)
    
    for i in range(len(input_tokens)-1):
        if i > 512:
            break
        elif input_tokens[i:i+len(answer_tokens)] == answer_tokens:
            start_idx = i
            end_idx = i+len(answer_tokens)-1
            if end_idx > 512:
                break
            else:
                return torch.tensor([start_idx, end_idx]).to(device)
        
    return torch.tensor([0,0]).to(device)

In [9]:
batch_size = 16

class QAdataset(Dataset):
    '''A dataset for housing QA data, including input_data, output_data, and padding mask'''
    def __init__(self, input_data, output_data,mask):
        self.input_data = input_data
        self.output_data = output_data
        self.mask = mask
        
    def __len__(self):
        return len(self.input_data)
    
    def __getitem__(self, index):
        target = self.output_data[index]
        data_val = self.input_data[index]
        mask = self.mask[index]
        return data_val,target,mask

In [8]:
def prepare_QA_dataset(aspect, utterance, answer):
    spans = []
    inputs, masks = convert_to_BERT_tensors(aspect, utterance)
    for i, span in enumerate(answer):
        spans.append(get_answer_span_tensor(aspect[i], utterance[i], span))
    
    return QAdataset(inputs, spans, masks)

In [None]:
%%time
train_dataset = prepare_QA_dataset(aspect_train, utt_train, answer_train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

CPU times: user 1.85 s, sys: 30.2 ms, total: 1.88 s
Wall time: 1.87 s


In [7]:
def select_best_answer_span(start_probs, end_probs, distance):
    ''' returns a list of spans corresponding to the highest probability QA solution which satisfy the restriction that the end index must
    be within distance after the start index'''
    output_spans = []
    for i, start in enumerate(start_probs):
        end = end_probs[i]    # end probs
        start_idx = (-start).argsort()  # sort descending
        end_idx = (-end).argsort()
        idx_sums = - start_idx.reshape(-1, 1) + end_idx  # find negative idx (if start > end, value will be negative)
        best_span = None
        best_prob = -np.inf
        for row_id in range(len(idx_sums)):
            col_id = np.argmax(idx_sums[row_id]>=0)  # find first positive value with highest probability
            start_id = start_idx[row_id]      # use row_id, col_id to index on start_idx, end_idx
            end_id = end_idx[col_id]
            if end_id <= start_id+distance:
                prob = start[start_id] + end[end_id]
                if prob > best_prob:
                    best_prob = prob
                    best_span = (start_id, end_id)
        output_spans.append(best_span)
    
    return output_spans

### Training DistilBERTQA Model

In [6]:
## Using a SQUAD fine-tuned DistilBertforQA
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad')

#your code here
manual_seed = 123
torch.manual_seed(manual_seed)
epochs = 1
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#print(device)
model = model.to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.00003)
print(model)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=451.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=265481570.0, style=ProgressStyle(descri…


DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
           

In [None]:
%%time
for epoch in range(epochs):
    for input, span, mask in train_dataloader:
        model.zero_grad()

        #forward pass
        gold_start = torch.tensor([i[0].item() for i in span]).to(device)
        gold_end = torch.tensor([i[1].item() for i in span]).to(device)
        output = model(input, attention_mask=mask)
        start_logits = output.start_logits#.type(torch.LongTensor)
        end_logits = output.end_logits#.type(torch.LongTensor)
        loss = loss_function(start_logits, gold_start)
        loss += loss_function(end_logits, gold_end)
        loss.backward()
        optimizer.step()

CPU times: user 1h 12min 34s, sys: 1min 21s, total: 1h 13min 56s
Wall time: 1h 13min 42s


In [None]:
model.save_pretrained('/content/gdrive/MyDrive/COLX563_lab4/model_noncat')

### Evaluate on Dev Set

In [12]:
model_noncat = model.from_pretrained('/content/gdrive/MyDrive/COLX563_lab4/model_noncat')
model_noncat = model_noncat.to(device)
model_noncat.eval()

DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            

In [15]:
utt_dev = []
aspect_dev = []
answer_dev = []
with open(path+"WOZ_dev_utt.txt") as u, open(path+"WOZ_dev_ans.txt") as a:
    utterances = u.readlines()
    answers = a.readlines()
    for i, utt in enumerate(utterances):
        utt = utt.strip()
        aspects = answers[i].strip().split("|")[1:]
        match_aspect = [re.findall(regex_aspect, aspect)[0].strip("-=") for aspect in aspects]
        match_answer = [re.findall(regex_answer, aspect)[0].strip("-=") for aspect in aspects]
        for j, aspect in enumerate(match_aspect):
            if aspect not in categorical:
                utt_dev.append(utt)
                aspect_dev.append(aspect)
                answer_dev.append(match_answer[j])

In [16]:
dev_dataset = prepare_QA_dataset(aspect_dev, utt_dev, answer_dev)
dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)

In [17]:
with torch.no_grad():
    gold_starts = []
    gold_ends = []
    sys_starts = []
    sys_ends = []
    for input, span, mask in dev_dataloader:
        ## gold
        gold_start = torch.tensor([i[0].item() for i in span]).to(device)
        gold_starts.extend(gold_start)
        gold_end = torch.tensor([i[1].item() for i in span]).to(device)
        gold_ends.extend(gold_end)

        ## sys
        output = model_noncat(input, mask)
        start_logits = output.start_logits
        sys_starts.extend(torch.argmax(start_logits, dim=1))
        end_logits = output.end_logits
        sys_ends.extend(torch.argmax(end_logits, dim=1))

    np_gold_starts = [tensor.detach().cpu().numpy() for tensor in gold_starts]
    np_gold_ends = [tensor.detach().cpu().numpy() for tensor in gold_ends]
    np_sys_starts = [tensor.detach().cpu().numpy() for tensor in sys_starts]
    np_sys_ends = [tensor.detach().cpu().numpy() for tensor in sys_ends]
    start_accuracy = accuracy_score(np_gold_starts, np_sys_starts)
    end_accuracy = accuracy_score(np_gold_ends, np_sys_ends)
    print(f"start_accuracy: {start_accuracy}")
    print(f"end_accuracy: {end_accuracy}")

start_accuracy: 0.9653465346534653
end_accuracy: 0.9504950495049505


### Predict on Test Set

In [90]:
regex_aspect = r"-.*"
categorical = {"pricerange", "area", "bookday", "bookpeople", "parking", "internet", "stars", "type", "bookstay"}
utt_test = []
aspect_test = []
idx2utt = {}
utt2idx = {}
idx2slot = {}

with open(path+"WOZ_test_utt.txt") as u, open(path+"domain_aspect_pred.txt") as s:
    utterances = u.readlines()
    slots = s.readlines()
    for i, utt in enumerate(utterances):
        utt = utt.strip()
        aspects = slots[i].strip().split("|")[1:]
        match_aspect = [re.findall(regex_aspect, aspect)[0].strip("-=") for aspect in aspects]
        for j, aspect in enumerate(match_aspect):
              if aspect not in categorical:
                    utt_test.append(utt)
                    aspect_test.append(aspect)
                    idx2utt[i] = utt
                    idx2slot[i] = slots[i].strip()
                    utt2idx[utt] = i


answer_test = ["" for i in range(len(utt_test))]

In [43]:
test_dataset = prepare_QA_dataset(aspect_test, utt_test, answer_test)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [44]:
with torch.no_grad():
    test_starts = []
    test_ends = []
    for input, span, mask in test_dataloader:
        ## sys
        output = model_noncat(input, mask)
        start_logits = output.start_logits
        test_starts.append(start_logits.detach().cpu().numpy())
        
        end_logits = output.end_logits
        test_ends.append(end_logits.detach().cpu().numpy())

In [45]:
distance = 20
test_output = []
for i, start in enumerate(test_starts):
    end = test_ends[i]
    test_output.extend(select_best_answer_span(start, end, distance))

pred_list = []
for i in range(len(test_dataset)):
    start = test_output[i][0]
    end = test_output[i][1]+1
    pred_list.append(tokenizer.decode(test_dataset[i][0][start:end]))

In [105]:
idx2pred = {}
for i, utt in enumerate(utt_test):
      idx2pred[utt2idx[utt]] = pred_list[i]

### Create Prediction file

In [122]:
with open(f'noncat_lab4.txt', "w", newline="", encoding='utf-8') as f, open(path+"domain_aspect_pred.txt") as s:
    regex_slot = r"-.*"
    fslots = s.readlines()
    for i, line in enumerate(fslots):
          answer = ""
              if i in idx2utt.keys():
                intent = idx2slot[i].strip().split("|")[0]
                slots = idx2slot[i].strip().split("|")[1:]
                new_slots = []
                answer = intent
                for slot in slots:
                    if "name" in slot or "food" in slot:
                        slot = slot + "=" + idx2pred[i]
                        new_slots.append(slot)
                        answer = answer+"|"+slot
                    else:
                        new_slots.append(slot)
                        answer = answer+"|"+slot
                f.write(answer+"\n")
            else:
                  f.write(line)