In [1]:
# imports and PAD_TOKEN
PAD_TOKEN = 0
import os
import json

import torch
print(torch.cuda.is_available())
# device = 'cuda:0' # cuda:0 means we are using the GPU with id 0, if you have multiple GPU
device = 'cpu'
os.environ['CUDA_LAUNCH_BLOCKING'] = "1" # Used to report errors on CUDA side
import torch.utils.data as data

from torch.utils.data import DataLoader

import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# import bert
from transformers import BertTokenizer, BertModel, BertForMaskedLM

True


In [2]:
# datasets
def word2id(raw_dataset):
# returns a dictionary of words and their ids
    words = []
    for entry in raw_dataset:
       words.extend(entry['utterance'].split())
    words = list(set(words))
    words_dict = {'[PAD]': PAD_TOKEN}
    words_dict.update({w:i+1 for i, w in enumerate(words)})
    words_dict['[UNK]'] = len(words_dict)
    return words_dict

def slot2id(raw_dataset):
# returns a dictionary of slots and their ids
    slots = ['[PAD]']
    for entry in raw_dataset:
       slots.extend(entry['slots'].split())
    slots = list(set(slots))
    slots_dict = {s:i for i, s in enumerate(slots)}
    return slots_dict

def intent2id(raw_dataset):
# returns a dictionary of intents and their ids
    intents = [entry['intent'] for entry in raw_dataset]
    intents = list(set(intents))
    intents_dict = {inte:i for i, inte in enumerate(intents)}
    return intents_dict
def get_vocab(raw_dataset):
    vocab = set()
    for entry in raw_dataset:
        vocab = vocab.union(set(entry['utterance'].split()))
    return ['[PAD]'] + list(vocab) + ['[UNK]']


class Lang():
    def __init__(self, train_raw, dev_raw, test_raw):
        self.word2id = word2id(train_raw)
        self.slot2id = slot2id(train_raw + dev_raw + test_raw)
        self.intent2id = intent2id(train_raw + dev_raw + test_raw)
        self.vocab = get_vocab(train_raw + dev_raw + test_raw)
        self.id2word = {v:k for k, v in self.word2id.items()}
        self.id2slot = {v:k for k, v in self.slot2id.items()}
        self.id2intent = {v:k for k, v in self.intent2id.items()}
        # self.intent_list = list(set(list(self.intent2id.keys())))
        # self.slot_list = list(set(list(self.slot2id.keys())))
def load_data(path):
    dataset = []
    with open(path) as f:
        dataset = json.loads(f.read())
    return dataset

In [3]:
class IntentsAndSlots (data.Dataset):
    # Mandatory methods are __init__, __len__ and __getitem__
    def __init__(self, dataset, lang, unk='[UNK]'):
        self.utterances = []
        self.intents = []
        self.slots = []
        self.unk = unk
        
        for x in dataset:
            self.utterances.append(x['utterance'])
            self.slots.append(x['slots'])
            self.intents.append(x['intent'])

        self.utt_ids = self.mapping_seq(self.utterances, lang.word2id)
        self.slot_ids = self.mapping_seq(self.slots, lang.slot2id)
        self.intent_ids = self.mapping_lab(self.intents, lang.intent2id)

    def __len__(self):
        return len(self.utterances)

    def __getitem__(self, idx):
        utt = torch.Tensor(self.utt_ids[idx])
        slots = torch.Tensor(self.slot_ids[idx])
        intent = self.intent_ids[idx]
        sample = {'utterance': utt, 'slots': slots, 'intent': intent}
        return sample
    
    # Auxiliary methods
    
    def mapping_lab(self, data, mapper):
        return [mapper[x] if x in mapper else mapper[self.unk] for x in data]
    
    def mapping_seq(self, data, mapper): # Map sequences to number
        res = []
        for seq in data:
            tmp_seq = []
            for x in seq.split():
                if x in mapper:
                    tmp_seq.append(mapper[x])
                else:
                    tmp_seq.append(mapper[self.unk])
            res.append(tmp_seq)
        return res

def collate_fn(data):
    def merge(sequences):
        '''
        merge from batch * sent_len to batch * max_len 
        '''
        lengths = [len(seq) for seq in sequences]
        max_len = 1 if max(lengths)==0 else max(lengths)
        # Pad token is zero in our case
        # So we create a matrix full of PAD_TOKEN (i.e. 0) with the shape 
        # batch_size X maximum length of a sequence
        padded_seqs = torch.LongTensor(len(sequences),max_len).fill_(PAD_TOKEN)
        for i, seq in enumerate(sequences):
            end = lengths[i]
            padded_seqs[i, :end] = seq # We copy each sequence into the matrix
        # print(padded_seqs)
        padded_seqs = padded_seqs.detach()  # We remove these tensors from the computational graph
        return padded_seqs, lengths
    # Sort data by seq lengths
    data.sort(key=lambda x: len(x['utterance']), reverse=True) 
    new_item = {}
    for key in data[0].keys():
        new_item[key] = [d[key] for d in data]
    # We just need one length for packed pad seq, since len(utt) == len(slots)
    src_utt, _ = merge(new_item['utterance'])
    y_slots, y_lengths = merge(new_item["slots"])
    intent = torch.LongTensor(new_item["intent"])
    
    src_utt = src_utt.to(device) # We load the Tensor on our seleceted device
    y_slots = y_slots.to(device)
    intent = intent.to(device)
    y_lengths = torch.LongTensor(y_lengths).to(device)
    
    new_item["utterances"] = src_utt
    new_item["intents"] = intent
    new_item["y_slots"] = y_slots
    new_item["slots_len"] = y_lengths
    return new_item

In [4]:
def prepare_data(dataset):
    train_raw = load_data(os.path.join('data', dataset, 'train.json'))
    test_raw = load_data(os.path.join('data', dataset, 'test.json'))
    dev_raw = load_data(os.path.join('data', dataset, 'valid.json'))
    
    lang = Lang(train_raw, dev_raw, test_raw)

    ##############################
    train_dataset = IntentsAndSlots(train_raw, lang)
    dev_dataset = IntentsAndSlots(dev_raw, lang)
    test_dataset = IntentsAndSlots(test_raw, lang)
    
    ##############################
    train_loader = DataLoader(train_dataset, batch_size=128, collate_fn=collate_fn,  shuffle=True)
    dev_loader = DataLoader(dev_dataset, batch_size=64, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=64, collate_fn=collate_fn)
    
    return train_loader, dev_loader, test_loader, lang

In [5]:
dataset = "SNIPS"
train_loader, dev_loader, test_loader, lang = prepare_data(dataset)
hid_size = 200
emb_size = 300

lr = 0.0001 # learning rate
clip = 5 # Clip the gradient

out_slot = len(lang.slot2id)
out_int = len(lang.intent2id)
print("OUT_INT", out_int)
vocab_len = len(lang.word2id)

train_raw = load_data(os.path.join('data', dataset, 'train.json'))
test_raw = load_data(os.path.join('data', dataset, 'test.json'))
dev_raw = load_data(os.path.join('data', dataset, 'valid.json'))

OUT_INT 7


In [6]:
class JISBERT(nn.Module):
    def __init__ (self, out_int, out_slot, lang):
        super(JISBERT, self).__init__()
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.tokenizer.add_tokens(lang.vocab)
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.bert.resize_token_embeddings(len(self.tokenizer))
        self.bert_drop = nn.Dropout(0.4)
        self.intent_classifier = nn.Linear(768, out_int)
        self.slot_classifier = nn.Linear(768, out_slot)
        
    def forward(self, input, lang):
        # get back the input sentence using lang
        utterance = []
        for element in input:
            utterance.append(' '.join(lang.id2word[i] for i in element if i > 0))

        tokenized = self.token.encode_plus(utterance, return_tensors='pt', add_special_tokens=True,padding=True, truncation=True)
        bert_out = self.bert(**tokenized)

        intent = bert_out.pooler_output
        slots = bert_out.last_hidden_state[:,:input.size(1),:]
        
        intent = self.intent_classifier(intent)
        slots = self.slot_classifier(slots).permute(0, 2, 1)

        return intent, slots

In [7]:
train_utterances = []
train_slots = []
train_intents = []
for element in train_raw:
    train_utterances.append(element['utterance'])
    train_slots.append(element['slots'])
    train_intents.append(element['intent'])
max_len_utt = max([len(x) for x in train_utterances])

valid_utterances = []
valid_slots = []
valid_intents = []
for element in dev_raw:
    valid_utterances.append(element['utterance'])
    valid_slots.append(element['slots'])
    valid_intents.append(element['intent'])
max_len_valid = max([len(x) for x in valid_utterances])

test_utterances = []
test_slots = []
test_intents = []
for element in test_raw:
    test_utterances.append(element['utterance'])
    test_slots.append(element['slots'])
    test_intents.append(element['intent'])
max_len_test = max([len(x) for x in test_utterances])




In [8]:
from conll import evaluate
from sklearn.metrics import classification_report
import torch.nn.functional as F

def train_loop(data, optimizer, criterion_slots, criterion_intents, model):
    model.train()
    loss_array = []
    for sample in data:
        optimizer.zero_grad() # Zeroing the gradient        
        utterance = sample['utterances']
        slots, intent = model(utterance)
        loss_slot = criterion_slots(slots, sample['y_slots'])
        loss_intent = criterion_intents(intent, sample['intents'])
        
        loss = loss_intent + loss_slot # In joint training we sum the losses. 
                                       # Is there another way to do that?
        # weights, _ = torch.sort(F.softmax(torch.randn(2), dim=-1)) 
        # loss = max(loss_intent, loss_slot) * max(weights[0], weights[1]) + min(loss_intent, loss_slot) * min(weights[0], weights[1])
        loss_array.append(loss.item())
        loss.backward() # Compute the gradient, deleting the computational graph
        # clip the gradient to avoid explosioning gradients
        # torch.nn.utils.clip_grad_norm_(model.parameters(), clip)  
        optimizer.step() # Update the weights
    return loss_array

def eval_loop(data, criterion_slots, criterion_intents, model, lang):
    model.eval()
    loss_array = []
    
    ref_intents = []
    hyp_intents = []
    
    ref_slots = []
    hyp_slots = []
    #softmax = nn.Softmax(dim=1) # Use Softmax if you need the actual probability
    with torch.no_grad(): # It used to avoid the creation of computational graph
        for sample in data:
            slots, intents = model(sample['utterances'], sample['slots_len'])
            loss_slot = criterion_slots(slots, sample['y_slots'])
            loss_intent = criterion_intents(intents, sample['intents'])
            loss = loss_intent + loss_slot 
            loss_array.append(loss.item())
            # Intent inference
            # Get the highest probable class
            out_intents = [lang.id2intent[x] 
                           for x in torch.argmax(intents, dim=1).tolist()] 
            gt_intents = [lang.id2intent[x] for x in sample['intents'].tolist()]
            ref_intents.extend(gt_intents)
            hyp_intents.extend(out_intents)
            
            # Slot inference 
            output_slots = torch.argmax(slots, dim=1)
            for id_seq, seq in enumerate(output_slots):
                length = sample['slots_len'].tolist()[id_seq]
                utt_ids = sample['utterance'][id_seq][:length].tolist()
                gt_ids = sample['y_slots'][id_seq].tolist()
                gt_slots = [lang.id2slot[elem] for elem in gt_ids[:length]]
                utterance = [lang.id2word[elem] for elem in utt_ids]
                to_decode = seq[:length].tolist()
                ref_slots.append([(utterance[id_el], elem) for id_el, elem in enumerate(gt_slots)])
                tmp_seq = []
                for id_el, elem in enumerate(to_decode):
                    tmp_seq.append((utterance[id_el], lang.id2slot[elem]))
                hyp_slots.append(tmp_seq)
    try:            
        results = evaluate(ref_slots, hyp_slots)
    except Exception as ex:
        # Sometimes the model predics a class that is not in REF
        print(ex)
        ref_s = set([x[1] for x in ref_slots])
        hyp_s = set([x[1] for x in hyp_slots])
        print(hyp_s.difference(ref_s))
        
    report_intent = classification_report(ref_intents, hyp_intents, 
                                          zero_division=False, output_dict=True)
    return results, report_intent, loss_array

In [9]:
import numpy as np
def encode_dataset(tokenizer, text_sequences, max_length):
    token_ids = np.zeros(shape=(len(text_sequences), max_length),
                         dtype=np.int32)
    for i, text_sequence in enumerate(text_sequences):
        encoded = tokenizer.encode(text_sequence)
        token_ids[i, 0:len(encoded)] = encoded
    attention_masks = (token_ids != 0).astype(np.int32)
    
    return {'input_ids': token_ids, 'attention_masks': attention_masks}

In [11]:
import torch.optim as optim

def training_loop(data, optimizer, criterion_slots, criterion_intents, model):
    model.train()
    loss_array = []
    for sample in data:
        optimizer.zero_grad() # Zeroing the gradient
        slots, intent = model(sample['utterances'], lang)
        loss_intent = criterion_intents(intent, sample['intents'])
        loss_slot = criterion_slots(slots, sample['y_slots'])
        loss = loss_intent + loss_slot # In joint training we sum the losses. 
                                       # Is there another way to do that?
        loss_array.append(loss.item())
        loss.backward() # Compute the gradient, deleting the computational graph
        # clip the gradient to avoid explosioning gradients
        # torch.nn.utils.clip_grad_norm_(model.parameters(), clip)  
        optimizer.step() # Update the weights
    return loss_array

model = JISBERT(out_int, out_slot, lang)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoded_train = encode_dataset(tokenizer, train_utterances, max_len_utt)
# get intents in trains mapping from lang.intent2id
# train_intents = [lang.intent2id[x] for x in train_intents]
# get slots in trains mapping from lang.slot2id
# train_slots = [[lang.slot2id[x] for x in y] for y in train_slots]

optimizer = optim.Adam(model.parameters(), lr=lr)
criterion_slots = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
criterion_intents = nn.CrossEntropyLoss()

loss = training_loop(train_loader, optimizer, criterion_slots, criterion_intents, model)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


TypeError: JISBERT.forward() missing 1 required positional argument: 'lang'

In [None]:
import torch.optim as optim
import numpy as np
from tqdm import tqdm
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion_slots = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
criterion_intents = nn.CrossEntropyLoss() # Because we do not have the pad token

n_epochs = 20
patience = 5

losses_train = []
losses_dev = []
sampled_epochs = []
best_f1 = 0

for x in tqdm(range(1,n_epochs)):
    loss = train_loop(train_raw, optimizer, criterion_slots, 
                    criterion_intents, model)
    if x % 5 == 0:
        sampled_epochs.append(x)
        losses_train.append(np.asarray(loss).mean())
        results_dev, intent_res, loss_dev = eval_loop(dev_loader, criterion_slots, 
                                                    criterion_intents, model, lang)
        losses_dev.append(np.asarray(loss_dev).mean())
        f1 = results_dev['total']['f']
        
        if f1 > best_f1:
            best_f1 = f1
        else:
            patience -= 1
        if patience <= 0: # Early stopping with patience
            break # Not nice but it keeps the code clean

results_test, intent_test, _ = eval_loop(test_loader, criterion_slots, 
                                        criterion_intents, model, lang)

print('Slot F1: ', results_test['total']['f'])
print('Intent Accuracy:', intent_test['accuracy'])

In [None]:
slots, intent = model('I am going to kill myself')

In [None]:
print(lang.intent2id)