Modolo Davide

**NLU Project - Joint Intent Classification and Slot Filling Sentence Level**

In [12]:
# imports and PAD_TOKEN
PAD_TOKEN = 0
import os
import json

import torch
print(torch.cuda.is_available())
device = 'cuda:0' # cuda:0 means we are using the GPU with id 0, if you have multiple GPU
os.environ['CUDA_LAUNCH_BLOCKING'] = "1" # Used to report errors on CUDA side
import torch.utils.data as data

from torch.utils.data import DataLoader

import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

True


In [13]:
# Lang class
def word2id(raw_dataset):
# returns a dictionary of words and their ids
    words = []
    for entry in raw_dataset:
       words.extend(entry['utterance'].split())
    words = list(set(words))
    words_dict = {'pad': PAD_TOKEN}
    words_dict.update({w:i+1 for i, w in enumerate(words)})
    words_dict['unk'] = len(words_dict)
    return words_dict

def slot2id(raw_dataset):
# returns a dictionary of slots and their ids
    slots = ['pad']
    for entry in raw_dataset:
       slots.extend(entry['slots'].split())
    slots = list(set(slots))
    slots_dict = {s:i for i, s in enumerate(slots)}
    return slots_dict

def intent2id(raw_dataset):
# returns a dictionary of intents and their ids
    intents = [entry['intent'] for entry in raw_dataset]
    intents = list(set(intents))
    intents_dict = {inte:i for i, inte in enumerate(intents)}
    return intents_dict

class Lang():
    def __init__(self, train_raw, dev_raw, test_raw):
        self.word2id = word2id(train_raw)
        self.slot2id = slot2id(train_raw + dev_raw + test_raw)
        self.intent2id = intent2id(train_raw + dev_raw + test_raw)
        self.id2word = {v:k for k, v in self.word2id.items()}
        self.id2slot = {v:k for k, v in self.slot2id.items()}
        self.id2intent = {v:k for k, v in self.intent2id.items()}
        # self.intent_list = list(set(list(self.intent2id.keys())))
        # self.slot_list = list(set(list(self.slot2id.keys())))

In [14]:
# datasets
def load_data(path):
    dataset = []
    with open(path) as f:
        dataset = json.loads(f.read())
    return dataset


From Lab10

In [15]:
class IntentsAndSlots (data.Dataset):
    # Mandatory methods are __init__, __len__ and __getitem__
    def __init__(self, dataset, lang, unk='unk'):
        self.utterances = []
        self.intents = []
        self.slots = []
        self.unk = unk
        
        for x in dataset:
            self.utterances.append(x['utterance'])
            self.slots.append(x['slots'])
            self.intents.append(x['intent'])

        self.utt_ids = self.mapping_seq(self.utterances, lang.word2id)
        self.slot_ids = self.mapping_seq(self.slots, lang.slot2id)
        self.intent_ids = self.mapping_lab(self.intents, lang.intent2id)

    def __len__(self):
        return len(self.utterances)

    def __getitem__(self, idx):
        utt = torch.Tensor(self.utt_ids[idx])
        slots = torch.Tensor(self.slot_ids[idx])
        intent = self.intent_ids[idx]
        sample = {'utterance': utt, 'slots': slots, 'intent': intent}
        return sample
    
    # Auxiliary methods
    
    def mapping_lab(self, data, mapper):
        return [mapper[x] if x in mapper else mapper[self.unk] for x in data]
    
    def mapping_seq(self, data, mapper): # Map sequences to number
        res = []
        for seq in data:
            tmp_seq = []
            for x in seq.split():
                if x in mapper:
                    tmp_seq.append(mapper[x])
                else:
                    tmp_seq.append(mapper[self.unk])
            res.append(tmp_seq)
        return res

def collate_fn(data):
    def merge(sequences):
        '''
        merge from batch * sent_len to batch * max_len 
        '''
        lengths = [len(seq) for seq in sequences]
        max_len = 1 if max(lengths)==0 else max(lengths)
        # Pad token is zero in our case
        # So we create a matrix full of PAD_TOKEN (i.e. 0) with the shape 
        # batch_size X maximum length of a sequence
        padded_seqs = torch.LongTensor(len(sequences),max_len).fill_(PAD_TOKEN)
        for i, seq in enumerate(sequences):
            end = lengths[i]
            padded_seqs[i, :end] = seq # We copy each sequence into the matrix
        # print(padded_seqs)
        padded_seqs = padded_seqs.detach()  # We remove these tensors from the computational graph
        return padded_seqs, lengths
    # Sort data by seq lengths
    data.sort(key=lambda x: len(x['utterance']), reverse=True) 
    new_item = {}
    for key in data[0].keys():
        new_item[key] = [d[key] for d in data]
    # We just need one length for packed pad seq, since len(utt) == len(slots)
    src_utt, _ = merge(new_item['utterance'])
    y_slots, y_lengths = merge(new_item["slots"])
    intent = torch.LongTensor(new_item["intent"])
    
    src_utt = src_utt.to(device) # We load the Tensor on our seleceted device
    y_slots = y_slots.to(device)
    intent = intent.to(device)
    y_lengths = torch.LongTensor(y_lengths).to(device)
    
    new_item["utterances"] = src_utt
    new_item["intents"] = intent
    new_item["y_slots"] = y_slots
    new_item["slots_len"] = y_lengths
    return new_item

In [16]:
# data loader
def prepare_data(dataset):
    train_raw = load_data(os.path.join('data', dataset, 'train.json'))
    test_raw = load_data(os.path.join('data', dataset, 'test.json'))
    dev_raw = load_data(os.path.join('data', dataset, 'valid.json'))
    
    lang = Lang(train_raw, dev_raw, test_raw)

    ##############################
    train_dataset = IntentsAndSlots(train_raw, lang)
    dev_dataset = IntentsAndSlots(dev_raw, lang)
    test_dataset = IntentsAndSlots(test_raw, lang)
    
    ##############################
    train_loader = DataLoader(train_dataset, batch_size=128, collate_fn=collate_fn,  shuffle=True)
    dev_loader = DataLoader(dev_dataset, batch_size=64, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=64, collate_fn=collate_fn)
    
    return train_loader, dev_loader, test_loader, lang

In [17]:
# baseline model to improve
class ModelIAS(nn.Module):

    def __init__(self, hid_size, out_slot, out_int, emb_size, vocab_len, n_layer=1, pad_index=0):
        super(ModelIAS, self).__init__()
        # hid_size = Hidden size
        # out_slot = number of slots (output size for slot filling)
        # out_int = number of intents (ouput size for intent class)
        # emb_size = word embedding size
        
        self.embedding = nn.Embedding(vocab_len, emb_size, padding_idx=pad_index)
        
        self.utt_encoder = nn.LSTM(emb_size, hid_size, n_layer, bidirectional=False)    
        self.slot_out = nn.Linear(hid_size, out_slot)
        self.intent_out = nn.Linear(hid_size, out_int)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, utterance, seq_lengths):
        # utterance.size() = batch_size X seq_len
        utt_emb = self.embedding(utterance) # utt_emb.size() = batch_size X seq_len X emb_size
        utt_emb = utt_emb.permute(1,0,2) # we need seq len first -> seq_len X batch_size X emb_size
        
        # pack_padded_sequence avoid computation over pad tokens reducing the computational cost
        
        packed_input = pack_padded_sequence(utt_emb, seq_lengths.cpu().numpy())
        # Process the batch
        packed_output, (last_hidden, cell) = self.utt_encoder(packed_input) 
        # Unpack the sequence
        utt_encoded, input_sizes = pad_packed_sequence(packed_output)
        # Get the last hidden state
        last_hidden = last_hidden[-1,:,:]
        # Compute slot logits
        slots = self.slot_out(utt_encoded)
        # Compute intent logits
        intent = self.intent_out(last_hidden)
        
        # Slot size: seq_len, batch size, classes 
        slots = slots.permute(1,2,0) # We need this for computing the loss
        # Slot size: batch_size, classes, seq_len
        return slots, intent


In [18]:
def init_weights(mat):
    for m in mat.modules():
        if type(m) in [nn.GRU, nn.LSTM, nn.RNN]:
            for name, param in m.named_parameters():
                if 'weight_ih' in name:
                    for idx in range(4):
                        mul = param.shape[0]//4
                        torch.nn.init.xavier_uniform_(param[idx*mul:(idx+1)*mul])
                elif 'weight_hh' in name:
                    for idx in range(4):
                        mul = param.shape[0]//4
                        torch.nn.init.orthogonal_(param[idx*mul:(idx+1)*mul])
                elif 'bias' in name:
                    param.data.fill_(0)
        else:
            if type(m) in [nn.Linear]:
                torch.nn.init.uniform_(m.weight, -0.01, 0.01)
                if m.bias != None:
                    m.bias.data.fill_(0.01)

In [19]:
from conll import evaluate
from sklearn.metrics import classification_report
import torch.nn.functional as F

def train_loop(data, optimizer, criterion_slots, criterion_intents, model):
    model.train()
    loss_array = []
    for sample in data:
        optimizer.zero_grad() # Zeroing the gradient
        slots, intent = model(sample['utterances'], sample['slots_len'])
        loss_intent = criterion_intents(intent, sample['intents'])
        loss_slot = criterion_slots(slots, sample['y_slots'])
        loss = loss_intent + loss_slot # In joint training we sum the losses. 
                                       # Is there another way to do that?
        # weights, _ = torch.sort(F.softmax(torch.randn(2), dim=-1)) 
        # loss = max(loss_intent, loss_slot) * max(weights[0], weights[1]) + min(loss_intent, loss_slot) * min(weights[0], weights[1])
        loss_array.append(loss.item())
        loss.backward() # Compute the gradient, deleting the computational graph
        # clip the gradient to avoid explosioning gradients
        # torch.nn.utils.clip_grad_norm_(model.parameters(), clip)  
        optimizer.step() # Update the weights
    return loss_array

def eval_loop(data, criterion_slots, criterion_intents, model, lang):
    model.eval()
    loss_array = []
    
    ref_intents = []
    hyp_intents = []
    
    ref_slots = []
    hyp_slots = []
    #softmax = nn.Softmax(dim=1) # Use Softmax if you need the actual probability
    with torch.no_grad(): # It used to avoid the creation of computational graph
        for sample in data:
            slots, intents = model(sample['utterances'], sample['slots_len'])
            loss_intent = criterion_intents(intents, sample['intents'])
            loss_slot = criterion_slots(slots, sample['y_slots'])
            loss = loss_intent + loss_slot 
            loss_array.append(loss.item())
            # Intent inference
            # Get the highest probable class
            out_intents = [lang.id2intent[x] 
                           for x in torch.argmax(intents, dim=1).tolist()] 
            gt_intents = [lang.id2intent[x] for x in sample['intents'].tolist()]
            ref_intents.extend(gt_intents)
            hyp_intents.extend(out_intents)
            
            # Slot inference 
            output_slots = torch.argmax(slots, dim=1)
            for id_seq, seq in enumerate(output_slots):
                length = sample['slots_len'].tolist()[id_seq]
                utt_ids = sample['utterance'][id_seq][:length].tolist()
                gt_ids = sample['y_slots'][id_seq].tolist()
                gt_slots = [lang.id2slot[elem] for elem in gt_ids[:length]]
                utterance = [lang.id2word[elem] for elem in utt_ids]
                to_decode = seq[:length].tolist()
                ref_slots.append([(utterance[id_el], elem) for id_el, elem in enumerate(gt_slots)])
                tmp_seq = []
                for id_el, elem in enumerate(to_decode):
                    tmp_seq.append((utterance[id_el], lang.id2slot[elem]))
                hyp_slots.append(tmp_seq)
    try:            
        results = evaluate(ref_slots, hyp_slots)
    except Exception as ex:
        # Sometimes the model predics a class that is not in REF
        print(ex)
        ref_s = set([x[1] for x in ref_slots])
        hyp_s = set([x[1] for x in hyp_slots])
        print(hyp_s.difference(ref_s))
        
    report_intent = classification_report(ref_intents, hyp_intents, 
                                          zero_division=False, output_dict=True)
    return results, report_intent, loss_array


In [20]:
# Seq2Seq architecture
# https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
import torch.nn.functional as F
class Seq2Seq(nn.Module):
    def __init__(self, hid_size, out_slot, out_int, emb_size, vocab_len, n_layer=1, pad_index=0,  dropout=0.1):
        super(Seq2Seq, self).__init__()

        self.hidden = None

        self.embedding = nn.Embedding(vocab_len, emb_size, padding_idx=pad_index)
        self.bidirectionality = True
        # self.gru = nn.GRU(emb_size, hid_size, n_layer, bidirectional=self.bidirectionality)   
        self.lstm = nn.LSTM(emb_size, hid_size, n_layer, bidirectional=self.bidirectionality)
        slot_input_size = hid_size * 2 if self.bidirectionality else hid_size 
        self.slot_out = nn.Linear(slot_input_size, out_slot)
        self.intent_out = nn.Linear(hid_size, out_int)
        self.dropout = 0.1
        
    def forward(self, utterance, seq_lengths):
        utt_emb = self.embedding(utterance) # utt_emb.size() = batch_size X seq_len X emb_size

        utt_emb = F.dropout(utt_emb, self.dropout)
        utt_emb = utt_emb.permute(1,0,2) # we need seq len first -> seq_len X batch_size X emb_size
        # Process the batch

        packed_input = pack_padded_sequence(utt_emb, seq_lengths.cpu().numpy())

        packed_output, (self.hidden, _) = self.lstm(packed_input) 
        
        packed_output, input_sizes = pad_packed_sequence(packed_output)

        packed_output = F.dropout(packed_output, self.dropout)
        self.hidden = F.dropout(self.hidden, self.dropout)

        last_hidden = self.hidden[-1,:,:]

        # Compute slot logits
        slots = self.slot_out(packed_output)
        # Compute intent logits
        intent = self.intent_out(last_hidden)
        
        # Slot size: seq_len, batch size, calsses 
        slots = slots.permute(1,2,0) # We need this for computing the loss
        # Slot size: batch_size, classes, seq_len
        return slots, intent

In [21]:
from tqdm import tqdm
import numpy as np
import torch.optim as optim
def do_model(dataset="ATIS", model="Seq2Seq"):
    train_loader, dev_loader, test_loader, lang = prepare_data(dataset)
    hid_size = 200
    emb_size = 300

    lr = 0.0001 # learning rate
    clip = 5 # Clip the gradient

    out_slot = len(lang.slot2id)
    out_int = len(lang.intent2id)
    vocab_len = len(lang.word2id)
    if model == 'ModelIAS':
        model = ModelIAS(hid_size, out_slot, out_int, emb_size, vocab_len, pad_index=PAD_TOKEN).to(device)
    elif model == 'Seq2Seq':
        model = Seq2Seq(hid_size, out_slot, out_int, emb_size, vocab_len, pad_index=PAD_TOKEN).to(device)
    model.apply(init_weights)

    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion_slots = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
    criterion_intents = nn.CrossEntropyLoss() # Because we do not have the pad token

    n_epochs = 20
    patience = 5

    losses_train = []
    losses_dev = []
    sampled_epochs = []
    best_f1 = 0

    for x in tqdm(range(1,n_epochs)):
        loss = train_loop(train_loader, optimizer, criterion_slots, 
                        criterion_intents, model)
        if x % 5 == 0:
            sampled_epochs.append(x)
            losses_train.append(np.asarray(loss).mean())
            results_dev, intent_res, loss_dev = eval_loop(dev_loader, criterion_slots, 
                                                        criterion_intents, model, lang)
            losses_dev.append(np.asarray(loss_dev).mean())
            f1 = results_dev['total']['f']
            
            if f1 > best_f1:
                best_f1 = f1
            else:
                patience -= 1
            if patience <= 0: # Early stopping with patience
                break # Not nice but it keeps the code clean

    results_test, intent_test, _ = eval_loop(test_loader, criterion_slots, 
                                            criterion_intents, model, lang)

    print('Slot F1: ', results_test['total']['f'])
    print('Intent Accuracy:', intent_test['accuracy'])
    return results_test['total']['f'], intent_test['accuracy']

In [22]:
trials = 5
results = []
dataset = "ATIS"
for x in range(trials):
    results.append(do_model(dataset))

print (f"#################### {dataset} ####################")
print("Average Slot F1: ", np.asarray(results).mean(axis=0)[0])
print("Average Intent Accuracy: ", np.asarray(results).mean(axis=0)[1])
print(results)

OUT_INT 26


  0%|          | 0/19 [00:00<?, ?it/s]

128
tensor([[ 0.0502, -0.0164,  0.0162,  ...,  0.0121,  0.0141, -0.0057],
        [ 0.0250, -0.0062, -0.0029,  ...,  0.0070, -0.0023,  0.0204],
        [ 0.0259,  0.0205,  0.0529,  ..., -0.0140,  0.0168,  0.0111],
        ...,
        [ 0.0378,  0.0400, -0.0034,  ...,  0.0307,  0.0279,  0.0049],
        [ 0.0076,  0.0426,  0.0164,  ...,  0.0311, -0.0050,  0.0132],
        [-0.0201,  0.0040,  0.0208,  ...,  0.0178, -0.0121,  0.0074]],
       device='cuda:0', grad_fn=<AddmmBackward0>) 
 tensor([ 4,  4,  6,  4,  6,  4,  4,  4,  4,  4,  4,  4,  4,  4, 10, 14,  4,  4,
         6,  4, 23,  4,  4,  6,  4,  4,  4,  4,  4,  6,  4,  6,  4,  6,  4,  4,
         4,  6,  4,  4,  4,  4,  4,  4, 17,  4,  4,  4,  4, 13,  4, 23,  4,  4,
         4,  4,  4,  4,  4,  4,  4,  4,  6,  4,  4,  6, 13, 23, 10,  6,  4,  4,
         4,  4,  4,  4,  4,  4,  4, 13,  4,  4,  4,  4,  4, 10,  4,  4, 10,  4,
        13,  4,  4,  4,  4,  4, 13,  4,  4,  4,  6,  4,  4,  4,  4,  4,  4, 23,
         4,  4, 15,  4,  4, 16

  5%|▌         | 1/19 [00:00<00:17,  1.00it/s]

128
tensor([[-0.0026, -0.0526, -0.0290,  ...,  0.0101,  0.0170,  0.0116],
        [-0.0388, -0.0513, -0.0754,  ..., -0.0088, -0.0648, -0.0356],
        [-0.0475, -0.0410, -0.0675,  ...,  0.0269, -0.0956, -0.0911],
        ...,
        [-0.0084,  0.0003,  0.0034,  ...,  0.0202,  0.0089, -0.0055],
        [-0.0172, -0.0114,  0.0091,  ...,  0.0176, -0.0126, -0.0099],
        [ 0.0100,  0.0050,  0.0473,  ..., -0.0031,  0.0090,  0.0376]],
       device='cuda:0', grad_fn=<AddmmBackward0>) 
 tensor([ 4,  4,  4,  4,  4,  6,  4,  4,  4,  4,  4,  4,  4,  4,  6,  4,  4, 10,
         4, 23,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4, 23,  4,  4,  4,  4,  4,
         4,  4,  4,  4, 13,  4,  4,  4,  7,  4,  4,  6,  6,  4, 13,  4,  4,  4,
         4,  6, 10, 13,  4,  6,  4,  4,  4,  4,  4, 13,  4,  4,  4,  4,  4,  4,
         4,  4, 13,  4, 14,  4,  4,  6,  4,  4,  4, 13,  6,  4,  6,  4,  4,  4,
        13,  4,  4,  4,  4,  4, 13,  4,  4,  6,  4,  4, 23,  4,  4, 23,  4,  4,
         4, 13, 13, 11,  4,  4

 11%|█         | 2/19 [00:01<00:16,  1.02it/s]

tensor([[-0.3628, -0.3768, -0.4197, -0.4069,  0.5715, -0.3576,  0.1792, -0.2632,
         -0.3746, -0.3554, -0.2706, -0.3018, -0.3747, -0.0599, -0.3455, -0.1986,
         -0.3252, -0.2377, -0.3682, -0.3563, -0.3509, -0.4250, -0.3974, -0.1469,
         -0.4116, -0.3310],
        [-0.3927, -0.4303, -0.4289, -0.4440,  0.5883, -0.3783,  0.2523, -0.2881,
         -0.4153, -0.3856, -0.3003, -0.2993, -0.4318, -0.0366, -0.3608, -0.1282,
         -0.3567, -0.2517, -0.3822, -0.4089, -0.3572, -0.5033, -0.4208, -0.1335,
         -0.4292, -0.3756],
        [-0.4511, -0.4582, -0.4991, -0.4824,  0.6853, -0.4411,  0.3496, -0.3189,
         -0.4655, -0.4334, -0.2962, -0.3746, -0.4719,  0.0385, -0.4349,  0.0162,
         -0.4520, -0.2666, -0.4329, -0.4695, -0.4028, -0.5303, -0.4748, -0.1066,
         -0.4985, -0.5073],
        [-0.0757, -0.0973, -0.0719, -0.0985,  0.1276, -0.0710,  0.0052, -0.0371,
         -0.0553, -0.0454, -0.0498, -0.0806, -0.0973,  0.0143, -0.0250, -0.0013,
         -0.0830, -0.0641

 16%|█▌        | 3/19 [00:02<00:15,  1.01it/s]

128
tensor([[-1.6284, -1.6928, -1.6739,  ..., -0.8429, -1.6790, -1.5732],
        [-1.4931, -1.6179, -1.6251,  ..., -0.7710, -1.5576, -1.5133],
        [-1.5289, -1.6038, -1.5841,  ..., -0.7888, -1.6060, -1.5316],
        ...,
        [-0.5845, -0.6009, -0.6830,  ..., -0.2568, -0.6649, -0.6586],
        [-0.2040, -0.2238, -0.2028,  ..., -0.0864, -0.2170, -0.2007],
        [-0.4805, -0.5054, -0.4678,  ..., -0.2067, -0.4899, -0.4930]],
       device='cuda:0', grad_fn=<AddmmBackward0>) 
 tensor([ 4,  4,  4,  4, 10,  6,  4,  4,  6,  4,  4,  4,  4,  4,  4,  6,  4,  4,
         4,  4,  4,  4,  4, 13,  6, 10,  4,  4,  6,  4,  4, 13,  6, 17, 10,  4,
         4,  4,  4,  6,  4, 23,  4,  4,  4,  6,  4,  4,  4,  4,  4, 13,  4,  4,
         6,  4,  4,  4,  4,  4,  6,  4,  4,  4,  4,  7,  4,  6,  4,  4, 13,  4,
         4,  4,  4,  4, 17,  4,  4,  4,  4,  4,  4,  6,  4,  6,  4,  4,  4,  4,
         4,  4,  4,  4, 15,  4,  4, 13,  4,  4,  4,  4,  4, 13,  4,  4,  4,  4,
         4,  4,  6, 10,  4,  4

 21%|██        | 4/19 [00:04<00:15,  1.02s/it]

128
tensor([[-2.4922, -2.5873, -2.5994,  ..., -0.9379, -2.5941, -2.5583],
        [-2.5831, -2.6384, -2.6401,  ..., -0.9858, -2.6414, -2.5615],
        [-2.4514, -2.5991, -2.5529,  ..., -0.8625, -2.4795, -2.5104],
        ...,
        [-1.6427, -1.7276, -1.6962,  ..., -0.5640, -1.6543, -1.6342],
        [-1.6189, -1.6613, -1.6308,  ..., -0.5131, -1.6211, -1.6046],
        [-1.9146, -1.9668, -1.9348,  ..., -0.7116, -1.9083, -1.9018]],
       device='cuda:0', grad_fn=<AddmmBackward0>) 
 tensor([ 9,  4,  6,  4,  4,  4,  4, 10,  4,  4,  6,  4,  4,  4,  4,  4,  4,  4,
         6,  4, 23,  4,  3,  4,  4,  6,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
         4,  4,  4,  6,  4,  4,  4,  6,  4, 13,  4,  4,  4,  4,  7,  4, 17,  4,
        13,  4,  4,  6,  4,  4, 13,  4,  4,  4,  4,  4, 17, 20,  4,  4,  4,  4,
         4,  6,  6,  4,  6,  4,  6,  4,  4,  4,  4,  4,  6,  4,  4,  4,  4,  4,
         4,  4,  4,  7,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4, 13,  4,  4,
         4,  4,  4,  4, 15, 13

 26%|██▋       | 5/19 [00:05<00:15,  1.07s/it]

64
64
64
64
64
64
64
64
64
21
128
tensor([[-2.9130, -2.9457, -3.0147,  ..., -0.5873, -2.9337, -2.9570],
        [-2.9606, -3.0527, -3.0565,  ..., -0.6578, -3.0512, -3.0321],
        [-2.8765, -2.8794, -2.9586,  ..., -0.6131, -2.9083, -2.8766],
        ...,
        [-0.4010, -0.3660, -0.3755,  ..., -0.0339, -0.3767, -0.3850],
        [-0.4297, -0.4083, -0.3781,  ..., -0.0475, -0.4040, -0.4236],
        [ 0.0335,  0.0196,  0.0631,  ...,  0.0104,  0.0306,  0.0490]],
       device='cuda:0', grad_fn=<AddmmBackward0>) 
 tensor([ 4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  6,
         4,  4,  4,  6,  4,  4,  4,  4,  4,  4,  6,  4,  4,  4, 13,  4,  4,  4,
         4,  4,  0,  4,  4,  4,  4,  4,  4, 17,  4,  6,  4,  4,  4,  4,  4,  4,
         4,  6,  4,  6,  4, 17,  4,  4,  4, 20,  4,  8,  4,  4,  4,  6,  4,  4,
        13,  4,  4,  4, 14,  4,  4,  4,  4,  4,  6,  4,  4,  4,  4, 23,  4,  4,
         4,  4, 23,  4,  4,  4,  4, 25,  4, 23,  4,  4,  4, 13, 13,  4,  4,  4,


 26%|██▋       | 5/19 [00:06<00:17,  1.23s/it]


KeyboardInterrupt: 

In [None]:
def train_loop(data, optimizer, criterion_slots, criterion_intents, model):
    model.train()
    loss_array = []
    for sample in data:
        optimizer.zero_grad() # Zeroing the gradient
        
        slots, intent = model(sample['utterances'], sample['slots_len'])
        loss_intent = criterion_intents(intent, sample['intents'])
        loss_slot = criterion_slots(slots, sample['y_slots'])
        loss = loss_intent + loss_slot # In joint training we sum the losses. 
                                       # Is there another way to do that?
        # weights, _ = torch.sort(F.softmax(torch.randn(2), dim=-1)) 
        # loss = max(loss_intent, loss_slot) * max(weights[0], weights[1]) + min(loss_intent, loss_slot) * min(weights[0], weights[1])
        loss_array.append(loss.item())
        loss.backward() # Compute the gradient, deleting the computational graph
        # clip the gradient to avoid explosioning gradients
        # torch.nn.utils.clip_grad_norm_(model.parameters(), clip)  
        optimizer.step() # Update the weights
    return loss_array

Using pre-trained model BERT with TensorFlow