In [1]:
import json
from pprint import pprint
# Global variables
import os
device = 'cuda:0' # cuda:0 means we are using the GPU with id 0, if you have multiple GPU
os.environ['CUDA_LAUNCH_BLOCKING'] = "1" # Used to report errors on CUDA side
PAD_TOKEN = 0
from collections import Counter

def load_data(path):
    '''
        input: path/to/data
        output: json
    '''
    dataset = []
    with open(path) as f:
        dataset = json.loads(f.read())
    return dataset

tmp_train_raw = load_data(os.path.join('dataset/ATIS/train.json'))
test_raw = load_data(os.path.join('dataset/ATIS/test.json'))
print('Train samples:', len(tmp_train_raw))
print('Test samples:', len(test_raw))

pprint(test_raw[0])

Train samples: 4978
Test samples: 893
{'intent': 'flight',
 'slots': 'O O O O O O O O B-fromloc.city_name O B-toloc.city_name '
          'I-toloc.city_name O O O O O B-stoploc.city_name I-stoploc.city_name',
 'utterance': 'i would like to find a flight from charlotte to las vegas that '
              'makes a stop in st. louis'}


In [2]:
import random
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter

# Firt we get the 10% of dataset, then we compute the percentage of these examples
# on the training set which is around 11%
portion = round(((len(tmp_train_raw) + len(test_raw)) * 0.10)/(len(tmp_train_raw)),2)


intents = [x['intent'] for x in tmp_train_raw] # We stratify on intents
count_y = Counter(intents)

Y = []
X = []
mini_Train = []

for id_y, y in enumerate(intents):
    if count_y[y] > 1: # If some intents occure once only, we put them in training
        X.append(tmp_train_raw[id_y])
        Y.append(y)
    else:
        mini_Train.append(tmp_train_raw[id_y])
# Random Stratify
X_train, X_dev, y_train, y_dev = train_test_split(X, Y, test_size=portion,
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=Y)
X_train.extend(mini_Train)
train_raw = X_train
dev_raw = X_dev

y_test = [x['intent'] for x in test_raw]

# Intent distribution
print('Train:')
pprint({k:round(v/len(y_train),3)*100 for k, v in sorted(Counter(y_train).items())})
print('Dev:'),
pprint({k:round(v/len(y_dev),3)*100 for k, v in sorted(Counter(y_dev).items())})
print('Test:')
pprint({k:round(v/len(y_test),3)*100 for k, v in sorted(Counter(y_test).items())})
print('='*89)
# Dataset size
print('TRAIN size:', len(train_raw))
print('DEV size:', len(dev_raw))
print('TEST size:', len(test_raw))


Train:
{'abbreviation': 2.9000000000000004,
 'aircraft': 1.6,
 'airfare': 8.5,
 'airline': 3.2,
 'airline+flight_no': 0.0,
 'airport': 0.4,
 'capacity': 0.3,
 'city': 0.4,
 'distance': 0.4,
 'flight': 73.7,
 'flight+airfare': 0.4,
 'flight_no': 0.3,
 'flight_time': 1.0999999999999999,
 'ground_fare': 0.4,
 'ground_service': 5.1,
 'meal': 0.1,
 'quantity': 1.0,
 'restriction': 0.1}
Dev:
{'abbreviation': 3.0,
 'aircraft': 1.7000000000000002,
 'airfare': 8.5,
 'airline': 3.2,
 'airport': 0.3,
 'capacity': 0.3,
 'city': 0.3,
 'distance': 0.3,
 'flight': 73.7,
 'flight+airfare': 0.5,
 'flight_no': 0.2,
 'flight_time': 1.0,
 'ground_fare': 0.3,
 'ground_service': 5.2,
 'meal': 0.2,
 'quantity': 1.0,
 'restriction': 0.2}
Test:
{'abbreviation': 3.6999999999999997,
 'aircraft': 1.0,
 'airfare': 5.4,
 'airfare+flight': 0.1,
 'airline': 4.3,
 'airport': 2.0,
 'capacity': 2.4,
 'city': 0.7000000000000001,
 'day_name': 0.2,
 'distance': 1.0999999999999999,
 'flight': 70.8,
 'flight+airfare': 1.3,
 

In [3]:
from collections import Counter
class Lang():
    def __init__(self, words, intents, slots, cutoff=0):
        self.word2id = self.w2id(words, cutoff=cutoff, unk=True)
        self.slot2id = self.lab2id(slots)
        self.intent2id = self.lab2id(intents, pad=False)
        self.id2word = {v:k for k, v in self.word2id.items()}
        self.id2slot = {v:k for k, v in self.slot2id.items()}
        self.id2intent = {v:k for k, v in self.intent2id.items()}

    def w2id(self, elements, cutoff=None, unk=True):
        vocab = {'pad': PAD_TOKEN}
        if unk:
            vocab['unk'] = len(vocab)
        count = Counter(elements)
        for k, v in count.items():
            if v > cutoff:
                vocab[k] = len(vocab)
        return vocab

    def lab2id(self, elements, pad=True):
        vocab = {}
        if pad:
            vocab['pad'] = PAD_TOKEN
        for elem in elements:
                vocab[elem] = len(vocab)
        return vocab
words = sum([x['utterance'].split() for x in train_raw], []) # No set() since we want to compute
                                                            # the cutoff
corpus = train_raw + dev_raw + test_raw # We do not wat unk labels,
                                        # however this depends on the research purpose
slots = set(sum([line['slots'].split() for line in corpus],[]))
intents = set([line['intent'] for line in corpus])

lang = Lang(words, intents, slots, cutoff=0)

In [4]:
!pip install sklearn
!pip install scikit-learn
!pip install tqdm
!pip install transformers



In [5]:
w2id = {'pad':PAD_TOKEN, 'unk': 1}
slot2id = {'pad':PAD_TOKEN}
intent2id = {}
# Map the words only from the train set
# Map slot and intent labels of train, dev and test set. 'unk' is not needed.
for example in train_raw:
    for w in example['utterance'].split():
        if w not in w2id:
            w2id[w] = len(w2id)
    for slot in example['slots'].split():
        if slot not in slot2id:
            slot2id[slot] = len(slot2id)
    if example['intent'] not in intent2id:
        intent2id[example['intent']] = len(intent2id)
print('# Vocab:', len(w2id)-2) # we remove pad and unk from the count
print('# Slots:', len(slot2id)-1)
print('# Intent:', len(intent2id))
for example in dev_raw:
    for slot in example['slots'].split():
        if slot not in slot2id:
            slot2id[slot] = len(slot2id)
    if example['intent'] not in intent2id:
        intent2id[example['intent']] = len(intent2id)

print('# Vocab:', len(w2id)-2) # we remove pad and unk from the count
print('# Slots:', len(slot2id)-1)
print('# Intent:', len(intent2id))

for example in test_raw:
    for slot in example['slots'].split():
        if slot not in slot2id:
            slot2id[slot] = len(slot2id)
    if example['intent'] not in intent2id:
        intent2id[example['intent']] = len(intent2id)

print('# Vocab:', len(w2id)-2) # we remove pad and unk from the count
print('# Slots:', len(slot2id)-1)
print('# Intent:', len(intent2id))

# Vocab: 861
# Slots: 120
# Intent: 22
# Vocab: 861
# Slots: 123
# Intent: 22
# Vocab: 861
# Slots: 129
# Intent: 26


In [6]:
import torch
import torch.utils.data as data

class IntentsAndSlots (data.Dataset):
    # Mandatory methods are __init__, __len__ and __getitem__
    def __init__(self, dataset, lang, unk='unk'):
        self.utterances = []
        self.intents = []
        self.slots = []
        self.unk = unk

        for x in dataset:
            self.utterances.append(x['utterance'])
            self.slots.append(x['slots'])
            self.intents.append(x['intent'])

        self.utt_ids = self.mapping_seq(self.utterances, lang.word2id)
        self.slot_ids = self.mapping_seq(self.slots, lang.slot2id)
        self.intent_ids = self.mapping_lab(self.intents, lang.intent2id)

    def __len__(self):
        return len(self.utterances)

    def __getitem__(self, idx):
        utt = torch.Tensor(self.utt_ids[idx])
        slots = torch.Tensor(self.slot_ids[idx])
        intent = self.intent_ids[idx]
        sample = {'utterance': utt, 'slots': slots, 'intent': intent}
        return sample

    # Auxiliary methods

    def mapping_lab(self, data, mapper):
        return [mapper[x] if x in mapper else mapper[self.unk] for x in data]

    def mapping_seq(self, data, mapper): # Map sequences to number
        res = []
        for seq in data:
            tmp_seq = []
            for x in seq.split():
                if x in mapper:
                    tmp_seq.append(mapper[x])
                else:
                    tmp_seq.append(mapper[self.unk])
            res.append(tmp_seq)
        return res
# Create our datasets
train_dataset = IntentsAndSlots(train_raw, lang)
dev_dataset = IntentsAndSlots(dev_raw, lang)
test_dataset = IntentsAndSlots(test_raw, lang)

In [7]:
from torch.utils.data import DataLoader

def collate_fn(data):
    def merge(sequences):
        '''
        merge from batch * sent_len to batch * max_len
        '''
        lengths = [len(seq) for seq in sequences]
        max_len = 1 if max(lengths)==0 else max(lengths)
        # Pad token is zero in our case
        # So we create a matrix full of PAD_TOKEN (i.e. 0) with the shape
        # batch_size X maximum length of a sequence
        padded_seqs = torch.LongTensor(len(sequences),max_len).fill_(PAD_TOKEN)
        for i, seq in enumerate(sequences):
            end = lengths[i]
            padded_seqs[i, :end] = seq # We copy each sequence into the matrix
        # print(padded_seqs)
        padded_seqs = padded_seqs.detach()  # We remove these tensors from the computational graph
        return padded_seqs, lengths
    # Sort data by seq lengths
    data.sort(key=lambda x: len(x['utterance']), reverse=True)
    new_item = {}
    for key in data[0].keys():
        new_item[key] = [d[key] for d in data]
    # We just need one length for packed pad seq, since len(utt) == len(slots)
    src_utt, _ = merge(new_item['utterance'])
    y_slots, y_lengths = merge(new_item["slots"])
    intent = torch.LongTensor(new_item["intent"])

    src_utt = src_utt.to(device) # We load the Tensor on our seleceted device
    y_slots = y_slots.to(device)
    intent = intent.to(device)
    y_lengths = torch.LongTensor(y_lengths).to(device)

    new_item["utterances"] = src_utt
    new_item["intents"] = intent
    new_item["y_slots"] = y_slots
    new_item["slots_len"] = y_lengths
    return new_item

# Dataloader instantiation
train_loader = DataLoader(train_dataset, batch_size=128, collate_fn=collate_fn,  shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=64, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, collate_fn=collate_fn)

In [8]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class ModelIAS(nn.Module):

    def __init__(self, hid_size, out_slot, out_int, emb_size, vocab_len, n_layer=2, pad_index=0):
        super(ModelIAS, self).__init__()
        # hid_size = Hidden size
        # out_slot = number of slots (output size for slot filling)
        # out_int = number of intents (ouput size for intent class)
        # emb_size = word embedding size

        self.embedding = nn.Embedding(vocab_len, emb_size, padding_idx=pad_index)

        self.utt_encoder = nn.LSTM(emb_size, hid_size, n_layer, bidirectional=True)
        self.slot_out = nn.Linear(hid_size*2, out_slot)
        self.intent_out = nn.Linear(hid_size, out_int)
        # Dropout layer How do we apply it?
        self.dropout = nn.Dropout(0.1)

    def forward(self, utterance, seq_lengths):
        # utterance.size() = batch_size X seq_len
        utt_emb = self.embedding(utterance) # utt_emb.size() = batch_size X seq_len X emb_size
        utt_emb = utt_emb.permute(1,0,2) # we need seq len first -> seq_len X batch_size X emb_size

        # pack_padded_sequence avoid computation over pad tokens reducing the computational cost

        packed_input = pack_padded_sequence(utt_emb, seq_lengths.cpu().numpy())
        # Process the batch
        packed_output, (last_hidden, cell) = self.utt_encoder(packed_input)
        # Unpack the sequence
        utt_encoded, input_sizes = pad_packed_sequence(packed_output)
        # Get the last hidden state
        last_hidden = last_hidden[-1,:,:]
        # Compute slot logits
        slots = self.slot_out(utt_encoded)
        slots = self.dropout(slots)
        # Compute intent logits
        intent = self.intent_out(last_hidden)
        intent = self.dropout(intent)

        # Slot size: seq_len, batch size, calsses
        slots = slots.permute(1,2,0) # We need this for computing the loss
        # Slot size: batch_size, classes, seq_len
        return slots, intent

In [9]:
def init_weights(mat):
    for m in mat.modules():
        if type(m) in [nn.GRU, nn.LSTM, nn.RNN]:
            for name, param in m.named_parameters():
                if 'weight_ih' in name:
                    for idx in range(4):
                        mul = param.shape[0]//4
                        torch.nn.init.xavier_uniform_(param[idx*mul:(idx+1)*mul])
                elif 'weight_hh' in name:
                    for idx in range(4):
                        mul = param.shape[0]//4
                        torch.nn.init.orthogonal_(param[idx*mul:(idx+1)*mul])
                elif 'bias' in name:
                    param.data.fill_(0)
        else:
            if type(m) in [nn.Linear]:
                torch.nn.init.uniform_(m.weight, -0.01, 0.01)
                if m.bias != None:
                    m.bias.data.fill_(0.01)

In [10]:
import torch.optim as optim

hid_size = 200
emb_size = 300

lr = 0.0001 # learning rate
clip = 5 # Clip the gradient

out_slot = len(lang.slot2id)
out_int = len(lang.intent2id)
vocab_len = len(lang.word2id)

model = ModelIAS(hid_size, out_slot, out_int, emb_size, vocab_len, pad_index=PAD_TOKEN).to(device)
model.apply(init_weights)

optimizer = optim.Adam(model.parameters(), lr=lr)
criterion_slots = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
criterion_intents = nn.CrossEntropyLoss() # Because we do not have the pad token

In [11]:
from conll import evaluate
from sklearn.metrics import classification_report

def train_loop(data, optimizer, criterion_slots, critenrion_intents, model):
    model.train()
    loss_array = []
    for sample in data:
        optimizer.zero_grad() # Zeroing the gradient
        slots, intent = model(sample['utterances'], sample['slots_len'])
        loss_intent = criterion_intents(intent, sample['intents'])
        loss_slot = criterion_slots(slots, sample['y_slots'])
        loss = loss_intent + loss_slot # In joint training we sum the losses.
                                       # Is there another way to do that?
        loss_array.append(loss.item())
        loss.backward() # Compute the gradient, deleting the computational graph
        # clip the gradient to avoid explosioning gradients
        # torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step() # Update the weights
    return loss_array

def eval_loop(data, criterion_slots, criterion_intents, model, lang):
    model.eval()
    loss_array = []

    ref_intents = []
    hyp_intents = []

    ref_slots = []
    hyp_slots = []
    #softmax = nn.Softmax(dim=1) # Use Softmax if you need the actual probability
    with torch.no_grad(): # It used to avoid the creation of computational graph
        for sample in data:
            slots, intents = model(sample['utterances'], sample['slots_len'])
            loss_intent = criterion_intents(intents, sample['intents'])
            loss_slot = criterion_slots(slots, sample['y_slots'])
            loss = loss_intent + loss_slot
            loss_array.append(loss.item())
            # Intent inference
            # Get the highest probable class
            out_intents = [lang.id2intent[x]
                           for x in torch.argmax(intents, dim=1).tolist()]
            gt_intents = [lang.id2intent[x] for x in sample['intents'].tolist()]
            ref_intents.extend(gt_intents)
            hyp_intents.extend(out_intents)

            # Slot inference
            output_slots = torch.argmax(slots, dim=1)
            for id_seq, seq in enumerate(output_slots):
                length = sample['slots_len'].tolist()[id_seq]
                utt_ids = sample['utterance'][id_seq][:length].tolist()
                gt_ids = sample['y_slots'][id_seq].tolist()
                gt_slots = [lang.id2slot[elem] for elem in gt_ids[:length]]
                utterance = [lang.id2word[elem] for elem in utt_ids]
                to_decode = seq[:length].tolist()
                ref_slots.append([(utterance[id_el], elem) for id_el, elem in enumerate(gt_slots)])
                tmp_seq = []
                for id_el, elem in enumerate(to_decode):
                    tmp_seq.append((utterance[id_el], lang.id2slot[elem]))
                hyp_slots.append(tmp_seq)
    try:
        results = evaluate(ref_slots, hyp_slots)
    except Exception as ex:
        # Sometimes the model predics a class that is not in REF
        print(ex)
        ref_s = set([x[1] for x in ref_slots])
        hyp_s = set([x[1] for x in hyp_slots])
        print(hyp_s.difference(ref_s))

    report_intent = classification_report(ref_intents, hyp_intents,
                                          zero_division=False, output_dict=True)
    return results, report_intent, loss_array

In [12]:
import matplotlib.pyplot as plt
from tqdm import tqdm
n_epochs = 200
patience = 3
losses_train = []
losses_dev = []
sampled_epochs = []
best_f1 = 0
for x in tqdm(range(1,n_epochs)):
    loss = train_loop(train_loader, optimizer, criterion_slots,
                      criterion_intents, model)
    if x % 5 == 0:
        sampled_epochs.append(x)
        losses_train.append(np.asarray(loss).mean())
        results_dev, intent_res, loss_dev = eval_loop(dev_loader, criterion_slots,
                                                      criterion_intents, model, lang)
        losses_dev.append(np.asarray(loss_dev).mean())
        f1 = results_dev['total']['f']

        if f1 > best_f1:
            best_f1 = f1
            patience = 3
        else:
            patience -= 1
        if patience <= 0: # Early stopping with patience
            break # Not nice but it keeps the code clean

results_test, intent_test, _ = eval_loop(test_loader, criterion_slots,
                                         criterion_intents, model, lang)
print("\033[1mResults of Modified model MODELIAS:\033[0m")
print('Slot F1: ', results_test['total']['f'])
print('Intent Accuracy:', intent_test['accuracy'])

  if not is_compiling() and torch.has_cuda and torch.cuda.is_available():
  if not is_compiling() and torch.has_cuda and torch.cuda.is_available():
  if not is_compiling() and torch.has_cuda and torch.cuda.is_available():
  if not is_compiling() and torch.has_cuda and torch.cuda.is_available():
  if not is_compiling() and torch.has_cuda and torch.cuda.is_available():
  if not is_compiling() and torch.has_cuda and torch.cuda.is_available():
  if not is_compiling() and torch.has_cuda and torch.cuda.is_available():
  if not is_compiling() and torch.has_cuda and torch.cuda.is_available():
  if not is_compiling() and torch.has_cuda and torch.cuda.is_available():
  if not is_compiling() and torch.has_cuda and torch.cuda.is_available():
  if not is_compiling() and torch.has_cuda and torch.cuda.is_available():
  if not is_compiling() and torch.has_cuda and torch.cuda.is_available():
  if not is_compiling() and torch.has_cuda and torch.cuda.is_available():
  if not is_compiling() and torch.has_

[1mResults of Modified model MODELIAS:[0m
Slot F1:  0.9356435643564356
Intent Accuracy: 0.9507278835386338


In [13]:
!pip install --upgrade ipywidgets
!jupyter nbextension enable --py widgetsnbextension

Requirement already up-to-date: ipywidgets in /home/disi/.local/lib/python3.8/site-packages (8.1.1)
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [14]:
import torch
import torch.utils.data as data
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from transformers import BertTokenizer, BertModel
import os
import sys
import tensorflow as tf
sys.path.insert(0, os.path.abspath('../src/'))
#from conll import evaluate
from sklearn.metrics import classification_report
from transformers import Trainer, TrainingArguments
from transformers import default_data_collator, TFBertModel
import keras
import keras.utils
from keras import utils as np_utils
from tensorflow.keras.layers import Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, LearningRateScheduler, ModelCheckpoint, TensorBoard
import matplotlib.pyplot as plt


class ModelIAS(tf.keras.Model):

    def __init__(self, max_len, total_intent_no=None, total_slot_no=None, dropout_prob=0.1):
        super().__init__()

        self.max_len = max_len
        self.dropout_prob = dropout_prob
        self.total_intent_no = total_intent_no
        self.total_slot_no = total_slot_no
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.model = TFBertModel.from_pretrained("bert-base-uncased")
        self.slot_out = Dense(self.total_slot_no, activation='softmax')
        self.intent_out = Dense(self.total_intent_no, activation='softmax')
        # Dropout layer How do we apply it?
        self.dropout = Dropout(self.dropout_prob)



    def tokenize(self, tokenizer, text_sequence, max_length):
        encoded = tokenizer(text_sequence, return_tensors='pt', is_split_into_words=True)
        input_ids = encoded['input_ids'].unsqueeze(0)
        attention_mask = encoded['attention_mask'].unsqueeze(0)
        token_type_ids = encoded['token_type_ids'].unsqueeze(0)
        return input_ids, attention_mask, token_type_ids


    def call(self, inputs, **kwargs):
        outputs = self.model(inputs)
        slots = self.dropout(outputs[0])
        slots = self.slot_out(slots)
        intent = self.dropout(outputs[1])
        intent = self.intent_out(intent)

        return slots, intent

In [15]:
import torch.optim as optim



lr = 3e-5 # learning rate
e = 1e-08

out_slot = len(lang.slot2id)
out_int = len(lang.intent2id)
vocab_len = len(lang.word2id)

model = ModelIAS(vocab_len, out_int, out_slot)

optimizer = Adam(learning_rate=lr, epsilon=e)
losses = [SparseCategoricalCrossentropy(from_logits=False),
          SparseCategoricalCrossentropy(from_logits=False)]
metrics = [SparseCategoricalAccuracy('accuracy')]

model.compile(optimizer=optimizer, loss=losses, metrics=metrics)


def encode_dataset(tokenizer, text_sequences, max_length):
    token_ids = np.zeros(shape=(len(text_sequences), max_length),
                         dtype=np.int32)
    for i, text_sequence in enumerate(text_sequences):
        encoded = tokenizer.encode(text_sequence)
        token_ids[i, 0:len(encoded)] = encoded
    attention_masks = (token_ids != 0).astype(np.int32)
    return {"input_ids": token_ids, "attention_masks": attention_masks}


def prepare_dataset(data, model):
    input_ids_all = []
    attention_masks_all = []
    token_type_ids_all = []
    labels = []
    slots = []



    for sample in data:
        input_ids, attention_masks, token_type_ids = model.tokenize(model.tokenizer, sample['utterance'], model.max_len)

        input_ids_all.append(input_ids)
        attention_masks_all.append(attention_masks)
        token_type_ids_all.append(token_type_ids)
        labels.append(sample['intent'])
        slots.append(sample['slots'])


    input_ids_all = input_ids_all.extend([0] * (4 - len(input_ids_all)))
    input_ids_all = [input_ids_all]
    input_ids_ = np.asarray(input_ids_all)
    input_ids_ = np.expand_dims(input_ids_, axis=0)
    input_ids_ = np.nan_to_num(input_ids_)
    input_ids_[input_ids_ == None] = 0
    input_ids = tf.convert_to_tensor(input_ids_, dtype=tf.int32)


    attention_masks_all = attention_masks_all.extend([0] * (4 - len(attention_masks_all)))
    attention_masks_all = [attention_masks_all]
    attention_masks_ = np.asarray(attention_masks_all)
    attention_masks_ = np.expand_dims(attention_masks_, axis=0)
    attention_masks_ = np.nan_to_num(attention_masks_)
    attention_masks_[attention_masks_ == None] = 0
    attention_masks = tf.convert_to_tensor(attention_masks_, dtype=tf.int32)

    token_type_ids_all = token_type_ids_all.extend([0] * (4 - len(token_type_ids_all)))
    token_type_ids_all = [token_type_ids_all]
    token_type_ids_ = np.asarray(token_type_ids_all)
    token_type_ids_ = np.expand_dims(token_type_ids_, axis=0)
    token_type_ids_ = np.nan_to_num(token_type_ids_)
    token_type_ids_[token_type_ids_ == None] = 0
    token_type_ids = tf.convert_to_tensor(token_type_ids_, dtype=tf.int32)

    labels = labels.extend([0] * (4 - len(labels)))
    labels = [labels]
    labels_ = np.asarray(labels)
    labels_ = np.expand_dims(labels_, axis=0)
    labels_ = np.nan_to_num(labels_)
    labels_[labels_ == None] = 0
    labels = tf.convert_to_tensor(labels_, dtype=tf.int32)

    slots = slots.extend([0] * (4 - len(slots)))
    slots = [slots]
    slots_ = np.asarray(slots)
    slots_ = np.expand_dims(slots_, axis=0)
    slots_ = np.nan_to_num(slots_)
    slots_[slots_ == None] = 0
    slots = tf.convert_to_tensor(slots_, dtype=tf.int32)


    return input_ids, attention_masks, token_type_ids, labels, slots


train_input_ids, train_attention_masks, train_token_type_ids, train_labels, train_slots = prepare_dataset(train_raw, model)
dev_input_ids, dev_attention_masks, dev_token_type_ids, dev_labels, dev_slots = prepare_dataset(dev_raw, model)
test_input_ids, test_attention_masks, test_token_type_ids, test_labels, test_slots = prepare_dataset(test_raw, model)



history = model.fit(
    [train_input_ids,train_attention_masks], (train_slots, train_labels),
    validation_data=([dev_input_ids,dev_attention_masks], (dev_slots, dev_labels)),
    epochs=15, batch_size=128)


result = model.evaluate([test_input_ids,test_attention_masks], (test_slots, test_labels))
print("\033[1mResults of BERT MODEL:\033[0m")
print(f'Loss: {result[0]}')
print(f'Slot Loss: {result[1]}')
print(f'Intent Loss: {result[2]}')
print(f'Slot Accuracy: {result[3]}')
print(f'Intent Accuracy: {result[4]}')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
[1mResults of BERT MODEL:[0m
Loss: 0.7094132900238037
Slot Loss: 0.19924859702587128
Intent Loss: 0.5101646780967712
Slot Accuracy: 1.0
Intent Accuracy: 1.0
