In [1]:
import json
import pandas as pd
import os
dataset = "SNIPS"

In [2]:
BERT_MODEL = 'bert-base-uncased'
MAX_SEQ_LEN = 50

In [3]:
def load_data(path):
    dataset = []
    with open(path) as f:
        dataset = json.loads(f.read())
    return dataset

In [4]:
train_raw = load_data(os.path.join('data', dataset, 'train.json'))
train_seq_in = []
train_seq_out = []
train_labels = []
for element in train_raw:
    train_seq_in.append(element['utterance'])
    train_seq_out.append(element['slots'])
    train_labels.append(element['intent'])

# same for valid
valid_raw = load_data(os.path.join('data', dataset, 'valid.json'))
val_seq_in = []
val_seq_out = []
val_labels = []
for element in valid_raw:
    val_seq_in.append(element['utterance'])
    val_seq_out.append(element['slots'])
    val_labels.append(element['intent'])

# same for test
test_raw = load_data(os.path.join('data', dataset, 'test.json'))
test_seq_in = []
test_seq_out = []
test_labels = []
for element in test_raw:
    test_seq_in.append(element['utterance'])
    test_seq_out.append(element['slots'])
    test_labels.append(element['intent'])

columns_name = ['seq_in','seq_out','label']
df_train = pd.DataFrame(list(zip(train_seq_in,train_seq_out,train_labels)), columns=columns_name)
df_val = pd.DataFrame(list(zip(val_seq_in,val_seq_out,val_labels)), columns=columns_name)
df_test = pd.DataFrame(list(zip(test_seq_in,test_seq_out,test_labels)),columns=columns_name)

In [5]:
total_intent_num = len(df_train['label'].unique())

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df_train['label'])

train_labels_encoded = le.transform(df_train['label'].values)
val_labels_encoded = le.transform(df_val['label'].values)
test_labels_encoded = le.transform(df_test['label'].values)

In [7]:
from transformers import BertTokenizer, TFBertModel
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

In [8]:
import numpy as np
def encode_dataset(tokenizer, text_sequences, max_length):
    token_ids = np.zeros(shape=(len(text_sequences), max_length),
                         dtype=np.int32)
    for i, text_sequence in enumerate(text_sequences):
        encoded = tokenizer.encode(text_sequence)
        token_ids[i, 0:len(encoded)] = encoded
    attention_masks = (token_ids != 0).astype(np.int32)
    return {"input_ids": token_ids, "attention_masks": attention_masks}


train_bert_input = encode_dataset(tokenizer, df_train["seq_in"], MAX_SEQ_LEN)
valid_bert_input = encode_dataset(tokenizer, df_val["seq_in"], MAX_SEQ_LEN)
test_bert_input = encode_dataset(tokenizer, df_test["seq_in"], MAX_SEQ_LEN)

In [9]:
tf_bert_model = TFBertModel.from_pretrained(BERT_MODEL)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [10]:
def encode_tokens(text_seq, label_seq, word_to_index, tokenizer):

  encoded_seq_labels = []
  extra_keys = 0
  extra_key_names = []
  
  for i, (text, labels) in enumerate(zip(text_seq, label_seq)):
    sent_level_label_encoding=[]
    for word, word_label in zip(text.split(), labels.split()):
      word_level_label_encoding = []
      word_tokens = tokenizer.tokenize(word)
      for w in word_tokens:
        #handling when word has word level tokenizatation, and label start with B-
        if w.startswith("#") and word_label.startswith("B-"):
          word_label = word_label.replace("B-","I-")
          if word_label not in word_to_index.keys():
            extra_keys +=1
            extra_key_names.append(word_label)
            word_to_index[word_label] = len(word_to_index)
        word_level_label_encoding.append(word_label) 
      sent_level_label_encoding.extend(word_level_label_encoding)
    #assert to check weather we have same number of token and labels
    assert(len(sent_level_label_encoding) == len(tokenizer.tokenize(text)))
    encoded_seq_labels.append(" ".join(sent_level_label_encoding))

  print("Total number of keys added\n",extra_keys)
  print("Extra keys are,\n ", extra_key_names)
  return encoded_seq_labels

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
seq_out_tokenizer = Tokenizer(filters='!"#$%&()*+,./:;<=>?@[\\]^`{|}~\t\n', oov_token="UNK",lower=False)
seq_out_tokenizer.fit_on_texts(df_train["seq_out"].tolist())
seq_out_word_to_index = seq_out_tokenizer.word_index


In [12]:
train_slots_encoded = encode_tokens(df_train['seq_in'].tolist(), df_train['seq_out'].tolist(), seq_out_word_to_index, tokenizer)
val_slots_encoded = encode_tokens(df_val['seq_in'].tolist(), df_val['seq_out'].tolist(), seq_out_word_to_index, tokenizer)
test_slots_encoded = encode_tokens(df_test['seq_in'].tolist(), df_test['seq_out'].tolist(), seq_out_word_to_index, tokenizer)

Total number of keys added
 3
Extra keys are,
  ['I-year', 'I-condition_description', 'I-condition_temperature']
Total number of keys added
 0
Extra keys are,
  []
Total number of keys added
 0
Extra keys are,
  []


In [13]:
seq_out_tokenizer = Tokenizer(oov_token="UNK",lower=False,filters='!"#$%&()*+,./:;<=>?@\\^`{|}~\t\n')
seq_out_tokenizer.fit_on_texts(train_slots_encoded)
seq_out_tokenizer.word_index['PAD'] = 0
seq_out_tokenizer.index_word[0] = 'PAD'

In [14]:
train_slots_tokenized = seq_out_tokenizer.texts_to_sequences(train_slots_encoded)
val_slots_tokenized = seq_out_tokenizer.texts_to_sequences(val_slots_encoded)
test_slots_tokenized = seq_out_tokenizer.texts_to_sequences(test_slots_encoded)

In [15]:
def prepare_slots_for_bert(tokenized_slots, max_len):

  final = np.zeros(shape=(len(tokenized_slots),max_len), dtype='int32')

  for i,slot in enumerate(tokenized_slots):
    final[i, 1:len(slot)+1] = slot
  return final

In [16]:
train_slots = prepare_slots_for_bert(train_slots_tokenized, MAX_SEQ_LEN)
valid_slots = prepare_slots_for_bert(val_slots_tokenized, MAX_SEQ_LEN)
test_slots = prepare_slots_for_bert(test_slots_tokenized, MAX_SEQ_LEN)

In [17]:
import tqdm
def get_slots_with_pad(text_seq, maxlen):
  final = np.full((len(text_seq), maxlen), "PAD", dtype='U32')

  for i,text in enumerate(text_seq):
    splitted_text = text.split()
    final[i, 1:len(splitted_text)+1] = splitted_text
  return final
train_slots_encoed_with_pad_token = get_slots_with_pad(train_slots_encoded, MAX_SEQ_LEN)
val_slots_encoded_with_pad_token =  get_slots_with_pad(val_slots_encoded, MAX_SEQ_LEN)
test_slots_encoded_with_pad_token = get_slots_with_pad(test_slots_encoded, MAX_SEQ_LEN)

In [18]:
from tensorflow.keras.layers import Dropout, Dense
import tensorflow as tf
class JointIntentAndSlotFillingModel(tf.keras.Model):

    def __init__(self, total_intent_no=None, total_slot_no=None,
                 model_name=BERT_MODEL, dropout_prob=0.1):
        super().__init__()
        self.bert = TFBertModel.from_pretrained(model_name)
        self.dropout = Dropout(dropout_prob)
        self.intent_classifier = Dense(total_intent_no, activation='softmax')
        self.slot_classifier = Dense(total_slot_no, activation='softmax')

    def call(self, inputs, **kwargs):
        bert_output = self.bert(inputs)

        sequence_output = self.dropout(bert_output[0])
        slots_predicted = self.slot_classifier(sequence_output)

        pooled_output = self.dropout(bert_output[1])
        intent_predicted = self.intent_classifier(pooled_output)

        return slots_predicted, intent_predicted

In [19]:
joint_model = JointIntentAndSlotFillingModel(
    total_intent_no=7, total_slot_no=77,dropout_prob=0.1)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [20]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
opt = Adam(learning_rate=3e-5, epsilon=1e-08)
losses = [SparseCategoricalCrossentropy(from_logits=False),
          SparseCategoricalCrossentropy(from_logits=False)]
metrics = [SparseCategoricalAccuracy('accuracy')]

joint_model.compile(optimizer=opt, loss=losses, metrics=metrics)

In [21]:
import tensorflow as tf
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, TFBertModel
import warnings
warnings.filterwarnings('ignore')
from tensorflow.keras.layers import Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, LearningRateScheduler, ModelCheckpoint, TensorBoard
from seqeval.metrics import classification_report
import shutil
import pickle
from seqeval.metrics import classification_report

In [23]:
import shutil
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, LearningRateScheduler, ModelCheckpoint, TensorBoard
model_path = 'joint_model/'
model_name = "joint_model_weights_{val_loss:.2f}.ckpt"
if os.path.exists(model_path):
  print("Model path exist, clearing all files under model_path")
  shutil.rmtree(model_path)
else:
  print("Creating model_path")
  os.makedirs(model_path)

model_chk_point = ModelCheckpoint(filepath=os.path.join(model_path,model_name),monitor="val_loss",save_best_only=True,save_weights_only=True)
early_stopping = EarlyStopping(monitor="val_loss",min_delta=0.0001,patience=4,verbose=1)

tensorboard_path = "joint_model/Tensorboard/logs/"
if os.path.exists(tensorboard_path):
  print("Tensorboard path exists, clearing all files under tensorboard_path")
  shutil.rmtree(tensorboard_path)
else:
  print("Creating tensorboard path")
  os.makedirs(tensorboard_path)

tensorboard_cb = TensorBoard(log_dir=tensorboard_path)

callback_list = [model_chk_point, tensorboard_cb, early_stopping]


Model path exist, clearing all files under model_path


PermissionError: [WinError 5] Access is denied: 'joint_model/Tensorboard\\logs\\train'

In [None]:
history = joint_model.fit(
    train_bert_input, (train_slots,train_labels_encoded),
    validation_data=(valid_bert_input, (valid_slots, val_labels_encoded)),
    epochs=15, batch_size=128, callbacks=callback_list)