# Joint Intent Classification and Slot filling with BERT
This notebook is based on the paper __BERT for Joint Intent Classification and Slot Filling__ by Chen et al. (2019), https://arxiv.org/abs/1902.10909 but on a different dataset made for a class project.

Ideas were also taken from https://github.com/monologg/JointBERT, which is a PyTorch implementation of the paper with the original dataset.


joint Bert 
https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&cad=rja&uact=8&ved=2ahUKEwj_v7me8e74AhUZhP0HHRLvACMQFnoECCYQAQ&url=https%3A%2F%2Fcolab.research.google.com%2Fgithub%2FShawonAshraf%2Fnlu-jointbert-dl2021%2Fblob%2Fmain%2Fnotebooks%2Fnlu_jointbert_dl21.ipynb&usg=AOvVaw37od7E8oCcH5ksYeTA8yyt

In [None]:
import os
#import shutil
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import seaborn as sns
from pylab import rcParams

import matplotlib.pyplot as plt
tf.get_logger().setLevel('ERROR')

sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8
import warnings
warnings.filterwarnings("ignore")

In [None]:
import json

# read data/ATIS/test.json and save as dataframe
with open('/content/drive/MyDrive/NLU/IntentSlotDatasets/ATIS/test.json') as f:
    data = json.load(f)
ATIS_test = pd.DataFrame(data)
# read data/ATIS/train.json and save as dataframe
with open('/content/drive/MyDrive/NLU/IntentSlotDatasets/ATIS/train.json') as f:
    data = json.load(f)
ATIS_train = pd.DataFrame(data)
# read data/SNIPS/test.json and save as dataframe
with open('/content/drive/MyDrive/NLU/IntentSlotDatasets/SNIPS/test.json') as f:
    data = json.load(f)
SNIPS_test = pd.DataFrame(data)
# read data/SNIPS/train.json and save as dataframe
with open('/content/drive/MyDrive/NLU/IntentSlotDatasets/SNIPS/train.json') as f:
    data = json.load(f)
SNIPS_train = pd.DataFrame(data)
# read data/SNIPS/valid.json and save as dataframe
with open('/content/drive/MyDrive/NLU/IntentSlotDatasets/SNIPS/valid.json') as f:
    data = json.load(f)
SNIPS_valid = pd.DataFrame(data)

In [None]:
!wget https://github.com/ShawonAshraf/nlu-jointbert-dl2021/raw/main/data/nlu_traindev/dev.json

## Read data from json files

Data is of the following format
````json5
{
  "text": "",
  "positions": [{}],
  "slots": [{}],
  "intent": ""
}

[
  {
    "utterance": "on april first i need a flight going from phoenix to san diego", 
    "slots": "O B-depart_date.month_name B-depart_date.day_number O O O O O O B-fromloc.city_name O B-toloc.city_name I-toloc.city_name", 
    "intent": "flight"
   },
   "..."
]
````

We will be using `text` as the input and `slots` and `intent` as lables

In [None]:
testFeatures=ATIS_train['utterance']
testIntents=ATIS_train['intent']
testSlots = ATIS_train['slots']

#testlabels=binarizer.transform(testlabels.values)

In [None]:
import json
import os

class RawData(object):
    def __init__(self, id, intent, slots, text):
        self.id = id
        self.intent = intent
        #self.positions = positions
        self.slots = slots
        self.text = text

    def __repr__(self):
        return str(json.dumps(self.__dict__, indent=2))


"""
reads json from data file
returns a list containing DataInstance objects
"""


def read_train_json_file(file):    
    
  testFeatures=ATIS_train['utterance']
  testIntents=ATIS_train['intent']
  testSlots = ATIS_train['slots']
  intents = []

  for k in range (0, len(testIntents)):      
    intent = testIntents[k]
    #positions = data[k]["positions"]
    slots = testSlots[k]
    text = testFeatures[k]

    temp = RawData(k, intent, slots, text)
    intents.append(temp)

  return intents
    

# read from json file
train_data = read_train_json_file(ATIS_train)


In [None]:
example = train_data[0]
example

## Load Tokenizer from transformers

We will use a pretrained bert model `bert-base-cased` for both Tokenizer and our classifier.

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer

model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Encode texts from the dataset

We have to encode the texts using the tokenizer to create tensors for training the classifier.

In [None]:
# https://huggingface.co/transformers/preprocessing.html

def encode_texts(tokenizer, texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors="tf")

texts = [d.text for d in train_data]
tds = encode_texts(tokenizer, texts)
tds.keys()
print(tds)

In [None]:
encoded_texts = tds

## Encode labels
### Intents

In [None]:

intents = [d.intent for d in train_data]
intent_names = list(set(intents))
intent_names

In [None]:
intent_map = dict() # index -> intent
for idx, ui in enumerate(intent_names):
    intent_map[ui] = idx
intent_map

In [None]:
# map to train_data values
def encode_intents(intents, intent_map):
    encoded = []
    for i in intents:
        encoded.append(intent_map[i])
    # convert to tf tensor
    return tf.convert_to_tensor(encoded, dtype="int32")

encoded_intents = encode_intents(intents, intent_map)
print (encoded_intents)

### Slots

To padd all the texts to the same length, the tokenizer will use special characters. To handle those we need to add <PAD> to slots_names. It can be some other symbol as well.

In [None]:
# encode slots
slot_names = set()
for td in train_data:
    slots = td.slots
    print([slots])
    for slot in [slots]:
      #print(slot)
      slot = slot.split()
      #print ("new slot", slot)
      for slot1 in slot:
        if (slot1 != 'O'):
          slot_names.add(slot1)
          #print(slot1)
        

slot_names = list(slot_names)
slot_names.insert(0, "<PAD>")
#slot_names

In [None]:
slot_map = dict() # slot -> index
for idx, us in enumerate(slot_names):
    slot_map[us] = idx
slot_map

In [None]:
# gets slot name from its values
def get_slot_from_word(word, utterance, slot_dict):
    #slot_dict1 = [slot_dict]
    #print(slot_dict)
    slot_dict = slot_dict.split()
    utterance = utterance.split()
    #print(slot_dict)
    for k in range(0, len(utterance)):
        if (utterance[k] == word) and (slot_dict[k] != 'O'):
            return slot_dict[k]
    return None

print(train_data[0].text)
print(train_data[0].slots)
print("slot_name for boston is : ", get_slot_from_word("boston", train_data[0].text, train_data[0].slots))

In [None]:
import numpy as np

# find the max encoded test length
# tokenizer pads all texts to same length anyway so
# just get the length of the first one's input_ids
max_len = len(encoded_texts["input_ids"][0])
print (max_len)

def encode_slots(all_slots, all_texts, tokenizer, slot_map, max_len=max_len):
    encoded_slots = np.zeros(shape=(len(all_texts), max_len), dtype=np.int32)
    print(all_slots)
    print(all_texts)
    print(tokenizer)
    print(slot_map)
    for idx, text in enumerate(all_texts):
        enc = [] # for this idx, to be added at the end to encoded_slots
        
        # slot names for this idx
        slot_names = all_slots[idx]
        
        #print (slot_names)
        
        # raw word tokens
        # not using bert for this block because bert uses
        # a wordpiece tokenizer which will make 
        # the slot label to word mapping
        # difficult
        raw_tokens = text.split()

        #print (raw_tokens)
        
        # words or slot_values associated with a certain
        # slot_name are contained in the values of the
        # dict slots_names
        # now this becomes a two way lookup
        # first we check if a word belongs to any
        # slot label or not and then we add the value from
        # slot map to encoded for that word
        for rt in raw_tokens:
            # use bert tokenizer
            # to get wordpiece tokens
            bert_tokens = tokenizer.tokenize(rt)
            
            # find the slot name for a token
            rt_slot_name = get_slot_from_word(rt,  text, slot_names)
            if rt_slot_name is not None:
                # fill with the slot_map value for all ber tokens for rt
                enc.append(slot_map[rt_slot_name])
                enc.extend([slot_map[rt_slot_name]] * (len(bert_tokens) - 1))

            else:
                # rt is not associated with any slot name
                enc.append(0)

        
        # now add to encoded_slots
        # ignore the first and the last elements
        # in encoded text as they're special chars
        encoded_slots[idx, 1:len(enc)+1] = enc
    
    return encoded_slots
    

In [None]:
all_slots = [td.slots for td in train_data]
all_texts = [td.text for td in train_data]

In [None]:
encoded_slots = encode_slots(all_slots, all_texts, tokenizer, slot_map)

In [None]:
encoded_slots[0]

## Classifier Model

### Definition

In [None]:
from transformers import TFBertModel
from tensorflow.keras.layers import Dropout, Dense, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy

class JointIntentAndSlotFillingModel(tf.keras.Model):

    def __init__(self, intent_num_labels=None, slot_num_labels=None,
                 model_name=model_name, dropout_prob=0.1):
        super().__init__(name="joint_intent_slot")
        self.bert = TFBertModel.from_pretrained(model_name)
        self.dropout = Dropout(dropout_prob)
        self.intent_classifier = Dense(intent_num_labels,
                                       name="intent_classifier")
        self.slot_classifier = Dense(slot_num_labels,
                                     name="slot_classifier")

    def call(self, inputs, **kwargs):
        # two outputs from BERT
        trained_bert = self.bert(inputs, **kwargs)
        pooled_output = trained_bert.pooler_output
        sequence_output = trained_bert.last_hidden_state
        
        # sequence_output will be used for slot_filling / classification
        sequence_output = self.dropout(sequence_output,
                                       training=kwargs.get("training", False))
        slot_logits = self.slot_classifier(sequence_output)

        # pooled_output for intent classification
        pooled_output = self.dropout(pooled_output,
                                     training=kwargs.get("training", False))
        intent_logits = self.intent_classifier(pooled_output)

        return slot_logits, intent_logits

In [None]:
joint_model = JointIntentAndSlotFillingModel(
    intent_num_labels=len(intent_map), slot_num_labels=len(slot_map))

### Hyperparams, Optimizer and Loss function

In [None]:
opt = Adam(learning_rate=3e-5, epsilon=1e-08)

# two outputs, one for slots, another for intents
# we have to fine tune for both
losses = [SparseCategoricalCrossentropy(from_logits=True),
          SparseCategoricalCrossentropy(from_logits=True)]

metrics = [SparseCategoricalAccuracy("accuracy")]
# compile model
joint_model.compile(optimizer=opt, loss=losses, metrics=metrics)

### Train

In [None]:
x = {"input_ids": encoded_texts["input_ids"], "token_type_ids": encoded_texts["token_type_ids"],  "attention_mask": encoded_texts["attention_mask"]}

history = joint_model.fit(
    x, (encoded_slots, encoded_intents), epochs=2, batch_size=32, shuffle=True)

## Inference

In [None]:
def nlu(text, tokenizer, model, intent_names, slot_names):
    inputs = tf.constant(tokenizer.encode(text))[None, :]  # batch_size = 1
    outputs = model(inputs)
    slot_logits, intent_logits = outputs

    slot_ids = slot_logits.numpy().argmax(axis=-1)[0, :]
    intent_id = intent_logits.numpy().argmax(axis=-1)[0]

    info = {"intent": intent_names[intent_id], "slots": {}}

    out_dict = {}
    # get all slot names and add to out_dict as keys
    predicted_slots = set([slot_names[s] for s in slot_ids if s != 0])
    for ps in predicted_slots:
      out_dict[ps] = []

    # check if the text starts with a small letter
    if text[0].islower():
      tokens = tokenizer.tokenize(text, add_special_tokens=True)
    else:
      tokens = tokenizer.tokenize(text)
    for token, slot_id in zip(tokens, slot_ids):
        # add all to out_dict
        slot_name = slot_names[slot_id]

        if slot_name == "<PAD>":
            continue

        # collect tokens
        collected_tokens = [token]
        idx = tokens.index(token)

        # see if it starts with ##
        # then it belongs to the previous token
        if token.startswith("##"):
          # check if the token already exists or not
          if tokens[idx - 1] not in out_dict[slot_name]:
            collected_tokens.insert(0, tokens[idx - 1])

        # add collected tokens to slots
        out_dict[slot_name].extend(collected_tokens)

    # process out_dict
    for slot_name in out_dict:
        tokens = out_dict[slot_name]
        slot_value = tokenizer.convert_tokens_to_string(tokens)

        info["slots"][slot_name] = slot_value.strip()

    return info


In [None]:
nlu("add Madchild to Electro Latino", tokenizer, joint_model, 
    intent_names, slot_names)

In [None]:
nlu("add Brian May to my Reggae Infusions list", tokenizer, joint_model, 
    intent_names, slot_names)

In [None]:
import calendar
import time

# to generate timestamps for prediction file
def get_time_stamp():
    ts = calendar.timegm(time.gmtime())
    return ts

get_time_stamp()

## Generate prediction.json

This section creates a file containing all the prediction results for inputs from dev.json

In [None]:
def read_dev_data(file="dev.json"):
    dev_texts = []
    with open(file, "r", encoding="utf-8") as json_file:
        data = json.load(json_file)

        for k in data.keys():
          text = data[k]["text"]
          dev_texts.append(text)
          
    return dev_texts
dev_texts = read_dev_data()

In [None]:
from tqdm import tqdm

results = []
for i in tqdm(range(len(dev_texts))):
    res = nlu(dev_texts[i], tokenizer, joint_model, intent_names, slot_names)
    results.append(res)

In [None]:
# process results
results_dict = dict()

for idx, res in enumerate(results):
    results_dict[str(idx)] = res

In [None]:
with open("prediction.json", "w") as f:
    json.dump(results_dict, f, indent=2)

In [None]:
!head prediction.json