In [77]:
import os
from pathlib import Path
from typing import List

import tensorflow as tf
from keras.layers import (Dense, Conv1D)
from keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from transformers import BertTokenizer, TFBertModel

# Read the natural language understanding dataset and BERT model

Clone the repos inside intent-detection directory
```
git clone https://github.com/tilde-nlp/NLU-datasets.git
git clone https://huggingface.co/bert-base-multilingual-cased
```

Directory tree should be as follows
```
/intent-detection
├── NLU-datasets
├── bert-base-multilingual-cased
├── run-on-windows.ipynb
```

In [78]:
if "NLU-datasets" not in os.getcwd():
    os.chdir("./NLU-datasets")

In [79]:
def get_data(path: str) -> List[str]:
    """ Read path and append each line without \n as an element to an array.
    Encoding is specified to correctly read files in Russian.
    Example output: ['FindConnection', 'FindConnection', ..., 'FindConnection']
    """
    with open(path, encoding='utf-8') as f:
        array = []
        for line in list(f):
            array.append(line.split('\n')[0])
        return array

In [80]:
path_list = Path("chatbot").glob("**/*.txt")

for path in path_list:
    # because path is object not string
    path_in_str = str(path)
    # print(path_in_str)
    if path_in_str == "chatbot\chatbot_train_ans.txt":
        train_answers = get_data(path_in_str)
    elif path_in_str == "chatbot\chatbot_test_ans.txt":
        test_answers  = get_data(path_in_str)
    elif path_in_str == "chatbot\en\chatbot_test_q.txt":
        en_test  = get_data(path_in_str)
    elif path_in_str == "chatbot\en\chatbot_train_q.txt":
        en_train  = get_data(path_in_str)
    elif path_in_str == "chatbot\lv\chatbot_test_q.txt":
        lv_test  = get_data(path_in_str)
    elif path_in_str == "chatbot\lv\chatbot_train_q.txt":
        lv_train  = get_data(path_in_str)
    elif path_in_str == "chatbot\\ru\chatbot_test_q.txt":
        ru_test  = get_data(path_in_str)
    elif path_in_str == "chatbot\\ru\chatbot_train_q.txt":
        ru_train  = get_data(path_in_str)
    elif path_in_str == "chatbot\et\chatbot_test_q.txt":
        et_test  = get_data(path_in_str)
    elif path_in_str == "chatbot\et\chatbot_train_q.txt":
        et_train  = get_data(path_in_str)
    elif path_in_str == "chatbot\lt\chatbot_test_q.txt":
        lt_test  = get_data(path_in_str)
    elif path_in_str == "chatbot\lt\chatbot_train_q.txt":
        lt_train  = get_data(path_in_str)


print(train_answers)

['FindConnection', 'FindConnection', 'FindConnection', 'FindConnection', 'FindConnection', 'FindConnection', 'FindConnection', 'FindConnection', 'FindConnection', 'FindConnection', 'FindConnection', 'FindConnection', 'FindConnection', 'FindConnection', 'FindConnection', 'FindConnection', 'FindConnection', 'FindConnection', 'FindConnection', 'FindConnection', 'FindConnection', 'DepartureTime', 'FindConnection', 'FindConnection', 'DepartureTime', 'FindConnection', 'FindConnection', 'FindConnection', 'FindConnection', 'FindConnection', 'FindConnection', 'DepartureTime', 'DepartureTime', 'DepartureTime', 'DepartureTime', 'DepartureTime', 'DepartureTime', 'DepartureTime', 'DepartureTime', 'DepartureTime', 'DepartureTime', 'DepartureTime', 'DepartureTime', 'DepartureTime', 'DepartureTime', 'DepartureTime', 'DepartureTime', 'DepartureTime', 'DepartureTime', 'DepartureTime', 'DepartureTime', 'DepartureTime', 'DepartureTime', 'DepartureTime', 'DepartureTime', 'DepartureTime', 'DepartureTime', '

In [81]:
if "NLU-datasets" in os.getcwd():
    os.chdir("..")

print(os.getcwd())

C:\Users\Zver\ws\intent-detection


In [83]:
# define model and tokenizer
model_name = "bert-base-multilingual-cased" # loading from huggingface
model_name = "./bert-base-multilingual-cased" # loading from local path

tokenizer = BertTokenizer.from_pretrained(model_name)
model_bert = TFBertModel.from_pretrained(model_name)

Some layers from the model checkpoint at ./bert-base-multilingual-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at ./bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


# Testing word embeddings on small example

In [84]:
# test the tokenizer
multiple_lines = [
'i want to go marienplatz',
'when is the next train in muncher freiheit?',
'when does the next u-bahn leaves from garching forschungszentrum?'
]
ids_for_test = tokenizer(multiple_lines, padding=True, return_tensors='tf')
ids_for_test

{'input_ids': <tf.Tensor: shape=(3, 19), dtype=int32, numpy=
array([[   101,    177,  21528,  10114,  11783,  24538,  10136,  20732,
           102,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0],
       [   101,  10841,  10124,  10105,  13451,  17767,  10106, 101833,
         13396,  42109,  15543,    136,    102,      0,      0,      0,
             0,      0,      0],
       [   101,  10841,  15107,  10105,  13451,    189,    118,  15688,
         15797,  24516,  10188,  47243,  41247,  10142,  12044,  10716,
         72100,    136,    102]])>, 'token_type_ids': <tf.Tensor: shape=(3, 19), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>, 'attention_mask': <tf.Tensor: shape=(3, 19), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [

In [85]:
# test the model
model_bert_output = model_bert(ids_for_test)

In [86]:
print(model_bert_output.keys())

input_dimensions = model_bert_output['last_hidden_state'].shape
input_dimensions

odict_keys(['last_hidden_state', 'pooler_output'])


TensorShape([3, 19, 768])

In [9]:
multiple_labels = [
 'FindConnection\n',
 'DepartureTime\n',
 'DepartureTime\n'
]
multiple_labels_for_training = tf.convert_to_tensor([0, 1, 1])
print(multiple_labels_for_training)

tf.Tensor([0 1 1], shape=(3,), dtype=int32)


# Classifier

In [14]:
assert len(test_answers) == len(en_test)

In [71]:
batch_size = 4
sentence_length = 20

text = en_test[0:batch_size]
encoded_input = tokenizer(text, padding='max_length', max_length=sentence_length, truncation=True, return_tensors='tf')
inputs = model_bert(encoded_input)["last_hidden_state"]

In [72]:
inputs

<tf.Tensor: shape=(4, 20, 768), dtype=float32, numpy=
array([[[-0.08018287,  0.03810783,  0.27046824, ...,  0.39124072,
         -0.11388035, -0.04561067],
        [-0.13535435, -0.6108816 ,  0.83754337, ...,  0.60649425,
         -0.3622976 ,  0.06997238],
        [-0.9340399 , -0.27897757,  0.34910378, ...,  1.0008135 ,
         -0.27053413,  0.39220405],
        ...,
        [-0.12981133, -0.3453001 ,  0.6919307 , ...,  0.47795454,
         -0.35215393,  0.29598516],
        [-0.20455204, -0.32787555,  0.87851584, ...,  0.687666  ,
         -0.5359596 ,  0.26696217],
        [-0.5047288 , -0.10888059,  0.11342178, ...,  0.54780823,
         -0.33753088,  0.34957582]],

       [[-0.06291284,  0.3134947 ,  0.05111048, ...,  0.65492725,
          0.06485228, -0.05769763],
        [-0.33032477,  0.5593583 ,  0.24920447, ...,  0.6845547 ,
          0.11658497, -0.09617171],
        [-0.27714643,  0.38922876, -0.20803127, ...,  0.6321562 ,
          0.16640294,  0.09471501],
        ...,


In [75]:
def create_model_one_layer(units, **kwargs):
    model = Sequential()
    model.add(tf.keras.Input(shape=(batch_size, sentence_length, 768))) # from shape=(1, 9, 768)
    model.add(Dense(units, activation='softmax'))
    model.add(Conv1D(units, sentence_length, padding="valid", activation="softmax"))
    return model

def create_adam_optimizer(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, amsgrad=False):
    return Adam(learning_rate=lr, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, amsgrad=amsgrad)

learning_rate = 0.03
optimizer = create_adam_optimizer(lr=learning_rate)
classification_model = create_model_one_layer(units=2) # units = 2 because we want to get scores for two classes

In [74]:
classification_model.compile(optimizer=optimizer,
              #loss='categorical_crossentropy',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              metrics=['accuracy'])

# expand dimensions
classifiaction_input = tf.expand_dims(inputs, axis=0)

# view the output of the classification_model
# probabilities for labels
classification_model(classifiaction_input)

<tf.Tensor: shape=(1, 4, 1, 2), dtype=float32, numpy=
array([[[[0.21387655, 0.78612345]],

        [[0.1751532 , 0.8248468 ]],

        [[0.13610135, 0.86389863]],

        [[0.20202312, 0.7979769 ]]]], dtype=float32)>