In [None]:
import os
from pathlib import Path
from typing import List

import tensorflow as tf
from keras.layers import (Dense, Conv1D)
from keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from transformers import BertTokenizer, TFBertModel

# Read the natural language understanding dataset and BERT model

Clone the repos inside intent-detection directory
```
git clone https://github.com/tilde-nlp/NLU-datasets.git
git clone https://huggingface.co/bert-base-multilingual-cased
```

Directory tree should be as follows
```
/intent-detection
├── NLU-datasets
├── bert-base-multilingual-cased
├── run-on-windows.ipynb
```

In [None]:
if "NLU-datasets" not in os.getcwd():
    os.chdir("./NLU-datasets")

In [None]:
def get_data(path: str) -> List[str]:
    """ Read path and append each line without \n as an element to an array.
    Encoding is specified to correctly read files in Russian.
    Example output: ['FindConnection', 'FindConnection', ..., 'FindConnection']
    """
    with open(path, encoding='utf-8') as f:
        array = []
        for line in list(f):
            array.append(line.split('\n')[0])
        return array

In [None]:
path_list = Path("chatbot").glob("**/*.txt")

for path in path_list:
    # because path is object not string
    path_in_str = str(path)
    # print(path_in_str)
    if path_in_str == "chatbot\chatbot_train_ans.txt":
        train_answers = get_data(path_in_str)
    elif path_in_str == "chatbot\chatbot_test_ans.txt":
        test_answers  = get_data(path_in_str)
    elif path_in_str == "chatbot\en\chatbot_test_q.txt":
        en_test  = get_data(path_in_str)
    elif path_in_str == "chatbot\en\chatbot_train_q.txt":
        en_train  = get_data(path_in_str)
    elif path_in_str == "chatbot\lv\chatbot_test_q.txt":
        lv_test  = get_data(path_in_str)
    elif path_in_str == "chatbot\lv\chatbot_train_q.txt":
        lv_train  = get_data(path_in_str)
    elif path_in_str == "chatbot\\ru\chatbot_test_q.txt":
        ru_test  = get_data(path_in_str)
    elif path_in_str == "chatbot\\ru\chatbot_train_q.txt":
        ru_train  = get_data(path_in_str)
    elif path_in_str == "chatbot\et\chatbot_test_q.txt":
        et_test  = get_data(path_in_str)
    elif path_in_str == "chatbot\et\chatbot_train_q.txt":
        et_train  = get_data(path_in_str)
    elif path_in_str == "chatbot\lt\chatbot_test_q.txt":
        lt_test  = get_data(path_in_str)
    elif path_in_str == "chatbot\lt\chatbot_train_q.txt":
        lt_train  = get_data(path_in_str)


print(train_answers)

In [None]:
if "NLU-datasets" in os.getcwd():
    os.chdir("..")

print(os.getcwd())

In [None]:
# define model and tokenizer
model_name = "bert-base-multilingual-cased" # loading from huggingface
model_name = "./bert-base-multilingual-cased" # loading from local path

tokenizer = BertTokenizer.from_pretrained(model_name)
model_bert = TFBertModel.from_pretrained(model_name)

# Testing word embeddings on a small example

In [None]:
# test the tokenizer
multiple_lines = [
'i want to go marienplatz',
'when is the next train in muncher freiheit?',
'when does the next u-bahn leaves from garching forschungszentrum?'
]
ids_for_test = tokenizer(multiple_lines, padding=True, return_tensors='tf')
ids_for_test

In [None]:
# test the model
model_bert_output = model_bert(ids_for_test)

In [None]:
print(model_bert_output.keys())

input_dimensions = model_bert_output['last_hidden_state'].shape
input_dimensions

# Classifier

In [None]:
assert len(train_answers) == len(en_train)

In [None]:
def create_model_one_layer(units: int, batch_size: int, sentence_length: int, **kwargs):
    """
    returns <tf.Tensor: shape=(1, batch_size, 1, units), dtype=float32
    e.g. <tf.Tensor: shape=(1, 4, 1, 2), dtype=float32
    where 4 = batch_size, 2 = units
    """
    model = Sequential()
    model.add(tf.keras.Input(shape=(batch_size, sentence_length, 768))) # from shape=(1, 9, 768)
    model.add(Dense(units, activation='softmax'))
    model.add(Conv1D(units, sentence_length, padding="valid", activation="softmax"))
    return model


def create_adam_optimizer(lr=0.001, beta_1=0.9, beta_2=0.999, decay=0, epsilon=None, amsgrad=False):
    return tf.keras.optimizers.legacy.Adam(learning_rate=lr, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, decay=decay, amsgrad=amsgrad)


def encode_labels(answers: List) -> List:
    """ Encode labels in one hot-encoding
    'FindConnection' corresponds to [[1, 0]]
    'DepartureTime' corresponds to [[0, 1]]
    """
    y = []
    for answer in answers:
        if answer == 'FindConnection':
            y.append([[1, 0]])
        else:
            y.append([[0, 1]])
    return y


def expand_dimensions(y: List):
    y_tensor = tf.convert_to_tensor(y, dtype=float)
    return tf.expand_dims(y_tensor, axis=0)

## Test on small example

In [None]:
batch_size = 4
sentence_length = 20

text = en_train[0:batch_size]
encoded_input = tokenizer(text, padding='max_length', max_length=sentence_length, truncation=True, return_tensors='tf')
inputs = model_bert(encoded_input)["last_hidden_state"]
inputs

In [None]:
learning_rate = 0.03
optimizer = create_adam_optimizer(lr=learning_rate)
 # units = 2 because we want to get scores for two classes
classification_model = create_model_one_layer(units=2, batch_size=batch_size, sentence_length=sentence_length)

classification_model.compile(
    optimizer=optimizer,
    loss='categorical_crossentropy',
    # loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    metrics=['accuracy']
)

# expand dimensions, why????
classification_input = tf.expand_dims(inputs, axis=0)

# view the output of the classification_model
# probabilities for labels
classification_model(classification_input)

In [None]:
labels= encode_labels(train_answers)
labels_expanded = expand_dimensions(labels[0:batch_size])

labels_expanded

In [None]:
number_of_epochs = 5
classification_model.fit(classification_input, y=labels_expanded, epochs=number_of_epochs)

In [None]:
classification_model(classification_input)

## Run on all inputs

In [None]:
batch_size = 32
sentence_length = 20
learning_rate = 0.03
number_of_epochs = 5

optimizer = create_adam_optimizer(lr=learning_rate)
classification_model = create_model_one_layer(units=2, batch_size=batch_size, sentence_length=sentence_length)

classification_model.compile(
    optimizer=optimizer,
    loss='categorical_crossentropy',
    # loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    metrics=['accuracy']
)

In [None]:
for index in range(len(train_answers)//batch_size):
    text = en_train[index*batch_size:(index+1)*batch_size]
    encoded_input = tokenizer(text, padding='max_length', max_length=sentence_length, truncation=True, return_tensors='tf')
    classification_input = tf.expand_dims(model_bert(encoded_input)["last_hidden_state"], axis=0)

    labels = encode_labels(train_answers)
    labels_expanded = expand_dimensions(labels[index*batch_size:(index+1)*batch_size])
    classification_model.fit(classification_input, y=labels_expanded, epochs=number_of_epochs)
    # print(classification_model(classification_input))