In [None]:
from typing import List

import tensorflow as tf
from keras.layers import (Dense, Conv1D)
from keras.models import Sequential
from transformers import BertTokenizer, TFBertModel

import matplotlib.pyplot as plt

# Read the natural language understanding dataset and BERT model

Clone the repos inside intent-detection directory
```
git clone https://github.com/tilde-nlp/NLU-datasets.git
git clone https://huggingface.co/bert-base-multilingual-cased
```

Directory tree should be as follows
```
/intent-detection
├── NLU-datasets
├── bert-base-multilingual-cased
├── run-on-windows.ipynb
```

In [None]:
def read_file(path: str) -> List[str]:
    """ Read path and append each line without \n as an element to an array.
    Encoding is specified to correctly read files in Russian.
    Example output: ['FindConnection', 'FindConnection', ..., 'FindConnection']
    """
    with open(path, encoding='utf-8') as f:
        array = []
        for line in list(f):
            array.append(line.split('\n')[0])
        return array


def get_source_text(dataset_type: str, source_language: str = None, labels: bool = False, machine_translated: bool = False) -> List[str]:
    """ Wrapper for read_file that provides file path.
    Prompts in all languages are in the same order, therefore they use the same label files. So please be careful
    to use the correct argument for labels, as label=True returns labels regardless of specified source_language
    Usage examples:
    prompts: read_source_text("test", "et", False)
    labels: read_source_text("test")
    :param dataset_type: "test" or "train"
    :param source_language: "lv", "ru", "et", "lt"
    :param labels: does the file being read contain labels
    :return: array of file contents for specified file
    """
    if labels:
        return read_file(f"NLU-datasets\chatbot\chatbot_{dataset_type}_ans.txt")
    elif machine_translated:
        return read_file(f"machine-translated-datasets\{source_language}_{dataset_type}.txt")
    else:
        return read_file(f"NLU-datasets\chatbot\{source_language}\chatbot_{dataset_type}_q.txt")

In [None]:
# Read the NLU-datasets in their original source languages

en_test = get_source_text("test", "en")
lv_test = get_source_text("test", "lv")
ru_test = get_source_text("test", "ru")
et_test = get_source_text("test", "et")
lt_test = get_source_text("test", "lt")

en_train = get_source_text("train", "en")
lv_train = get_source_text("train", "lv")
ru_train = get_source_text("train", "ru")
et_train = get_source_text("train", "et")
lt_train = get_source_text("train", "lt")

train_answers = get_source_text(dataset_type="train", labels=True)
test_answers = get_source_text(dataset_type="test", labels=True)

In [None]:
assert len(train_answers) == len(en_train)

In [None]:
# Read non-English NLU-datasets that have been pre-machine-translated to English

lv_test_en = get_source_text("test", "lv", machine_translated=True)
ru_test_en = get_source_text("test", "ru", machine_translated=True)
et_test_en = get_source_text("test", "et", machine_translated=True)
lt_test_en = get_source_text("test", "lt", machine_translated=True)

lv_train_en = get_source_text("train", "lv", machine_translated=True)
ru_train_en = get_source_text("train", "ru", machine_translated=True)
et_train_en = get_source_text("train", "et", machine_translated=True)
lt_train_en = get_source_text("train", "lt", machine_translated=True)

In [None]:
print(lv_test[0])
print(lv_test_en[0])

# Definitions
## Model and tokenizer

In [None]:
model_name = "bert-base-multilingual-cased" # loading from huggingface
model_name = "./bert-base-multilingual-cased" # loading from local path

tokenizer = BertTokenizer.from_pretrained(model_name)
model_bert = TFBertModel.from_pretrained(model_name)

## Labels

In [None]:
# use keras.to_categorical() instead
def encode_labels(answers: List) -> List:
    """ Encode labels in one hot-encoding
    'FindConnection' corresponds to [[1, 0]]
    'DepartureTime' corresponds to [[0, 1]]
    """
    y = []
    for answer in answers:
        if answer == 'FindConnection':
            y.append([[1, 0]])
        else:
            y.append([[0, 1]])
    return y

In [None]:
encoded_train_labels = encode_labels(train_answers)
encoded_train_labels = tf.convert_to_tensor(encoded_train_labels)

encoded_test_labels = encode_labels(test_answers)
encoded_test_labels = tf.convert_to_tensor(encoded_test_labels)

## Training

In [None]:
def create_model_one_layer(sentence_length: int, units: int = 2, hidden_size: int = 768):
    """
    returns <tf.Tensor: shape=(1, 1, units), dtype=float32>
    e.g. <tf.Tensor: shape=(1, 1, 2), dtype=float32>
    where 2 = units
    """
    model = Sequential()
    model.add(tf.keras.Input(shape=(sentence_length, hidden_size)))
    model.add(Dense(units, activation='softmax'))
    model.add(Conv1D(units, sentence_length, padding="valid", activation="softmax"))
    model.add(Dense(units, activation='softmax'))
    return model


def create_adam_optimizer(lr=0.001, beta_1=0.9, beta_2=0.999, decay=0, epsilon=None, amsgrad=False):
    # TODO: Replace legacy optimizer with current version of Adam
    return tf.keras.optimizers.legacy.Adam(learning_rate=lr, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, decay=decay, amsgrad=amsgrad)



def get_classification_model(learning_rate: int, sentence_length: int):
    optimizer = create_adam_optimizer(lr=learning_rate)
    classification_model = create_model_one_layer(sentence_length=sentence_length)

    classification_model.compile(
        optimizer=optimizer,
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return classification_model


def plot_performance(data, dataset: str, x_label: str = 'accuracy'):
    plt.plot(data)
    ax = plt.gca()
    ax.set_xlabel('epochs')
    ax.set_ylabel(x_label)
    plt.title(f"{dataset} model {x_label}")
    plt.savefig(f"{dataset}-{x_label}.png")
    # plt.savefig(f"{dataset}{x_label}.pdf", dpi=150) # pdf for LaTeX
    plt.show()


def training(train_dataset, dataset_name: str, learning_rate: int, sentence_length: int, labels):

    classification_model = get_classification_model(learning_rate, sentence_length)

    encoded_input = tokenizer(train_dataset, padding='max_length', max_length=sentence_length, truncation=True, return_tensors='tf')
    classification_input = model_bert(encoded_input)["last_hidden_state"]

    history = classification_model.fit(classification_input, y=labels, batch_size=batch_size, epochs=number_of_epochs)
    # predictions = classification_model(classification_input)

    plot_performance(history.history['accuracy'], dataset=dataset_name, x_label='accuracy')
    plot_performance(history.history['loss'], dataset=dataset_name, x_label='loss')

    return classification_model

## Test

In [None]:
def test_classification_model(classification_model, en_test, encoded_test_labels):
    encoded_input = tokenizer(en_test, padding='max_length', max_length=sentence_length, truncation=True, return_tensors='tf')
    classification_input = model_bert(encoded_input)["last_hidden_state"]

    test_loss, test_accuracy = classification_model.evaluate(classification_input, encoded_test_labels, batch_size=batch_size)
    print('Test Loss: {}'.format(test_loss))
    print('Test Accuracy: {}'.format(test_accuracy))

# A small example
## Sentence -> word embedding

In [None]:
batch_size = 4
sentence_length = 20

text = en_train[0:batch_size]
encoded_input = tokenizer(text, padding='max_length', max_length=sentence_length, truncation=True, return_tensors='tf')
encoded_input

In [None]:
# odict_keys(['last_hidden_state', 'pooler_output'])
inputs = model_bert(encoded_input)["last_hidden_state"]
inputs.shape

## Word embedding -> classification

In [None]:
optimizer = create_adam_optimizer(lr=0.03)
classification_model = create_model_one_layer(sentence_length=sentence_length)

classification_model.compile(
    optimizer=optimizer,
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# initial probabilities
classification_model(inputs)

In [None]:
classification_model.fit(inputs, y=encoded_train_labels[0:batch_size], epochs=5)

In [None]:
# view the output of the classification_model: probabilities for labels

classification_model(inputs)

# Methods

## Hyperparameters

In [None]:
batch_size = 25
sentence_length = 20
learning_rate = 0.0003
number_of_epochs = 150

## Each language has its own model
### Using the original NLU-datasets
#### Train

In [None]:
classification_model_en = training(en_train, dataset_name="en_train", learning_rate=learning_rate, sentence_length=sentence_length, labels=encoded_train_labels)

In [None]:
classification_model_lv = training(lv_train, dataset_name="lv_train", learning_rate=learning_rate, sentence_length=sentence_length, labels=encoded_train_labels)

In [None]:
classification_model_ru = training(ru_train, dataset_name="ru_train", learning_rate=learning_rate, sentence_length=sentence_length, labels=encoded_train_labels)

In [None]:
classification_model_et = training(et_train, dataset_name="et_train", learning_rate=learning_rate, sentence_length=sentence_length, labels=encoded_train_labels)

In [None]:
classification_model_lt = training(lt_train, dataset_name="lt_train", learning_rate=learning_rate, sentence_length=sentence_length, labels=encoded_train_labels)

#### Test

In [None]:
test_classification_model(classification_model_en, en_test, encoded_test_labels)

In [None]:
test_classification_model(classification_model_lv, lv_test, encoded_test_labels)

In [None]:
test_classification_model(classification_model_ru, ru_test, encoded_test_labels)

In [None]:
test_classification_model(classification_model_et, et_test, encoded_test_labels)

In [None]:
test_classification_model(classification_model_lt, lt_test, encoded_test_labels)

### Using machine translated non-English datasets
#### Train

In [None]:
classification_model_lv_en = training(lv_train_en, dataset_name="lv_train_en", learning_rate=learning_rate, sentence_length=sentence_length, labels=encoded_train_labels)

In [None]:
classification_model_ru_en = training(ru_train_en, dataset_name="ru_train_en", learning_rate=learning_rate, sentence_length=sentence_length, labels=encoded_train_labels)

In [None]:
classification_model_et_en = training(et_train_en, dataset_name="et_train_en", learning_rate=learning_rate, sentence_length=sentence_length, labels=encoded_train_labels)

In [None]:
classification_model_lt_en = training(lt_train_en, dataset_name="lt_train_en", learning_rate=learning_rate, sentence_length=sentence_length, labels=encoded_train_labels)

#### Test

In [None]:
test_classification_model(classification_model_lv_en, lv_test_en, encoded_test_labels)

In [None]:
test_classification_model(classification_model_ru_en, ru_test_en, encoded_test_labels)

In [None]:
test_classification_model(classification_model_et_en, et_test_en, encoded_test_labels)

In [None]:
test_classification_model(classification_model_lt_en, lt_test_en, encoded_test_labels)

## One model trained on all languages

In [None]:
# one big train label dataset
all_train_labels = []

for i in range(5):
    all_train_labels.extend(train_answers)

In [None]:
# one big test label dataset
all_test_labels = []

for i in range(5):
    all_test_labels.extend(test_answers)

In [None]:
all_train_labels = encode_labels(all_train_labels)
all_train_labels = tf.convert_to_tensor(all_train_labels)

all_test_labels = encode_labels(all_test_labels)
all_test_labels = tf.convert_to_tensor(all_test_labels)

### Using the original NLU-datasets

In [None]:
# one big training dataset
all_train = []
all_train.extend(en_train)
all_train.extend(lv_train)
all_train.extend(ru_train)
all_train.extend(et_train)
all_train.extend(lt_train)

In [None]:
# one big test dataset
all_test = []
all_test.extend(en_test)
all_test.extend(lv_test)
all_test.extend(ru_test)
all_test.extend(et_test)
all_test.extend(lt_test)

In [None]:
classification_model_all = training(all_train, dataset_name="all_train", learning_rate=learning_rate, sentence_length=sentence_length, labels=all_train_labels)

In [None]:
test_classification_model(classification_model_all, all_test, all_test_labels)

### Using machine translated non-English datasets

In [None]:
# one big training dataset
all_train_en = []
all_train_en.extend(en_train)
all_train_en.extend(lv_train_en)
all_train_en.extend(ru_train_en)
all_train_en.extend(et_train_en)
all_train_en.extend(lt_train_en)

In [None]:
# one big test dataset
all_test_en = []
all_test_en.extend(en_test)
all_test_en.extend(lv_test_en)
all_test_en.extend(ru_test_en)
all_test_en.extend(et_test_en)
all_test_en.extend(lt_test_en)

In [None]:
classification_model_all = training(all_train_en, dataset_name="all_train_en", learning_rate=learning_rate, sentence_length=sentence_length, labels=all_train_labels)

In [None]:
test_classification_model(classification_model_all, all_test_en, all_test_labels)

## Trained only on English data

In [None]:
classification_model_en = training(en_train, dataset_name="en_train", learning_rate=learning_rate, sentence_length=sentence_length, labels=encoded_train_labels)

### Test on non-English data

In [None]:
test_classification_model(classification_model_en, lv_test, encoded_test_labels)

In [None]:
test_classification_model(classification_model_en, ru_test, encoded_test_labels)

In [None]:
test_classification_model(classification_model_en, et_test, encoded_test_labels)

In [None]:
test_classification_model(classification_model_en, lt_test, encoded_test_labels)

### Test on non-English machine translated to English data

In [None]:
test_classification_model(classification_model_en, lv_test_en, encoded_test_labels)

In [None]:
test_classification_model(classification_model_en, ru_test_en, encoded_test_labels)

In [None]:
test_classification_model(classification_model_en, et_test_en, encoded_test_labels)

In [None]:
test_classification_model(classification_model_en, lt_test_en, encoded_test_labels)