In [1]:
import tensorflow as tf

# from bilstm_crf import build_bilstm
import numpy as np

# Load material

In [2]:
import json

# load embedding
# embedding_maxtrix = np.load('embedding/embedding_matrix.npy')

# load vocab
# with open('data/vocab.txt', 'r') as f:
#     vocab = f.read().split('\n')

# load tag_to_id
with open('data/tag_to_id.json', 'r') as f:
    tag_to_id = json.load((f))

# load train and dev data
TRAIN_PATH = 'data/span_detection_datasets_IOB/train.json'
DEV_PATH = 'data/span_detection_datasets_IOB/dev.json'

with open(TRAIN_PATH, 'r') as f:
    train_data = json.load(f)

with open(DEV_PATH, 'r') as f:
    dev_data = json.load(f)

train_sentences = list(train_data['text'].values())
dev_sentences = list(dev_data['text'].values())

train_labels = list(train_data['labels'].values())
dev_labels = list(dev_data['labels'].values())

# Convert data

## Sentences

In [3]:
from keras.layers import TextVectorization

tokenizer = TextVectorization(pad_to_max_tokens=True,
                              output_sequence_length=256,
                              output_mode='int',
                              max_tokens=10000)
tokenizer.adapt(train_sentences + dev_sentences)

In [6]:
# get vocab and save it to file vocab.txt
vocab = tokenizer.get_vocabulary()
vocab = vocab[2:]
vocab = ['<UNK>', '<PAD>'] + vocab
with open('data/vocab.txt', 'w') as f:
    f.write('\n'.join(vocab))

In [7]:
# train_tokenized and dev_tokenized are numpy array with padding
train_tokenized = tokenizer(np.array([[s] for s in train_sentences])).numpy()
dev_tokenized = tokenizer(np.array([[s] for s in dev_sentences])).numpy()

## Labels

In [8]:
# Convert labels to ids
# labels = [[start, end, tag], ...]
def convert_labels_to_ids(label, tag_to_id, max_len=256):
    ids = [int(tag_to_id[tag]) for tag in label]

    if len(ids) < max_len:
        ids += [int(tag_to_id['<PAD>'])] * (max_len - len(ids))
        
    return np.array(ids, dtype=np.int32)

train_labels_encoding = np.array([convert_labels_to_ids(label, tag_to_id) for label in train_labels], dtype=np.int32)
dev_labels_encoding = np.array([convert_labels_to_ids(label, tag_to_id) for label in dev_labels], dtype=np.int32)

In [9]:
# one hot encoding
train_labels_one_hot_encoding = tf.one_hot(train_labels_encoding, len(tag_to_id))
dev_labels_one_hotencoding = tf.one_hot(dev_labels_encoding, len(tag_to_id))

## Loader

In [10]:
# create data loader tensorflow
BATCH_SIZE = 4
BUFFER_SIZE = 1000

train_dataset = tf.data.Dataset.from_tensor_slices((train_tokenized, train_labels_encoding))
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dev_dataset = tf.data.Dataset.from_tensor_slices((dev_tokenized, dev_labels_encoding))
dev_dataset = dev_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

# Model

In [11]:
import pickle
import operator
import re
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf


from keras import layers
from keras import optimizers

from tensorflow_addons.layers import CRF

from keras.models import Model, Sequential
from keras.metrics import F1Score


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [12]:
class BiLSTM_CRF(Model):
    def __init__(self, vocab_size, max_len, n_tags, embedding_matrix=None, embedding_dim=None, unit='lstm', num_units=100, dropout=0.1, recurrent_dropout=0.1):
        super(BiLSTM_CRF, self).__init__()

        if embedding_matrix is not None and embedding_dim is not None:
            raise ValueError('Cannot provide both an embedding matrix and an embedding dimension.')

        if embedding_matrix is not None:
            self.embedding = layers.Embedding(input_dim=vocab_size, output_dim=embedding_matrix.shape[-1], input_length=max_len, mask_zero=True, weights=[embedding_matrix], trainable=False)
        elif embedding_dim is not None:
            self.embedding = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len, mask_zero=True, embeddings_initializer='uniform')
        else:
            raise ValueError('Must provide either an embedding matrix or an embedding dimension.')

        if unit == 'lstm':
            self.lstm = layers.Bidirectional(layers.LSTM(units=num_units, return_sequences=True, dropout=recurrent_dropout))
        elif unit == 'gru':
            self.lstm = layers.Bidirectional(layers.GRU(units=num_units, return_sequences=True, dropout=recurrent_dropout))
        elif unit == 'rnn':
            self.lstm = layers.Bidirectional(layers.SimpleRNN(units=num_units, return_sequences=True, dropout=recurrent_dropout))
        else:
            raise ValueError('Invalid unit type. Must be one of lstm, gru, or rnn.')
        
        self.dropout = layers.Dropout(dropout)
        self.time_distributed = layers.TimeDistributed(layers.Dense(n_tags, activation="relu"))
        
        # use fully connected layer instead of CRF (batch_size, max_len) softmax
        # self.fc = layers.Dense(n_tags, activation="softmax")
        self.crf = CRF(units=n_tags)

    def call(self, inputs, training=False):
        
        x = self.embedding(inputs)
        x = self.lstm(x)
        x = self.dropout(x, training=training)
        x = self.time_distributed(x)
        # x = self.fc(x)
        x = self.crf(x)

        return x
    
    def summary(self):
        x = layers.Input(shape=(256,))
        model = Model(inputs=[x], outputs=self.call(x))
        return model.summary()        

In [13]:
# Hyperparameters
MAX_LEN = 256
VOCAB_SIZE = len(vocab)
TAG_SIZE = len(tag_to_id)
UNITS = 100
EMBEDDING_DIM = 300

In [14]:
model = BiLSTM_CRF(VOCAB_SIZE, MAX_LEN, TAG_SIZE, embedding_dim=EMBEDDING_DIM, num_units=UNITS)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 256)]             0         
                                                                 
 embedding (Embedding)       (None, 256, 300)          2420400   
                                                                 
 bidirectional (Bidirection  (None, 256, 200)          320800    
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 256, 200)          0         
                                                                 
 time_distributed (TimeDist  (None, 256, 22)           4422      
 ributed)                                                        
                                                                 
 crf (CRF)                   [(None, 256),             1034  

In [None]:
# compile model
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0.1)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
metric_acc = tf.keras.metrics.CategoricalAccuracy()
# metric_f1 = tf.keras.metrics.F1Score(average='micro')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric_acc])

# train model
EPOCHS = 10
history = model.fit(train_dataset, epochs=EPOCHS, validation_data=dev_dataset)

# Plot results

In [None]:
# plot loss and accuracy of train and dev in one figure
def plot_history(history):
    fig, axs = plt.subplots(1, 2, figsize=(15, 5))

    axs[0].plot(history.history['loss'])
    axs[0].plot(history.history['val_loss'])
    axs[0].set_title('Model loss')
    axs[0].set_ylabel('Loss')
    axs[0].set_xlabel('Epoch')
    axs[0].legend(['Train', 'Val'], loc='upper right')

    axs[1].plot(history.history['categorical_accuracy'])
    axs[1].plot(history.history['val_categorical_accuracy'])
    axs[1].set_title('Model accuracy')
    axs[1].set_ylabel('Accuracy')
    axs[1].set_xlabel('Epoch')
    axs[1].legend(['Train', 'Val'], loc='upper right')

    plt.show()

plot_history(history)

In [None]:
# save model
model.save('model/span_detection_model.h5')

# End