In [1]:
import tensorflow as tf

# from bilstm_crf import build_bilstm
import numpy as np

# Load material

In [2]:
import json

# load embedding
# embedding_maxtrix = np.load('embedding/embedding_matrix.npy')

# load vocab
# with open('data/vocab.txt', 'r') as f:
#     vocab = f.read().split('\n')

# load tag_to_id
with open('data/tag_to_id.json', 'r') as f:
    tag_to_id = json.load((f))

# load train and dev data
TRAIN_PATH = 'data/span_detection_datasets_IOB/train.json'
DEV_PATH = 'data/span_detection_datasets_IOB/dev.json'

with open(TRAIN_PATH, 'r') as f:
    train_data = json.load(f)

with open(DEV_PATH, 'r') as f:
    dev_data = json.load(f)

train_sentences = list(train_data['text'].values())
dev_sentences = list(dev_data['text'].values())

train_labels = list(train_data['labels'].values())
dev_labels = list(dev_data['labels'].values())

# Convert data

## Sentences

In [32]:
from keras.layers import TextVectorization

tokenizer = TextVectorization(pad_to_max_tokens=True,
                              output_sequence_length=256,
                              output_mode='int',
                              max_tokens=10000)
tokenizer.adapt(train_sentences + dev_sentences)

In [45]:
# get vocab and save it to file vocab.txt
vocab = tokenizer.get_vocabulary()
with open('data/vocab.txt', 'w') as f:
    f.write('\n'.join(vocab))

In [46]:
# train_tokenized and dev_tokenized are numpy array with padding
train_tokenized = tokenizer(np.array([[s] for s in train_sentences])).numpy()
dev_tokenized = tokenizer(np.array([[s] for s in dev_sentences])).numpy()

## Labels

In [123]:
def convert_labels_to_ids(label, tag_to_id, max_len=256):
    ids = [int(tag_to_id[tag]) for tag in label]

    if len(ids) < max_len:
        ids += [int(tag_to_id['<PAD>'])] * (max_len - len(ids))
        
    return np.array(ids, dtype=np.int32)

train_labels_encoding = np.array([convert_labels_to_ids(label, tag_to_id) for label in train_labels], dtype=np.int32)
dev_labels_encoding = np.array([convert_labels_to_ids(label, tag_to_id) for label in dev_labels], dtype=np.int32)

# change label (batch_size, max_len) to (batch_size, max_len, 1)
# train_labels_encoding = np.expand_dims(train_labels_encoding, axis=-1)
# dev_labels_encoding = np.expand_dims(dev_labels_encoding, axis=-1)

In [126]:
# one hot encoding
train_labels_one_hot_encoding = tf.one_hot(train_labels_encoding, len(tag_to_id))
dev_labels_one_hot_encoding = tf.one_hot(dev_labels_encoding, len(tag_to_id))

## Loader

In [127]:
# create data loader tensorflow
BATCH_SIZE = 2
BUFFER_SIZE = 1000

train_dataset = tf.data.Dataset.from_tensor_slices((train_tokenized, train_labels_one_hot_encoding))
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dev_dataset = tf.data.Dataset.from_tensor_slices((dev_tokenized, dev_labels_one_hot_encoding))
dev_dataset = dev_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

# Model

## Embedding model

In [None]:
# import fasttext

# # Load the pre-trained model
# embedding_model = fasttext.load_model('pretrained-weights/cc.vi.300.bin')

# vocabulary = tokenizer.get_vocabulary()
# vector_dim = embedding_model.get_dimension()

# embedding_matrix = np.zeros((len(vocabulary), vector_dim))
# for i, word in enumerate(vocabulary):
#         embedding_matrix[i] = embedding_model.get_word_vector(word)

# embedding_matrix_file = 'embedding/embedding_matrix.npy'

# np.save(embedding_matrix_file, embedding_matrix)

In [None]:
# load embedding
# embedding_maxtrix = np.load('embedding/embedding_matrix.npy')

In [None]:
# class BiLSTM_CRF(Model):
#     def __init__(self, vocab_size, max_len, n_tags, embedding_matrix=None, embedding_dim=None, unit='lstm', num_units=100, dropout=0.1, recurrent_dropout=0.1):
#         super(BiLSTM_CRF, self).__init__()

#         self.max_len = max_len

#         if embedding_matrix is not None and embedding_dim is not None:
#             raise ValueError('Cannot provide both an embedding matrix and an embedding dimension.')

#         if embedding_matrix is not None:
#             self.embedding = layers.Embedding(input_dim=vocab_size, output_dim=embedding_matrix.shape[-1], input_length=max_len, mask_zero=True, weights=[embedding_matrix], trainable=False, name='embedding')
#         elif embedding_dim is not None:
#             self.embedding = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len, mask_zero=True, embeddings_initializer='uniform', name='embedding')
#         else:
#             raise ValueError('Must provide either an embedding matrix or an embedding dimension.')

#         if unit == 'lstm':
#             self.lstm = layers.Bidirectional(layers.LSTM(units=num_units, return_sequences=True, dropout=recurrent_dropout), name='bilstm')
#         elif unit == 'gru':
#             self.lstm = layers.Bidirectional(layers.GRU(units=num_units, return_sequences=True, dropout=recurrent_dropout), name='bigru')
#         elif unit == 'rnn':
#             self.lstm = layers.Bidirectional(layers.SimpleRNN(units=num_units, return_sequences=True, dropout=recurrent_dropout), name='birnn')
#         else:
#             raise ValueError('Invalid unit type. Must be one of lstm, gru, or rnn.')
        
#         self.dropout = layers.Dropout(dropout, name='dropout')
#         self.time_distributed = layers.TimeDistributed(layers.Dense(n_tags, activation="relu"), name='time_distributed')
        
#         self.crf = CRF(units=n_tags, name='crf')

#     def call(self, inputs, training=False):

#         x = self.embedding(inputs)
#         x = self.lstm(x)
#         x = self.dropout(x, training=training)
#         x = self.time_distributed(x)
#         decoded_sequence, potentials, sequence_length, chain_kernel = self.crf(x)

#         return potentials
    
#     def summary(self):
#         x = layers.Input(shape=(self.max_len,), name='input')
#         model = Model(inputs=[x], outputs=self.call(x))
#         return model.summary()        

## Span detection model

In [118]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_addons as tfa

from keras import layers
from keras import optimizers

from tensorflow_addons.layers import CRF

from keras.models import Model

In [119]:
def build_model_bilstm_crf(vocab_size, max_len, n_tags, embedding_matrix=None, embedding_dim=None, unit='lstm', num_units=100, dropout=0.1, recurrent_dropout=0.1):
    
    inputs = layers.Input(shape=(max_len,), name='input')
    
    if embedding_matrix is not None and embedding_dim is not None:
        raise ValueError('Cannot provide both an embedding matrix and an embedding dimension.')

    if embedding_matrix is not None:
        embedding = layers.Embedding(input_dim=vocab_size, output_dim=embedding_matrix.shape[-1], input_length=max_len, mask_zero=True, weights=[embedding_matrix], trainable=False, name='embedding')
    elif embedding_dim is not None:
        embedding = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len, mask_zero=True, embeddings_initializer='uniform', name='embedding')
    else:
        raise ValueError('Must provide either an embedding matrix or an embedding dimension.')

    if unit == 'lstm':
        lstm = layers.Bidirectional(layers.LSTM(units=num_units, return_sequences=True, dropout=recurrent_dropout), name='bilstm')
    elif unit == 'gru':
        lstm = layers.Bidirectional(layers.GRU(units=num_units, return_sequences=True, dropout=recurrent_dropout), name='bigru')
    elif unit == 'rnn':
        lstm = layers.Bidirectional(layers.SimpleRNN(units=num_units, return_sequences=True, dropout=recurrent_dropout), name='birnn')
    else:
        raise ValueError('Invalid unit type. Must be one of lstm, gru, or rnn.')
        
    dropout_model = layers.Dropout(dropout, name='dropout')
    time_distributed = layers.TimeDistributed(layers.Dense(n_tags, activation="relu"), name='time_distributed')
        
    crf = CRF(units=n_tags, name='crf')

    x = embedding(inputs)
    x = lstm(x)
    x = dropout_model(x)
    x = time_distributed(x)
    decoded_sequence, potentials, sequence_length, chain_kernel = crf(x)

    model = Model([inputs], potentials, name='bilstm_crf')

    model.add_loss(tf.abs(tf.reduce_mean(chain_kernel)))

    model.summary()

    return model

In [128]:
# Hyperparameters
MAX_LEN = 256
VOCAB_SIZE = len(vocab)
TAG_SIZE = len(tag_to_id)
UNITS = 100
EMBEDDING_DIM = 300

# Build model
model = build_model_bilstm_crf(vocab_size=VOCAB_SIZE, max_len=MAX_LEN, n_tags=TAG_SIZE, embedding_dim=EMBEDDING_DIM, unit='lstm', num_units=UNITS, dropout=0.1, recurrent_dropout=0.1)

Model: "bilstm_crf"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 256)]             0         
                                                                 
 embedding (Embedding)       (None, 256, 300)          2420400   
                                                                 
 bilstm (Bidirectional)      (None, 256, 200)          320800    
                                                                 
 dropout (Dropout)           (None, 256, 200)          0         
                                                                 
 time_distributed (TimeDist  (None, 256, 22)           4422      
 ributed)                                                        
                                                                 
 crf (CRF)                   [(None, 256),             1034      
                              (None, 256, 22),          

In [129]:
# compile model
# loss = tf.keras.losses.CategoricalCrossentropy()
# loss = tf.keras.losses.SparseCategoricalCrossentropy()
loss = tfa.losses.SigmoidFocalCrossEntropy()

optimizer = tf.keras.optimizers.AdamW(learning_rate=0.01)

metric_acc = tf.keras.metrics.SparseCategoricalAccuracy()
# metric_f1 = tf.keras.metrics.F1Score(average='micro')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric_acc])

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True)

# train model
EPOCHS = 10
history = model.fit(train_dataset,
                    epochs=EPOCHS,
                    validation_data=dev_dataset,
                    callbacks=[early_stopping]
                    )

Epoch 1/10


ValueError: in user code:

    File "C:\Users\Hii\AppData\Roaming\Python\Python311\site-packages\keras\src\engine\training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\Hii\AppData\Roaming\Python\Python311\site-packages\keras\src\engine\training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\Hii\AppData\Roaming\Python\Python311\site-packages\keras\src\engine\training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\Hii\AppData\Roaming\Python\Python311\site-packages\keras\src\engine\training.py", line 1081, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\Hii\AppData\Roaming\Python\Python311\site-packages\keras\src\engine\training.py", line 1139, in compute_loss
        return self.compiled_loss(
    File "C:\Users\Hii\AppData\Roaming\Python\Python311\site-packages\keras\src\engine\compile_utils.py", line 316, in __call__
        total_total_loss_mean_value = tf.add_n(total_loss_mean_values)

    ValueError: Shapes must be equal rank, but are 2 and 0
    	From merging shape 0 with other shapes. for '{{node AddN}} = AddN[N=2, T=DT_FLOAT](sigmoid_focal_crossentropy/weighted_loss/Mul, bilstm_crf/tf.math.abs_6/Abs)' with input shapes: [2,256], [].


# Plot results

In [None]:
# plot loss and accuracy of train and dev in one figure
def plot_history(history):
    fig, axs = plt.subplots(1, 2, figsize=(15, 5))

    axs[0].plot(history.history['loss'])
    axs[0].plot(history.history['val_loss'])
    axs[0].set_title('Model loss')
    axs[0].set_ylabel('Loss')
    axs[0].set_xlabel('Epoch')
    axs[0].legend(['Train', 'Val'], loc='upper right')

    axs[1].plot(history.history['categorical_accuracy'])
    axs[1].plot(history.history['val_categorical_accuracy'])
    axs[1].set_title('Model accuracy')
    axs[1].set_ylabel('Accuracy')
    axs[1].set_xlabel('Epoch')
    axs[1].legend(['Train', 'Val'], loc='upper right')

    plt.show()

plot_history(history)

In [None]:
# save model
model.save('model/span_detection_model.h5')

# End