<a id="Bi-LSTM CRF"></a>

# Bi-LSTM CRF Model

We will leverage a Bi-LSTM CRF model as the baseline tagger.

This notebook explores the construction of the tagger.

In [2]:
from pathlib import Path
import pandas as pd

In [3]:
ROOT_DIR = Path('notebooks/eda.ipynb').resolve().parents[2]
DATA_DIR = ROOT_DIR / "data"
PREPARED_DIR = DATA_DIR / "prepared"

In [4]:
from ast import literal_eval


df = pd.read_csv(PREPARED_DIR / "master.csv")
df["tags"] = df["tags"].apply(literal_eval)
df["single_tag"] = df["tags"].apply(lambda x: x[0])
df.head()

Unnamed: 0,sentence_num,word,start_idx,end_idx,tags,single_tag
0,0,This,0,4,[B-Temporal],B-Temporal
1,0,week,5,9,[I-Temporal],I-Temporal
2,0,sees,10,14,[O],O
3,0,the,15,18,[O],O
4,0,start,19,24,[O],O


In [5]:
words = set(list(df['word'].values))
words.add('PADword')
n_words = len(words)
print(f"There are {n_words} unique words")

There are 4332 unique words


In [6]:
tags = list(set(df["single_tag"].values))
n_tags = len(tags)
print(f"There are {n_tags} unique tags")

There are 29 unique tags


In [7]:
class SentenceGetter:
    """Iterator to get a sentence sequence and its BIO tags"""
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["word"].values.tolist(),s["single_tag"].values.tolist())]
        self.grouped = self.data.groupby("sentence_num").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None

In [8]:
getter = SentenceGetter(df)
sentences = getter.sentences
print(f"There are {len(sentences)} total sentence")

There are 1013 total sentence


In [9]:
# Make indices for ML modeling
words2index = {w:i for i,w in enumerate(words)}
tags2index = {t:i for i,t in enumerate(tags)}

In [10]:
from tensorflow.keras.utils import pad_sequences

max_len = 50
y = [[tags2index[w[1]] for w in s] for s in sentences]

y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tags2index["O"])

In [11]:
y.shape

(1013, 50)

In [12]:
max_len = 50
X = [[words2index[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words-1)

In [13]:
import tensorflow as tf

x_tensor = tf.convert_to_tensor(X)
y_tensor = tf.convert_to_tensor(y)

ds = tf.data.Dataset.from_tensor_slices((x_tensor, y_tensor))

BATCH_SIZE = 64

2022-11-11 07:45:59.228034: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_SYSTEM_DRIVER_MISMATCH: system has unsupported display driver / cuda driver combination
2022-11-11 07:45:59.228082: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: enyquist-X399-DESIGNARE-EX
2022-11-11 07:45:59.228092: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: enyquist-X399-DESIGNARE-EX
2022-11-11 07:45:59.228256: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 510.85.2
2022-11-11 07:45:59.228287: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 520.61.5
2022-11-11 07:45:59.228296: E tensorflow/stream_executor/cuda/cuda_diagnostics.cc:313] kernel version 520.61.5 does not match DSO version 510.85.2 -- cannot find working devices in this configuration
2022-11-11 07:45:59.229325: I tensorflow/core/platform/cpu_f

In [14]:
def get_dataset_partitions_tf(
    ds, 
    ds_size, 
    train_split=0.8, 
    val_split=0.1, 
    test_split=0.1, 
    shuffle=True, 
    shuffle_size=1000
):
    assert (train_split + test_split + val_split) == 1

    if shuffle:
        ds = ds.shuffle(shuffle_size, seed=42)

    train_size = int(train_split * ds_size)
    val_size = int(val_split * ds_size)
    
    train_ds = ds.take(train_size)    
    val_ds = ds.skip(train_size).take(val_size)
    test_ds = ds.skip(train_size).skip(val_size)
    
    return train_ds.batch(BATCH_SIZE), val_ds.batch(BATCH_SIZE), test_ds.batch(BATCH_SIZE)

In [15]:
train_ds, val_ds, test_ds = get_dataset_partitions_tf(
    ds=ds,
    ds_size=X.shape[0],
)

In [16]:
import io
from pathlib import Path
import numpy as np
import tensorflow as tf
import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, TimeDistributed, Input, Dropout
from tensorflow.keras.optimizers import Adam

In [17]:
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
from tensorflow_addons.text import crf_log_likelihood, crf_decode

"""
Credit:
https://github.com/ngoquanghuy99/POS-Tagging-BiLSTM-CRF
"""

class CRF(L.Layer):
    def __init__(self,
                 output_dim,
                 sparse_target=True,
                 **kwargs):
        """    
        Args:
            output_dim (int): the number of labels to tag each temporal input.
            sparse_target (bool): whether the the ground-truth label represented in one-hot.
        Input shape:
            (batch_size, sentence length, output_dim)
        Output shape:
            (batch_size, sentence length, output_dim)
        """
        super(CRF, self).__init__(**kwargs)
        self.output_dim = int(output_dim) 
        self.sparse_target = sparse_target
        self.input_spec = L.InputSpec(min_ndim=3)
        self.supports_masking = False
        self.sequence_lengths = None
        self.transitions = None

    def build(self, input_shape):
        assert len(input_shape) == 3
        f_shape = tf.TensorShape(input_shape)
        input_spec = L.InputSpec(min_ndim=3, axes={-1: f_shape[-1]})

        if f_shape[-1] is None:
            raise ValueError('The last dimension of the inputs to `CRF` '
                             'should be defined. Found `None`.')
        if f_shape[-1] != self.output_dim:
            raise ValueError('The last dimension of the input shape must be equal to output'
                             ' shape. Use a linear layer if needed.')
        self.input_spec = input_spec
        self.transitions = self.add_weight(name='transitions',
                                           shape=[self.output_dim, self.output_dim],
                                           initializer='glorot_uniform',
                                           trainable=True)
        self.built = True

    def compute_mask(self, inputs, mask=None):
        # Just pass the received mask from previous layer, to the next layer or
        # manipulate it if this layer changes the shape of the input
        return mask

    def call(self, inputs, sequence_lengths=None, training=None, **kwargs):
        sequences = tf.convert_to_tensor(inputs, dtype=self.dtype)
        if sequence_lengths is not None:
            assert len(sequence_lengths.shape) == 2
            assert tf.convert_to_tensor(sequence_lengths).dtype == 'int32'
            seq_len_shape = tf.convert_to_tensor(sequence_lengths).get_shape().as_list()
            assert seq_len_shape[1] == 1
            self.sequence_lengths = K.flatten(sequence_lengths)
        else:
            self.sequence_lengths = tf.ones(tf.shape(inputs)[0], dtype=tf.int32) * (
                tf.shape(inputs)[1]
            )

        viterbi_sequence, _ = crf_decode(sequences,
                                         self.transitions,
                                         self.sequence_lengths)
        output = K.one_hot(viterbi_sequence, self.output_dim)
        return K.in_train_phase(sequences, output)

    @property
    def loss(self):
        def crf_loss(y_true, y_pred):
            y_pred = tf.convert_to_tensor(y_pred, dtype=self.dtype)
            log_likelihood, self.transitions = crf_log_likelihood(
                y_pred,
                tf.cast(K.argmax(y_true), dtype=tf.int32) if self.sparse_target else y_true,
                self.sequence_lengths,
                transition_params=self.transitions,
            )
            return tf.reduce_mean(-log_likelihood)
        return crf_loss

    @property
    def accuracy(self):
        def viterbi_accuracy(y_true, y_pred):
            # -1e10 to avoid zero at sum(mask)
            mask = K.cast(
                K.all(K.greater(y_pred, -1e10), axis=2), K.floatx())
            shape = tf.shape(y_pred)
            sequence_lengths = tf.ones(shape[0], dtype=tf.int32) * (shape[1])
            y_pred, _ = crf_decode(y_pred, self.transitions, sequence_lengths)
            if self.sparse_target:
                y_true = K.argmax(y_true, 2)
            y_pred = K.cast(y_pred, 'int32')
            y_true = K.cast(y_true, 'int32')
            corrects = K.cast(K.equal(y_true, y_pred), K.floatx())
            return K.sum(corrects * mask) / K.sum(mask)
        return viterbi_accuracy

    @property
    def f1(self):
        def crf_f1(y_true, y_pred):
            shape = tf.shape(y_pred)
            sequence_lengths = tf.ones(shape[0], dtype=tf.int32) * (shape[1])
            y_pred, _ = crf_decode(y_pred, self.transitions, sequence_lengths)
            true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
            possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
            predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
            precision = true_positives / (predicted_positives + K.epsilon())
            recall = true_positives / (possible_positives + K.epsilon())
            f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
            return f1_val
        return crf_f1

    def compute_output_shape(self, input_shape):
        tf.TensorShape(input_shape).assert_has_rank(3)
        return input_shape[:2] + (self.output_dim,)

    def get_config(self):
        config = {
            'output_dim': self.output_dim,
            'sparse_target': self.sparse_target,
            'supports_masking': self.supports_masking,
            'transitions': K.eval(self.transitions)
        }
        base_config = super(CRF, self).get_config()
        return dict(base_config, **config)


In [18]:
EMBEDDING_DIR = DATA_DIR / "embeddings"

In [19]:
# standard libaries
import io
from pathlib import Path
from typing import Dict, Tuple

# third party libraries
import numpy as np
from crf import CRF
import tensorflow.keras.layers as layers
import tensorflow.keras.models as models


def create_model(
    vocab_size: int, max_length: int, embedding_dim: int, word_index: Dict[str, int], tag_index: Dict[str, int]
) -> Tuple[models.Model]:
    """Create Bi-LSTM CRF model in tensorflow.

    Model1 is the trainable model. Model2 is for predictions and returns:
    [predicted labels, LSTM hidden state (Forward and backward), LSTM cell state (forward and backward), embeddings]

    This is leveraged to build the REINFORCE states.

    Adapted from:
    https://github.com/ngoquanghuy99/POS-Tagging-BiLSTM-CRF

    Args:
        vocab_size (int): Size of vocabulary
        max_length (int): Max sequence length
        embedding_dim (int): Size of embedding. Make sure to match size of GloVe embedding.
        word_index (Dict[str, int]): Index mapping words to ints
        tag_index (Dict[str, int]): Index mapping tokens to ints

    Returns:
        Tuple[Model]: Compiled Model and Non-compiled Model
        with exposed LSTM and embedding layers
    """

    embeddings_index = {}
    with io.open(EMBEDDING_DIR / "glove.6B.100d.txt", "r", encoding="utf-8") as f:
        for line in f:
            values = line.strip().split()
            curr_word = values[0]
            coefs = np.asarray(values[1:], dtype="float64")
            embeddings_index[curr_word] = coefs
        embeddings_matrix = np.zeros((vocab_size, embedding_dim))
        for word, i in word_index.items():
            if i > vocab_size:
                continue
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embeddings_matrix[i] = embedding_vector

    inputs = layers.Input(shape=(max_length, ))

    embeddings = layers.Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        input_length=max_length,
        weights=[embeddings_matrix],
        mask_zero=True
    )(inputs)

    lstm_out, sh_fw, sc_fw, sh_bw, sc_bw = layers.Bidirectional(
        layers.LSTM(
            units=embedding_dim, return_sequences=True, return_state=True, recurrent_dropout=0.01
        )
    )(embeddings)

    time_dist = layers.TimeDistributed(layers.Dense(len(tag_index)))(lstm_out)
    
    crf = CRF(len(tag_index), sparse_target=False)
    pred = crf(time_dist)

    model1 = models.Model(inputs=[inputs], outputs=[pred])
    model2 = models.Model(inputs=[inputs], outputs=[pred, sh_fw, sc_fw, sh_bw, sc_bw, embeddings])

    model1.compile(optimizer="adam", loss=crf.loss, metrics=[crf.accuracy])
    model1.summary()

    return model1, model2

In [21]:
import shutil

log_dir = ROOT_DIR / "models/logs/"

if any(log_dir.iterdir()):
    for i in log_dir.glob("**/*"):
        if i.is_dir():
            shutil.rmtree(i)
        else:
            i.unlink()

In [72]:
model1, model2 = create_model(
    vocab_size=len(words2index),
    max_length=50,
    embedding_dim=100,
    word_index=words2index,
    tag_index=tags2index
)

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 50)]              0         
                                                                 
 embedding_7 (Embedding)     (None, 50, 100)           433200    
                                                                 
 bidirectional_7 (Bidirectio  [(None, 50, 200),        160800    
 nal)                         (None, 100),                       
                              (None, 100),                       
                              (None, 100),                       
                              (None, 100)]                       
                                                                 
 time_distributed_6 (TimeDis  (None, 50, 29)           5829      
 tributed)                                                       
                                                           

In [73]:
callback = tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)
tensorboard_callback = tf.keras.callbacks.TensorBoard(
    log_dir=log_dir, 
    histogram_freq=1,
    profile_batch="1,20"
)


history = model1.fit(
    train_ds,
    epochs=100, 
    verbose=1,
    validation_data=val_ds,
    callbacks=[callback, tensorboard_callback],
)

ERROR:tensorflow:Failed to start profiler: Another profiler is running.
Epoch 1/100
ERROR:tensorflow:Failed to start profiler: Another profiler is running.
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 6

The baseline Bi-LSTM CRF model has compiled, trained, and tested on the Re3d dataset successfully! Achieving ~99.00% Accuracy on the test set, which is pretty good, as only ~68% of the tags are "O" tags.

In [74]:
model1.evaluate(test_ds)



[106.7297134399414, 0.9930180907249451]

`create_model` creates two models, one which is trainable and one which is used for prediction. This model outputs:
* predictions
* LSTM hidden state forward pass
* LSTM cell state forward pass
* LSTM hidden state backward pass
* LSTM cell state backward pass
* Embeddings

This is leveraged in construction of the REINFORCE states.

In [75]:
preds = model2.predict(test_ds)



In [76]:
len(preds)

6

In [83]:
# The LSTM hidden state forward pass of the first test sequence
preds[1][0]

array([ 9.54754055e-01,  9.72771227e-01,  6.33437157e-01, -6.93008006e-01,
       -9.18714166e-01,  9.78925109e-01,  6.07711911e-01,  9.23140645e-01,
       -4.55931574e-03, -9.10419464e-01, -3.90474796e-01, -6.82343185e-01,
       -9.59207773e-01,  9.63804483e-01, -3.37456353e-02, -7.90837646e-01,
       -9.65304255e-01, -9.09259856e-01,  8.73449922e-01,  8.20084631e-01,
        9.25438404e-01, -6.42064333e-01, -9.67434347e-01,  9.83933330e-01,
        8.76466393e-01,  9.71220791e-01, -9.86461580e-01,  3.70038413e-02,
       -9.91436124e-01, -4.81567651e-01, -5.11627853e-01, -1.45667776e-01,
        2.39765011e-02, -9.89327073e-01,  9.89802063e-01, -9.76399183e-01,
       -6.76109552e-01,  9.95943606e-01, -9.79098320e-01, -6.36848986e-01,
       -8.99715304e-01, -9.18038607e-01,  9.90229487e-01, -3.58470827e-01,
        9.99914169e-01,  9.82747495e-01, -7.12321937e-01, -1.45849541e-01,
       -9.28535044e-01,  8.50757241e-01,  1.02656238e-01,  9.85507429e-01,
        9.70421255e-01, -