# CoNLL 2003

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from future.utils import iteritems
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint

from tensorflow.keras import Model, Input

 The versions of TensorFlow you are currently using is 2.5.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [2]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [8]:
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
from tensorflow_addons.text import crf_log_likelihood, crf_decode


class CRF(L.Layer):
    def __init__(self,
                 output_dim,
                 sparse_target=True,
                 **kwargs):
        """    
        Args:
            output_dim (int): the number of labels to tag each temporal input.
            sparse_target (bool): whether the the ground-truth label represented in one-hot.
        Input shape:
            (batch_size, sentence length, output_dim)
        Output shape:
            (batch_size, sentence length, output_dim)
        """
        super(CRF, self).__init__(**kwargs)
        self.output_dim = int(output_dim) 
        self.sparse_target = sparse_target
        self.input_spec = L.InputSpec(min_ndim=3)
        self.supports_masking = False
        self.sequence_lengths = None
        self.transitions = None

    def build(self, input_shape):
        assert len(input_shape) == 3
        f_shape = tf.TensorShape(input_shape)
        input_spec = L.InputSpec(min_ndim=3, axes={-1: f_shape[-1]})

        if f_shape[-1] is None:
            raise ValueError('The last dimension of the inputs to `CRF` '
                             'should be defined. Found `None`.')
        if f_shape[-1] != self.output_dim:
            raise ValueError('The last dimension of the input shape must be equal to output'
                             ' shape. Use a linear layer if needed.')
        self.input_spec = input_spec
        self.transitions = self.add_weight(name='transitions',
                                           shape=[self.output_dim, self.output_dim],
                                           initializer='glorot_uniform',
                                           trainable=True)
        self.built = True

    def compute_mask(self, inputs, mask=None):
        # Just pass the received mask from previous layer, to the next layer or
        # manipulate it if this layer changes the shape of the input
        return mask

    def call(self, inputs, sequence_lengths=None, training=None, **kwargs):
        sequences = tf.convert_to_tensor(inputs, dtype=self.dtype)
        if sequence_lengths is not None:
            assert len(sequence_lengths.shape) == 2
            assert tf.convert_to_tensor(sequence_lengths).dtype == 'int32'
            seq_len_shape = tf.convert_to_tensor(sequence_lengths).get_shape().as_list()
            assert seq_len_shape[1] == 1
            self.sequence_lengths = K.flatten(sequence_lengths)
        else:
            self.sequence_lengths = tf.ones(tf.shape(inputs)[0], dtype=tf.int32) * (
                tf.shape(inputs)[1]
            )

        viterbi_sequence, _ = crf_decode(sequences,
                                         self.transitions,
                                         self.sequence_lengths)
        output = K.one_hot(viterbi_sequence, self.output_dim)
        return K.in_train_phase(sequences, output)

    @property
    def loss(self):
        def crf_loss(y_true, y_pred):
            y_pred = tf.convert_to_tensor(y_pred, dtype=self.dtype)
            log_likelihood, self.transitions = crf_log_likelihood(
                y_pred,
                tf.cast(K.argmax(y_true), dtype=tf.int32) if self.sparse_target else y_true,
                self.sequence_lengths,
                transition_params=self.transitions,
            )
            return tf.reduce_mean(-log_likelihood)
        return crf_loss

    @property
    def accuracy(self):
        def viterbi_accuracy(y_true, y_pred):
            # -1e10 to avoid zero at sum(mask)
            mask = K.cast(
                K.all(K.greater(y_pred, -1e10), axis=2), K.floatx())
            shape = tf.shape(y_pred)
            sequence_lengths = tf.ones(shape[0], dtype=tf.int32) * (shape[1])
            y_pred, _ = crf_decode(y_pred, self.transitions, sequence_lengths)
            if self.sparse_target:
                y_true = K.argmax(y_true, 2)
            y_pred = K.cast(y_pred, 'int32')
            y_true = K.cast(y_true, 'int32')
            corrects = K.cast(K.equal(y_true, y_pred), K.floatx())
            return K.sum(corrects * mask) / K.sum(mask)
        return viterbi_accuracy

    def compute_output_shape(self, input_shape):
        tf.TensorShape(input_shape).assert_has_rank(3)
        return input_shape[:2] + (self.output_dim,)

    def get_config(self):
        config = {
            'output_dim': self.output_dim,
            'sparse_target': self.sparse_target,
            'supports_masking': self.supports_masking,
            'transitions': K.eval(self.transitions)
        }
        base_config = super(CRF, self).get_config()
        return dict(base_config, **config)

In [9]:
! ls ../data/ner/conll/2003/

metadata  test.txt  train.txt  valid.txt


In [10]:
class CoNLLSentenceDataLoader(object):
    def __init__(self, 
                 train_file_path, 
                 val_file_path, 
                 test_file_path,
                 unknown_word = 'unk',
                 text_col=0,
                 ner_tag_col=3):
        
        self._text_col = text_col
        self._ner_tag_col = ner_tag_col
        self._unknown_word = unknown_word
        self._train_df = self._read_txt(train_file_path)
        self._test_df = self._read_txt(test_file_path)
        self._val_df = self._read_txt(val_file_path)
        
        self.train_sentences = self._get_sentences(self._train_df)
        self.test_sentences = self._get_sentences(self._test_df)
        self.val_sentences = self._get_sentences(self._val_df)
        
        # Collect word and tag bags based on trian data
        self.words = self._get_words(self.train_sentences)
        self.ner_tags = self._get_tags(self.train_sentences)
        self.max_length = self._get_max_length(self.train_sentences)
        
    def _read_txt(self, file_path):
        df = pd.read_csv(file_path, 
                           sep=' ', 
                           skip_blank_lines=False, 
                           header=None).fillna(self._unknown_word)
        # Filter out the DOCSTART lines
        df = df[~df[0].str.contains("DOCSTART")]
        return df
    
    def _get_sentences(self, df):
        current_rows = []
        # list of list of tuples
        sentences = []
        
        for i in tqdm(range(len(df))):
            row = df.values[i]
            if row[0] != self._unknown_word:
                current_rows.append(row)
            else:
                if len(current_rows) > 2:
                    _temp_df = pd.DataFrame(current_rows)
                    sentences.append(list(zip(_temp_df[self._text_col].values, _temp_df[self._ner_tag_col].values)))
                    current_rows = []
        return sentences    
    
    def _get_words(self, sentences):
        words = set()
        for s in sentences:
            [words.add(t[0]) for t in s]
        return list(words)
    
    def _get_tags(self, sentences):
        tags = set()
        for s in sentences:
            [tags.add(t[1]) for t in s]
        return list(tags)
    
    def _get_max_length(self, sentences):
        return max([len(s) for s in sentences])
    
    

In [11]:
class SentencePreprocessor(object):
    def __init__(self, words, tags, maxlen):
        self._words = words
        self._tags = tags
        
        self.n_words = len(words)
        self.n_tags = len(tags)
        self.maxlen = maxlen
        
        self._word2idx = {w: i for i, w in enumerate(self._words)}
        self._tag2idx = {t: i for i, t in enumerate(self._tags)}
        self._idx2tag = {v: k for k, v in iteritems(self._tag2idx)}
        
    def sentences_2_data(self, sentences):
        X = [[self._word2idx.get(w[0], self.n_words - 1) for w in s] for s in sentences]
        X = tf.keras.preprocessing.sequence.pad_sequences(maxlen=self.maxlen, sequences=X, padding="post", value=self.n_words - 1)

        y = [[self._tag2idx.get(w[1], 'unk') for w in s] for s in sentences]
        y = tf.keras.preprocessing.sequence.pad_sequences(maxlen=self.maxlen, sequences=y, padding="post", value=self._tag2idx["O"])
        y = [tf.keras.utils.to_categorical(i, num_classes=self.n_tags) for i in y]
        return X, y

In [19]:
class BiLSTMCRF(object):
    def __init__(self,
                 num_words,
                 num_tags,
                 sentence_max_length,
                 word_embeddings_size=64,
                 checkpoint_dir=os.path.join(str(Path.home()), '.mozhi')):
        self._word_embeddings_size = word_embeddings_size
        self._checkpoint_dir = checkpoint_dir
        self._filepath = "vf-bi-lstm-td-model-{val_accuracy:.2f}.hdf5"


        inputs = Input(shape=(sentence_max_length,))

        # Embedding Layer
        model = tf.keras.layers.Embedding(input_dim=num_words,
                                          output_dim=word_embeddings_size,
                                          input_length=sentence_max_length)(inputs)

        # BI-LSTM Layer
        model = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=word_embeddings_size,
                                                                   return_sequences=True,
                                                                   dropout=0.5,
                                                                   recurrent_dropout=0.5,
                                                                   kernel_initializer=tf.keras.initializers.HeNormal()))(model)

        model = tf.keras.layers.LSTM(units=word_embeddings_size * 2,
                                     return_sequences=True,
                                     dropout=0.5,
                                     recurrent_dropout=0.5,
                                     kernel_initializer=tf.keras.initializers.HeNormal())(model)

        # TimeDistributed Layer
        model = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(num_tags,
                                                                      activation="relu"))(model)

        self._crf = CRF(num_tags, sparse_target=True)
        outputs = self._crf(model)  # output

        self.model = Model(inputs, outputs)
        self.model.summary()

    def _get_optimizer(self):
        return tf.keras.optimizers.Adam(learning_rate=0.0005, beta_1=0.9, beta_2=0.999)

    def compile(self):
        self.model.compile(optimizer=self._get_optimizer(),
                           loss=self._crf.loss,
                           metrics=[self._crf.accuracy, 'accuracy'])

    def fit(self,
            X,
            y,
            batch_size=256,
            epochs=20,
            validation_split=0.1,
            verbose=1):
        # Saving the best model only
        checkpoint = ModelCheckpoint(self._checkpoint_dir + "/" + self._filepath,
                                     monitor='val_accuracy',
                                     verbose=1,
                                     save_best_only=True,
                                     mode='max')
        callbacks_list = [checkpoint]

        # Fit the best model
        history = self.model.fit(X,
                                 np.array(y),
                                 batch_size=batch_size,
                                 epochs=epochs,
                                 validation_split=validation_split,
                                 verbose=verbose,
                                 callbacks=callbacks_list)
        return history

In [20]:
conll_loader = CoNLLSentenceDataLoader(train_file_path="../data/ner/conll/2003/train.txt", 
                                      val_file_path="../data/ner/conll/2003/valid.txt", 
                                      test_file_path="../data/ner/conll/2003/test.txt")

100%|██████████| 218608/218608 [00:08<00:00, 26154.97it/s]
100%|██████████| 50119/50119 [00:02<00:00, 23705.52it/s]
100%|██████████| 54828/54828 [00:01<00:00, 28146.74it/s]


In [21]:
preprocessor = SentencePreprocessor(conll_loader.words, conll_loader.ner_tags, conll_loader.max_length)

In [22]:
trainX, trainY = preprocessor.sentences_2_data(conll_loader.train_sentences)

In [27]:
model = BiLSTMCRF(num_words=preprocessor.n_words,
                 num_tags=preprocessor.n_tags,
                 sentence_max_length=conll_loader.max_length,
                 word_embeddings_size=64)
model.compile()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 113)]             0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 113, 64)           1511808   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 113, 128)          66048     
_________________________________________________________________
lstm_7 (LSTM)                (None, 113, 128)          131584    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 113, 10)           1290      
_________________________________________________________________
crf_2 (CRF)                  (None, 113, 10)           100       
Total params: 1,710,830
Trainable params: 1,710,830
Non-trainable params: 0
_________________________________________________

In [28]:
model.fit(X=trainX,
        y=trainY,
        batch_size=256,
        epochs=20,
        validation_split=0.1,
        verbose=1)

Epoch 1/20
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.

Epoch 00001: val_accuracy improved from -inf to 0.97644, saving model to /home/mageswarand/.mozhi/vf-bi-lstm-td-model-0.98.hdf5
Epoch 2/20

Epoch 00002: val_accuracy did not improve from 0.97644
Epoch 3/20

Epoch 00003: val_accuracy did not improve from 0.97644
Epoch 4/20
10/46 [=====>........................] - ETA: 21s - loss: 13.5601 - viterbi_accuracy: 0.9752 - accuracy: 0.9752

KeyboardInterrupt: 

In [None]:
# Plot the graph 
plt.style.use('ggplot')

def plot_history(history):
    accuracy = history.history['accuracy']
    val_accuracy = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(accuracy) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, accuracy, 'b', label='Training acc')
    plt.plot(x, val_accuracy, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

plot_history(history)

In [None]:
testX, testY = preprocessor.sentences_2_data(conll_loader.test_sentences)

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(preprocessor._idx2tag[p_i])
        out.append(out_i)
    return out
test_pred = model.predict(testX, verbose=1)   
pred_labels = pred2label(test_pred)
test_labels = pred2label(testY)

In [None]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
print("F1-score: {:.1%}".format(f1_score(test_labels, pred_labels)))

In [None]:
from  sklearn_crfsuite.metrics import flat_classification_report  
report = flat_classification_report(y_pred=pred_labels, y_true=test_labels)
print(report)