In [None]:

!pip install tensorflow==1.15.0
!pip install transformers scikit-learn


In [None]:

!git clone https://github.com/VinAIResearch/PhoNER_COVID19


fatal: destination path 'PhoNER_COVID19' already exists and is not an empty directory.


In [None]:

def read_data(file_path):
    tokens = []
    tags = []
    
    tokens_temp = []
    tags_temp = []
    for line in open(file_path, encoding='utf-8'):
        line = line.strip()
        if not line: 
            if tokens_temp:
                tokens.append(tokens_temp)
                tags.append(tags_temp)
            tokens_temp = []
            tags_temp = []
        else:
            token, tag = line.split()        
            tokens_temp.append(token)
            tags_temp.append(tag)
            
    return tokens, tags


In [None]:

train_tokens, train_tags = read_data("PhoNER_COVID19/data/syllable/train_syllable.conll")
validation_tokens, validation_tags = read_data("PhoNER_COVID19/data/syllable/dev_syllable.conll")
test_tokens, test_tags = read_data("PhoNER_COVID19/data/syllable/test_syllable.conll")


In [None]:

print(len(train_tokens))
print(len(validation_tokens))
print(len(test_tokens))


5027
2000
3000


In [None]:

for i in range(1):
    for token, tag in zip(train_tokens[i], train_tags[i]):
        print('%s\t%s' % (token, tag))
    print()


Đồng	O
thời	O
,	O
bệnh	O
viện	O
tiếp	O
tục	O
thực	O
hiện	O
các	O
biện	O
pháp	O
phòng	O
chống	O
dịch	O
bệnh	O
COVID	O
-	O
19	O
theo	O
hướng	O
dẫn	O
của	O
Bộ	B-ORGANIZATION
Y	I-ORGANIZATION
tế	I-ORGANIZATION
.	O



In [None]:

from collections import defaultdict

def build_dict(tokens_or_tags, special_tokens):
    tok2idx = defaultdict(lambda: 0)
    idx2tok = []
    
    vocab = set([t for ts in tokens_or_tags for t in ts])
    voab_size = len(vocab)+len(special_tokens)
    idx2tok = ['']*voab_size

    for i,token in enumerate(special_tokens):
        tok2idx[token] = i
        idx2tok[i] = token
    
    for i, token in enumerate(vocab, len(special_tokens)):
        tok2idx[token] = i
        idx2tok[i] = token       
    
    return tok2idx, idx2tok


In [None]:

special_tokens = ['', '']
special_tags = ['O']
 
token2idx, idx2token = build_dict(train_tokens + validation_tokens, special_tokens)
tag2idx, idx2tag = build_dict(train_tags, special_tags)


In [None]:

tag2idx


defaultdict(<function __main__.build_dict.<locals>.<lambda>>,
            {'B-AGE': 4,
             'B-DATE': 9,
             'B-GENDER': 18,
             'B-JOB': 7,
             'B-LOCATION': 17,
             'B-NAME': 10,
             'B-ORGANIZATION': 11,
             'B-PATIENT_ID': 12,
             'B-SYMPTOM_AND_DISEASE': 3,
             'B-TRANSPORTATION': 6,
             'I-AGE': 20,
             'I-DATE': 2,
             'I-GENDER': 14,
             'I-JOB': 15,
             'I-LOCATION': 19,
             'I-NAME': 21,
             'I-ORGANIZATION': 16,
             'I-PATIENT_ID': 5,
             'I-SYMPTOM_AND_DISEASE': 1,
             'I-TRANSPORTATION': 13,
             'O': 8})

In [None]:

def words2idxs(tokens_list):
    return [token2idx[word] for word in tokens_list]
 
def tags2idxs(tags_list):
    return [tag2idx[tag] for tag in tags_list]
 
def idxs2words(idxs):
    return [idx2token[idx] for idx in idxs]
 
def idxs2tags(idxs):
    return [idx2tag[idx] for idx in idxs]


In [None]:

def batches_generator(batch_size, tokens, tags, shuffle=True, allow_smaller_last_batch=True): 
    n_samples = len(tokens)
    if shuffle:
        order = np.random.permutation(n_samples)
    else:
        order = np.arange(n_samples)
 
    n_batches = n_samples // batch_size
    if allow_smaller_last_batch and n_samples % batch_size:
        n_batches += 1
 
    for k in range(n_batches):
        batch_start = k * batch_size
        batch_end = min((k + 1) * batch_size, n_samples)
        current_batch_size = batch_end - batch_start
        x_list = []
        y_list = []
        max_len_token = 0
        for idx in order[batch_start: batch_end]:
            x_list.append(words2idxs(tokens[idx]))
            y_list.append(tags2idxs(tags[idx]))
            max_len_token = max(max_len_token, len(tags[idx]))
 
        x = np.ones([current_batch_size, max_len_token], dtype=np.int32) * token2idx['&amp;amp;amp;amp;lt;PAD&amp;amp;amp;amp;gt;']
        y = np.ones([current_batch_size, max_len_token], dtype=np.int32) * tag2idx['O']
        lengths = np.zeros(current_batch_size, dtype=np.int32)
        for n in range(current_batch_size):
            utt_len = len(x_list[n])
            x[n, :utt_len] = x_list[n]
            lengths[n] = utt_len
            y[n, :utt_len] = y_list[n]
        yield x, y, lengths


In [None]:

import tensorflow as tf
import numpy as np
tf.compat.v1.disable_eager_execution()
from sklearn.metrics import accuracy_score, classification_report, f1_score
class BiLSTMModel():
    pass


In [None]:

def declare_placeholders(self):
    """Specifies placeholders for the model."""
 
    # Placeholders for input and ground truth output.
    self.input_batch = tf.compat.v1.placeholder(dtype=tf.int32, shape=[None, None], name='input_batch')
    self.ground_truth_tags = tf.compat.v1.placeholder(dtype=tf.int32, shape=[None, None], name='ground_truth_tags')  
 
    # Placeholder for lengths of the sequences.
    self.lengths = tf.compat.v1.placeholder(dtype=tf.int32, shape=[None], name='lengths') 
 
    # Placeholder for a dropout keep probability. If we don't feed
    # a value for this placeholder, it will be equal to 1.0.
    self.dropout_ph = tf.compat.v1.placeholder_with_default(tf.cast(1.0, tf.float32), shape=[])
 
    # Placeholder for a learning rate (tf.float32).
    self.learning_rate_ph = tf.compat.v1.placeholder(dtype=tf.float32, shape=[]) 
 
BiLSTMModel.__declare_placeholders = classmethod(declare_placeholders)


In [None]:

def build_layers(self, vocabulary_size, embedding_dim, n_hidden_rnn, n_tags):
   """Specifies bi-LSTM architecture and computes logits for inputs."""
 
   # Create embedding variable (tf.Variable) with dtype tf.float32
   initial_embedding_matrix = np.random.randn(vocabulary_size, embedding_dim) / np.sqrt(embedding_dim)
   embedding_matrix_variable = tf.Variable(initial_value=initial_embedding_matrix, dtype=tf.float32) 
 
   # Create RNN cells (for example, tf.nn.rnn_cell.BasicLSTMCell) with n_hidden_rnn number of units
   # and dropout (tf.nn.rnn_cell.DropoutWrapper), initializing all *_keep_prob with dropout placeholder.
   forward_cell = tf.compat.v1.nn.rnn_cell.DropoutWrapper(tf.compat.v1.nn.rnn_cell.LSTMCell(n_hidden_rnn), self.dropout_ph, self.dropout_ph) 
   backward_cell = tf.compat.v1.nn.rnn_cell.DropoutWrapper(tf.compat.v1.nn.rnn_cell.LSTMCell(n_hidden_rnn), self.dropout_ph, self.dropout_ph)
 
   # Look up embeddings for self.input_batch (tf.nn.embedding_lookup).
   # Shape: [batch_size, sequence_len, embedding_dim].
   embeddings = tf.nn.embedding_lookup(embedding_matrix_variable, self.input_batch) 
 
   # Pass them through Bidirectional Dynamic RNN (tf.nn.bidirectional_dynamic_rnn).
   # Shape: [batch_size, sequence_len, 2 * n_hidden_rnn].
   # Also don't forget to initialize sequence_length as self.lengths and dtype as tf.float32.
   (rnn_output_fw, rnn_output_bw), _ = tf.compat.v1.nn.bidirectional_dynamic_rnn(forward_cell, backward_cell, embeddings, sequence_length=self.lengths, dtype=tf.float32) 
   rnn_output = tf.concat([rnn_output_fw, rnn_output_bw], axis=2)
 
   # Dense layer on top.
   # Shape: [batch_size, sequence_len, n_tags].
   self.logits = tf.compat.v1.layers.dense(rnn_output, n_tags, activation=None)
 
BiLSTMModel.__build_layers = classmethod(build_layers)


In [None]:

def compute_predictions(self):
    """Transforms logits to probabilities and finds the most probable tags."""
 
    # Create softmax (tf.nn.softmax) function
    self.softmax_output = tf.nn.softmax(self.logits)
 
    # Use argmax (tf.argmax) to get the most probable tags
    self.predictions = tf.argmax(self.softmax_output, axis=-1)

BiLSTMModel.__compute_predictions = classmethod(compute_predictions)


In [None]:

def compute_loss(self, n_tags, PAD_index):
    """Computes masked cross-entopy loss with logits."""
 
    # Create cross entropy function function (tf.nn.softmax_cross_entropy_with_logits_v2)
    ground_truth_tags_one_hot = tf.one_hot(self.ground_truth_tags, n_tags)
    loss_tensor = tf.nn.softmax_cross_entropy_with_logits_v2(ground_truth_tags_one_hot, self.logits) 
 
    mask = tf.cast(tf.not_equal(self.input_batch, PAD_index), tf.float32)
    # Create loss function which doesn't operate with &amp;amp;lt;PAD&amp;amp;gt; tokens (tf.reduce_mean)
    # The argument of tf.reduce_mean should be multiplication of mask and loss_tensor.
    self.loss =  tf.reduce_mean(mask*loss_tensor) 
 
BiLSTMModel.__compute_loss = classmethod(compute_loss)



In [None]:

def perform_optimization(self):
    """Specifies the optimizer and train_op for the model."""
 
    # Create an optimizer (tf.train.AdamOptimizer)
    self.optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=self.learning_rate_ph)
    self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
 
    # Gradient clipping (tf.clip_by_norm) for self.grads_and_vars
    # Apply this operation only for gradients because self.grads_and_vars also contains variables.
    # list comprehension might be useful in this case.
    clip_norm = tf.cast(1.0, tf.float32)
    self.grads_and_vars = [(tf.clip_by_norm(g, clip_norm), v) for (g,v) in self.grads_and_vars]
 
    self.train_op = self.optimizer.apply_gradients(self.grads_and_vars)
 
BiLSTMModel.__perform_optimization = classmethod(perform_optimization)



In [None]:

def init_model(self, vocabulary_size, n_tags, embedding_dim, n_hidden_rnn, PAD_index):
    self.__declare_placeholders()
    self.__build_layers(vocabulary_size, embedding_dim, n_hidden_rnn, n_tags)
    self.__compute_predictions()
    self.__compute_loss(n_tags, PAD_index)
    self.__perform_optimization()


In [None]:

BiLSTMModel.__init__ = classmethod(init_model)


In [None]:

def train_on_batch(self, session, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability):
    feed_dict = {self.input_batch: x_batch,
                 self.ground_truth_tags: y_batch,
                 self.learning_rate_ph: learning_rate,
                 self.dropout_ph: dropout_keep_probability,
                 self.lengths: lengths}
 
    session.run(self.train_op, feed_dict=feed_dict)
 
BiLSTMModel.train_on_batch = classmethod(train_on_batch)


In [None]:

def predict_for_batch(self, session, x_batch, lengths):
 
    feed_dict = {self.input_batch: x_batch,
                 self.lengths: lengths}
 
    predictions = session.run(self.predictions, feed_dict=feed_dict)
    softmax_output = session.run(self.softmax_output, feed_dict=feed_dict)
 
    return predictions, softmax_output
 
BiLSTMModel.predict_for_batch = classmethod(predict_for_batch)


In [None]:

from evaluation import precision_recall_f1

def predict_tags(model, session, token_idxs_batch, lengths):
    """Performs predictions and transforms indices to tokens and tags."""
 
    tag_idxs_batch, softmax_batch = model.predict_for_batch(session, token_idxs_batch, lengths)
 
    tags_batch, tokens_batch, probs_batch = [], [], []
    for tag_idxs, token_idxs, softmax_probs in zip(tag_idxs_batch, token_idxs_batch, softmax_batch):
        tags, tokens, probs = [], [], []
        for tag_idx, token_idx, softmax_prob in zip(tag_idxs, token_idxs, softmax_probs):
            tags.append(idx2tag[tag_idx])
            tokens.append(idx2token[token_idx])
            probs.append(softmax_prob)
        tags_batch.append(tags)
        tokens_batch.append(tokens)
        probs_batch.append(probs)
    return tags_batch, tokens_batch, probs_batch
 
def eval_conll(model, session, tokens, tags, short_report=True):
    """Computes NER quality measures using CONLL shared task script."""
 
    y_true, y_pred = [], []
    for x_batch, y_batch, lengths in batches_generator(1, tokens, tags):
        tags_batch, tokens_batch, probs_batch = predict_tags(model, session, x_batch, lengths)
        if len(x_batch[0]) != len(tags_batch[0]):
            raise Exception("Incorrect length of prediction for the input, "
                            "expected length: %i, got: %i" % (len(x_batch[0]), len(tags_batch[0])))
        predicted_tags = []
        ground_truth_tags = []
        for gt_tag_idx, pred_tag, token in zip(y_batch[0], tags_batch[0], tokens_batch[0]):
            if token != '&amp;amp;lt;PAD&amp;amp;gt;':
                ground_truth_tags.append(idx2tag[gt_tag_idx])
                predicted_tags.append(pred_tag)
 
        # We extend every prediction and ground truth sequence with 'O' tag
        # to indicate a possible end of entity.
        y_true.extend(ground_truth_tags + ['O'])
        y_pred.extend(predicted_tags + ['O'])
 
    results = precision_recall_f1(y_true, y_pred, print_results=True, short_report=short_report)
    return results


In [None]:

tf.compat.v1.reset_default_graph()

model = BiLSTMModel(vocabulary_size=len(token2idx), n_tags=len(tag2idx), embedding_dim=200, n_hidden_rnn=200, PAD_index=token2idx['<PAD>']) 

batch_size = 32
n_epochs = 4 
learning_rate = 0.005
learning_rate_decay = np.sqrt(2) 
dropout_keep_probability = 0.5


In [None]:

sess = tf.compat.v1.Session()
sess.run(tf.compat.v1.global_variables_initializer())

print('Start training... \n')
for epoch in range(n_epochs):
    # For each epoch evaluate the model on train and validation data
    print('-' * 20 + ' Epoch {} '.format(epoch+1) + 'of {} '.format(n_epochs) + '-' * 20)
    print('Train data evaluation:')
    eval_conll(model, sess, train_tokens, train_tags, short_report=True)
    print('Validation data evaluation:')
    eval_conll(model, sess, validation_tokens, validation_tags, short_report=True)
    
    # Train the model
    for x_batch, y_batch, lengths in batches_generator(batch_size, train_tokens, train_tags):
        model.train_on_batch(sess, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability)
        
    # Decaying the learning rate
    learning_rate = learning_rate / learning_rate_decay
    
print('...training finished.')


Start training... 

-------------------- Epoch 1 of 4 --------------------
Train data evaluation:
processed 172568 tokens with 15767 phrases; found: 113308 phrases; correct: 592.

precision:  0.52%; recall:  3.75%; F1:  0.92

Validation data evaluation:
processed 73325 tokens with 7478 phrases; found: 48628 phrases; correct: 270.

precision:  0.56%; recall:  3.61%; F1:  0.96

-------------------- Epoch 2 of 4 --------------------
Train data evaluation:
processed 172568 tokens with 15767 phrases; found: 17528 phrases; correct: 14383.

precision:  82.06%; recall:  91.22%; F1:  86.40

Validation data evaluation:
processed 73325 tokens with 7478 phrases; found: 8311 phrases; correct: 6383.

precision:  76.80%; recall:  85.36%; F1:  80.85

-------------------- Epoch 3 of 4 --------------------
Train data evaluation:
processed 172568 tokens with 15767 phrases; found: 15757 phrases; correct: 14513.

precision:  92.11%; recall:  92.05%; F1:  92.08

Validation data evaluation:
processed 73325 t

In [None]:

print('-' * 20 + ' Train set quality: ' + '-' * 20)
train_results = eval_conll(model, sess, train_tokens, train_tags, short_report=False)

print('-' * 20 + ' Validation set quality: ' + '-' * 20)
validation_results = eval_conll(model, sess, validation_tokens, validation_tags, short_report=False) 

print('-' * 20 + ' Test set quality: ' + '-' * 20)
test_results = eval_conll(model, sess, test_tokens, test_tags, short_report=False) 


-------------------- Train set quality: --------------------
processed 172568 tokens with 15767 phrases; found: 15941 phrases; correct: 15235.

precision:  95.57%; recall:  96.63%; F1:  96.10

	         AGE: precision:   96.19%; recall:   96.19%; F1:   96.19; predicted:   682

	        DATE: precision:   99.02%; recall:   99.33%; F1:   99.18; predicted:  2557

	      GENDER: precision:   93.85%; recall:   90.04%; F1:   91.90; predicted:   520

	         JOB: precision:   69.39%; recall:   66.34%; F1:   67.83; predicted:   196

	    LOCATION: precision:   96.09%; recall:   97.96%; F1:   97.02; predicted:  5503

	        NAME: precision:   83.43%; recall:   80.80%; F1:   82.10; predicted:   338

	ORGANIZATION: precision:   91.15%; recall:   93.32%; F1:   92.22; predicted:  1164

	  PATIENT_ID: precision:   98.59%; recall:   99.35%; F1:   98.97; predicted:  3265

	SYMPTOM_AND_DISEASE: precision:   90.74%; recall:   94.02%; F1:   92.35; predicted:  1491

	TRANSPORTATION: precision:   97.78