In [3]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
from __future__ import absolute_import, division, print_function


import tensorflow_hub as hub
import tensorflow_text as hub_text  # A dependency of the preprocessing model
from official import nlp
import official.nlp.optimization

import os
import numpy as np
import tensorflow as tf
import pandas as pd
import time
from fastprogress import master_bar, progress_bar
from sklearn.metrics import classification_report
import math
import official.nlp.bert.tokenization
from official.nlp import bert


def load_ris_file(file_ds):
    '''
    Loads the ner dataset. Replace categotical variables for numerical variables:'
    O = 1   B-GENE = 2    I-GENE = 3    [CLS],[SEP],[PAD] = 0
    Output:
    1. text_test: a list of sentences containing a list of words. 
    2. label_test: a lost of lists containing labels for each word in each sentence.
    '''
    with open(f'DatasetNER/{file_ds}.tsv','r') as file:
        test_file = file.readlines()

    text_test = []
    label_test = []
    temp_ds = []
    temp_label = []
    for line in test_file:
        if line == '\n':  
            if len(temp_label) > 128:
                temp_label = temp_label[:128]
            text_test.append(' '.join(temp_ds))
            if len(temp_label) < 128:
                temp_label.extend([0]*(128-len(temp_label)))
            label_test.append(temp_label)
            temp_ds = []
            temp_label = []
        else:
            temp_ds.append(line.split('\t')[0].replace('\n',''))
            if line.split('\t')[1].replace('\n','') == 'O':
                temp_label.append(1)
            elif line.split('\t')[1].replace('\n','') == 'B-GENE':
                temp_label.append(2)
            elif line.split('\t')[1].replace('\n','') == 'I-GENE':
                temp_label.append(3)
    
    return text_test,label_test



def build_classifier_model(num_classes):
    '''
    Fine tunes BERT.
    Input: Dataset made by a bert preprocessor e.i "https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3"
        1. input_word_ids, input_mask, input_type_ids
        2. Labels for each word. 
    Output: Trained model.
    '''

    inputs = dict(
        input_word_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32),
        input_mask=tf.keras.layers.Input(shape=(None,), dtype=tf.int32),
        input_type_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32)
  )
  
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='encoder')
    net = encoder(inputs)['sequence_output']
    
    net = tf.keras.layers.Dropout(rate=0.1)(net)
    net = tf.keras.layers.Dense(num_classes, activation='softmax', name='classifier')(net)
    return tf.keras.Model(inputs, net, name='prediction')



@tf.function
def train_step(x, y):
    '''
    Controls what is happening during training.
    '''
    with tf.GradientTape() as tape:
        logits = bert_classifier(x, training=True)
        loss_value = loss_fn(y, logits)
    grads = tape.gradient(loss_value, bert_classifier.trainable_weights)
    optimizer.apply_gradients(zip(grads, bert_classifier.trainable_weights))
    train_acc_metric.update_state(y, logits)
    return loss_value


@tf.function
def test_step(x, y):
    '''
    Computes the validation accuracy for each epoch during training.
    '''
    val_logits = bert_classifier(x, training=False)
    val_acc_metric.update_state(y, val_logits)

    
    
def sentence_token_tagging(test_sentence_tags, tokenized_sentences):
    '''
    Rewrite the found genes during training. 
    Input: 
        1. List of categorical labels asigned by the model during prediction. 
        2. List of tokenized sentences. (BERT tokenizer)
        
    Output:
        1. NerSentence: Sentences in which each found entity was replaced by the word GENE
            [The GENE protein has two activation domains , one of which is an GENE ...]
            
        2. FinalEntities: All of the entities in the sentence that were replaced by the word GENE
            [AraC, arac xyls family domain...]
        
    '''
    entity = ''
    num_entities = 0
    n = 0
    FinalEntities, temp,temp_s,NerSentence = [],[],[],[]
    for num in (range(len(test_sentence_tags))): 
        for num_word, (entity_tags, words) in enumerate(zip(test_sentence_tags[num], tokenized_sentences[num])):
            if entity_tags.startswith('B'):
                n += 1
                if n == 1:
                    entity += '[SEP]' + str(words) + ' '
                else:
                    entity += str(words) + ' '
                temp_s.append('GENE')
                num_entities += 1
            elif entity_tags.startswith('I'):
                entity += str(words) + ' '
                temp_s[-1] += ' GENE'
                n = 0
            else:
                temp_s.append(words)
                n = 0
            if entity != '':
                temp.append(entity.replace(' ##','').replace('##',''))
            else:
                temp.append('')
        FinalEntities.append(temp[-1])
        entity = ''
        temp_str = " ".join(temp_s).replace(' ##','').replace('##','').replace('  ',' ')
        NerSentence.append(temp_str)
        temp_s = []
        temp = []
        
    print(f'Completed. Found {num_entities} genes.')
    return [NerSentence,FinalEntities]


# Loads the preprocessor and the BERT model to fine tune from TensorflowHub  
preprocessor = hub.load( "https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3")
tfhub_handle_encoder = "https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/4"


In [4]:

text_train, label_train = load_ris_file('train')
train_x = preprocessor(text_train)
tokens = preprocessor.tokenize(text_train)

# Since BERT tokenizer splits single words (interestingly -> interest, ##ing, ##ly), we need to add labels for each additional token
              #data                 #labels
# Before:   interest ##ing #ly      -> O
# After:    interest ##ing #ly      -> O, 0, 0
for num_s, sentence in enumerate(tokens):
    n = 0
    for num_w, word in enumerate(sentence):
        if num_w < 128:
            old_tag = label_train[num_s][num_w+n]
        if len(word) > 1:
            for subtoken in range(len(word)-1):
                if old_tag == 1:
                    label_train[num_s].insert(num_w + n ,0)
                elif old_tag == 2:
                    label_train[num_s].insert(num_w + n ,0)
                elif old_tag == 3:
                    label_train[num_s].insert(num_w + n ,0)
            n += len(word) -1
    label_train[num_s].insert(0,0) # Adds a 0 for the [CLS] tokken added by the BERT preprocessor. 
    label_train[num_s] = tf.constant(label_train[num_s][0:128]) # Controls max length
train_dataset = tf.data.Dataset.from_tensor_slices((train_x,label_train)) # Builds dataset in a format that can be fed to the model.


# Optimizador, metricas y loss function. 
# Set up epochs and steps
epochs = 2
batch_size = 32
train_data_size = len(label_train)
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)

# creates an optimizer with learning rate schedule
optimizer = nlp.optimization.create_optimizer(3e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)
bert_classifier = build_classifier_model(4)
train_ds = train_dataset.shuffle(len(label_train),reshuffle_each_iteration=True)
train_ds = train_ds.batch(batch_size)

# Metrics and loss that can be used by a multiclass classificator
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()


In [5]:
# Same as above but fot the test dataset.
text_test,label_test = load_ris_file('test')
test_x = preprocessor(text_test)
tokens = preprocessor.tokenize(text_test)
for num_s, sentence in enumerate(tokens):
    n = 0
    for num_w, word in enumerate(sentence):
        if num_w < 128:
            old_tag = label_test[num_s][num_w+n]
        if len(word) > 1:
            for subtoken in range(len(word)-1):
                if old_tag == 1:
                    label_test[num_s].insert(num_w + n ,0)
                elif old_tag == 2:
                    label_test[num_s].insert(num_w + n ,0)
                elif old_tag == 3:
                    label_test[num_s].insert(num_w + n ,0)
            n += len(word) -1
    label_test[num_s].insert(0,0)
    label_test[num_s] = tf.constant(label_test[num_s][0:128])
val_dataset = tf.data.Dataset.from_tensor_slices((test_x,label_test))
val_ds = val_dataset.batch(batch_size)

In [6]:

# Custom training loop. https://www.tensorflow.org/guide/keras/writing_a_training_loop_from_scratch
epoch_bar = master_bar(range(epochs))
pb_max_len = math.ceil(float(len(text_train))/float(batch_size))

for epoch in epoch_bar:
    print("\nStart of epoch %d" % (epoch,))
    start_time = time.time()
    for step, (x_batch_train, y_batch_train) in progress_bar(enumerate(train_ds),total=pb_max_len, parent=epoch_bar):
        loss_value = train_step(x_batch_train, y_batch_train)
        
        if step % 50 == 0:
            print(f"Training loss (for one batch) at step {step}: {loss_value}")

    train_acc = train_acc_metric.result()
    print(f"Training accuracy over epoch {epoch}: {float(train_acc)}")
    train_acc_metric.reset_states()
    
    # Run a validation loop at the end of each epoch.
    for x_batch_val, y_batch_val in val_ds:
        test_step(x_batch_val, y_batch_val)

    val_acc = val_acc_metric.result()
    val_acc_metric.reset_states()
    print(f"Validation acc: {float(val_acc)}")
    print(f"Time taken: {time.time() - start_time}")


Start of epoch 0
Training loss (for one batch) at step 0: 2.1518588066101074
Training loss (for one batch) at step 50: 0.12621662020683289
Training loss (for one batch) at step 100: 0.03737030178308487
Training loss (for one batch) at step 150: 0.04553067311644554
Training loss (for one batch) at step 200: 0.028643585741519928
Training loss (for one batch) at step 250: 0.026299865916371346
Training loss (for one batch) at step 300: 0.023588906973600388
Training loss (for one batch) at step 350: 0.03769940137863159
Training loss (for one batch) at step 400: 0.018895652145147324
Training loss (for one batch) at step 450: 0.019236121326684952
Training accuracy over epoch 0: 0.945479154586792
Validation acc: 0.9918921589851379
Time taken: 12532.133661985397

Start of epoch 1
Training loss (for one batch) at step 0: 0.023592986166477203
Training loss (for one batch) at step 50: 0.02015260048210621
Training loss (for one batch) at step 100: 0.012883375398814678
Training loss (for one batch)

In [7]:
# Prints a model summary and saves the model.
bert_classifier.summary()
bert_classifier.save('NERModel/', include_optimizer=False)

Model: "prediction"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 encoder (KerasLayer)           {'encoder_outputs':  108310273   ['input_2[0][0]',                
                                 [(None, None, 768)               'input_3[0][0]',       



INFO:tensorflow:Assets written to: NERModel/assets


INFO:tensorflow:Assets written to: NERModel/assets
