In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'


from __future__ import absolute_import, division, print_function


import tensorflow_hub as hub
import tensorflow_text as tf_text  # A dependency of the preprocessing model
from official import nlp
import official.nlp.optimization

import numpy as np
import tensorflow as tf
import pandas as pd
import time
from fastprogress import master_bar, progress_bar
import math
import official.nlp.bert.tokenization
from official.nlp import bert




def build_classifier_model(num_classes):
    '''
    Fine tunes BERT.
    Input: Dataset made by a bert preprocessor e.i "https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3"
        1. input_word_ids, input_mask, input_type_ids
        2. Labels for each word. 
    Output: Trained model.
    '''

    inputs = dict(
        input_word_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32),
        input_mask=tf.keras.layers.Input(shape=(None,), dtype=tf.int32),
        input_type_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32)
    )
  
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='encoder')
    net = encoder(inputs)['sequence_output']
    net = tf.keras.layers.Dropout(rate=0.1)(net)
    net = tf.keras.layers.Dense(num_classes, activation=None, name='classifier')(net)
    return tf.keras.Model(inputs, net, name='prediction')




@tf.function
def train_step(x, y):
    '''
    Controls what is happening during training.
    '''
    with tf.GradientTape() as tape:
        logits = bert_classifier(x, training=True)
        loss_value = loss_fn(y, logits)
    grads = tape.gradient(loss_value, bert_classifier.trainable_weights)
    optimizer.apply_gradients(zip(grads, bert_classifier.trainable_weights))
    train_acc_metric.update_state(y, logits)
    return loss_value


@tf.function
def test_step(x, y):
    '''
    Computes the validation accuracy for each epoch during training.
    '''
    val_logits = bert_classifier(x, training=True)
    val_acc_metric.update_state(y, val_logits)
    

def remove_subtoken(text_var,label_var):
    '''

    '''
    tokens = preprocessor.tokenize(text_var)
    for num_s, sentence in enumerate(tokens):
        n = 0
        for num_w, word in enumerate(sentence):
            if num_w < 128:
                old_tag = label_var[num_s][num_w+n]
            if len(word) > 1:
                for subtoken in range(len(word)-1):
                    if old_tag == 1:
                        label_var[num_s].insert(num_w + n ,0)
                    elif old_tag == 2:
                        label_var[num_s].insert(num_w + n ,0)
                    if n < len(label_var[num_s]):
                        n += 1
        label_var[num_s].insert(0,0)
        label_var[num_s] = tf.constant(label_var[num_s][0:128])

    
    
def encode_sentence(s, tokenizer):
    '''
    Tokenizes pair of sentences and adds a [SEP] token to join them. This token is labeled as 0
    '''
    tokens = list(tokenizer.tokenize(s))
    tokens.append('[SEP]')
    return tokenizer.convert_tokens_to_ids(tokens)

def bert_encode(sentence_dict, tokenizer):
    '''
    Preprocess the data to be on the format expected by BERT. Does the same
    as the BERT preprocessor function.
    Input:
        1. Dict containing:
            'sentence1':
                ['These results indicate that the GeneReg'], 
            'sentence2':
                ['and acrD drug efflux genes are directly regulated by RegProtein protein ( BaeR protein ) .']
        2. Labels:
            [''O O O O O O 0 I-Rel I-Rel I-Rel I-Rel I-Rel I-Rel I-Rel I-Rel I-Rel O O O O O O O O]
    '''
    num_examples = len(sentence_dict["gene1"])

    sentence1 = tf.ragged.constant([
        encode_sentence(s, tokenizer)
        for s in np.array(sentence_dict["gene1"])])
    sentence2 = tf.ragged.constant([
        encode_sentence(s, tokenizer)
        for s in np.array(sentence_dict["gene2"])])

    cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]
    input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)

    input_mask = tf.ones_like(input_word_ids).to_tensor()

    type_cls = tf.zeros_like(cls)
    type_s1 = tf.zeros_like(sentence1)
    type_s2 = tf.ones_like(sentence2)
    input_type_ids = tf.concat([type_cls, type_s1, type_s2], axis=-1).to_tensor()

    inputs = {
        'input_word_ids': input_word_ids.to_tensor()[0:, :128],
        'input_mask': input_mask[0:, :128],
        'input_type_ids': input_type_ids[0:, :128]}

    return inputs


    
preprocessor = hub.load( "https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3")
tfhub_handle_encoder = "https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/4"


In [34]:
with open(f'DatasetRE/train_mnli.txt','r') as file:
    test_file = file.readlines()
    
sentence_train = {}
train_labels = []
gene1_train = []
gene2_train = []
full_train_sentence = []

for x in test_file:
    if x != '\n':
        # Each variable has 1 of the 2 input sentences.
        gene1_train.append(x.split('\t')[0])
        gene2_train.append(x.split('\t')[1])
        
        # Replaces categotical variables for numerical variables:'
        #    O = 1   I-Rel = 2     [CLS],[SEP],[PAD] = 0
        temp_label = []
        for tag in x.split('\t')[2].split(' '):
            if tag == 'O':
                temp_label.append(1)
            if tag == 'I-Rel':
                temp_label.append(2)
        if len(temp_label) > 128:
            temp_label = temp_label[:128]
        else:
            if len(temp_label) < 128:
                temp_label.extend([0]*(128-len(temp_label)))
        train_labels.append(temp_label)
        
        # Single list containing the 2 concatenated input sentences  
        full_train_sentence.append(x.split('\t')[0] + x.split('\t')[1])
sentence_train['gene1'] = gene1_train
sentence_train['gene2'] = gene2_train


# Loads the tokenizer and calls the preprocessing function.
tokenizer = bert.tokenization.FullTokenizer(vocab_file="vocabNER.txt",do_lower_case=False)
train_ds = bert_encode(sentence_train, tokenizer)


# Tokenizes each sentence
pre_txt = []
for indx in range(len(full_train_sentence)):
    pre_txt.append(' '.join(tokenizer.tokenize(full_train_sentence[indx])).split(' '))
    
    
# Since BERT tokenizer splits single words (interestingly -> interest, ##ing, ##ly), we need to add labels for each additional token
#                     data                        labels
# Input example:      interest ##ing #ly          -> O
# Output example:     interest ##ing #ly          -> O, 0, 0
for index in range(len(pre_txt)):
    for x in range(len(pre_txt[index])):
        if '##' in pre_txt[index][x]:
            train_labels[index].insert(x,0)
    train_labels[index] = tf.constant(train_labels[index][:128])


# Builds dataset in a format that can be fed to the model.
train_ds = tf.data.Dataset.from_tensor_slices((train_ds,train_labels))

In [38]:
# Same as above but fot the test dataset.
with open(f'DatasetRE/test_mnli.txt','r') as file:
    test_file = file.readlines()
    
sentence_test = {}
test_label = []
gene1_test = []
gene2_test = []
full_test_sentence = []
for x in test_file:
    if x != '\n':
        gene1_test.append(x.split('\t')[0])
        gene2_test.append(x.split('\t')[1])
        temp_label = []
        for tag in x.split('\t')[2].split(' '):
            if tag == 'O':
                temp_label.append(1)
            if tag == 'I-Rel':
                temp_label.append(2)
        if len(temp_label) > 128:
            temp_label = temp_label[:128]
        else:
            if len(temp_label) < 128:
                temp_label.extend([0]*(128-len(temp_label)))
        test_label.append(temp_label)
        full_test_sentence.append(x.split('\t')[0] + x.split('\t')[1])
sentence_test['gene1'] = gene1_test
sentence_test['gene2'] = gene2_test

tokenizer = bert.tokenization.FullTokenizer(vocab_file="vocabNER.txt",do_lower_case=False)
test_ds = bert_encode(sentence_test, tokenizer)

pre_txt = []
for indx in range(len(full_test_sentence)):
    pre_txt.append(' '.join(tokenizer.tokenize(full_test_sentence[indx])).split(' '))
    
for index in range(len(pre_txt)):
    for x in range(len(pre_txt[index])):
        if '##' in pre_txt[index][x]:
            test_label[index].insert(x,0)
    test_label[index] = tf.constant(test_label[index][:128])
    
test_ds = tf.data.Dataset.from_tensor_slices((test_ds,test_label))


In [39]:
# Optimizador, metricas y loss function. 
# Set up epochs and steps
epochs = 2
batch_size = 32
train_data_size = len(train_labels)
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)


# creates an optimizer with learning rate schedule
optimizer = nlp.optimization.create_optimizer(init_lr=2e-5,num_train_steps=num_train_steps,num_warmup_steps=warmup_steps)
bert_classifier = build_classifier_model(3)
train_ds = train_ds.shuffle(len(train_labels),reshuffle_each_iteration=True)
train_ds = train_ds.batch(batch_size)


# Metrics and loss that can be used by a multiclass classificator
val_ds = test_ds.batch(batch_size)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()

#bert_classifier.compile(optimizer=optimizer)

In [40]:
# Custom training loop. https://www.tensorflow.org/guide/keras/writing_a_training_loop_from_scratch
epoch_bar = master_bar(range(epochs))
pb_max_len = math.ceil(float(len(train_labels))/float(batch_size))

for epoch in epoch_bar:
    print("\nStart of epoch %d" % (epoch + 1,))
    start_time = time.time()
    for step, (x_batch_train, y_batch_train) in progress_bar(enumerate(train_ds),total=pb_max_len, parent=epoch_bar):
        loss_value = train_step(x_batch_train, y_batch_train)
        
        if step % 25 == 0:
            print(f"Training loss (for one batch) at step {step}: {loss_value}")

    train_acc = train_acc_metric.result()
    print(f"Training accuracy over epoch {epoch}: {float(train_acc)}")
    train_acc_metric.reset_states()
    
    # Run a validation loop at the end of each epoch.
    for x_batch_val, y_batch_val in val_ds:
        test_step(x_batch_val, y_batch_val)

    val_acc = val_acc_metric.result()
    val_acc_metric.reset_states()
    print(f"Validation acc: {float(val_acc)}")
    print(f"Time taken: {time.time() - start_time}")


Start of epoch 1
Training loss (for one batch) at step 0: 1.1509202718734741
Training loss (for one batch) at step 25: 0.4241214990615845
Training accuracy over epoch 0: 0.7838122844696045
Validation acc: 0.883280336856842
Time taken: 1538.9019479751587

Start of epoch 2
Training loss (for one batch) at step 0: 0.22367316484451294
Training loss (for one batch) at step 25: 0.1800626963376999
Training accuracy over epoch 1: 0.9231259822845459
Validation acc: 0.9237743020057678
Time taken: 1523.4775159358978


In [41]:
# Prints a model summary and saves the model.
bert_classifier.summary()
# #tf.keras.utils.plot_model(bert_classifier, "RE_Interactions.png", show_shapes=True)
bert_classifier.save('reModel/', include_optimizer=False)

Model: "prediction"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 encoder (KerasLayer)           {'default': (None,   108310273   ['input_2[0][0]',                
                                768),                             'input_3[0][0]',       



INFO:tensorflow:Assets written to: reModel/assets


INFO:tensorflow:Assets written to: reModel/assets
