In [4]:

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

from __future__ import absolute_import, division, print_function
import tensorflow_hub as hub
import tensorflow_text as tf_text  # A dependency of the preprocessing model
from official import nlp
import official.nlp.optimization
import numpy as np
import tensorflow as tf
import pandas as pd
import time
from fastprogress import master_bar, progress_bar
import math
import official.nlp.bert.tokenization
from official.nlp import bert
from nltk.tokenize import wordpunct_tokenize

def load_file(file_ds):
    with open(f'dataset/{file_ds}.tsv','r') as file:
        test_file = file.readlines()

    function_sentence = []
    label_test = []
    temp_ds = []
    temp_label = []
    for line in test_file:
        if line == '\n':  
            if len(temp_label) > 128:
                temp_label = temp_label[:128]
            if len(temp_label) < 128:
                temp_label.extend([0]*(128-len(temp_label)))
            if ' '.join(temp_ds) not in function_sentence:
                if temp_label and temp_ds:
                    function_sentence.append(' '.join(temp_ds))
                    label_test.append(temp_label)
            temp_ds = []
            temp_label = []
        else:
            temp_ds.append(line.split('\t')[0].replace('\n',''))
            #temp_label.append(int(line.split('\t')[1].replace('\n','')))
            if line.split('\t')[1].replace('\n','') == 'O':
                temp_label.append(1)
            elif line.split('\t')[1].replace('\n','') == 'B-OD':
                temp_label.append(2)
            elif line.split('\t')[1].replace('\n','') == 'I-OD':
                temp_label.append(2)
            elif line.split('\t')[1].replace('\n','') == 'I-Med':
                temp_label.append(3)
            elif line.split('\t')[1].replace('\n','') == 'I-Med':
                temp_label.append(3)
            elif line.split('\t')[1].replace('\n','') == 'B-Air':
                temp_label.append(4)
            elif line.split('\t')[1].replace('\n','') == 'B-Supp':
                temp_label.append(5)
            elif line.split('\t')[1].replace('\n','') == 'I-Supp':
                temp_label.append(5)
            elif line.split('\t')[1].replace('\n','') == 'B-Anti':
                temp_label.append(6)
            if line.split('\t')[1].replace('\n','') == 'B-Temp':
                temp_label.append(7)
            elif line.split('\t')[1].replace('\n','') == 'B-Orgn':
                temp_label.append(8)
            elif line.split('\t')[1].replace('\n','') == 'I-Orgn':
                temp_label.append(9)
            elif line.split('\t')[1].replace('\n','') == 'B-Gtype':
                temp_label.append(10)
            elif line.split('\t')[1].replace('\n','') == 'I-Gtype':
                temp_label.append(10)
            elif line.split('\t')[1].replace('\n','') == 'B-Phase':
                temp_label.append(11)
            elif line.split('\t')[1].replace('\n','') == 'I-Phase':
                temp_label.append(11)
            elif line.split('\t')[1].replace('\n','') == 'B-Gversion':
                temp_label.append(12)
            elif line.split('\t')[1].replace('\n','') == 'B-Technique':
                temp_label.append(13)
            elif line.split('\t')[1].replace('\n','') == 'I-Technique':
                temp_label.append(13)
    
    return function_sentence,label_test



def build_classifier_model(num_classes):
    '''
    Fine tunes BERT.
    Input: Dataset made by a bert preprocessor e.i "https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3"
        1. input_word_ids, input_mask, input_type_ids
        2. Labels for each word. 
    Output: Trained model.
    '''

    inputs = dict(
        input_word_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32),
        input_mask=tf.keras.layers.Input(shape=(None,), dtype=tf.int32),
        input_type_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32)
    )
  
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='encoder')
    net = encoder(inputs)['sequence_output']
    net = tf.keras.layers.Dropout(rate=0.1)(net)
    net = tf.keras.layers.Dense(num_classes, activation='softmax', name='classifier')(net)
    return tf.keras.Model(inputs, net, name='prediction')


@tf.function
def train_step(x, y):
    '''
    Controls what is happening during training.
    '''
    with tf.GradientTape() as tape:
        logits = bert_classifier(x, training=True)
        loss_value = loss_fn(y, logits)
    grads = tape.gradient(loss_value, bert_classifier.trainable_weights)
    optimizer.apply_gradients(zip(grads, bert_classifier.trainable_weights))
    train_acc_metric.update_state(y, logits)
    return loss_value


@tf.function
def test_step(x, y):
    '''
    Computes the validation accuracy for each epoch during training.
    '''
    val_logits = bert_classifier(x, training=False)
    val_acc_metric.update_state(y, val_logits)
    
    

def sentence_token_tagging(test_sentence_tags, tokenized_sentences):
    '''
    Rewrite the genes found by the NER model. 
    Input: 
        1. List of categorical labels asigned by the model during prediction. 
        2. List of tokenized sentences. (BERT tokenizer)
        
    Output:
        1. NerSentence: Sentences in which each found entity was replaced by the word GENE
            [The GENE protein has two activation domains , one of which is an GENE ...]
            
        2. FinalEntities: All of the entities in the sentence that were replaced by the word GENE
            [AraC, arac xyls family domain...]
        
    '''
    entity = ''
    num_entities = 0
    n = 0
    TF_Regulator, RegulatedGene = [],[]
    FinalEntities, temp,temp_s,NerSentence = [],[],[],[]
    for num in (range(len(test_sentence_tags))): 
        for num_word, (entity_tags, words) in enumerate(zip(test_sentence_tags[num], tokenized_sentences[num])):
            if entity_tags.startswith('B'):
                entity += '[SEP] ' + str(words) + ' '
                num_entities += 1
                result = entity_tags.split('-')[1] + ' [' + str(words) + '] '  + entity_tags.split('-')[1]
                temp_s.append(result)
                
            if entity_tags.startswith('I'):
                if test_sentence_tags[num][num_word-1].startswith('O'):
                    entity += '[SEP] ' + str(words) + ' '
                    num_entities += 1
                    result = entity_tags.split('-')[1] + ' [' + str(words) + ' ]'  + entity_tags.split('-')[1]
                    temp_s.append(entity_tags.split('-')[1])
                       
                else:
                    entity += str(words) + ' '
            
            if entity_tags.startswith('B') == False and entity_tags.startswith('I') == False:
                temp_s.append(words)
    
        if entity != '':
            temp.append(entity.split('[SEP] ')[1:])        
        FinalEntities.append(temp)
        entity = ''
        temp_str = " ".join(temp_s).replace('  ',' ')
        NerSentence.append(temp_str)
        temp_s = []
        temp = []


        
    print(f'Completed. Found {num_entities} genes.')
    return [NerSentence,FinalEntities]


def test_result(file_ds):
    with open(f'dataset/{file_ds}.tsv','r') as file:
        test_file = file.readlines()

    function_sentence = []
    label_test = []
    temp_ds = []
    temp_label = []
    for line in test_file:
        if line == '\n':  
            if len(temp_label) > 128:
                temp_label = temp_label[:128]
            if ' '.join(temp_ds) not in function_sentence:
                if temp_label and temp_ds:
                    function_sentence.append(' '.join(temp_ds))
                    label_test.append(temp_label)
            temp_ds = []
            temp_label = []
        else:
            temp_ds.append(line.split('\t')[0].replace('\n',''))
            temp_label.append(line.split('\t')[1].replace('\n',''))
    return function_sentence,label_test


preprocessor = hub.load("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
tfhub_handle_encoder = "https://tfhub.dev/google/experts/bert/pubmed/2"

In [108]:
# Set up model, epochs and steps
epochs = 2
batch_size = 8


# test_text, test_tags, unique_test_labels = load_file('test')
# test_sentences = preprocessor(test_text)
train_text, tags = load_file('train')
train_sentences = preprocessor(train_text)
tokens = preprocessor.tokenize(train_text)

# Since BERT tokenizer splits single words (interestingly -> interest, ##ing, ##ly), we need to add labels for each additional token
              #data                 #labels
# Before:   interest ##ing #ly      -> O
# After:    interest ##ing #ly      -> O, 0, 0
train_labels = []
for num_s, sentence in enumerate(tokens):
    temp = []
    for num_w, fword in enumerate(sentence):
        word = list(fword.numpy())
        if num_w <= len(tags[num_s]):
            old_tag = tags[num_s][num_w]
            temp.append(old_tag)
            for times in range(len(word)-1):
                temp.append(old_tag)
    l = len(temp)
    temp.extend([0]*(128-l))
    temp.insert(0,0)
    train_labels.append(tf.constant(temp[0:128]))
train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences,train_labels)) # Builds dataset in a format that can be fed to the model.
train_ds = train_dataset.shuffle(len(tags),reshuffle_each_iteration=True)
train_ds = train_ds.batch(batch_size)


# Same as above but fot the test dataset.
test_text, test_tags = load_file('test')
test_sentences = preprocessor(test_text)
tokens = preprocessor.tokenize(test_text)
test_labels = []
for num_s, sentence in enumerate(tokens):
    temp = []
    for num_w, fword in enumerate(sentence):
        word = list(fword.numpy())
        if num_w <= len(test_tags[num_s]):
            old_tag = test_tags[num_s][num_w]
            temp.append(old_tag)
            for times in range(len(word)-1):
                temp.append(old_tag)
    l = len(temp)
    temp.extend([0]*(128-l))
    temp.insert(0,0)
    test_labels.append(tf.constant(temp[0:128]))
val_dataset = tf.data.Dataset.from_tensor_slices((test_sentences,test_labels))
val_ds = val_dataset.batch(batch_size)


In [109]:

# creates an optimizer with learning rate schedule
train_data_size = len(tags)
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)
optimizer = nlp.optimization.create_optimizer(5e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)


# Metrics and loss that can be used by a multiclass classificator
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
bert_classifier = build_classifier_model(14)


# Custom training loop. https://www.tensorflow.org/guide/keras/writing_a_training_loop_from_scratch
epoch_bar = master_bar(range(epochs))
pb_max_len = math.ceil(float(len(train_text))/float(batch_size))

for epoch in epoch_bar:
    print("\nStart of epoch %d" % (epoch,))
    start_time = time.time()
    for step, (x_batch_train, y_batch_train) in progress_bar(enumerate(train_ds),total=pb_max_len, parent=epoch_bar):
        loss_value = train_step(x_batch_train, y_batch_train)
        
        if step % 50 == 0:
            print(f"Training loss (for one batch) at step {step}: {loss_value}")

    train_acc = train_acc_metric.result()
    print(f"Training accuracy over epoch {epoch}: {float(train_acc)}")
    train_acc_metric.reset_states()
    
    # Run a validationghjl loop at the end of each epoch.
    for x_batch_val, y_batch_val in val_ds:
        test_step(x_batch_val, y_batch_val)

    val_acc = val_acc_metric.result()
    val_acc_metric.reset_states()
    print(f"Validation acc: {float(val_acc)}")
    print(f"Time taken: {time.time() - start_time}")


Start of epoch 0
Training loss (for one batch) at step 0: 2.052732467651367
Training accuracy over epoch 0: 0.8168797492980957
Validation acc: 0.9435221552848816
Time taken: 235.25398802757263

Start of epoch 1
Training loss (for one batch) at step 0: 0.14749933779239655
Training accuracy over epoch 1: 0.9466145634651184
Validation acc: 0.956787109375
Time taken: 189.85518217086792


In [110]:
# Prints a model summary and saves the model.
bert_classifier.summary()
bert_classifier.save('TrainedModel/', include_optimizer=False)

Model: "prediction"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 encoder (KerasLayer)           {'pooled_output': (  109482241   ['input_2[0][0]',                
                                None, 768),                       'input_3[0][0]',       



INFO:tensorflow:Assets written to: TrainedModel/assets


INFO:tensorflow:Assets written to: TrainedModel/assets


In [8]:

test_text, test_tags = test_result('test')
test_x = preprocessor(test_text)
bert_classifier = tf.keras.models.load_model('TrainedModel')  # Loads the NER model trained with the Ner_Training.ipynb notebook
prediction = bert_classifier.predict(test_x) # Predicts the label for each token of the preprocessed sentences
tokens = preprocessor.tokenize(test_text)
test_labels = []
for num_s, sentence in enumerate(tokens):
    temp = []
    for num_w, fword in enumerate(sentence):
        word = list(fword.numpy())
        if num_w < len(test_tags[num_s]):
            old_tag = test_tags[num_s][num_w]
            temp.append(old_tag)
            for times in range(len(word)-1):
                temp.append(old_tag)
    l = len(temp)
    temp.extend([0]*(128-l))
    temp.insert(0,0)
    test_labels.append(temp)

# The "prediction" variable has a score for each of the possible categories for a token'
#    ([CLS],[SEP],[PAD])        O          B-GENE      I-GENE    -> Index (0:3)
#           0.01              0.90         0.05      0.04        -> Predicted score for a single token
# THE SIZE OF PREDICTION IS:     (4) x (Num of tokens in a sentence) x (Total num of sentences)
# This part finds in which index of "prediction[sentence x][token x]" has the bigest number, then saves the asociated label for that index
sentence_tags = []
raw_sentences = []
for i, sentence in enumerate(prediction):
    temp_rel = []
    for n_wor, pred_word in enumerate(sentence):
        val = list(pred_word)
        if val[1] == max(val):
            temp_rel.append('O')
        elif val[2] == max(val):
            temp_rel.append('OD')
        elif val[3] == max(val) or val[5] == max(val):
            temp_rel.append('Med')
        elif val[4] == max(val):
            temp_rel.append('B-Air')
        elif val[5] == max(val):
            temp_rel.append('Supp')
        elif val[6] == max(val):
            temp_rel.append('Anti')
        elif val[7] == max(val):
            temp_rel.append('Temp')
        elif val[8] == max(val):
            temp_rel.append('B-Orgn')
        elif val[9] == max(val):
            temp_rel.append('I-Orgn')
        elif val[10] == max(val):
            temp_rel.append('B-Gtype')
        elif val[10] == max(val):
            temp_rel.append('I-Gtype')
        elif val[11] == max(val):
            temp_rel.append('I-Phase')
        elif val[11] == max(val):
            temp_rel.append('B-Phase')
        elif val[12] == max(val):
            temp_rel.append('I-Gversion')
        elif val[13] == max(val):
            temp_rel.append('B-Technique')
        elif val[13] == max(val):
            temp_rel.append('I-Technique')
    #raw_sentences.append(pre_txt[i])
    sentence_tags.append(temp_rel)
    
tokenizer = bert.tokenization.FullTokenizer('vocab.txt', do_lower_case=True)
pre_txt = []
for indx in range(len(test_text)):
    pre_txt.append(tokenizer.tokenize(test_text[indx]))
    





In [11]:
for sen_idx, pred_sentence in enumerate(sentence_tags):
    for word_idx, pred_tok in enumerate(pred_sentence):
        print(test_labels[sen_idx][word_idx+1],end='\t')
        print((pre_txt[sen_idx][word_idx]),end='\t')
        print(sentence_tags[sen_idx][word_idx])
    print('\n')

O	r	O
O	##po	O
O	##b	O
O	with	O
B-Supp	d	Med
B-Supp	##pd	O
O	2	O
B-Technique	(	B-Technique
I-Technique	chip	O
I-Technique	-	O
I-Technique	ex	B-Technique


O	gen	O
O	##otype	O
O	/	O
O	variation	O
O	:	O
B-Gtype	delta	B-Gtype


O	l	O
O	##rp	O
B-Supp	_	O
O	nh	O
O	##4	O
O	##cl	O


O	treatment	O
O	:	O
B-Med	glucose	O
I-Med	(	O
I-Med	2	O
I-Med	g	O
I-Med	/	O
I-Med	l	O
I-Med	)	O
I-Med	minimal	O
O	m	Med
O	##9	O
O	medium	Med
B-Supp	supplemented	Med
I-Supp	without	O


O	gen	O
O	##otype	O
O	:	O
B-Gtype	ga	B-Gtype
B-Gtype	##d	Med
B-Gtype	##x	O
0	-	Med
0	8	O
0	##my	B-Gtype


B-Gtype	wild	B-Gtype
B-Air	-	O
O	type	B-Gtype


O	cr	B-Gtype
O	##a	O
B-Supp	ace	O
B-Supp	##tate	O


O	culture	O
O	condition	O
O	:	O
B-Air	aero	B-Gtype
B-Air	##bic	B-Gtype


B-Gtype	w	B-Gtype
B-Gtype	##t	B-Gtype
O	_	O
B-Technique	rna	B-Gtype
B-Technique	##se	O


O	culture	O
O	##d	O
O	in	O
O	:	O
B-Med	m	Med
B-Med	##9	O
I-Med	minimal	Med
I-Med	media	O
O	with	O
B-Supp	0	Med
I-Supp	.	Med


O	chip	O
O	antibody	O
O	:	O
B-Anti	anti	O
O	-