In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import absl.logging
absl.logging.set_verbosity(absl.logging.ERROR)

from __future__ import absolute_import, division, print_function
import tensorflow_hub as hub
import tensorflow_text as tf_text  # A dependency of the preprocessing model
from official import nlp
import official.nlp.optimization
import numpy as np
import tensorflow as tf
import pandas as pd
import time
from fastprogress import master_bar, progress_bar
import math
import official.nlp.bert.tokenization
from official.nlp import bert

def load_file(file_ds):
    with open(f'dataset/{file_ds}.tsv','r') as file:
        test_file = file.readlines()

    function_sentence = []
    label_test = []
    temp_ds = []
    temp_label = []
    for line in test_file:
        if line == '\n':  
            if len(temp_label) > 128:
                temp_label = temp_label[:128]
            if len(temp_label) < 128:
                temp_label.extend([0]*(128-len(temp_label)))
            if ' '.join(temp_ds) not in function_sentence:
                if temp_label and temp_ds:
                    function_sentence.append(' '.join(temp_ds))
                    label_test.append(temp_label)
            temp_ds = []
            temp_label = []
        else:
            temp_ds.append(line.split('\t')[0].replace('\n',''))
            cadena = line.split('\t')[1].replace('\n','')
            temp_label.append(number_category(cadena))

    return function_sentence,label_test

def number_category(name_category):
    category_dict = {
      "O":1,
      "B-Gversion":2,
      "B-Technique":4,
      "B-Med":6,
      "B-Gtype":8,
      "B-Orgn":10,
      "B-Strain":12,
      "B-Air":13,
      "B-Substrain":12,
      "B-pH":26,
      "B-Supp":15,
      "B-Vess":12,
      "B-Agit":17,
      "B-Anti":19,
      "B-OD":21,
      "B-Phase":23,
      "B-Temp":25,
      
      "I-Gversion":3,
      "I-Technique":5,
      "I-Med":7,
      "I-Gtype":9,
      "I-Orgn":11,
      "I-Strain":12,
      "I-Air":14,
      "I-Substrain":12,
      "I-pH":26,
      "I-Supp":16,
      "I-Vess":12,
      "I-Agit":18,
      "I-Anti":20,
      "I-OD":22,
      "I-Phase":24,
      "I-Temp":25,
      }
    return category_dict[name_category]


def build_classifier_model(num_classes):
    '''
    Fine tunes BERT.
    Input: Dataset made by a bert preprocessor e.i "https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3"
        1. input_word_ids, input_mask, input_type_ids
        2. Labels for each word. 
    Output: Trained model.
    '''

    inputs = dict(
        input_word_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32),
        input_mask=tf.keras.layers.Input(shape=(None,), dtype=tf.int32),
        input_type_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32)
    )
  
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='encoder')
    net = encoder(inputs)['sequence_output']
    net = tf.keras.layers.Dropout(rate=0.1)(net)
    net = tf.keras.layers.Dense(num_classes, activation='softmax', name='classifier')(net)
    return tf.keras.Model(inputs, net, name='prediction')


@tf.function
def train_step(x, y):
    '''
    Controls what is happening during training.
    '''
    with tf.GradientTape() as tape:
        logits = bert_classifier(x, training=True)
        loss_value = loss_fn(y, logits)
    grads = tape.gradient(loss_value, bert_classifier.trainable_weights)
    optimizer.apply_gradients(zip(grads, bert_classifier.trainable_weights))
    train_acc_metric.update_state(y, logits)
    return loss_value


@tf.function
def test_step(x, y):
    '''
    Computes the validation accuracy for each epoch during training.
    '''
    val_logits = bert_classifier(x, training=False)
    val_acc_metric.update_state(y, val_logits)
    

def test_result(file_ds):
    with open(f'dataset/{file_ds}.tsv','r') as file:
        test_file = file.readlines()

    function_sentence = []
    label_test = []
    temp_ds = []
    temp_label = []
    for line in test_file:
        if line == '\n':  
            if len(temp_label) > 128:
                temp_label = temp_label[:128]
            if ' '.join(temp_ds) not in function_sentence:
                if temp_label and temp_ds:
                    function_sentence.append(' '.join(temp_ds))
                    label_test.append(temp_label)
            temp_ds = []
            temp_label = []
        else:
            temp_ds.append(line.split('\t')[0].replace('\n',''))
            temp_label.append(line.split('\t')[1].replace('\n',''))
    return function_sentence,label_test


preprocessor = hub.load("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
tfhub_handle_encoder = "https://tfhub.dev/google/experts/bert/pubmed/2"

In [2]:
# Set up model, epochs and steps
epochs = 4
batch_size = 8


# test_text, test_tags, unique_test_labels = load_file('test')
# test_sentences = preprocessor(test_text)
train_text, tags = load_file('train')
train_sentences = preprocessor(train_text)
tokens = preprocessor.tokenize(train_text)


# Since BERT tokenizer splits single words (interestingly -> interest, ##ing, ##ly), we need to add labels for each additional token
              #data                 #labels
# Before:   interest ##ing #ly      -> O
# After:    interest ##ing #ly      -> O, 0, 0
train_labels = []
for num_s, sentence in enumerate(tokens):
    temp = []
    for num_w, fword in enumerate(sentence):
        word = list(fword.numpy())
        if num_w <= len(tags[num_s]):
            old_tag = tags[num_s][num_w]
            if len(word) >= 1:
                temp.append(old_tag)
            for times in range(len(word)-1):
                temp.append(old_tag)
    l = len(temp)
    temp.extend([0]*(128-l))
    temp.insert(0,0)
    train_labels.append(tf.constant(temp[0:128]))
train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences,train_labels)) # Builds dataset in a format that can be fed to the model.
train_ds = train_dataset.shuffle(len(tags),reshuffle_each_iteration=False)
train_ds = train_ds.batch(batch_size)


# Same as above but fot the test dataset.
test_text, test_tags = load_file('test')
test_sentences = preprocessor(test_text)
tokens = preprocessor.tokenize(test_text)
test_labels = []
for num_s, sentence in enumerate(tokens):
    temp = []
    for num_w, fword in enumerate(sentence):
        word = list(fword.numpy())
        if num_w <= len(test_tags[num_s]):
            old_tag = test_tags[num_s][num_w]
            if len(word) >= 1:
                temp.append(old_tag)
            for times in range(len(word)-1):
                temp.append(old_tag)
    l = len(temp)
    temp.extend([0]*(128-l))
    temp.insert(0,0)
    test_labels.append(tf.constant(temp[0:128]))
val_dataset = tf.data.Dataset.from_tensor_slices((test_sentences,test_labels))
val_ds = val_dataset.batch(batch_size)


In [3]:

# creates an optimizer with learning rate schedule
train_data_size = len(tags)
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)
optimizer = nlp.optimization.create_optimizer(5e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)


# Metrics and loss that can be used by a multiclass classificator
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
bert_classifier = build_classifier_model(27)


In [4]:
bert_classifier.compile(optimizer=optimizer)

In [5]:


# Custom training loop. https://www.tensorflow.org/guide/keras/writing_a_training_loop_from_scratch
epoch_bar = master_bar(range(epochs))
pb_max_len = math.ceil(float(len(train_text))/float(batch_size))

for epoch in epoch_bar:
    print("\nStart of epoch %d" % (epoch + 1,))
    start_time = time.time()
    for step, (x_batch_train, y_batch_train) in progress_bar(enumerate(train_ds),total=pb_max_len, parent=epoch_bar):
        loss_value = train_step(x_batch_train, y_batch_train)
        
        if step % 50 == 0:
            print(f"Training loss (for one batch) at step {step}: {loss_value}")

    train_acc = train_acc_metric.result()
    print(f"Training accuracy over epoch {epoch}: {float(train_acc)}")
    train_acc_metric.reset_states()
    
    # Run a validationghjl loop at the end of each epoch.
    for x_batch_val, y_batch_val in val_ds:
        test_step(x_batch_val, y_batch_val)

    val_acc = val_acc_metric.result()
    val_acc_metric.reset_states()
    print(f"Validation acc: {float(val_acc)}")
    print(f"Time taken: {time.time() - start_time}")


Start of epoch 1
Training loss (for one batch) at step 0: 6.2161455154418945
Training accuracy over epoch 0: 0.7471411824226379
Time taken: 269.8068301677704

Start of epoch 2
Training loss (for one batch) at step 0: 0.19663934409618378
Training accuracy over epoch 1: 0.9581032991409302
Time taken: 270.4076099395752

Start of epoch 3
Training loss (for one batch) at step 0: 0.12945497035980225
Training accuracy over epoch 2: 0.9689717888832092
Time taken: 279.6406960487366

Start of epoch 4
Training loss (for one batch) at step 0: 0.08624684810638428
Training accuracy over epoch 3: 0.9768335819244385
Time taken: 240.59756016731262


In [6]:
# Prints a model summary and saves the model.
bert_classifier.summary()
bert_classifier.save('TrainedModel/', include_optimizer=False)

Model: "prediction"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 encoder (KerasLayer)           {'sequence_output':  109482241   ['input_2[0][0]',                
                                 (None, None, 768),               'input_3[0][0]',       



INFO:tensorflow:Assets written to: TrainedModel/assets


INFO:tensorflow:Assets written to: TrainedModel/assets


In [7]:

test_text, exp_test = test_result('test')
test_x = preprocessor(test_text)
bert_classifier = tf.keras.models.load_model('TrainedModel')  # Loads the NER model trained with the Ner_Training.ipynb notebook
prediction = bert_classifier.predict(test_x) # Predicts the label for each token of the preprocessed sentences
tokens = preprocessor.tokenize(test_text)
exp_labels = []
for num_s, sentence in enumerate(tokens):
    temp = []
    for num_w, fword in enumerate(sentence):
        word = list(fword.numpy())
        if num_w < len(exp_test[num_s]):
            old_tag = exp_test[num_s][num_w]
            if len(word) >= 1:
                temp.append(old_tag)
            for times in range(len(word)-1):
                temp.append(old_tag)
    l = len(temp)
    temp.extend([0]*(128-l))
    temp.insert(0,0)
    exp_labels.append(temp)

# The "prediction" variable has a score for each of the possible categories for a token'
#    ([CLS],[SEP],[PAD])        O          B-GENE      I-GENE    -> Index (0:3)
#           0.01              0.90         0.05      0.04        -> Predicted score for a single token
# THE SIZE OF PREDICTION IS:     (4) x (Num of tokens in a sentence) x (Total num of sentences)
# This part finds in which index of "prediction[sentence x][token x]" has the bigest number, then saves the asociated label for that index
sentence_tags = []
raw_sentences = []
for i, sentence in enumerate(prediction):
    temp_rel = []
    for n_wor, pred_word in enumerate(sentence):
        val = list(pred_word)
        if val[1] == max(val):
            temp_rel.append('O')
        elif val[2] == max(val) or val[3] == max(val):
            temp_rel.append('Gversion')
        elif val[4] == max(val) or val[5] == max(val):
            temp_rel.append('Technique')
        elif val[6] == max(val) or val[7] == max(val):
            temp_rel.append('Med')
        elif val[8] == max(val) or val[9] == max(val):
            temp_rel.append('Gtype')
        elif val[10] == max(val) or val[11] == max(val):
            temp_rel.append('Orgn')
        elif val[12] == max(val):
            temp_rel.append('Strain')
        elif val[13] == max(val) or val[14] == max(val):
            temp_rel.append('Air')
        elif val[12] == max(val):
            temp_rel.append('Substrain')
        elif val[26] == max(val):
            temp_rel.append('pH')
        elif val[15] == max(val) or val[16] == max(val):
            temp_rel.append('Supp')
        elif val[12] == max(val):
            temp_rel.append('Vess')
        elif val[17] == max(val) or val[18] == max(val):
            temp_rel.append('Agit')
        elif val[19] == max(val) or val[20] == max(val):
            temp_rel.append('Anti')
        elif val[21] == max(val) or val[22] == max(val):
            temp_rel.append('OD')
        elif val[23] == max(val) or val[24] == max(val):
            temp_rel.append('Phase')
        elif val[25] == max(val):
            temp_rel.append('Temp')
    #raw_sentences.append(pre_txt[i])
    sentence_tags.append(temp_rel)
    
tokenizer = bert.tokenization.FullTokenizer('vocab.txt', do_lower_case=True)
pre_txt = []
for indx in range(len(test_text)):
    pre_txt.append(tokenizer.tokenize(test_text[indx]))
    





In [8]:
all_text = (test_text + train_text)
single_test = list(set(test_text))
single_train = list(set(train_text))
single_all = list(set(all_text))

print(len(single_all),len(single_test),len(single_train), len(single_train + single_test))

354 39 317 356


In [9]:
for sen_idx, pred_sentence in enumerate(sentence_tags):
    for word_idx, pred_tok in enumerate(pred_sentence):
        print(exp_labels[sen_idx][word_idx+1],end='\t')
        print((pre_txt[sen_idx][word_idx]),end='\t')
        print(sentence_tags[sen_idx][word_idx])
    print('\n')

O	chip	O
B-Gtype	-	Gtype
O	f	O
O	##nr	O
B-Supp	_	Supp
O	f	O
O	##nr	O
O	##8	O
O	##my	O
O	##c	O
B-Supp	_	Supp
O	glucose	O
B-Air	_	Supp
O	nh	O
O	##4	O
O	##cl	O


O	e	O
O	.	O
O	coli	O
O	k	O
O	-	O
O	12	O
O	mg	O
O	##16	O
O	##55	O
O	w	O
O	##t	O
O	,	O
O	and	O
O	δ	O
O	##cr	O
O	##a	O
B-Phase	were	O
I-Phase	grown	O
B-Air	to	Air
O	mid	O
B-Temp	-	Air
O	log	O
B-Med	phase	O
I-Med	aero	Med
I-Med	##bic	O
I-Med	##ally	Med
I-Med	at	O
O	37	O
O	##°	O
O	##c	O
O	in	O
B-Supp	m	O
B-Supp	##9	O
I-Supp	minimal	O
O	media	Supp
O	supplemented	O


B-Air	aero	Air
B-Air	##bic	Air
O	cultures	O


O	medium	O
O	:	O
B-Med	lb	Supp


O	cells	O
O	were	O
O	grown	O
O	aero	O
O	##bic	O
O	##ally	O
O	(	Air
O	70	Air
O	%	Air
O	n	Air
O	##2	Air
O	,	Air
O	25	Air
O	%	Air
O	o	Air
O	##2	Air
O	,	Air
O	and	Air
O	5	Air
O	%	Air
O	co	Air
O	##2	Air
O	)	O
O	or	O
B-Air	ana	O
B-Air	##ero	O
B-Air	##bic	O
B-Air	##ally	O
I-Air	(	Air
I-Air	95	O
I-Air	%	Air
I-Air	n	Air
I-Air	##2	Air
I-Air	and	O
I-Air	5	Air
I-Air	%	Air
I-Air	co	Air
I-Air	##2	Air
I-Air	)	O