In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

from __future__ import absolute_import, division, print_function

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as hub_text
from official import nlp
import official.nlp.optimization
import itertools
import numpy as np
import pandas as pd
import time
from fastprogress import master_bar, progress_bar
import math
import official.nlp.bert.tokenization
from official.nlp import bert
import string
tf.get_logger().setLevel('ERROR')



def encode_sentence(s, tokenizer):
    '''
    Tokenizes pair of sentences and adds a [SEP] token to join them. This token is labeled as 0
    '''
    tokens = list(tokenizer.tokenize(s))
    tokens.append('[SEP]')
    return tokenizer.convert_tokens_to_ids(tokens)



def bert_encode(sentence_dict, tokenizer):
    '''
    Preprocess the data to be on the format expected by BERT. Does the same
    as the BERT preprocessor function.
    Input:
        1. Dict containing:
            'sentence1':
                ['These results indicate that the GeneReg'], 
            'sentence2':
                ['and acrD drug efflux genes are directly regulated by RegProtein protein ( BaeR protein ) .']
        2. Labels:
            [''O O O O O O 0 I-Rel I-Rel I-Rel I-Rel I-Rel I-Rel I-Rel I-Rel I-Rel O O O O O O O O]
    '''
    num_examples = len(sentence_dict["gene1"])

    sentence1 = tf.ragged.constant([
        encode_sentence(s, tokenizer)
        for s in np.array(sentence_dict["gene1"])])
    sentence2 = tf.ragged.constant([
        encode_sentence(s, tokenizer)
        for s in np.array(sentence_dict["gene2"])])

    cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]
    input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)

    input_mask = tf.ones_like(input_word_ids).to_tensor()

    type_cls = tf.zeros_like(cls)
    type_s1 = tf.zeros_like(sentence1)
    type_s2 = tf.ones_like(sentence2)
    input_type_ids = tf.concat([type_cls, type_s1, type_s2], axis=-1).to_tensor()

    inputs = {
        'input_word_ids': input_word_ids.to_tensor(),
        'input_mask': input_mask,
        'input_type_ids': input_type_ids}

    return inputs



def sentence_token_tagging(test_sentence_tags, tokenized_sentences):
    '''
    Rewrite the genes found by the NER model. 
    Input: 
        1. List of categorical labels asigned by the model during prediction. 
        2. List of tokenized sentences. (BERT tokenizer)
        
    Output:
        1. NerSentence: Sentences in which each found entity was replaced by the word GENE
            [The GENE protein has two activation domains , one of which is an GENE ...]
            
        2. FinalEntities: All of the entities in the sentence that were replaced by the word GENE
            [AraC, arac xyls family domain...]
        
    '''
    entity = ''
    num_entities = 0
    n = 0
    TF_Regulator, RegulatedGene = [],[]
    FinalEntities, temp,temp_s,NerSentence = [],[],[],[]
    for num in (range(len(test_sentence_tags))): 
        for num_word, (entity_tags, words) in enumerate(zip(test_sentence_tags[num], tokenized_sentences[num])):
            if entity_tags.startswith('B'):
                entity += '[SEP] ' + str(words) + ' '
                num_entities += 1
                temp_s.append('GENE')
                
            if entity_tags.startswith('I'):
                if test_sentence_tags[num][num_word-1].startswith('O'):
                    entity += '[SEP] ' + str(words) + ' '
                    num_entities += 1
                    temp_s.append('GENE')
                       
                else:
                    entity += str(words) + ' '
            
            if entity_tags.startswith('B') == False and entity_tags.startswith('I') == False:
                temp_s.append(words)
    
        if entity != '':
            temp.append(entity.split('[SEP] ')[1:])        
        FinalEntities.append(temp)
        entity = ''
        temp_str = " ".join(temp_s).replace('  ',' ')
        NerSentence.append(temp_str.split(' '))
        temp_s = []
        temp = []


        
    print(f'Completed. Found {num_entities} genes.')
    return [NerSentence,FinalEntities]


TensorFlow Addons offers no support for the nightly versions of TensorFlow. Some things might work, some other might not. 
If you encounter a bug, do not file an issue on GitHub.


In [2]:
# Loads the file with the example data and tensorflowhub preprocessor
tokenizer = bert.tokenization.FullTokenizer('vocabNER.txt', do_lower_case=False)
preprocessor = hub.load( "https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3")
with open('../Tools/ris-sentences-ECO.txt','r') as file:
    eco_sentence = file.readlines()


raw_text = preprocessor(eco_sentence) # Preprocessing sentences
bert_classifier = tf.keras.models.load_model('NERModel')  # Loads the NER model trained with the Ner_Training.ipynb notebook
prediction = bert_classifier.predict(raw_text) # Predicts the label for each token of the preprocessed sentences

# Tokenizes the sentences and removes tokens that are not a whole word 
pre_txt = []
for indx in range(len(eco_sentence)):
    pre_txt.append(' '.join(tokenizer.tokenize(eco_sentence[indx])).replace(' ##','').replace('##','').replace('  ',' ').split(' '))
    

# The "prediction" variable has a score for each of the possible categories for a token'
#    ([CLS],[SEP],[PAD])        O          B-GENE      I-GENE    -> Index (0:3)
#           0.01              0.90         0.05      0.04        -> Predicted score for a single token
# THE SIZE OF PREDICTION IS:     (4) x (Num of tokens in a sentence) x (Total num of sentences)
# This part finds in which index of "prediction[sentence x][token x]" has the bigest number, then saves the asociated label for that index
sentence_tags = []
raw_sentences = []
TF_Regulator_, RegulatedGene_ = [],[]
for i, sentence in enumerate(prediction):
    temp_rel = []
    for n_wor, pred_word in enumerate(sentence):
        val = list(pred_word)
        if val[1] == max(val):
            temp_rel.append('O')
        elif val[2] == max(val):
            temp_rel.append('B-GENE')
        elif val[3] == max(val):
            temp_rel.append('I-GENE')
    if len(pre_txt[i]) == len(temp_rel):
        raw_sentences.append(pre_txt[i])
        sentence_tags.append(temp_rel)


# Calls the function that hides the found entities.
ner_sentence, final_entities  = sentence_token_tagging(sentence_tags,raw_sentences)

Completed. Found 4533 genes.


In [3]:

# This part selects all posible pair of entities in a sentence. The 2 selected entities are between a pair of new special tokens [E1]-[/E1] or [E2]-[/E2]
# This was aproach is explained here https://medium.com/e-bot7-tech/matching-the-blanks-78e8063794b5 but its not implemented in this code cus it was 2 difficult
output_sentence = []
for line in range(len(ner_sentence)):
    new_line = ' '.join(ner_sentence[line])
    genes_in_sentence = 0
    prducts_in_sentence = 0
    for x in range(len(final_entities[line][0])):
        if genes_in_sentence == 0:
            new_line = new_line.replace('GENE',f'[ {final_entities[line][0][x]} ]',1).replace('  ',' ').replace(' [ ','[E1]').replace(' ] ','[/E1]')
            genes_in_sentence += 1            
        elif prducts_in_sentence == 0:
            new_line = new_line.replace('GENE',f'[ {final_entities[line][0][x]} ]',1).replace('  ',' ').replace(' [ ','[E2]').replace(' ] ','[/E2]')
            prducts_in_sentence += 1
                         
        else:
            new_line = new_line.replace('GENE',f' [{final_entities[line][0][x]}] ',1).replace('  ',' ')
                
    if '[/E2]' in new_line and '[/E1]' in new_line:
        output_sentence.append(new_line.replace('  ',' '))



# By this part we have sentences in wich two pairs of entities are delimited by [E] and [/E]
# Example:
##  [E1]AraC[/E1] seems to slightly repress [E2]arac[/E2] ( i . e . , below our cut - off level of 2 . 5 - fold ) .
input_sentences = []
for e,x in enumerate(output_sentence):
    if x[0] == ' ':
        output_sentence[e] = output_sentence[e][1:]
    temp = ''
    list_rel = []
    n = 0
    for i, word in enumerate(x.split(' ')):
        temp += word + ' '
        if n == 2:
            list_rel[-1] += word + ' '
            
        if '[/E1]' in word : # All the words located before the end of the first entity ([/E1]) make sentence 1:     i.e [AraC]
            list_rel.append(temp)
            if n < 1:
                temp = ''
            n += 1
            
        if '[/E2]' in word: # All the words located after the end of the first entity ([/E1]) make sentence 2:      i.e [seems to slightly repress arac ( i . e . , below our cut - off level of 2 . 5 - fold ) . ]
            list_rel.append(temp)
            if n < 1:
                temp = ''
            n += 1
    # The result is the concatenation of sentence 1 and sentence 2 but with a [SEP] token between em.
    # [[AraC [SEP] seems to slightly repress arac ( i . e . , below our cut - off level of 2 . 5 - fold ) . ]]
    input_sentences.append('[SEP] '.join(list_rel).replace('[/E1]','] ').replace('[/E2]','] ').replace('[E1]',' [').replace('[E2]',' [').replace('  ',' '))
    

labels = []
sentences = []
for indx in range(len(input_sentences)):
    temp_label = []
    n = 0
    for word in input_sentences[indx].split(' '):
        if n == 1:
            temp_label.append('I-Rel') # Only the words between the pair of entities are labeled as I-Rel. The rest of the words are labeled as O
        if '[SEP]' in word:
            n = 1
        if word == 'GeneReg' or word == 'RegProtein':
            n = 0
            temp_label.append('O') 
        else:
            if n == 0:
                temp_label.append('O')
    # Split each  sentence into sentence 1 and sentence 2 (as in previous chunk)
    sentences.append(input_sentences[indx].replace('[','').replace(']','').replace('  ',' ').split('SEP '))
    labels.append(temp_label)

In [6]:

bert_classifier = tf.keras.models.load_model('REModel/')
tokenizer = bert.tokenization.FullTokenizer(vocab_file="vocabNER.txt",do_lower_case=False)


sentence_test = {}
gene1_test = []
gene2_test = []
full_sentence = []
for x in sentences:    
    gene1_test.append(x[0])
    gene2_test.append(x[0] + x[1])
    full_sentence.append(x[0] + x[1])
sentence_test['gene1'] = gene1_test
sentence_test['gene2'] = gene2_test


pre_txt = []
for indx in range(len(full_sentence)):
    pre_txt.append(' '.join(tokenizer.tokenize(full_sentence[indx])).replace('  ',' ').split(' '))
    
test_text = bert_encode(sentence_test, tokenizer)
prediction = bert_classifier.predict(test_text)
# for x in pre_txt:
#     x.extend(['PAD']*(128-len(x)))


example_tags = []
raw_example = []
for i, sentence in enumerate(prediction):
    temp_rel = []
    for n_wor, pred_word in enumerate(sentence[:len(pre_txt[i])]):
        val = list(pred_word)
        if val[1] == max(val):
            temp_rel.append('O')
        if val[2] == max(val):
            temp_rel.append('I-Rel')
        if val[0] == max(val):
            temp_rel.append('PAD')
    if len(pre_txt[i]) == len(temp_rel):
        raw_example.append(pre_txt[i])
        example_tags.append(temp_rel)


for i,x in enumerate(raw_example[0]):
        print(x.replace(' ##','').replace('##',''), example_tags[0][i])

The O
results O
of O
the O
present O
study O
extend O
these O
observations O
and O
show O
that O
the O
Met O
R PAD
protein O
also O
stimulate PAD
s PAD
the I-Rel
in O
v O
it I-Rel
ro I-Rel
expression O
of I-Rel
met I-Rel
H I-Rel
and I-Rel
that I-Rel
both I-Rel
the PAD
Met I-Rel
E I-Rel
and I-Rel
Met I-Rel
H I-Rel
proteins O
synthesized PAD
in PAD
v PAD
it O
ro O
are O
en PAD
zy O
matical O
ly I-Rel
active O
. O


In [7]:

sentence_test = {}
gene1_test = []
gene2_test = []
for x in sentences:    
    gene1_test.append(x[0])
    gene2_test.append(x[1])
sentence_test['gene1'] = gene1_test
sentence_test['gene2'] = gene2_test


sentence_tags = []
for sentence in prediction:
    temp_rel = []
    for n_wor, pred_word in enumerate(sentence):
        val = list(pred_word)
        if val[2] == max(val):
            temp_rel.append(n_wor)
    sentence_tags.append(temp_rel)


complete_sentence = []
for x in range(len(sentence_tags)):
    complete_sentence.append(' '.join(tokenizer.tokenize(sentence_test['gene1'][x]) + tokenizer.tokenize(sentence_test['gene2'][x])).replace(' ##','').replace('##','').split(' '))


predicted_relations = []
for sent_num, word_index in enumerate(sentence_tags):
    if len(word_index) > 1:
        relationship_end = max(word_index)
        relationship_start = min(word_index)
        sentence_relation = ' '.join(complete_sentence[sent_num][relationship_start:relationship_end])
        sentence_start = ' '.join(complete_sentence[sent_num][:relationship_start]).replace(' ##','').replace('##','')
        sentence_end = ' '.join(complete_sentence[sent_num][relationship_end:]).replace(' ##','').replace('##','')
    predicted_relations.append(sentence_start + ' **' + sentence_relation + '** ' + sentence_end)
    #predicted_relations.append(sentence_relation)

for x in predicted_relations[0:20]:
    print(x.replace('[ ','[').replace(' ]',']'))
    #print(x.replace('[ ','').replace(' ]',''))

The results of the present study extend these observations and show that the MetR protein also stimulates the in **vitro expression of metH and that both the MetE and MetH proteins synthesized in vitro are enzymatically active .** 
Operons observed to be differentially expressed include not only all four of the transcripts , hmpA , ytfE , ygbA , and hcp - hcr , known or predicted to be **NsrR - regulated ( 4 , 40 ) , but also other transcripts predicted to be regulated by nitrite or RNS generated from nitrite , for example , nitric oxide ( 11 ) .** 
It appears that binding of the metJ gene product may prevent binding **of RNA polymerase to the metB and Jl promoters and , under strongly repressing conditions , may also decrease the binding to the J2 promoter .** 
Both inaA and marRAB are known Rob - regulated genes ( 5 , 21 **) .** 
Identification of the DNAbinding domain of the OmpR protein required for transcriptional activation of the ompF and ompC **genes of Escherichia coli by in v