In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

from __future__ import absolute_import, division, print_function

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as hub_text
from official import nlp
import official.nlp.optimization
import itertools
import numpy as np
import pandas as pd
import time
from fastprogress import master_bar, progress_bar
import math
import official.nlp.bert.tokenization
from official.nlp import bert
import string

tf.get_logger().setLevel('ERROR')



def encode_sentence(s, tokenizer):
    '''
    Tokenizes pair of sentences and adds a [SEP] token to join them. This token is labeled as 0
    '''
    tokens = list(tokenizer.tokenize(s))
    tokens.append('[SEP]')
    return tokenizer.convert_tokens_to_ids(tokens)



def bert_encode(sentence_dict, tokenizer):
    '''
    Preprocess the data to be on the format expected by BERT. Does the same
    as the BERT preprocessor function.
    Input:
        1. Dict containing:
            'sentence1':
                ['These results indicate that the GeneReg'], 
            'sentence2':
                ['and acrD drug efflux genes are directly regulated by RegProtein protein ( BaeR protein ) .']
        2. Labels:
            [''O O O O O O 0 I-Rel I-Rel I-Rel I-Rel I-Rel I-Rel I-Rel I-Rel I-Rel O O O O O O O O]
    '''
    num_examples = len(sentence_dict["gene1"])

    sentence1 = tf.ragged.constant([
        encode_sentence(s, tokenizer)
        for s in np.array(sentence_dict["gene1"])])
    sentence2 = tf.ragged.constant([
        encode_sentence(s, tokenizer)
        for s in np.array(sentence_dict["gene2"])])

    cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]
    input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)

    input_mask = tf.ones_like(input_word_ids).to_tensor()

    type_cls = tf.zeros_like(cls)
    type_s1 = tf.zeros_like(sentence1)
    type_s2 = tf.ones_like(sentence2)
    input_type_ids = tf.concat([type_cls, type_s1, type_s2], axis=-1).to_tensor()

    inputs = {
        'input_word_ids': input_word_ids.to_tensor(),
        'input_mask': input_mask,
        'input_type_ids': input_type_ids}

    return inputs



def sentence_token_tagging(test_sentence_tags, tokenized_sentences):
    '''
    Rewrite the genes found by the NER model. 
    Input: 
        1. List of categorical labels asigned by the model during prediction. 
        2. List of tokenized sentences. (BERT tokenizer)
        
    Output:
        1. NerSentence: Sentences in which each found entity was replaced by the word GENE
            [The GENE protein has two activation domains , one of which is an GENE ...]
            
        2. FinalEntities: All of the entities in the sentence that were replaced by the word GENE
            [AraC, arac xyls family domain...]
        
    '''
    entity = ''
    num_entities = 0
    n = 0
    TF_Regulator, RegulatedGene = [],[]
    FinalEntities, temp,temp_s,NerSentence = [],[],[],[]
    for num in (range(len(test_sentence_tags))): 
        for num_word, (entity_tags, words) in enumerate(zip(test_sentence_tags[num], tokenized_sentences[num])):
            if entity_tags.startswith('B'):
                entity += '[SEP] ' + str(words) + ' '
                num_entities += 1
                temp_s.append('GENE')
                
            if entity_tags.startswith('I'):
                if test_sentence_tags[num][num_word-1].startswith('O'):
                    entity += '[SEP] ' + str(words) + ' '
                    num_entities += 1
                    temp_s.append('GENE')
                       
                else:
                    entity += str(words) + ' '
            
            if entity_tags.startswith('B') == False and entity_tags.startswith('I') == False:
                temp_s.append(words)
    
        if entity != '':
            temp.append(entity.split('[SEP] ')[1:])        
        FinalEntities.append(temp)
        entity = ''
        temp_str = " ".join(temp_s).replace('  ',' ')
        NerSentence.append(temp_str.split(' '))
        temp_s = []
        temp = []


        
    print(f'Completed. Found {num_entities} genes.')
    return [NerSentence,FinalEntities]


def consecutive(data, stepsize=1):
    return np.split(data, np.where(np.diff(data) != stepsize)[0]+1)


In [3]:
# Loads the file with the example data and tensorflowhub preprocessor
tokenizer = bert.tokenization.FullTokenizer('vocabNER.txt', do_lower_case=False)
preprocessor = hub.load( "https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3")
with open('Tools/ris-sentences-ECO.txt','r') as file:
    eco_sentence = file.readlines()


raw_text = preprocessor(eco_sentence) # Preprocessing sentences
bert_classifier = tf.keras.models.load_model('NERModel')  # Loads the NER model trained with the Ner_Training.ipynb notebook
prediction = bert_classifier.predict(raw_text) # Predicts the label for each token of the preprocessed sentences

# Tokenizes the sentences and removes tokens that are not a whole word 
pre_txt = []
for indx in range(len(eco_sentence)):
    pre_txt.append(' '.join(tokenizer.tokenize(eco_sentence[indx])).replace(' ##','').replace('##','').replace('  ',' ').split(' '))
    

# The "prediction" variable has a score for each of the possible categories for a token'
#    ([CLS],[SEP],[PAD])        O          B-GENE      I-GENE    -> Index (0:3)
#           0.01              0.90         0.05      0.04        -> Predicted score for a single token
# THE SIZE OF PREDICTION IS:     (4) x (Num of tokens in a sentence) x (Total num of sentences)
# This part finds in which index of "prediction[sentence x][token x]" has the bigest number, then saves the asociated label for that index
sentence_tags = []
raw_sentences = []
TF_Regulator_, RegulatedGene_ = [],[]
for i, sentence in enumerate(prediction):
    temp_rel = []
    for n_wor, pred_word in enumerate(sentence):
        val = list(pred_word)
        if val[1] == max(val):
            temp_rel.append('O')
        elif val[2] == max(val):
            temp_rel.append('B-GENE')
        elif val[3] == max(val):
            temp_rel.append('I-GENE')
    if len(pre_txt[i]) == len(temp_rel):
        raw_sentences.append(pre_txt[i])
        sentence_tags.append(temp_rel)


# Calls the function that hides the found entities.
ner_sentence, final_entities  = sentence_token_tagging(sentence_tags,raw_sentences)

Completed. Found 4533 genes.


In [185]:

# This part selects all posible pair of entities in a sentence. The 2 selected entities are between a pair of new special tokens [E1]-[/E1] or [E2]-[/E2]
# This was aproach is explained here https://medium.com/e-bot7-tech/matching-the-blanks-78e8063794b5 but its not implemented in this code cus it was 2 difficult
output_sentence = []
entities_to_print = []
for line in range(len(ner_sentence)):
    new_line = ' '.join(ner_sentence[line])
    genes_in_sentence = 0
    prducts_in_sentence = 0
    for x in range(len(final_entities[line][0])):
        if genes_in_sentence == 0:
            new_line = new_line.replace('GENE',f'[ {final_entities[line][0][x]} ]',1).replace('  ',' ').replace(' [ ','[E1]').replace(' ] ','[/E1]')
            genes_in_sentence += 1            
        elif prducts_in_sentence == 0:
            new_line = new_line.replace('GENE',f'[ {final_entities[line][0][x]} ]',1).replace('  ',' ').replace(' [ ','[E2]').replace(' ] ','[/E2]')
            prducts_in_sentence += 1
                         
        else:
            new_line = new_line.replace('GENE',f' [{final_entities[line][0][x]}] ',1).replace('  ',' ')
                
    if '[/E2]' in new_line and '[/E1]' in new_line:
        output_sentence.append(new_line.replace('  ',' '))



# By this part we have sentences in wich two pairs of entities are delimited by [E] and [/E]
# Example:
##  [E1]AraC[/E1] seems to slightly repress [E2]arac[/E2] ( i . e . , below our cut - off level of 2 . 5 - fold ) .
input_sentences = []
for e,x in enumerate(output_sentence):
    if x[0] == ' ':
        output_sentence[e] = output_sentence[e][1:]
    temp = ''
    list_rel = []
    n = 0
    for i, word in enumerate(x.split(' ')):
        temp += word + ' '
        if n == 2:
            list_rel[-1] += word + ' '
            
        if '[/E1]' in word : # All the words located before the end of the first entity ([/E1]) make sentence 1:     i.e [AraC]
            list_rel.append(temp)
            if n < 1:
                temp = ''
            n += 1
            
        if '[/E2]' in word: # All the words located after the end of the first entity ([/E1]) make sentence 2:      i.e [seems to slightly repress arac ( i . e . , below our cut - off level of 2 . 5 - fold ) . ]
            list_rel.append(temp)
            if n < 1:
                temp = ''
            n += 1
    # The result is the concatenation of sentence 1 and sentence 2 but with a [SEP] token between em.
    # [[AraC [SEP] seems to slightly repress arac ( i . e . , below our cut - off level of 2 . 5 - fold ) . ]]
    input_sentences.append('[SEP] '.join(list_rel).replace('[/E1]','] ').replace('[/E2]','] ').replace('[E1]',' [').replace('[E2]',' [').replace('  ',' ').replace('[','').replace(']','').replace('  ',' ').split('SEP '))
    entities_to_print.append(' '.join(list_rel).replace('[/E1]','] ').replace('[/E2]','] ').replace('[E1]',' [').replace('[E2]',' [').replace('  ',' ').replace('[ ','[').replace(' ]',']').replace('  ',' '))

In [35]:
example_tags = []
raw_example = []
all_tags = []
for i, sentence in enumerate(prediction):
    temp_rel = []
    temp_tag = []
    for n_wor, pred_word in enumerate(sentence[:len(pre_txt[i])]):
        val = list(pred_word)
        if val[1] == max(val):
            temp_rel.append('O')
        elif val[2] == max(val):
            temp_rel.append('I-Rel')
            temp_tag.append(n_wor)
        else:
            temp_rel.append('PAD')
    if len(pre_txt[i]) == len(temp_rel):
        raw_example.append(pre_txt[i])
        example_tags.append(temp_rel)
        all_tags.append(temp_tag)


# for i,x in enumerate(raw_example[0]):
#         print(x.replace(' ##','').replace('##',''), example_tags[0][i])

predicted_relations = []
for sent_num, word_index in enumerate(all_tags):
    if len(word_index) > 1:
        relationship_end = max(word_index) -1
        relationship_start = min(word_index) -2
        sentence_relation = ' '.join(raw_example[sent_num][relationship_start:relationship_end])
        sentence_start = ' '.join(raw_example[sent_num][:relationship_start]).replace(' ##','').replace('##','')
        sentence_end = ' '.join(raw_example[sent_num][relationship_end:]).replace(' ##','').replace('##','')
        predicted_relations.append(sentence_start + ' **' + sentence_relation.replace('[ ','').replace(' ]','') + '** ' + sentence_end)
    #predicted_relations.append(sentence_relation)

for x in predicted_relations[180:200]:
    print(x,'\n')

[NarL ] **and NarP proteins compete for the 44 . 5 binding site ; binding of NarP further induces aeg - 46 . 5 operon expression , whereas binding of NarL has no significant effect on the basal anaerobic** ( [ Fnr ] - dependent ) level of expression . 

[Dna] **A protein appeared to be proportional to the inhibition of RNA polymerase binding to the promoters and the inhibition of transcription from the** promoters . 

[Nag] **C activates the fimB** promoter ] , but two general scenarios seem most plausible . 

[A] **rgP directly regulates lysP transcription , we tested binding of ArgP to the lys** P promoter ] / control [ region ] . 

[DnaA protein] **] and RNA polymerase can coexist at the dnaA promoter by gel - shift and footprinting analyses and whether a direct protein - protein interaction between DnaA protein and RNA polymerase** is the mechanism of inhibition of transcription . 

[yeaR ] **ogt promoters are also repressed by Fis , we suggest that rapidly growing cells may opt ou

In [240]:
bert_classifier = tf.keras.models.load_model('REModel/')

tokenizer = bert.tokenization.FullTokenizer(vocab_file="vocabNER.txt",do_lower_case=False)


sentence_test = {}
gene1_test = []
gene2_test = []
full_sentence = []

for x in input_sentences:    
    gene1_test.append(x[0])
    gene2_test.append(x[1])
    full_sentence.append(x[0] + x[1])
sentence_test['gene1'] = gene1_test
sentence_test['gene2'] = gene2_test


pre_txt = []
for indx in range(len(full_sentence)):
    pre_txt.append(' '.join(tokenizer.tokenize(full_sentence[indx])).split(' '))
    
test_text = bert_encode(sentence_test, tokenizer)
prediction = bert_classifier.predict(test_text)

visualize_relations = []
raw_example = []
all_tags = []
for i, sentence in enumerate(prediction):
    temp_rel = []
    temp_tag = []
    for n_wor, pred_word in enumerate(sentence[:len(pre_txt[i])]):
        val = list(pred_word)
        if val[1] == max(val):
            temp_rel.append('O')
        elif val[2] == max(val):
            temp_rel.append('I-Rel')
            temp_tag.append(n_wor)
        else:
            temp_rel.append('PAD')
    if len(pre_txt[i]) == len(temp_rel):
        raw_example.append(pre_txt[i])
        visualize_relations.append(entities_to_print[i])
        all_tags.append(temp_tag)

for index in range(len(all_tags)):
    a = all_tags[index]
    a = consecutive(a)
    for x in a:
        if len(list(x)) > 1:
            all_tags[index] = list(x)
            

relation_pairs = []
predicted_relations = []
for sent_num, word_index in enumerate(all_tags):
    if len(word_index) > 1:
        relationship_end = max(word_index)
        relationship_start = min(word_index)
            
        sentence_relation = ' '.join(raw_example[sent_num][relationship_start:relationship_end]).replace(' ##','').replace('##','')
        sentence_start = ' '.join(raw_example[sent_num][:relationship_start]).replace(' ##','').replace('##','')
        sentence_end = ' '.join(raw_example[sent_num][relationship_end:]).replace(' ##','').replace('##','')
        predicted_relations.append(sentence_start + ' **' + sentence_relation + '** ' + sentence_end)
        relation_pairs.append(visualize_relations[sent_num])


for indx in range(len(predicted_relations)):
    relation_start_limit = len(predicted_relations[indx].split('**')[0].split(' '))
    while relation_pairs[indx].split(' ')[relation_start_limit][-1] != ']':
        relation_start_limit -= 1

    relation_end_limit = len(relation_pairs[indx].split(' ')) - len(predicted_relations[indx].split('**')[2].split(' '))
    while relation_pairs[indx].split(' ')[relation_end_limit][0] != '[':
        relation_end_limit -= 1
    
    first_unrelared_entities = len(predicted_relations[indx].split('**')[0].split(' '))
    while relation_pairs[indx].split(' ')[first_unrelared_entities][0] != '[':
        first_unrelared_entities -= 1

    second_unrelated_entities = len(relation_pairs[indx].split(' ')) - len(predicted_relations[indx].split('**')[2].split(' '))
    while relation_pairs[indx].split(' ')[second_unrelated_entities][-1] != ']':
        second_unrelated_entities -= 1

    relation = ' '.join(relation_pairs[indx].split(' ')[relation_start_limit + 1:relation_end_limit])
    pre_entities_sentence = ' '.join(relation_pairs[indx].split(' ')[:first_unrelared_entities]).replace('[','').replace(']','')
    first_entity = '_'.join(relation_pairs[indx].split(' ')[first_unrelared_entities:relation_start_limit + 1])
    
    pos_entities_sentence = ' '.join(relation_pairs[indx].split(' ')[second_unrelated_entities + 1:]).replace('[','').replace(']','')
    second_entity = '_'.join(relation_pairs[indx].split(' ')[relation_end_limit:second_unrelated_entities + 1])
    print(pre_entities_sentence + ' ' +  first_entity + ' **' + relation.replace('[','').replace(']','') + '** ' + second_entity + ' ' + pos_entities_sentence,'\n')

The results of the present study extend these observations and show that the [MetR_protein] **also stimulates the in vitro expression of metH and that both the MetE and** [MetH_proteins] synthesized in vitro are enzymatically active .  

Operons observed to be differentially expressed include not only all four of the transcripts , hmpA , ytfE , [ygbA] **, and hcp - hcr , known or predicted to be** [NsrR] - regulated ( 4 , 40 ) , but also other transcripts predicted to be regulated by nitrite or RNS generated from nitrite , for example , nitric oxide ( 11 ) .  

Identification of the DNAbinding domain of the [OmpR_protein] **required for transcriptional activation of the ompF and** [ompC_genes] of Escherichia coli by in vivo DNA footprinting .  

 [MelR] **is a member of the AraC - XylS family of bacterial gene regulatory proteins ( 6 ) and our previous studies have shown that MelR , together with the cyclic AMP receptor protein , CRP , regulates expression of the** [melAB_operon] that encodes products essential for melibiose metabolism ( 7 ) .  

For example , the nitrate reductase genes [narGHJI] **are regulated exclusively by NarL and are not responsive to NarP as observed here for cydD , while nitrate induction of the fdnGHI operon is regulated predominantly by** [NarL] but is also responsive in part to NarP ( 33 , 39 ) .  

Under identical experimental conditions as for [argO] **, the regulatory regions of the other ArgP - regulated genes ( asd , dapB , dapD , gdhA , lysA , lysC , and lysP ) were also bound by ArgP , with apparent K d s ranging from 55 nM to 170 nM ; in all these cases ( unlike the situation with argO ) , the addition of** [Lys] was associated with an increase in the apparent K d , indicating that ArgP binding in these instances is Lys sensitive .  

 [MelR] **- dependent repression of the** [melR_promoter] in each of the new constructs was measured as above and the results are illustrated in Figure 6B .  

Given that the activation by full - length , chromosomally expressed [RhaS] **at rhaBAD is approximately 33 - fold higher than that of chromosomally expressed RhaR at rhaSR , comparable efficiencies of activation by the CTDs to their full - length counterparts would have resulted in His 6 - RhaR -** [CTD] activating rhaSR by approximately 30 - fold .  

This observation not only allows us to understand how a modest mutation in O NC2 can affect [fimB] **expression whereas the D3 mutation does not , but it also supports our prior assertion that NanR activates fimB expression without** [NagC] ( Sohanpal et al . , 2004 ) .  

These latter results , together with the experiments described here showing that [MetJ] **binds to this region of DNA , strongly suggest that the - 8 to + 27 region is involved in** [metE] repression .  

Note that in a control experiment , the activity of the MBP - [NarL] **protein was confirmed by its ability to activate transcription of the NarL - dependent** [fdnG_promoter] in vitro ( data not shown ) .  

Thus , it appears that NarL and [NarP] **adopt overlapping mechanisms to inhibit**  adopt overlapping mechanisms to inhibit ydhY – T expression .  

In relation to pacsP1 , the DNA site for [Fis] **is centred at position – 61 , and its distal nature suggests that** [pacsP1] repression operates via a different mechanism .  

Our results indicate that yggA encodes an [ArgP] **- regulated** [Arg_exporter] in E . coli .  

Organization and regulation of the [D_-_xylose_operons] **in Escherichia coli K - 12 :** [XylR] acts as a transcriptional activator .  

Some of these operons are regulated by the [NarL_protein] **alone , such as the narG and frdA operons , whereas expression of the nirB ( encoding NADH - dependent nitrite reductase ) , nrfA and fdnG operons is controlled by both the** [NarL] and NarP proteins .  

The [ArgP_protein] **enhances the expression of the argK gene To test the biological activity of the** [ArgP_protein] on the arginine transport system , the amounts of mRNA synthesized by a 502 bp KpnI - EcoRV DNA fragment containing the N terminus and the control region of the argK gene was investigated .  

However , the inactivation of DcuS ( [IMW553] **) led to a decrease of citC - lacZ expression by a factor of 1 . 5 , which could be due to interaction between CitA and** [DcuS] .  

RESULTS Oligomerization of DnaA Protein to the [dnaA_Promoter] **— Sequence - specific DNA binding of DnaA protein to the** [dnaA_promoter_region] ( Fig . 1 ) was studied in detail using a combined gel - shift and chemical footprinting assay to determine the extent of DNA binding to the region flanking the consensus DNA - binding site , the DnaA box .  
