In [1]:
from model import ner_bio
import torch
import pandas as pd
import torch.nn as nn
from pytorch_pretrained_bert import BertModel,BertConfig, BertForPreTraining
import tensorflow as tf
import re
import torch
import numpy as np
from tqdm import tqdm, trange
import os
import csv
from pytorch_pretrained_bert import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup
import torch.nn as nn
MAX_LEN = 75
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer(vocab_file='biobert_v1.0_pubmed_pmc/vocab.txt', do_lower_case=False)

Using TensorFlow backend.


In [2]:
def sentence_retriver(path):
    
    with open(path) as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t')
        sentences = []
        tags = []
        sent = []
        tag = []
        for row in reader:
            if len(row) == 0:
                if len(sent) != len(tag):
                    print('Error')
                    break
                sentences.append(sent)
                tags.append(tag)
                sent = []
                tag = []
            else:
                sent.append(row[0])
                tag.append(row[1])
            
    return sentences, tags

def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenizing the words
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels


In [3]:
data = pd.read_csv('tags_small.csv')
tag_values = data['tags'].values
vocab_len = len(tag_values)

In [34]:
config = BertConfig.from_json_file('biobert_v1.0_pubmed_pmc/bert_config.json')
model = ner_bio(vocab_len,config,state_dict=None)
model.load_state_dict(torch.load('app/BIONER_classifier_small.pt', map_location=device))

<All keys matched successfully>

In [35]:
model.cuda()
model.eval()

ner_bio(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )

In [4]:
text = """SmpB regulates histidine biosynthesis in AMM - 1 , affecting HisDGCBHAF. Nuclear export of S6K1 II is regulated by protein kinase CK2 phosphorylation at Ser - 17.
RopB mutation in this particular strain does not result in the kind of global phenotypic change found with covR / S mutation where differential expression was observed of multiple genes encoding virulence determinants , including sic , ska , slo , speA , speJ , scpC and the hyaluronic acid synthesis operon [ 37 ] which were all unaffected in the ropB mutant examined in this work .
SmpB regulates histidine biosynthesis in AMM - 1 , affecting HisDGCBHAF."""

In [5]:
import nltk
sent_text = nltk.sent_tokenize(text)

In [6]:
sent_text

['SmpB regulates histidine biosynthesis in AMM - 1 , affecting HisDGCBHAF.',
 'Nuclear export of S6K1 II is regulated by protein kinase CK2 phosphorylation at Ser - 17.',
 'RopB mutation in this particular strain does not result in the kind of global phenotypic change found with covR / S mutation where differential expression was observed of multiple genes encoding virulence determinants , including sic , ska , slo , speA , speJ , scpC and the hyaluronic acid synthesis operon [ 37 ] which were all unaffected in the ropB mutant examined in this work .',
 'SmpB regulates histidine biosynthesis in AMM - 1 , affecting HisDGCBHAF .']

In [8]:
tokenized_text = []
for sentence in sent_text:
    tokenized_text.append(nltk.word_tokenize(sentence))

In [11]:
print(tokenized_text)

[['SmpB', 'regulates', 'histidine', 'biosynthesis', 'in', 'AMM', '-', '1', ',', 'affecting', 'HisDGCBHAF', '.'], ['Nuclear', 'export', 'of', 'S6K1', 'II', 'is', 'regulated', 'by', 'protein', 'kinase', 'CK2', 'phosphorylation', 'at', 'Ser', '-', '17', '.'], ['RopB', 'mutation', 'in', 'this', 'particular', 'strain', 'does', 'not', 'result', 'in', 'the', 'kind', 'of', 'global', 'phenotypic', 'change', 'found', 'with', 'covR', '/', 'S', 'mutation', 'where', 'differential', 'expression', 'was', 'observed', 'of', 'multiple', 'genes', 'encoding', 'virulence', 'determinants', ',', 'including', 'sic', ',', 'ska', ',', 'slo', ',', 'speA', ',', 'speJ', ',', 'scpC', 'and', 'the', 'hyaluronic', 'acid', 'synthesis', 'operon', '[', '37', ']', 'which', 'were', 'all', 'unaffected', 'in', 'the', 'ropB', 'mutant', 'examined', 'in', 'this', 'work', '.'], ['SmpB', 'regulates', 'histidine', 'biosynthesis', 'in', 'AMM', '-', '1', ',', 'affecting', 'HisDGCBHAF', '.']]


In [12]:
def tokenize_and_preserve(sentence):
    tokenized_sentence = []
    
    for word in sentence:
        # Tokenizing the words
        tokenized_word = tokenizer.tokenize(word)   
        tokenized_sentence.extend(tokenized_word)

    return tokenized_sentence

In [13]:
tokenized_texts_ = [
    tokenize_and_preserve(sent)
    for sent in tokenized_text
]

In [26]:
input_ids = [tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_]
input_attentions = [[1]*len(in_id) for in_id in input_ids]

In [15]:
tokens = tokenizer.convert_ids_to_tokens(input_ids[1])
new_tokens, new_labels = [], []
for token in tokens:
    if token.startswith("##"):
        new_tokens[-1] = new_tokens[-1] + token[2:]
    else:
        
        new_tokens.append(token)

In [30]:
print(len(tokenized_texts_[0]))

26


In [27]:
for x,y in zip(input_ids,input_attentions):
    print(len(x),len(y))

26 26
26 26
97 97
26 26


In [41]:
actual_sentences = []
pred_labels = []
for x,y in zip(input_ids,input_attentions):
    x = torch.tensor(x).cuda()
    y = torch.tensor(y).cuda()
    x = x.view(-1,x.size()[-1])
    y = y.view(-1,y.size()[-1])
    with torch.no_grad():
        _,y_hat = model(x,y)
    label_indices = y_hat.to('cpu').numpy()
    
    tokens = tokenizer.convert_ids_to_tokens(x.to('cpu').numpy()[0])
    new_tokens, new_labels = [], []
    for token, label_idx in zip(tokens, label_indices[0]):
        if token.startswith("##"):
            new_tokens[-1] = new_tokens[-1] + token[2:]
        else:
            new_labels.append(tag_values[label_idx])
            new_tokens.append(token)
    actual_sentences.append(new_tokens)
    pred_labels.append(new_labels)

In [44]:
for token, label in zip(actual_sentences, pred_labels):
    for t,l in zip(token,label):
        print("{}\t{}".format(t, l))

SmpB	S-Protein
regulates	O
histidine	O
biosynthesis	O
in	O
AMM	O
-	O
1	O
,	O
affecting	O
HisDGCBHAF	O
.	O
Nuclear	O
export	O
of	O
S6K1	O
II	O
is	O
regulated	O
by	O
protein	O
kinase	O
CK2	O
phosphorylation	O
at	O
Ser	O
-	O
17	O
.	O
RopB	B-Protein
mutation	O
in	O
this	O
particular	O
strain	O
does	O
not	O
result	O
in	O
the	O
kind	O
of	O
global	O
phenotypic	O
change	O
found	O
with	O
covR	O
/	O
S	O
mutation	O
where	O
differential	O
expression	O
was	O
observed	O
of	O
multiple	O
genes	O
encoding	O
virulence	O
determinants	O
,	O
including	O
sic	O
,	O
ska	O
,	O
slo	O
,	O
speA	B-Protein
,	O
speJ	B-Protein
,	O
scpC	B-Protein
and	O
the	O
hyaluronic	O
acid	O
synthesis	O
operon	O
[	O
37	O
]	O
which	O
were	O
all	O
unaffected	O
in	O
the	O
ropB	B-Protein
mutant	O
examined	O
in	O
this	O
work	O
.	O
SmpB	S-Protein
regulates	O
histidine	O
biosynthesis	O
in	O
AMM	O
-	O
1	O
,	O
affecting	O
HisDGCBHAF	O
.	O
