# Using existing BERT NER model for tokenization

In [33]:
from transformers import AutoConfig, AutoTokenizer,TFBertTokenizer, BertTokenizer,BertForTokenClassification #pytorch package
import torch

In [34]:
#folder path
model_path = "/Users/u6022257/Documents/Transformers/models/biobert_genetic_ner/"
tokenizer_config = model_path+"tokenizer_config.json"
model_config = model_path+"config.json"
vocab_file = model_path+"vocab.txt"

In [35]:
config_model = AutoConfig.from_pretrained(model_config)
#tokenizer = AutoTokenizer.from_pretrained(model_path,use_fast=True)#AutoTokenizer does not work for bert genetic detection

#tokenizer = BertTokenizer(vocab_file,do_lower_case = True)
Model = BertForTokenClassification.from_pretrained(model_path, local_files_only=True)


## label dictionary

In [36]:
id2label = {
    "0": "B-GENETIC",
    "1": "I-GENETIC",
    "2": "O"
  }
label2id = {
    "B-GENETIC": 0,
    "I-GENETIC": 1,
    "O": 2
  }

In [37]:
sampleText = "This patient is heterozygous in the PPP2R5D gene for a de novo variant designated c.598G>A, which is predicted to result in the amino acid substitution p.Glu200Lys. This is a recurrent de novo variant that has been reported to be causative for autosomal dominant intellectual disability (Table 1, Houge et al. 2015. PubMed ID: 26168268; Table 1, Loveday et al. 2015. PubMed ID: 25972378; Supp. Tables 6 and 10, Lelieveld et al. 2017. PubMed ID: 28867141; Table 1, Reijnders et al. 2017. PubMed ID: 29051493). In summary, we interpret this variant as pathogenic. Pathogenic variants in PPP2R5D are associated with autosomal dominant intellectual disability 35 (OMIM: #616355). Clinical features include increased height (variable), macrocephaly (variable), absent speech, dysmorphic facial features, delayed psychomotor development, and seizures. Variants are reported to occur de novo. Clinical correlation is recommended."

In [70]:
encode_dic = tokenizer.encode_plus(
                    sampleText,                      # input text to tokenize
                    return_tensors="pt",
                    truncation=True,
                    max_length=512,            # maximum sequence length
                    pad_to_max_length=True,    # pad shorter sequences
                    return_offsets_mapping=True,  # return offsets
)
inputs = tokenizer.encode(
    sampleText,
    return_tensors="pt",
    max_length=512,
    truncation=True,
    pad_to_max_length=True    # pad shorter sequences
)

#inputs = tokenizer.encode_plus(
#                    sampleText,                      # input text to tokenize
#                    return_tensors="pt",
#                    truncation=True,
#                    max_length=512,            # maximum sequence length
#                    pad_to_max_length=True    # pad shorter sequences
#                    #return_offsets_mapping=True,  # return offsets
#)
tokens = tokenizer.tokenize(
    tokenizer.decode(
        tokenizer.encode(
            sampleText,
            #return_tensors="pt",
            max_length=512,
            truncation=True,
            pad_to_max_length=True
        )
    ))


#tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sampleText, max_length=512,truncation=True)))
#inputs = tokenizer.encode(sampleText,return_tensors="pt",max_length=512,truncation=True)

# encode text using the tokenizer
#encoded_dict = tokenizer.encode_plus(
#                    sampleText,                      # input text to tokenize
#                    add_special_tokens=True,   # add special tokens for BERT
#                    max_length=512,            # maximum sequence length
#                    pad_to_max_length=True,    # pad shorter sequences
#                    return_offsets_mapping=True,  # return offsets
#                    return_attention_mask=False,  # don't return attention mask
#                    return_token_type_ids=False    # don't return token type IDs
#               )

# extract the offsets from the encoded dictionary
offsets = encode_dic['offset_mapping']
#print(offsets.tolist())
offsetsList = offsets.tolist()
print(len(offsetsList[0]),len(tokens))
print(tokenizer.is_fast)
for i in range(len(tokens)):
    print(tokens[i],offsetsList[0][i])
## print the original text and the offsets for each token
#print(f"Original Text: {sampleText}\n")
#for i, token in enumerate(tokenizer.convert_ids_to_tokens(inputs['input_ids'])):
#    print(f"Token: {token}, Start Offset: {offsets[i][0]}, End Offset: {offsets[i][1]}")

##print(len(tokens))
##print(tokens)
#print(input)
#tokenizer.is_fast
#tokens

512 512
True
[CLS] [0, 0]
This [0, 4]
patient [5, 12]
is [13, 15]
he [16, 18]
##tero [18, 22]
##zy [22, 24]
##go [24, 26]
##us [26, 28]
in [29, 31]
the [32, 35]
PP [36, 38]
##P [38, 39]
##2 [39, 40]
##R [40, 41]
##5 [41, 42]
##D [42, 43]
gene [44, 48]
for [49, 52]
a [53, 54]
de [55, 57]
no [58, 60]
##vo [60, 62]
variant [63, 70]
designated [71, 81]
c [82, 83]
. [83, 84]
59 [84, 86]
##8 [86, 87]
##G [87, 88]
> [88, 89]
A [89, 90]
, [90, 91]
which [92, 97]
is [98, 100]
predicted [101, 110]
to [111, 113]
result [114, 120]
in [121, 123]
the [124, 127]
amino [128, 133]
acid [134, 138]
substitution [139, 151]
p [152, 153]
. [153, 154]
G [154, 155]
##lu [155, 157]
##20 [157, 159]
##0 [159, 160]
##L [160, 161]
##ys [161, 163]
. [163, 164]
This [165, 169]
is [170, 172]
a [173, 174]
re [175, 177]
##current [177, 184]
de [185, 187]
no [188, 190]
##vo [190, 192]
variant [193, 200]
that [201, 205]
has [206, 209]
been [210, 214]
reported [215, 223]
to [224, 226]
be [227, 229]
ca [230, 232]
##usa [23

In [71]:
outputs = Model(inputs)[0]
predictions = torch.argmax(outputs,dim=2)
print(predictions[0].tolist())
print(len(tokens),len(predictions[0].tolist()))
labels = predictions[0].tolist()
tOut = []
lOut = []
for i in range(len(predictions[0].tolist())):
    lOut.append(id2label[str(labels[i])])
    tOut.append(str(tokens[i]))
    print(str(tokens[i]), id2label[str(labels[i])])


[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 

In [25]:
import re
specialTag = ['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']

# convert tokens back to string
clean_text = tokenizer.convert_tokens_to_string(tokens)

# remove unwanted characters
#clean_text = re.sub(r'[^\w\s]', '', clean_text)
#clean_text = re.sub(r'[^\w\s]', '', clean_text)

# normalize text
clean_text = clean_text.lower()

## remove numbers
#clean_text = re.sub(r'\d+', '', clean_text)

# remove excess whitespace
clean_text = re.sub(r'\s+', ' ', clean_text).strip()
clean_text

'cls this patient is heterozygous in the ppp2r5d gene for a de novo variant designated c 598g a which is predicted to result in the amino acid substitution p glu200lys this is a recurrent de novo variant that has been reported to be causative for autosomal dominant intellectual disability table 1 houge et al 2015 pubmed id 26168268 table 1 loveday et al 2015 pubmed id 25972378 supp tables 6 and 10 lelieveld et al 2017 pubmed id 28867141 table 1 reijnders et al 2017 pubmed id 29051493 in summary we interpret this variant as pathogenic pathogenic variants in ppp2r5d are associated with autosomal dominant intellectual disability 35 omim 616355 clinical features include increased height variable macrocephaly variable absent speech dysmorphic facial features delayed psychomotor development and seizures variants are reported to occur de novo clinical correlation is recommended sep'

# spacy does not align the tokens

In [8]:
import spacy

In [9]:
nlp = spacy.blank("en") # empty English pipeline


In [11]:
config = {
    "model": {
        #"@architectures": "BertForTokenClassification",
        "name": "/Users/u6022257/Documents/Transformers/models/biobert_genetic_ner/" # XXX customize this bit
    }
}
nlp.add_pipe("transformer", config=config)
nlp.initialize() # XXX don't forget this step!

Some weights of the model checkpoint at /Users/u6022257/Documents/Transformers/models/biobert_genetic_ner/ were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at /Users/u6022257/Documents/Transformers/models/biobert_genetic_ner/ and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<thinc.optimizers.Optimizer at 0x7fba47854cc0>

In [12]:
doc = nlp("He is detected with CHEK2.")
print(doc._.trf_data)

TransformerData(wordpieces=WordpieceBatch(strings=[['[CLS]', 'He', 'is', 'detected', 'with', 'CH', '##E', '##K', '##2', '.', '[SEP]']], input_ids=array([[  101,  1124,  1110, 11168,  1114, 24890,  2036,  2428,  1477,
          119,   102]], dtype=int32), attention_mask=array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]], dtype=float32), lengths=[11], token_type_ids=array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)), model_output=ModelOutput([('last_hidden_state', array([[[-0.15534765, -1.3265778 ,  0.8279202 , ..., -0.09503871,
         -0.40406936, -0.38022342],
        [-0.5819348 , -1.1693225 ,  0.7354249 , ...,  0.40297902,
         -0.4020602 , -0.3657639 ],
        [-0.34660724, -1.248121  ,  0.7437485 , ...,  0.28507286,
         -0.59440875, -0.3216462 ],
        ...,
        [-0.24346054, -0.7579181 ,  0.59318686, ..., -0.4000535 ,
          0.31812215, -0.547424  ],
        [-0.07892587, -1.0033358 ,  0.65443486, ..., -0.07997692,
         -0.12574786, -0.31520274],
  

# spacy-transformers

In [14]:
import spacy_transformers