In [None]:
!pip install transformers
!pip install torch torchvision torchaudio

In [2]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from xml.dom import minidom

from transformers import BertModel, BertTokenizer
import torch
import numpy as np

#Timing
from IPython.display import clear_output
import timeit

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#Function to parse xml
def parseXML(filename, isInclude):
    abstracts = []
    tags = []
    xmldoc = minidom.parse(filename)
    itemlist = xmldoc.getElementsByTagName('abstract')
    for node in xmldoc.getElementsByTagName('abstract'):
        abstract = node.getElementsByTagName('style')[0].firstChild.nodeValue
        abstracts.append(abstract)
        tags.append(isInclude)  
    return abstracts, tags

In [4]:
# Function to get document embedding
def get_embedding(model, tokenizer, text):

    # Encode with special tokens ([CLS] and [SEP], returning pytorch tensors
    encoded_dict = tokenizer.encode_plus(
                        text,
                        truncation=True,
                        max_length=512,
                        add_special_tokens = True,
                        return_tensors = 'pt'
                )
    input_ids = encoded_dict['input_ids']  
    # Set model to evaluation mode
    model.eval()   
    
    # Run through BERT
    with torch.no_grad():
        outputs = model(input_ids)
        # Extract hidden states
        hidden_states = outputs[2]

    # Select the embeddings
    token_vecs = hidden_states[-2][0]
    # Calculate average of token vectors
    sentence_embedding = torch.mean(token_vecs, dim=0)
    # Convert to np array
    sentence_embedding = sentence_embedding.detach().numpy()

    return sentence_embedding

In [5]:
# Load scibert
scibert_model = BertModel.from_pretrained("allenai/scibert_scivocab_uncased",
                                  output_hidden_states=True)
scibert_tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

print('scibert_tokenizer is type:', type(scibert_tokenizer))
print('    scibert_model is type:', type(scibert_model))


def calculate_embeddings(name):
    # Parse XML
    abstractsInclude, tagsInclude = parseXML(name + '/' + name + 'Include.xml', 1)
    abstractsExclude, tagsExclude = parseXML(name + '/' + name + 'Exclude.xml', 0)
    df = pd.DataFrame(list(zip(tagsInclude + tagsExclude, abstractsInclude + abstractsExclude)), columns =['code', 'abstract'])

    embeddings = []
    length = len(df['abstract'].tolist())
    index = 0

    start = timeit.default_timer()
    for sentence in df['abstract'].tolist():
        clear_output(wait=True)
        index += 1
        sen_emb = get_embedding(scibert_model, scibert_tokenizer, sentence)
        embeddings.append(sen_emb)

        stop = timeit.default_timer()

        if (index/length*100) < 1:
            expected_time = "Calculating..."

        else:
            time_perc = timeit.default_timer()
            expected_time = np.round( (time_perc-start) /(index/length) /60,2)

        print(index, length)
        print(expected_time)


    # Append to dataframe
    df['scibert'] = embeddings
    # Save dataframe to prevent recalculation
    df.to_pickle("./" + name + "/" + name + "Train.pkl")

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


scibert_tokenizer is type: <class 'transformers.models.bert.tokenization_bert.BertTokenizer'>
    scibert_model is type: <class 'transformers.models.bert.modeling_bert.BertModel'>


In [12]:
calculate_embeddings("cellulitis")

602 602
2.98


In [13]:
calculate_embeddings("overdiagnosis")

1972 1972
9.36


In [6]:
calculate_embeddings("copper")

348 348
1.62
