In [1]:
import pysolr, os

MESH_URL = 'http://' + os.environ['SOLR_HOST'] + ':8983/solr/mesh3'
mesh_solr = pysolr.Solr(MESH_URL, always_commit=False)
mesh_solr.ping()

'{\n  "responseHeader":{\n    "zkConnected":null,\n    "status":0,\n    "QTime":160,\n    "params":{\n      "q":"{!lucene}*:*",\n      "distrib":"false",\n      "df":"_text_",\n      "rows":"10",\n      "echoParams":"all"}},\n  "status":"OK"}\n'

In [2]:
from transformers import BertTokenizer, BertModel

BERT_MODEL='bert-base-cased'

tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=False)

model = BertModel.from_pretrained(BERT_MODEL, output_hidden_states = True)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [3]:
import torch

def get_hidden_states(marked_text):
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    
    segments_ids = [1] * len(tokenized_text)

    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]
        return hidden_states

In [None]:
def get_hidden_states_refactored(marked_text):
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    
    segments_ids = [1] * len(tokenized_text)

    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]
        return hidden_states

In [4]:
def sum_four_layers(token_embeddings):
    # Stores the token vectors, with shape [22 x 768]
    token_vecs_sum = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)
    
        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec)

#     print ('token_vecs_sum Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))
    
    return token_vecs_sum

In [5]:
def avg_twelve_layers(hidden_states):
    # `hidden_states` has shape [13 x 1 x 22 x 768]

    # `token_vecs` is a tensor with shape [22 x 768]
    token_vecs = hidden_states[-2][0]

    # Calculate the average of all 22 token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)
    
    print ('token_vecs Shape is: %d x %d' % (len(token_vecs), len(token_vecs[0])))
    
    return sentence_embedding

In [6]:
def get_embeddings(text):
    marked_text = "[CLS] " + text + " [SEP]"
    
    hidden_states = get_hidden_states(model, tokenizer, marked_text)
    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    token_embeddings = token_embeddings.permute(1,0,2)
    
    return token_embeddings

In [31]:
def get_embeddings_for_double(marked_text):
#     marked_text = "[CLS] " + text + " [SEP]"
    
    hidden_states = get_hidden_states(marked_text)
    print(len(hidden_states))
    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    token_embeddings = token_embeddings.permute(1,0,2)
    
    return token_embeddings

## get all mesh concepts

In [7]:
i=0
nrows = 1000
numFound=0

concepts = []

while i*nrows <= numFound:
    results = mesh_solr.search('*:*', rows=1000, start=i*nrows)
    
    numFound = results.raw_response['response']['numFound']
    i = i+1

    for result in results:
        entry_terms = [entry_term for entry_term in result['EntryTerm']]
        concepts.append( {'concept_id':result['DescriptorUI'], 'entry_terms':entry_terms} )

# print(len(concepts))
# print(concepts)

In [8]:
bert_concepts = []

for c in concepts:
    for term in c['entry_terms']:
        tokenized_text = tokenizer.tokenize(term)

        if (len(tokenized_text)==1):
            c['bert_token'] = tokenized_text
            bert_concepts.append(c)

            # get just the first unique token. Usefull in cases like 'institute' and 'academy', which are entry_terms of the same concept
            break
    i = i + 1

In [None]:
with open("data/embeddings/label.tsv", "w") as f:
    for c in bert_concepts:
        print(c['bert_token'][0], file=f)

In [None]:
embeddings_last = []
embeddings_four_last = []
embeddings_avg = []

for concept in bert_concepts:
    marked_text = "[CLS] " + concept['bert_token'][0] + " [SEP]"
    
    hidden_states = get_hidden_states(marked_text)
    
    token_embeddings = get_embeddings(concept['bert_token'][0])
    
    last_embedding = token_embeddings[1][12]
    four_last= sum_four_layers(token_embeddings)
    avg_embedding  = avg_twelve_layers(hidden_states)
    
    tsv_row_last = ''
    tsv_row_four_last = ''
    tsv_row_avg = ''

#     print(len(last_embedding))
#     print(len(four_last[1]))

    for e in last_embedding:
        tsv_row_last = tsv_row_last + str(e.item()) + '\t'
    embeddings_last.append(tsv_row_last)
    
    for e in four_last[1]:
        tsv_row_four_last = tsv_row_four_last + str(e.item()) + '\t'
    embeddings_four_last.append(tsv_row_four_last)

    for e in avg_embedding:
        tsv_row_avg = tsv_row_avg + str(e.item()) + '\t'
    embeddings_avg.append(tsv_row_avg)

In [None]:
with open("data/embeddings/last_embedding.tsv", "w") as f:
    for e in embeddings_last:
        print (e, file=f)
        
with open("data/embeddings/four_last_embedding.tsv", "w") as f:
    for e in embeddings_four_last:
        print (e, file=f)

In [None]:
with open("data/embeddings/avg_embedding.tsv", "w") as f:
    for e in embeddings_avg:
        print (e, file=f)

## embeddings for concepts with double words

In [15]:
bert_double_concepts = []

for c in concepts:
    for term in c['entry_terms']:
        tokenized_text = tokenizer.tokenize(term)

        
        if (len(tokenized_text)==2):
            c['bert_token'] = tokenized_text
            bert_double_concepts.append(c)

            # get just the first unique token. Usefull in cases like 'institute' and 'academy', which are entry_terms of the same concept
            break

In [19]:
print(bert_double_concepts[1])

{'concept_id': ['D000043'], 'entry_terms': ['Abstracting and Indexing', 'Indexing and Abstracting as Topic', 'Abstracting and Indexing as Topic', 'Indexing and Abstracting', 'Indexing', 'Indexing as Topic', 'Indexes as Topic', 'Abstracting', 'Abstracting as Topic'], 'bert_token': ['Index', '##ing']}


In [32]:
j = 0
for concept in bert_double_concepts:
    marked_text = "[CLS] " + concept['bert_token'][0] + " [SEP] " + concept['bert_token'][1] + " [SEP] "
    
    print(marked_text)
    j = j+1
    
    token_embeddings = get_embeddings_for_double(marked_text)
    print(len(token_embeddings))
    
    
    if j==10:
        break
#     last_embedding = token_embeddings[1][12]

#     tsv_row_last = ''

#     print(len(last_embedding))
#     print(len(four_last[1]))

#     for e in last_embedding:
#         tsv_row_last = tsv_row_last + str(e.item()) + '\t'
#     embeddings_last.append(tsv_row_last)


[CLS] A [SEP] ##bate [SEP] 
13
8
[CLS] Index [SEP] ##ing [SEP] 
13
7
[CLS] A [SEP] ##cacia [SEP] 
13
8
[CLS] Research [SEP] Institutes [SEP] 
13
5
[CLS] Accounting [SEP] ##s [SEP] 
13
7
[CLS] Petroleum [SEP] Industry [SEP] 
13
5
[CLS] Coal [SEP] Industry [SEP] 
13
5
[CLS] Work [SEP] Performance [SEP] 
13
5
[CLS] Grandpa [SEP] ##rent [SEP] 
13
7
[CLS] He [SEP] ##uristic [SEP] 
13
9


## king - man + woman = ?

In [None]:
from scipy.spatial.distance import cosine

def calculate_cosine_distance(token_vecs_sum_1, token_vecs_sum_2):
    diff = 1 - cosine(token_vecs_sum_1, token_vecs_sum_2)
    print('Vector similarity:  %.2f' % diff)

In [None]:
king_token = 'king'
man_token = 'man'
woman_token = 'woman'
queen_token = 'queen'

king_embeddings = get_embeddings(model, tokenizer, king_token)
man_embeddings = get_embeddings(model, tokenizer, king_token)
woman_embeddings = get_embeddings(model, tokenizer, king_token)
queen_embeddings = get_embeddings(model, tokenizer, king_token)

four_last_king = sum_four_layers(king_embeddings)
four_last_man = sum_four_layers(man_embeddings)
four_last_woman = sum_four_layers(woman_embeddings)
four_last_queen = sum_four_layers(queen_embeddings)

result_embeddings = four_last_king[1] - four_last_man[1] + four_last_woman[1]
print('result:', result_embeddings[:5])

calculate_cosine_distance(four_last_queen[1][:5], result_embeddings))