In [None]:
import torch
from transformers import BertTokenizer, BertModel

import sys, traceback
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
BERT_MODEL='bert-base-cased'

In [3]:
def init_model(model_path):
    tokenizer = BertTokenizer.from_pretrained(model_path,do_lower_case=False)
    model = BertModel.from_pretrained(model_path, output_hidden_states = True)
    model.eval()
    return model,tokenizer

In [74]:
def get_hidden_states(model,tokenizer,marked_text):
    
    tokenized_text = tokenizer.tokenize(marked_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    
    # Display the words with their indeces.
    for tup in zip(tokenized_text, indexed_tokens):
        print('{:<12} {:>6,}'.format(tup[0], tup[1]))
        
    # Mark each of the 22 tokens as belonging to sentence "1".
    segments_ids = [1] * len(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    # Run the text through BERT, and collect all of the hidden states produced
    # from all 12 layers. 
    with torch.no_grad():

        outputs = model(tokens_tensor, segments_tensors)

        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        hidden_states = outputs[2]
        
        return hidden_states

In [12]:
def print_and_plot(hidden_states):
    print ("Number of layers:", len(hidden_states), "  (initial embeddings + 12 BERT layers)")
    layer_i = 0

    print ("Number of batches:", len(hidden_states[layer_i]))
    batch_i = 0

    print ("Number of tokens:", len(hidden_states[layer_i][batch_i]))
    token_i = 0

    print ("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i]))
    
    # For the 5th token in our sentence, select its feature values from layer 5.
    token_i = 1
    layer_i = 1
    vec = hidden_states[layer_i][batch_i][token_i]

    # Plot the values as a histogram to show their distribution.
    plt.figure(figsize=(10,10))
    plt.hist(vec, bins=200)
    plt.show()

In [31]:
def concatenate_four_layers(token_embeddings):
    # Stores the token vectors, with shape [22 x 3,072]
    token_vecs_cat = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:
    
        # `token` is a [12 x 768] tensor

        # Concatenate the vectors (that is, append them together) from the last 
        # four layers.
        # Each layer vector is 768 values, so `cat_vec` is length 3,072.
        cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
    
        # Use `cat_vec` to represent `token`.
        token_vecs_cat.append(cat_vec)

    print ('token_vecs_cat Shape is: %d x %d' % (len(token_vecs_cat), len(token_vecs_cat[0])))

In [36]:
def sum_four_layers(token_embeddings):
    # Stores the token vectors, with shape [22 x 768]
    token_vecs_sum = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)
    
        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec)

    print ('token_vecs_sum Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))
    
    return token_vecs_sum

In [33]:
def avg_twelve_layers(hidden_states):
    # `hidden_states` has shape [13 x 1 x 22 x 768]

    # `token_vecs` is a tensor with shape [22 x 768]
    token_vecs = hidden_states[-2][0]

    # Calculate the average of all 22 token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)
    
    print ('token_vecs Shape is: %d x %d' % (len(token_vecs), len(token_vecs[0])))
    
    return sentence_embedding

In [56]:
def get_embeddings_given_token(text, token_vecs_sum, i):
    print('First 5 vector values for each instance of', text)
    print('')
    print(text, str(token_vecs_sum[i][:5]))
#     print("bank robber  ", str(token_vecs_sum[i][:5]))
#     print("river bank   ", str(token_vecs_sum[i][:5]))

In [110]:
from scipy.spatial.distance import cosine

def calculate_cosine_distance(token_vecs_sum_1, token_vecs_sum_2):
#     print('-------------')
    
#     print(str(token_vecs_sum_1), str(token_vecs_sum_2))
    
    # Calculate the cosine similarity between the word bank 
    # in "bank robber" vs "river bank" (different meanings).
    diff_bank = 1 - cosine(token_vecs_sum_1, token_vecs_sum_2)

    # Calculate the cosine similarity between the word bank
    # in "bank robber" vs "bank vault" (same meaning).
#     same_bank = 1 - cosine(token_vecs_sum_1, token_vecs_sum_1)

#     print('Vector similarity for  *similar*  meanings:  %.2f' % same_bank)
    print('Vector similarity:  %.2f' % diff_bank)

In [1]:
text1 = "man" 
text2 =   "woman"

marked_text_compound = "[CLS] " + text1 + " [SEP]" + text2 + " [SEP]"
marked_text1 = "[CLS] " + text1 + " [SEP]"
marked_text2 = "[CLS] " + text2 + " [SEP]"


model,tokenizer = init_model(BERT_MODEL)
hidden_states_1 = get_hidden_states(model, tokenizer, marked_text_compound)
# hidden_states_2 = get_hidden_states(model, tokenizer, marked_text2)

#     print_and_plot(hidden_states)
        
# `hidden_states` is a Python list.
print('      Type of hidden_states: ', type(hidden_states))

# Each layer in the list is a torch tensor.
print('Tensor shape for each layer: ', hidden_states[0].size())

NameError: name 'init_model' is not defined

In [166]:
# Concatenate the tensors for all layers. We use `stack` here to
# create a new dimension in the tensor.
token_embeddings_1 = torch.stack(hidden_states_1, dim=0)
# token_embeddings_2 = torch.stack(hidden_states_2, dim=0)

# print(token_embeddings_1.size())

# Remove dimension 1, the "batches".
token_embeddings_1 = torch.squeeze(token_embeddings_1, dim=1)
# token_embeddings_2 = torch.squeeze(token_embeddings_2, dim=1)

# print(token_embeddings_1.size())

# Swap dimensions 0 and 1.
token_embeddings_1 = token_embeddings_1.permute(1,0,2)
# token_embeddings_2 = token_embeddings_2.permute(1,0,2)

# print(token_embeddings_1.size())

# token_vecs_sum_1 = concatenate_four_layers(token_embeddings_1)
# token_vecs_sum_2 = concatenate_four_layers(token_embeddings_2)

token_vecs_sum_1 = sum_four_layers(token_embeddings_1)
# token_vecs_sum_2 = sum_four_layers(token_embeddings_2)

sentence_embedding_1 = avg_twelve_layers(hidden_states_1)
# sentence_embedding_2 = avg_twelve_layers(hidden_states_2)

# print ("Our final sentence embedding vector of shape:", sentence_embedding_1.size())
# print(token_vecs_sum_1[1])

# get_embeddings_given_token('bacteria: ', token_vecs_sum_1, 1)
# get_embeddings_given_token('hand: ', token_vecs_sum_2, 1)


calculate_cosine_distance(token_vecs_sum_1[1], token_vecs_sum_1[3])

token_vecs_sum Shape is: 5 x 768
token_vecs Shape is: 5 x 768
Vector similarity:  0.94


In [None]:
import torch

md = torch.load('./pytorch_model.bin', map_location='cpu')
for k in md:
    if (k == 'bert.embeddins.word_embeddings.weight'):
        embeds = md[k]
    
    for l in range(len(embeds)):
        vector = embeds[1]
    for m in range(len(vector)):
            print(round(vector[m].tolist(), 6, end=''))

In [None]:
import torch

input_ids = torch.tensor(tokenizer.encode("Hello sir")).unsqueeze(0)  # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

In [None]:
embeddings = model.get_input_embeddings()
embeddings

In [None]:
input = torch.LongTensor([28995])
embeddings(input)

# trying ontology alignment

In [None]:
pip install xmltodict

import xmltodict

with open('../documents/ontologies/desc2020.xml') as fd:
    doc = xmltodict.parse(fd.read())