In [18]:
import sys
import os
from timeit import default_timer as timer
import torch
from transformers import RobertaTokenizer, RobertaModel, RobertaForMaskedLM, RobertaConfig
import matplotlib.pyplot as plt
# % matplotlib inline
from scipy.spatial.distance import cosine

In [19]:
# Make sure we're in the transformers directory with fine-tuned model output.
os.chdir('/home/jupyter/Notebooks/crystal/NLP/transformers/examples/')
os.getcwd()

'/home/jupyter/Notebooks/crystal/NLP/transformers/examples'

In [20]:
# Adapted from the tutorial at https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/
# and Transformers documentation: https://huggingface.co/transformers/model_doc/roberta.html#robertaformaskedlm

In [26]:
tokenizer = RobertaTokenizer.from_pretrained('./output_wiki-103_base/')
config = RobertaConfig.from_pretrained('./output_wiki-103_base/')
model = RobertaForMaskedLM.from_pretrained('./output_wiki-103_base/', config=config)
# Outputting hidden states allows direct access to hidden layers of the model.
# Outputting hidden states must be set to "true" in the config file during fine-tuning.
# config.output_hidden_states = True
model.eval()

context_file = "/home/jupyter/Notebooks/crystal/NLP/transformers/examples/CC_WET_test_ab"
output_file = '/home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/'
count_file = '/home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/'
vocab_file = '/home/jupyter/Notebooks/crystal/NLP/MiFace/Python/data/vocab_files/FE_vocab_study.txt'
vocab = make_vocab(vocab_file)

FEATURE_COUNT = 768
LAYER_COUNT = 13
MAX_LINES = 100

In [27]:
# Test all layers of the model in the outer loop.
for l in range(0, LAYER_COUNT):
    l_output_file = ''
    l_count_file = ''
    l_output_file = os.path.join(output_file, 'roberta_layer' + str(l) + '_wiki-103_base_FEvocab.txt')
    l_count_file = os.path.join(count_file, 'roberta_layer'+ str(l) + '_wiki-103_base_counts.txt')
    print(l_output_file, l_count_file)
    # Process vocabulary words in the middle loop.
    for v in vocab:
        start = timer()
        with open(context_file, 'r') as lines:
            v_sum = torch.zeros([1, FEATURE_COUNT])
            v_tokens = tokenizer.encode(v)
            print(f'\nThere are {len(v_tokens) - 2} tokens in tokenized vocabulary word:')
            for t in v_tokens[1:-1]:
                print(tokenizer.decode(t).strip())
            count_sentence = 0
            count_tensor = 0

            # Process all lines in the context file in the inner loop.
            for line in lines:
                # Check for this vocab word in this line; if found, split the line into individual sentences.
                if v in line.lower().split():
                    for sentence in line.split('.'):
                        if v in sentence.lower():
                            line = sentence
                            count_sentence += 1
                            break
                    # Split the new sentence-based line into tokens.
                    # Use max_length to avoid overflowing the maximum sequence length for the model.
                    tokenized_text = tokenizer.encode(line, add_special_tokens=True, max_length=512)
                    indices = []              

                    # Check to see whether the vocab word is found in this particular line.
                    # Initially, some lines may have comprised multiple sentences, which were
                    # broken out individually above.
                    for t in v_tokens[1:-1]:
                        for i, token_str in enumerate(tokenized_text):
                            if tokenizer.decode(token_str).strip() == tokenizer.decode(t).strip():
                                indices.append(i)               

                    ###################################################################################
                    # If the vocabulary word was found, process the containing line.
                    if indices:

                        # The vocab word was found in this line/sentence, at the locations in indices.
                        # Get the feature vectors for all tokens in the line/sentence.
                        token_embeddings = create_token_embeddings(tokenized_text)
                        token_vecs_layer = get_layer_token_vecs(token_embeddings, l)

                        # Get the vocab word's contextual embedding for this line.
                        tensor_layer = torch.zeros([1, FEATURE_COUNT])
                        for i in range(len(indices)):
                            v_index = i % len(v_tokens[1:-1])
                            tensor_layer += token_vecs_layer[indices[i]]

                        # If our vocab word is broken into more than one token, we need to get the mean of the token embeddings.
                        tensor_layer /= len(indices)

                        # Add the embedding distilled from this line to the sum of embeddings for all lines.
                        v_sum += tensor_layer
                        count_tensor += 1
                    ###################################################################################
                # Stop processing lines once we've found 2000 instances of our vocab word.
                if count_tensor >= MAX_LINES:
                    break

            # We're done processing all lines of 512 tokens or less containing our vocab word.
            # Get the mean embedding for the word.
            v_mean = v_sum / count_tensor
            print(f'Mean of {count_tensor} tensors is: {v_mean[0][:5]} ({len(v_mean[0])} features in tensor)')
            write_embedding(l_output_file, v, v_mean)
            try:
                with open(l_count_file, 'a') as counts:
                    counts.write(v + ', ' + str(count_tensor) + '\n')
            except:
                print('Wha?! Could not write the sentence count.')
        end = timer()
        print(f'Run time for {v} was {end - start} seconds.')

/home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/roberta_layer2_wiki-103_base_FEvocab.txt /home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/roberta_layer2_wiki-103_base_counts.txt

There are 4 tokens in tokenized vocabulary word:
st
upe
f
ied


IndexError: tuple index out of range

In [21]:
def make_vocab(vocab_file):
    vocab = []
    with open(vocab_file, 'r') as v:
        vocab = v.read().splitlines()
    return vocab

In [22]:
def create_token_embeddings(tokenized_text):
    input_ids = torch.tensor(tokenized_text).unsqueeze(0)  # Batch size 1
    with torch.no_grad():
        outputs = model(input_ids, masked_lm_labels=input_ids)
        encoded_layers = outputs[2]
        token_embeddings = torch.stack(encoded_layers, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1,0,2)
#         print(f'Size of token embeddings is {token_embeddings.size()}')
        return token_embeddings

In [23]:
# Sum the last 4 layers' features
def sum_last_four_token_vecs(token_embeddings):
    token_vecs_sum_last_four = []

    # For each token in the sentence...
    for token in token_embeddings:
        # `token` is a [13 x 768] tensor
        # Sum the vectors from the last 4 layers.
        sum_vec = torch.sum(token[-4:], dim=0)

        # Use `sum_vec` to represent `token`.
        token_vecs_sum_last_four.append(sum_vec)

#     print ('Shape of summed layers is: %d x %d' % (len(token_vecs_sum_last_four), len(token_vecs_sum_last_four[0])))
    # Shape is: <token count> x 768
    return token_vecs_sum_last_four

In [24]:
# Return a single layer of the model.
def get_layer_token_vecs(token_embeddings, layer_number):
    token_vecs_layer = []

    # For each token in the sentence...
    for token in token_embeddings:
        # `token` is a [13 x 768] tensor
        # Sum the vectors from the last 4 layers.
        layer_vec = token[layer_number]

        # Use `sum_vec` to represent `token`.
        token_vecs_layer.append(layer_vec)

#     print ('Shape of summed layers is: %d x %d' % (len(token_vecs_layer), len(token_vecs_layer[0])))
    # Shape is: <token count> x 768
    return token_vecs_layer

In [25]:
def write_embedding(embeddings_file, vocab_word, contextual_embedding):
    try:
        with open(embeddings_file, 'a') as f:
            f.write(vocab_word)
            for value in contextual_embedding[0]:
                f.write(' ' + str(value.item()))
            f.write('\n')
#         print(f'Saved the embedding for {vocab_word}.')
    except:
        print('Oh no! Unable to write to the embeddings file.')