In [1]:
import sys
import os
from timeit import default_timer as timer
import torch
from transformers import RobertaTokenizer, RobertaModel, RobertaForMaskedLM, RobertaConfig

In [2]:
# Make sure we're in the transformers directory with fine-tuned model output.
os.chdir('/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/')
os.getcwd()

'/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings'

In [3]:
# Adapted from the tutorial at https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/
# and Transformers documentation: https://huggingface.co/transformers/model_doc/roberta.html#robertaformaskedlm

In [26]:
tokenizer = RobertaTokenizer.from_pretrained('/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/output_wiki-103_filtered')
config = RobertaConfig.from_pretrained('/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/output_wiki-103_filtered')
model = RobertaForMaskedLM.from_pretrained('/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/output_wiki-103_filtered', config=config)
model.eval()

context_file = "/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/wiki.test.raw.out"
output_file = '/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/roberta_test.txt'
count_file = '/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/roberta_test_counts.txt'
vocab_file = '/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/vocab_checked.txt'
vocab = make_vocab(vocab_file)
FEATURE_COUNT = 768

In [27]:
# Process vocabulary words in the outer loop.
for v in vocab:
    start = timer()
    with open(context_file, 'r') as lines:
        v_sum = torch.zeros([1, FEATURE_COUNT])
        v_tokens = tokenize_text(v, tokenizer)
        print_tokenized_text(v_tokens, tokenizer)
        count_sentence = 0
        count_tensor = 0
        
        # Process all lines in the context file in the inner loop.
        for line in lines:
            # Check for this vocab word in this line; if found, split the line into individual sentences.
            if v in line.lower().split():
                for sentence in line.split('.'):
                    if v in sentence.lower():
                        line = sentence
                        count_sentence += 1
                        print(f'\nInstance {count_sentence} of {tokenizer.decode(v_tokens[1:-1]).strip()}.')
                        break  # We'll take the first instance of the word and discard the rest of the line.
                # Split the new sentence-based line into tokens.
                line_tokens = tokenize_text(line, tokenizer)               
                # Get the indices of the line at which our vocabulary word tokens are located.
                indices = get_vocab_indices(v_tokens, line_tokens, tokenizer)                             

                # If the vocabulary word was found, process the containing line.
                if indices:
                    # Get the feature vectors for all tokens in the line/sentence.
                    token_embeddings = create_token_embeddings(line_tokens)
                    # Select a method for extracting specific layers of the model.
                    token_vecs_layer = get_layer_token_vecs(token_embeddings, 12)
                    # Get the vocab word's contextual embedding for this line.
                    tensor_layer = torch.zeros([1, FEATURE_COUNT])
                    for i in range(len(indices)):
                        v_index = i % len(v_tokens[1:-1])
                        print(f'{tokenizer.decode(v_tokens[v_index + 1]).strip()} at index {indices[i]}: {token_vecs_layer[indices[i]][:5].tolist()}')
                        tensor_layer += token_vecs_layer[indices[i]]
#                         print(f'Sum of tensors is: {tensor_layer[0][:5].tolist()} before taking the mean.')

                    # If our vocab word is broken into more than one token, we need to get the mean of the token embeddings.
                    tensor_layer /= len(indices)
#                     print(f'Sum of tensors is: {tensor_layer[0][:5].tolist()} after taking the mean.')

                    # Add the embedding distilled from this line to the sum of embeddings for all lines.
                    v_sum += tensor_layer
                    count_tensor += 1
                    print(f'Grand sum of {count_tensor} tensor sets is: {v_sum[0][:5].tolist()}')
                ###################################################################################
            # Stop processing lines once we've found 2000 instances of our vocab word.
            if count_tensor >= 2000:
                break
        
        # We're done processing all lines of 512 tokens or less containing our vocab word.
        # Get the mean embedding for the word.
        v_mean = v_sum / count_tensor
        print(f'Mean of tensors is: {v_mean[0][:5]} ({len(v_mean[0])} features in tensor)')
        write_embedding(output_file, v, v_mean)
        try:
            with open(count_file, 'a') as counts:
                counts.write(v + ', ' + str(count_tensor) + '\n')
            print(f'Saved the count of sentences used to create {v} embedding')
        except:
            print('Wha?! Could not write the sentence count.')
    end = timer()
    print(f'Run time for {v} was {end - start} seconds.')


There are 1 tokens in tokenized text:
aback
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for aback.
Saved the count of sentences used to create aback embedding
Run time for aback was 0.034120993000215094 seconds.

There are 2 tokens in tokenized text:
ab
ashed
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for abashed.
Saved the count of sentences used to create abashed embedding
Run time for abashed was 0.03028911399997014 seconds.

There are 1 tokens in tokenized text:
abhor
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for abhor.
Saved the count of sentences used to create abhor embedding
Run time for abhor was 0.02822808500013707 seconds.

There are 2 tokens in tokenized text:
abhor
red
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for abhorred.
Saved the count of sentence

Size of token embeddings is torch.Size([37, 13, 768])
Shape of summed layers is: 37 x 768
accepted at index 22: [-0.005839262157678604, 0.0065529122948646545, 0.007053704932332039, 0.2620042860507965, 0.30201423168182373]
Grand sum of 5 tensor sets is: [0.3896675705909729, 0.37197333574295044, -0.18983963131904602, 1.792115330696106, 1.7109131813049316]

Instance 6 of accepted.
Size of token embeddings is torch.Size([21, 13, 768])
Shape of summed layers is: 21 x 768
accepted at index 19: [0.055956169962882996, 0.10654482245445251, -0.1882839798927307, 0.127366304397583, 0.40761083364486694]
Grand sum of 6 tensor sets is: [0.4456237554550171, 0.47851815819740295, -0.37812361121177673, 1.919481635093689, 2.1185240745544434]

Instance 7 of accepted.
Size of token embeddings is torch.Size([28, 13, 768])
Shape of summed layers is: 28 x 768
accepted at index 3: [0.04771412909030914, -0.06445986032485962, -0.07482258975505829, 0.20206670463085175, 0.9173025488853455]
Grand sum of 7 tensor set

KeyboardInterrupt: 

In [12]:
def make_vocab(vocab_file):
    """Convert a file of newline separated words into a Python list and return it."""
    vocab = []
    with open(vocab_file, 'r') as v:
        vocab = v.read().splitlines()
    return vocab

In [20]:
def tokenize_text(text, tokenizer):
    """Break the input text into tokens the model can use, and return them.
    Use max_length to avoid overflowing the maximum sequence length for the model."""
    tokenized_text = tokenizer.encode(text, add_special_tokens=True, max_length=512)
    return tokenized_text

In [18]:
def print_tokenized_text(tokens, tokenizer):
    """Print the number of tokens in some tokenized text, not counting the leading and trailing separators.
    Print each token without any leading or trailing whitespace."""
    print(f'\nThere are {len(tokens) - 2} tokens in tokenized text:')
    for t in tokens[1:-1]:
        print(tokenizer.decode(t).strip())

In [21]:
def get_vocab_indices(v_tokens, line_tokens, tokenizer):
    """Search a line for all tokens of a vocabulary word, and return the indices of their locations."""
    indices = []              
    for t in v_tokens[1:-1]:
        for i, token_str in enumerate(line_tokens):
            if tokenizer.decode(token_str).strip() == tokenizer.decode(t).strip():
                indices.append(i)
    return indices

In [29]:
def create_token_embeddings(tokenized_text):
    """Convert the model into a more usable format: a tensor of size [<token_count>, <layer_count>, <feature_count>]."""
    input_ids = torch.tensor(tokenized_text).unsqueeze(0)  # Batch size 1
    with torch.no_grad():
        outputs = model(input_ids, masked_lm_labels=input_ids)
        encoded_layers = outputs[2]
        token_embeddings = torch.stack(encoded_layers, dim=0)  # Concatenate the tensors for all layers.
        token_embeddings = torch.squeeze(token_embeddings, dim=1)  # Remove the "batches" dimension
        token_embeddings = token_embeddings.permute(1,0,2)  # Rearrange the model dimensions.
        print(f'Size of token embeddings is {token_embeddings.size()}')
        return token_embeddings

In [23]:
def sum_last_four_token_vecs(token_embeddings):
    """Sum the last 4 layers' features and return the resulting vector."""
    token_vecs_sum_last_four = []

    # For each token in the sentence, sum the last 4 layers of the model.
    for token in token_embeddings:
        sum_vec = torch.sum(token[-4:], dim=0)

        # Use `sum_vec` to represent `token`.
        token_vecs_sum_last_four.append(sum_vec)

    print ('Shape of summed layers is: %d x %d' % (len(token_vecs_sum_last_four), len(token_vecs_sum_last_four[0])))
    return token_vecs_sum_last_four

In [7]:
# Return a single layer of the model.
def get_layer_token_vecs(token_embeddings, layer_number):
    token_vecs_layer = []

    # For each token in the sentence...
    for token in token_embeddings:
        # `token` is a [13 x 768] tensor
        # Sum the vectors from the last 4 layers.
        layer_vec = token[layer_number]

        # Use `sum_vec` to represent `token`.
        token_vecs_layer.append(layer_vec)

    print ('Shape of summed layers is: %d x %d' % (len(token_vecs_layer), len(token_vecs_layer[0])))
    # Shape is: <token count> x 768
    return token_vecs_layer

In [8]:
def write_embedding(embeddings_file, vocab_word, contextual_embedding):
    try:
        with open(embeddings_file, 'a') as f:
            f.write(vocab_word)
            for value in contextual_embedding[0]:
                f.write(' ' + str(value.item()))
            f.write('\n')
        print(f'Saved the embedding for {vocab_word}.')
    except:
        print('Oh no! Unable to write to the embeddings file.')