In [1]:
import sys
import os
from timeit import default_timer as timer
import torch
from transformers import RobertaTokenizer, RobertaModel, RobertaForMaskedLM, RobertaConfig
import matplotlib.pyplot as plt
# % matplotlib inline
from scipy.spatial.distance import cosine

In [2]:
# Make sure we're in the transformers directory with fine-tuned model output.
os.chdir('/home/jupyter/Notebooks/crystal/NLP/transformers/examples/')
os.getcwd()

'/home/jupyter/Notebooks/crystal/NLP/transformers/examples'

In [3]:
# Adapted from the tutorial at https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/
# and Transformers documentation: https://huggingface.co/transformers/model_doc/roberta.html#robertaformaskedlm

In [3]:
tokenizer = RobertaTokenizer.from_pretrained('./roBERTa_base/')
config = RobertaConfig.from_pretrained('./roBERTa_base/')
model = RobertaForMaskedLM.from_pretrained('./roBERTa_base/', config=config)
# Outputting hidden states allows direct access to hidden layers of the model.
# Outputting hidden states must be set to "true" in the config file during fine-tuning.
# config.output_hidden_states = True
model.eval()

context_file = "/home/jupyter/Notebooks/crystal/NLP/transformers/examples/CC_WET_test_ab"
output_file = '/home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/'
count_file = '/home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/'
vocab_file = '/home/jupyter/Notebooks/crystal/NLP/MiFace/Python/data/vocab_files/FE_vocab_study.txt'
vocab = make_vocab(vocab_file)

FEATURE_COUNT = 768
LAYER_COUNT = 2
MAX_LINES = 100

NameError: name 'make_vocab' is not defined

In [29]:
# Test all layers of the model in the outer loop.
for l in range(0, LAYER_COUNT):
    l_output_file = ''
    l_count_file = ''
    l_output_file = os.path.join(output_file, 'roberta_layer' + str(l) + '_roberta_base_FEvocab.txt')
    l_count_file = os.path.join(count_file, 'roberta_layer'+ str(l) + '_roberta_base_counts.txt')
    print(l_output_file, l_count_file)
    # Process vocabulary words in the middle loop.
    for v in vocab:
        start = timer()
        with open(context_file, 'r') as lines:
            v_sum = torch.zeros([1, FEATURE_COUNT])
            v_tokens = tokenizer.encode(v)
            print(f'\nThere are {len(v_tokens) - 2} tokens in tokenized vocabulary word:')
            for t in v_tokens[1:-1]:
                print(tokenizer.decode(t).strip())
            count_sentence = 0
            count_tensor = 0

            # Process all lines in the context file in the inner loop.
            for line in lines:
                # Check for this vocab word in this line; if found, split the line into individual sentences.
                if v in line.lower().split():
                    for sentence in line.split('.'):
                        if v in sentence.lower():
                            line = sentence
                            count_sentence += 1
                            break
                    # Split the new sentence-based line into tokens.
                    # Use max_length to avoid overflowing the maximum sequence length for the model.
                    tokenized_text = tokenizer.encode(line, add_special_tokens=True, max_length=512)
                    indices = []              

                    # Check to see whether the vocab word is found in this particular line.
                    # Initially, some lines may have comprised multiple sentences, which were
                    # broken out individually above.
                    for t in v_tokens[1:-1]:
                        for i, token_str in enumerate(tokenized_text):
                            if tokenizer.decode(token_str).strip() == tokenizer.decode(t).strip():
                                indices.append(i)               

                    ###################################################################################
                    # If the vocabulary word was found, process the containing line.
                    if indices:

                        # The vocab word was found in this line/sentence, at the locations in indices.
                        # Get the feature vectors for all tokens in the line/sentence.
                        token_embeddings = create_token_embeddings(tokenized_text)
                        token_vecs_layer = get_layer_token_vecs(token_embeddings, l)

                        # Get the vocab word's contextual embedding for this line.
                        tensor_layer = torch.zeros([1, FEATURE_COUNT])
                        for i in range(len(indices)):
                            v_index = i % len(v_tokens[1:-1])
                            tensor_layer += token_vecs_layer[indices[i]]

                        # If our vocab word is broken into more than one token, we need to get the mean of the token embeddings.
                        tensor_layer /= len(indices)

                        # Add the embedding distilled from this line to the sum of embeddings for all lines.
                        v_sum += tensor_layer
                        count_tensor += 1
                    ###################################################################################
                # Stop processing lines once we've found 2000 instances of our vocab word.
                if count_tensor >= MAX_LINES:
                    break

            # We're done processing all lines of 512 tokens or less containing our vocab word.
            # Get the mean embedding for the word.
            v_mean = v_sum / count_tensor
            print(f'Mean of {count_tensor} tensors is: {v_mean[0][:5]} ({len(v_mean[0])} features in tensor)')
            write_embedding(l_output_file, v, v_mean)
            try:
                with open(l_count_file, 'a') as counts:
                    counts.write(v + ', ' + str(count_tensor) + '\n')
            except:
                print('Wha?! Could not write the sentence count.')
        end = timer()
        print(f'Run time for {v} was {end - start} seconds.')

/home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/roberta_layer0_roberta_base_FEvocab.txt /home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/roberta_layer0_roberta_base_counts.txt

There are 4 tokens in tokenized vocabulary word:
st
upe
f
ied
Mean of 10 tensors is: tensor([-0.1499,  0.3891,  0.2168, -0.3009, -0.4857]) (768 features in tensor)
Run time for stupefied was 25.81966659601312 seconds.

There are 2 tokens in tokenized vocabulary word:
scorn
ful
Mean of 18 tensors is: tensor([ 0.3809, -0.0835, -0.2398, -0.0859, -0.1096]) (768 features in tensor)
Run time for scornful was 25.50855671998579 seconds.

There are 2 tokens in tokenized vocabulary word:
disbel
ieving
Mean of 26 tensors is: tensor([ 0.1109,  0.1336, -0.2724,  0.5597,  0.0137]) (768 features in tensor)
Run time for disbelieving was 25.88151633902453 seconds.

There are 2 tokens in tokenized vocabulary word:
disdain
ful
Mean of 20 tensors is: tensor([ 0.3525, -0.1842, -0.1626

Mean of 100 tensors is: tensor([ 0.2524, -0.1718,  0.8378,  1.0775, -0.3389]) (768 features in tensor)
Run time for discouraged was 12.18205466598738 seconds.

There are 1 tokens in tokenized vocabulary word:
stunned
Mean of 100 tensors is: tensor([-0.0347, -0.1143, -0.1919,  0.3307, -0.2458]) (768 features in tensor)
Run time for stunned was 10.906711356015876 seconds.

There are 1 tokens in tokenized vocabulary word:
fearful
Mean of 100 tensors is: tensor([0.0310, 0.3575, 0.1188, 0.2435, 0.4270]) (768 features in tensor)
Run time for fearful was 12.896710635977797 seconds.

There are 1 tokens in tokenized vocabulary word:
pissed
Mean of 100 tensors is: tensor([ 0.3167, -0.1452, -0.1645,  0.0293,  0.3432]) (768 features in tensor)
Run time for pissed was 10.822908477042802 seconds.

There are 1 tokens in tokenized vocabulary word:
terrified
Mean of 100 tensors is: tensor([-0.0674,  0.0385,  0.1501,  0.1075,  0.4392]) (768 features in tensor)
Run time for terrified was 11.5451788430800

Mean of 100 tensors is: tensor([-0.0427,  0.1519, -0.0349,  0.7986, -0.2600]) (768 features in tensor)
Run time for interested was 6.771487821009941 seconds.

There are 1 tokens in tokenized vocabulary word:
happy
Mean of 100 tensors is: tensor([ 0.2117,  0.2532,  0.4332,  0.4256, -0.0266]) (768 features in tensor)
Run time for happy was 7.2554441310931 seconds.
/home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/roberta_layer1_roberta_base_FEvocab.txt /home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/roberta_layer1_roberta_base_counts.txt

There are 4 tokens in tokenized vocabulary word:
st
upe
f
ied
Mean of 10 tensors is: tensor([-0.1586,  0.6347,  0.4507, -0.5318, -0.8145]) (768 features in tensor)
Run time for stupefied was 17.747816493967548 seconds.

There are 2 tokens in tokenized vocabulary word:
scorn
ful
Mean of 18 tensors is: tensor([ 0.3017, -0.2832,  0.0431, -0.0180, -0.1338]) (768 features in tensor)
Run time for scornful was 18

Mean of 100 tensors is: tensor([-0.0162,  0.6408,  0.6147,  0.2643, -0.8166]) (768 features in tensor)
Run time for amused was 16.65588416403625 seconds.

There are 1 tokens in tokenized vocabulary word:
frightened
Mean of 100 tensors is: tensor([-0.1136,  0.1248,  0.3792,  0.6913,  0.8156]) (768 features in tensor)
Run time for frightened was 13.816430102917366 seconds.

There are 1 tokens in tokenized vocabulary word:
discouraged
Mean of 100 tensors is: tensor([ 0.1375,  0.0506,  1.1548,  0.9737, -0.1447]) (768 features in tensor)
Run time for discouraged was 11.912536459043622 seconds.

There are 1 tokens in tokenized vocabulary word:
stunned
Mean of 100 tensors is: tensor([-0.2764,  0.1920,  0.0638,  0.3221,  0.1024]) (768 features in tensor)
Run time for stunned was 10.866584729054011 seconds.

There are 1 tokens in tokenized vocabulary word:
fearful
Mean of 100 tensors is: tensor([-0.3102,  0.4526,  0.3339,  0.7427,  0.7396]) (768 features in tensor)
Run time for fearful was 12.8

Mean of 100 tensors is: tensor([-0.0188,  0.7806,  0.5708,  0.3402, -0.5025]) (768 features in tensor)
Run time for excited was 7.4258560690796 seconds.

There are 1 tokens in tokenized vocabulary word:
sad
Mean of 100 tensors is: tensor([0.4739, 0.5487, 0.3463, 0.4823, 1.3078]) (768 features in tensor)
Run time for sad was 7.526232686010189 seconds.

There are 1 tokens in tokenized vocabulary word:
interested
Mean of 100 tensors is: tensor([-0.2166,  0.3488,  0.1476,  1.2446, -0.4377]) (768 features in tensor)
Run time for interested was 6.498790754005313 seconds.

There are 1 tokens in tokenized vocabulary word:
happy
Mean of 100 tensors is: tensor([-0.2037,  0.6813,  0.6732,  0.8802,  0.0304]) (768 features in tensor)
Run time for happy was 6.763832855038345 seconds.


In [6]:
def make_vocab(vocab_file):
    vocab = []
    with open(vocab_file, 'r') as v:
        vocab = v.read().splitlines()
    return vocab

In [7]:
def create_token_embeddings(tokenized_text):
    input_ids = torch.tensor(tokenized_text).unsqueeze(0)  # Batch size 1
    with torch.no_grad():
        outputs = model(input_ids, masked_lm_labels=input_ids)
        encoded_layers = outputs[2]
        token_embeddings = torch.stack(encoded_layers, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1,0,2)
#         print(f'Size of token embeddings is {token_embeddings.size()}')
        return token_embeddings

In [8]:
# Sum the last 4 layers' features
def sum_last_four_token_vecs(token_embeddings):
    token_vecs_sum_last_four = []

    # For each token in the sentence...
    for token in token_embeddings:
        # `token` is a [13 x 768] tensor
        # Sum the vectors from the last 4 layers.
        sum_vec = torch.sum(token[-4:], dim=0)

        # Use `sum_vec` to represent `token`.
        token_vecs_sum_last_four.append(sum_vec)

#     print ('Shape of summed layers is: %d x %d' % (len(token_vecs_sum_last_four), len(token_vecs_sum_last_four[0])))
    # Shape is: <token count> x 768
    return token_vecs_sum_last_four

In [9]:
# Return a single layer of the model.
def get_layer_token_vecs(token_embeddings, layer_number):
    token_vecs_layer = []

    # For each token in the sentence...
    for token in token_embeddings:
        # `token` is a [13 x 768] tensor
        # Sum the vectors from the last 4 layers.
        layer_vec = token[layer_number]

        # Use `sum_vec` to represent `token`.
        token_vecs_layer.append(layer_vec)

#     print ('Shape of summed layers is: %d x %d' % (len(token_vecs_layer), len(token_vecs_layer[0])))
    # Shape is: <token count> x 768
    return token_vecs_layer

In [10]:
def write_embedding(embeddings_file, vocab_word, contextual_embedding):
    try:
        with open(embeddings_file, 'a') as f:
            f.write(vocab_word)
            for value in contextual_embedding[0]:
                f.write(' ' + str(value.item()))
            f.write('\n')
#         print(f'Saved the embedding for {vocab_word}.')
    except:
        print('Oh no! Unable to write to the embeddings file.')