In [4]:
import sys
import os
from timeit import default_timer as timer
import torch
from transformers import RobertaTokenizer, RobertaModel, RobertaForMaskedLM, RobertaConfig
# import matplotlib.pyplot as plt
# % matplotlib inline
# from scipy.spatial.distance import cosine

In [5]:
# Make sure we're in the transformers directory with fine-tuned model output.
os.chdir('/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/')
os.getcwd()

'/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings'

In [3]:
# Adapted from the tutorial at https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/
# and Transformers documentation: https://huggingface.co/transformers/model_doc/roberta.html#robertaformaskedlm

In [20]:
tokenizer = RobertaTokenizer.from_pretrained('/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/output_wiki-103_filtered')
config = RobertaConfig.from_pretrained('/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/output_wiki-103_filtered')
model = RobertaForMaskedLM.from_pretrained('/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/output_wiki-103_filtered', config=config)
# Outputting hidden states allows direct access to hidden layers of the model.
# config.output_hidden_states = True
model.eval()

context_file = "/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/wiki.test.raw.out"
output_file = '/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/roberta_test.txt'
count_file = '/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/roberta_test_counts.txt'
vocab_file = '/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/vocab_checked.txt'
vocab = make_vocab(vocab_file)

In [22]:
# Process vocabulary words in the outer loop.
for v in vocab:
    start = timer()
    with open(context_file, 'r') as lines:
        v_sum = torch.zeros([1, 768])
        v_tokens = tokenizer.encode(v)
        print(f'\nThere are {len(v_tokens) - 2} tokens in tokenized vocabulary word:')
        for t in v_tokens[1:-1]:
            print(tokenizer.decode(t).strip())
        count_sentence = 0
        count_tensor = 0
        
        # Process all lines in the context file in the inner loop.
        for line in lines:
            # Check for this vocab word in this line; if found, split the line into individual sentences.
            if v in line.lower().split():
                for sentence in line.split('.'):
                    if v in sentence.lower():
                        line = sentence
                        count_sentence += 1
                        print(f'\nInstance {count_sentence} of {tokenizer.decode(v_tokens[1:-1]).strip()}.')
                        break
                # Split the new sentence-based line into tokens.
                # Use max_length to avoid overflowing the maximum sequence length for the model.
                tokenized_text = tokenizer.encode(line, add_special_tokens=True, max_length=512)
#                 print(f'The decoded sentence has {len(tokenized_text)} tokens and is: {tokenizer.decode(tokenized_text)}')
                indices = []              

                # Check to see whether the vocab word is found in this particular line.
                # Initially, some lines may have comprised multiple sentences, which were
                # broken out individually above.
                for t in v_tokens[1:-1]:
                    print(f'Looking for vocab token: {tokenizer.decode(t).strip()}')
                    for i, token_str in enumerate(tokenized_text):
#                         print(f'Next sentence token: {tokenizer.decode(token_str).strip()}')
#                         print(tokenizer.decode(token_str).strip() == tokenizer.decode(t).strip())
                        if tokenizer.decode(token_str).strip() == tokenizer.decode(t).strip():
                            indices.append(i)               

                ###################################################################################
                # If the vocabulary word was found, process the containing line.
                if indices:

                    # The vocab word was found in this line/sentence, at the locations in indices.
                    print(f'Indices are {indices}')
                    # Get the feature vectors for all tokens in the line/sentence.
                    token_embeddings = create_token_embeddings(tokenized_text)
                    # Sum the last four layers to get embeddings for the line/sentence.
#                         for t in v_tokens[1:-1]:
#                             for i, token_str in enumerate(tokenized_text):
#                                 if (tokenizer.decode(token_str).strip() == tokenizer.decode(t).strip()):
#                                     print(f'{tokenizer.decode(token_str).strip()} is index {i} in the sentence and {token_str} in the vocabulary.')
                    token_vecs_layer = get_layer_token_vecs(token_embeddings, 12)

                    # Get the vocab word's contextual embedding for this line.
                    tensor_layer = torch.zeros([1, 768])
                    for i in range(len(indices)):
                        v_index = i % len(v_tokens[1:-1])
                        print(f'{tokenizer.decode(v_tokens[v_index + 1]).strip()} at index {indices[i]}: {token_vecs_layer[indices[i]][:5].tolist()}')
                        tensor_layer += token_vecs_layer[indices[i]]
#                         print(f'Sum of tensors is: {tensor_layer[0][:5].tolist()} before taking the mean.')

                    # If our vocab word is broken into more than one token, we need to get the mean of the token embeddings.
                    tensor_layer /= len(indices)
#                     print(f'Sum of tensors is: {tensor_layer[0][:5].tolist()} after taking the mean.')

                    # Add the embedding distilled from this line to the sum of embeddings for all lines.
                    v_sum += tensor_layer
                    count_tensor += 1
                    print(f'Grand sum of {count_tensor} tensor sets is: {v_sum[0][:5].tolist()}')
                ###################################################################################
            # Stop processing lines once we've found 2000 instances of our vocab word.
            if count_tensor >= 2000:
                break
        
        # We're done processing all lines of 512 tokens or less containing our vocab word.
        # Get the mean embedding for the word.
        v_mean = v_sum / count_tensor
        print(f'Mean of tensors is: {v_mean[0][:5]} ({len(v_mean[0])} features in tensor)')
        write_embedding(output_file, v, v_mean)
        try:
            with open(count_file, 'a') as counts:
                counts.write(v + ', ' + str(count_tensor) + '\n')
            print(f'Saved the count of sentences used to create {v} embedding')
        except:
            print('Wha?! Could not write the sentence count.')
    end = timer()
    print(f'Run time for {v} was {end - start} seconds.')


There are 1 tokens in tokenized vocabulary word:
aback
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for aback.
Saved the count of sentences used to create aback embedding
Run time for aback was 0.06877287099996465 seconds.

There are 2 tokens in tokenized vocabulary word:
ab
ashed
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for abashed.
Saved the count of sentences used to create abashed embedding
Run time for abashed was 0.02842349699994884 seconds.

There are 1 tokens in tokenized vocabulary word:
abhor
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for abhor.
Saved the count of sentences used to create abhor embedding
Run time for abhor was 0.024280316999920615 seconds.

There are 2 tokens in tokenized vocabulary word:
abhor
red
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embeddi

Size of token embeddings is torch.Size([54, 13, 768])
Shape of summed layers is: 54 x 768
accepted at index 26: [0.1258816421031952, -0.08449521660804749, -0.12492911517620087, 0.5103059411048889, 0.30571746826171875]
Grand sum of 4 tensor sets is: [0.3955068290233612, 0.3654204308986664, -0.1968933343887329, 1.5301110744476318, 1.408898949623108]

Instance 5 of accepted.
Looking for vocab token: accepted
Indices are [22]
Size of token embeddings is torch.Size([37, 13, 768])
Shape of summed layers is: 37 x 768
accepted at index 22: [-0.005839262157678604, 0.0065529122948646545, 0.007053704932332039, 0.2620042860507965, 0.30201423168182373]
Grand sum of 5 tensor sets is: [0.3896675705909729, 0.37197333574295044, -0.18983963131904602, 1.792115330696106, 1.7109131813049316]

Instance 6 of accepted.
Looking for vocab token: accepted
Indices are [19]
Size of token embeddings is torch.Size([21, 13, 768])
Shape of summed layers is: 21 x 768
accepted at index 19: [0.055956169962882996, 0.10654

Size of token embeddings is torch.Size([74, 13, 768])
Shape of summed layers is: 74 x 768
accommodating at index 40: [0.16071467101573944, 0.1875777542591095, -0.02297905646264553, 0.18755941092967987, 0.7809291481971741]
Grand sum of 1 tensor sets is: [0.16071467101573944, 0.1875777542591095, -0.02297905646264553, 0.18755941092967987, 0.7809291481971741]
Mean of tensors is: tensor([ 0.1607,  0.1876, -0.0230,  0.1876,  0.7809]) (768 features in tensor)
Saved the embedding for accommodating.
Saved the count of sentences used to create accommodating embedding
Run time for accommodating was 0.15933335800013992 seconds.

There are 1 tokens in tokenized vocabulary word:
accomplished

Instance 1 of accomplished.
Looking for vocab token: accomplished
Indices are [18]
Size of token embeddings is torch.Size([20, 13, 768])
Shape of summed layers is: 20 x 768
accomplished at index 18: [-0.04389871656894684, -0.006582438945770264, 0.22913262248039246, -0.1600954830646515, 0.07970038801431656]
Gran

Size of token embeddings is torch.Size([33, 13, 768])
Shape of summed layers is: 33 x 768
active at index 26: [0.0973002165555954, 0.0032387971878051758, -0.00901215709745884, 0.3552281856536865, -0.35985302925109863]
Grand sum of 4 tensor sets is: [0.10118819773197174, 0.36111515760421753, 0.5169610977172852, 1.2217259407043457, -1.5802624225616455]

Instance 5 of active.
Looking for vocab token: active
Indices are [9]
Size of token embeddings is torch.Size([15, 13, 768])
Shape of summed layers is: 15 x 768
active at index 9: [0.18956197798252106, 0.22880248725414276, 0.11391574889421463, 0.5409116744995117, -0.6028144359588623]
Grand sum of 5 tensor sets is: [0.2907501757144928, 0.5899176597595215, 0.6308768391609192, 1.7626376152038574, -2.183076858520508]

Instance 6 of active.
Looking for vocab token: active
Indices are [27]
Size of token embeddings is torch.Size([30, 13, 768])
Shape of summed layers is: 30 x 768
active at index 27: [-0.025129973888397217, -0.08917112648487091, -0

Mean of tensors is: tensor([0.3981, 0.0796, 0.2133, 0.7717, 0.8014]) (768 features in tensor)
Saved the embedding for acute.
Saved the count of sentences used to create acute embedding
Run time for acute was 0.12699194299989358 seconds.

There are 1 tokens in tokenized vocabulary word:
adamant
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for adamant.
Saved the count of sentences used to create adamant embedding
Run time for adamant was 0.028390168999976595 seconds.

There are 2 tokens in tokenized vocabulary word:
add
led
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for addled.
Saved the count of sentences used to create addled embedding
Run time for addled was 0.029982392999954754 seconds.

There are 1 tokens in tokenized vocabulary word:
admiration

Instance 1 of admiration.
Looking for vocab token: admiration
Indices are [9]
Size of token embeddings is torch.Size([20, 13, 768]

Size of token embeddings is torch.Size([18, 13, 768])
Shape of summed layers is: 18 x 768
affected at index 7: [0.057363126426935196, 0.5202317833900452, 0.11155638098716736, 0.5733896493911743, 0.09103856980800629]
Grand sum of 10 tensor sets is: [0.8750120997428894, 4.759311676025391, 1.291002869606018, 5.455968379974365, -0.38060587644577026]

Instance 11 of affected.
Looking for vocab token: affected
Indices are [25]
Size of token embeddings is torch.Size([30, 13, 768])
Shape of summed layers is: 30 x 768
affected at index 25: [0.03514000028371811, 0.14788757264614105, 0.07729656994342804, 0.39655354619026184, -0.3930742144584656]
Grand sum of 11 tensor sets is: [0.9101520776748657, 4.907199382781982, 1.3682994842529297, 5.852521896362305, -0.7736800909042358]

Instance 12 of affected.
Looking for vocab token: affected
Indices are [16]
Size of token embeddings is torch.Size([28, 13, 768])
Shape of summed layers is: 28 x 768
affected at index 16: [0.14838993549346924, 0.592944502830

Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for agog.
Saved the count of sentences used to create agog embedding
Run time for agog was 0.028895566000073813 seconds.

There are 2 tokens in tokenized vocabulary word:
agon
ized
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for agonized.
Saved the count of sentences used to create agonized embedding
Run time for agonized was 0.02781303899996601 seconds.

There are 1 tokens in tokenized vocabulary word:
agreeable
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for agreeable.
Saved the count of sentences used to create agreeable embedding
Run time for agreeable was 0.028221860000030574 seconds.

There are 2 tokens in tokenized vocabulary word:
ag
ressive
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for agressive.
Saved the count of 

Size of token embeddings is torch.Size([29, 13, 768])
Shape of summed layers is: 29 x 768
ambition at index 8: [0.05712300166487694, -0.4759107828140259, 0.3213956952095032, 0.015220381319522858, -0.09168340265750885]
Grand sum of 2 tensor sets is: [-0.0037985332310199738, -0.7670956254005432, 0.44811779260635376, 0.23437514901161194, -0.4927740693092346]

Instance 3 of ambition.
Looking for vocab token: ambition
Indices are [7]
Size of token embeddings is torch.Size([29, 13, 768])
Shape of summed layers is: 29 x 768
ambition at index 7: [-0.003787245601415634, -0.1687094122171402, 0.19934116303920746, 0.19455231726169586, -0.7462382316589355]
Grand sum of 3 tensor sets is: [-0.007585778832435608, -0.9358050227165222, 0.6474589705467224, 0.428927481174469, -1.2390122413635254]
Mean of tensors is: tensor([-0.0025, -0.3119,  0.2158,  0.1430, -0.4130]) (768 features in tensor)
Saved the embedding for ambition.
Saved the count of sentences used to create ambition embedding
Run time for amb

Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for angered.
Saved the count of sentences used to create angered embedding
Run time for angered was 0.03172099100015657 seconds.

There are 1 tokens in tokenized vocabulary word:
angrily

Instance 1 of angrily.
Looking for vocab token: angrily
Indices are [4]
Size of token embeddings is torch.Size([21, 13, 768])
Shape of summed layers is: 21 x 768
angrily at index 4: [-0.032908640801906586, 0.06928518414497375, -0.04905584454536438, 0.032097309827804565, 0.5097794532775879]
Grand sum of 1 tensor sets is: [-0.032908640801906586, 0.06928518414497375, -0.04905584454536438, 0.032097309827804565, 0.5097794532775879]

Instance 2 of angrily.
Looking for vocab token: angrily
Indices are [20]
Size of token embeddings is torch.Size([38, 13, 768])
Shape of summed layers is: 38 x 768
angrily at index 20: [0.03822272643446922, -0.06449541449546814, -0.0017080157995224, -0.18574970960617065, 0.69610357


Instance 1 of annoying.
Looking for vocab token: annoying
Indices are [11]
Size of token embeddings is torch.Size([21, 13, 768])
Shape of summed layers is: 21 x 768
annoying at index 11: [0.10794611275196075, 0.3627992272377014, 0.12529073655605316, 0.10958881676197052, 0.2889607846736908]
Grand sum of 1 tensor sets is: [0.10794611275196075, 0.3627992272377014, 0.12529073655605316, 0.10958881676197052, 0.2889607846736908]
Mean of tensors is: tensor([0.1079, 0.3628, 0.1253, 0.1096, 0.2890]) (768 features in tensor)
Saved the embedding for annoying.
Saved the count of sentences used to create annoying embedding
Run time for annoying was 0.07568186500020602 seconds.

There are 2 tokens in tokenized vocabulary word:
antagon
istic
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for antagonistic.
Saved the count of sentences used to create antagonistic embedding
Run time for antagonistic was 0.032699912999987646 seconds.

There are 2 tokens

Mean of tensors is: tensor([ 0.0992,  0.0727,  0.2822, -0.0251,  0.1242]) (768 features in tensor)
Saved the embedding for anxiety.
Saved the count of sentences used to create anxiety embedding
Run time for anxiety was 0.15094343700002355 seconds.

There are 1 tokens in tokenized vocabulary word:
anxious
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for anxious.
Saved the count of sentences used to create anxious embedding
Run time for anxious was 0.0297782999998617 seconds.

There are 2 tokens in tokenized vocabulary word:
anx
iously
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for anxiously.
Saved the count of sentences used to create anxiously embedding
Run time for anxiously was 0.026087830000051326 seconds.

There are 2 tokens in tokenized vocabulary word:
ap
athetic
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for apathet

Size of token embeddings is torch.Size([32, 13, 768])
Shape of summed layers is: 32 x 768
argue at index 6: [0.1798577755689621, 0.13027141988277435, 0.09999781847000122, 0.09113790094852448, -1.0571374893188477]
Grand sum of 3 tensor sets is: [0.2246711254119873, 0.45280516147613525, 0.46912407875061035, 0.27913662791252136, -3.77339506149292]
Mean of tensors is: tensor([ 0.0749,  0.1509,  0.1564,  0.0930, -1.2578]) (768 features in tensor)
Saved the embedding for argue.
Saved the count of sentences used to create argue embedding
Run time for argue was 0.21144664999997076 seconds.

There are 2 tokens in tokenized vocabulary word:
argument
ative
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for argumentative.
Saved the count of sentences used to create argumentative embedding
Run time for argumentative was 0.027810067000018535 seconds.

There are 1 tokens in tokenized vocabulary word:
aroused
Mean of tensors is: tensor([nan, nan, nan

Size of token embeddings is torch.Size([25, 13, 768])
Shape of summed layers is: 25 x 768
attempting at index 11: [-0.004661872982978821, -0.012229569256305695, 0.20318102836608887, 0.1792888194322586, 0.581027626991272]
Grand sum of 3 tensor sets is: [0.5386844277381897, 0.17191699147224426, 0.5534408092498779, 0.49088001251220703, 0.8163760900497437]

Instance 4 of attempting.
Looking for vocab token: attempting
Indices are [27]
Size of token embeddings is torch.Size([42, 13, 768])
Shape of summed layers is: 42 x 768
attempting at index 27: [0.11186040937900543, 0.13188433647155762, 0.07992716133594513, 0.07227587699890137, 0.47849926352500916]
Grand sum of 4 tensor sets is: [0.6505448222160339, 0.3038013279438019, 0.6333679556846619, 0.5631558895111084, 1.2948753833770752]

Instance 5 of attempting.
Looking for vocab token: attempting
Indices are [15]
Size of token embeddings is torch.Size([35, 13, 768])
Shape of summed layers is: 35 x 768
attempting at index 15: [0.2207575887441635

Size of token embeddings is torch.Size([27, 13, 768])
Shape of summed layers is: 27 x 768
aversion at index 23: [0.08816449344158173, 0.0816800445318222, 0.1390906423330307, 0.12265820056200027, -0.12753167748451233]
Grand sum of 1 tensor sets is: [0.08816449344158173, 0.0816800445318222, 0.1390906423330307, 0.12265820056200027, -0.12753167748451233]
Mean of tensors is: tensor([ 0.0882,  0.0817,  0.1391,  0.1227, -0.1275]) (768 features in tensor)
Saved the embedding for aversion.
Saved the count of sentences used to create aversion embedding
Run time for aversion was 0.08849502299995038 seconds.

There are 2 tokens in tokenized vocabulary word:
a
versive
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for aversive.
Saved the count of sentences used to create aversive embedding
Run time for aversive was 0.028951457999937702 seconds.

There are 1 tokens in tokenized vocabulary word:
avid
Mean of tensors is: tensor([nan, nan, nan, nan, n

Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for axed.
Saved the count of sentences used to create axed embedding
Run time for axed was 0.028360937999877933 seconds.

There are 2 tokens in tokenized vocabulary word:
back
handed
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for backhanded.
Saved the count of sentences used to create backhanded embedding
Run time for backhanded was 0.02738111700000445 seconds.

There are 1 tokens in tokenized vocabulary word:
badly

Instance 1 of badly.
Looking for vocab token: badly
Indices are [3]
Size of token embeddings is torch.Size([22, 13, 768])
Shape of summed layers is: 22 x 768
badly at index 3: [-0.04932598024606705, 0.20432643592357635, 0.14524024724960327, 0.2621002197265625, 1.0760900974273682]
Grand sum of 1 tensor sets is: [-0.04932598024606705, 0.20432643592357635, 0.14524024724960327, 0.2621002197265625, 1.0760900974273682]

Instan

Mean of tensors is: tensor([0.1063, 0.6146, 0.1141, 0.1567, 0.5055]) (768 features in tensor)
Saved the embedding for banal.
Saved the count of sentences used to create banal embedding
Run time for banal was 0.1496934419999434 seconds.

There are 1 tokens in tokenized vocabulary word:
barking
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for barking.
Saved the count of sentences used to create barking embedding
Run time for barking was 0.03098377299988897 seconds.

There are 2 tokens in tokenized vocabulary word:
bash
ful
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for bashful.
Saved the count of sentences used to create bashful embedding
Run time for bashful was 0.027605446000052325 seconds.

There are 2 tokens in tokenized vocabulary word:
be
aming
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for beaming.
Saved the count of 

Size of token embeddings is torch.Size([22, 13, 768])
Shape of summed layers is: 22 x 768
begging at index 16: [-0.084084153175354, 0.19248074293136597, 0.0530625618994236, 0.0920068770647049, 1.6260147094726562]
Grand sum of 1 tensor sets is: [-0.084084153175354, 0.19248074293136597, 0.0530625618994236, 0.0920068770647049, 1.6260147094726562]

Instance 2 of begging.
Looking for vocab token: begging
Indices are [4]
Size of token embeddings is torch.Size([31, 13, 768])
Shape of summed layers is: 31 x 768
begging at index 4: [0.22923703491687775, -0.2411428540945053, -0.01716305874288082, 0.3000563979148865, 1.1454803943634033]
Grand sum of 2 tensor sets is: [0.14515288174152374, -0.04866211116313934, 0.03589950501918793, 0.3920632600784302, 2.7714951038360596]
Mean of tensors is: tensor([ 0.0726, -0.0243,  0.0179,  0.1960,  1.3857]) (768 features in tensor)
Saved the embedding for begging.
Saved the count of sentences used to create begging embedding
Run time for begging was 0.147936787

Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for bested.
Saved the count of sentences used to create bested embedding
Run time for bested was 0.031252151999979105 seconds.

There are 1 tokens in tokenized vocabulary word:
betrayal
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for betrayal.
Saved the count of sentences used to create betrayal embedding
Run time for betrayal was 0.02735987699998077 seconds.

There are 1 tokens in tokenized vocabulary word:
betrayed
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for betrayed.
Saved the count of sentences used to create betrayed embedding
Run time for betrayed was 0.028704961000130425 seconds.

There are 2 tokens in tokenized vocabulary word:
bewild
ered
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for bewildered.
Saved the count 

Size of token embeddings is torch.Size([56, 13, 768])
Shape of summed layers is: 56 x 768
bleak at index 52: [0.007422048598527908, 0.0860687643289566, 0.16305164992809296, -0.27366897463798523, 1.9297645092010498]
Grand sum of 1 tensor sets is: [0.007422048598527908, 0.0860687643289566, 0.16305164992809296, -0.27366897463798523, 1.9297645092010498]

Instance 2 of bleak.
Looking for vocab token: bleak
Indices are [24]
Size of token embeddings is torch.Size([32, 13, 768])
Shape of summed layers is: 32 x 768
bleak at index 24: [-0.14343465864658356, 0.23452793061733246, 0.2352324277162552, -0.3386574685573578, 1.7515597343444824]
Grand sum of 2 tensor sets is: [-0.13601261377334595, 0.32059669494628906, 0.39828407764434814, -0.612326443195343, 3.6813242435455322]

Instance 3 of bleak.
Looking for vocab token: bleak
Indices are [34]
Size of token embeddings is torch.Size([42, 13, 768])
Shape of summed layers is: 42 x 768
bleak at index 34: [-0.08131209015846252, 0.18792158365249634, 0.203


Instance 5 of blown.
Looking for vocab token: blown
Indices are [8]
Size of token embeddings is torch.Size([17, 13, 768])
Shape of summed layers is: 17 x 768
blown at index 8: [0.22923651337623596, 0.20967547595500946, 0.1278967410326004, -0.33471596240997314, -0.14833146333694458]
Grand sum of 5 tensor sets is: [0.2851558327674866, 0.29876187443733215, 0.23627346754074097, -1.0278327465057373, 1.4010519981384277]
Mean of tensors is: tensor([ 0.0570,  0.0598,  0.0473, -0.2056,  0.2802]) (768 features in tensor)
Saved the embedding for blown.
Saved the count of sentences used to create blown embedding
Run time for blown was 0.33448945600002844 seconds.

There are 1 tokens in tokenized vocabulary word:
blue

Instance 1 of blue.
Looking for vocab token: blue

Instance 2 of blue.
Looking for vocab token: blue

Instance 3 of blue.
Looking for vocab token: blue

Instance 4 of blue.
Looking for vocab token: blue
Indices are [8]
Size of token embeddings is torch.Size([23, 13, 768])
Shape of s

boring at index 15: [0.060351304709911346, 0.17118412256240845, 0.14410527050495148, 0.5044066309928894, 0.6101423501968384]
Grand sum of 3 tensor sets is: [-0.14576411247253418, 0.3940320611000061, 0.2971527576446533, 1.253767728805542, 2.361644744873047]

Instance 4 of boring.
Looking for vocab token: boring
Indices are [48]
Size of token embeddings is torch.Size([66, 13, 768])
Shape of summed layers is: 66 x 768
boring at index 48: [0.18499086797237396, 0.16811926662921906, -0.08184072375297546, 0.5814017057418823, 1.1990149021148682]
Grand sum of 4 tensor sets is: [0.03922675549983978, 0.562151312828064, 0.21531203389167786, 1.8351694345474243, 3.560659646987915]
Mean of tensors is: tensor([0.0098, 0.1405, 0.0538, 0.4588, 0.8902]) (768 features in tensor)
Saved the embedding for boring.
Saved the count of sentences used to create boring embedding
Run time for boring was 0.2755873319999864 seconds.

There are 1 tokens in tokenized vocabulary word:
bothered
Mean of tensors is: tensor

broken at index 4: [0.0061073265969753265, 0.12184877693653107, -0.06926628202199936, 0.059637293219566345, 0.47531670331954956]
Grand sum of 12 tensor sets is: [0.062326014041900635, 0.32679346203804016, 0.18953701853752136, -0.6162763237953186, 8.149691581726074]

Instance 13 of broken.
Looking for vocab token: broken
Indices are [9]
Size of token embeddings is torch.Size([16, 13, 768])
Shape of summed layers is: 16 x 768
broken at index 9: [-0.022339805960655212, -0.021274246275424957, -0.06735842674970627, 0.007990519516170025, 1.0947480201721191]
Grand sum of 13 tensor sets is: [0.03998620808124542, 0.3055192232131958, 0.1221785917878151, -0.6082857847213745, 9.244440078735352]

Instance 14 of broken.
Looking for vocab token: broken
Indices are [16]
Size of token embeddings is torch.Size([39, 13, 768])
Shape of summed layers is: 39 x 768
broken at index 16: [-0.031853094696998596, 0.20837463438510895, 0.13104000687599182, -0.20576705038547516, 0.6488054990768433]
Grand sum of 14 t

Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for cagey.
Saved the count of sentences used to create cagey embedding
Run time for cagey was 0.025913127999956487 seconds.

There are 2 tokens in tokenized vocabulary word:
c
agy
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for cagy.
Saved the count of sentences used to create cagy embedding
Run time for cagy was 0.02729970699988371 seconds.

There are 1 tokens in tokenized vocabulary word:
calculating
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for calculating.
Saved the count of sentences used to create calculating embedding
Run time for calculating was 0.026354245999982595 seconds.

There are 2 tokens in tokenized vocabulary word:
call
ous
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for callous.
Saved the count of sentences

Size of token embeddings is torch.Size([78, 13, 768])
Shape of summed layers is: 78 x 768
capable at index 25: [-0.14691928029060364, 0.198392853140831, 0.10621216148138046, 0.4685290455818176, 0.49979764223098755]
Grand sum of 8 tensor sets is: [-1.4950034618377686, 1.2606920003890991, 1.29542076587677, 2.595224380493164, 4.928339004516602]

Instance 9 of capable.
Looking for vocab token: capable
Indices are [18]
Size of token embeddings is torch.Size([34, 13, 768])
Shape of summed layers is: 34 x 768
capable at index 18: [-0.06258904933929443, 0.27273035049438477, 0.1656576693058014, 0.29195722937583923, 0.5255934000015259]
Grand sum of 9 tensor sets is: [-1.557592511177063, 1.5334223508834839, 1.461078405380249, 2.887181520462036, 5.453932285308838]

Instance 10 of capable.
Looking for vocab token: capable
Indices are [40]
Size of token embeddings is torch.Size([45, 13, 768])
Shape of summed layers is: 45 x 768
capable at index 40: [0.051228754222393036, 0.29192084074020386, 0.24030

Size of token embeddings is torch.Size([36, 13, 768])
Shape of summed layers is: 36 x 768
careless at index 32: [0.06383327394723892, 0.40585237741470337, -0.0006874389946460724, -0.04816702380776405, 0.45477619767189026]
Grand sum of 1 tensor sets is: [0.06383327394723892, 0.40585237741470337, -0.0006874389946460724, -0.04816702380776405, 0.45477619767189026]
Mean of tensors is: tensor([ 0.0638,  0.4059, -0.0007, -0.0482,  0.4548]) (768 features in tensor)
Saved the embedding for careless.
Saved the count of sentences used to create careless embedding
Run time for careless was 0.11446805099990343 seconds.

There are 1 tokens in tokenized vocabulary word:
caring
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for caring.
Saved the count of sentences used to create caring embedding
Run time for caring was 0.03808503599998403 seconds.

There are 2 tokens in tokenized vocabulary word:
cat
ty
Mean of tensors is: tensor([nan, nan, nan, nan,

KeyboardInterrupt: 

In [8]:
def make_vocab(vocab_file):
    vocab = []
    with open(vocab_file, 'r') as v:
        vocab = v.read().splitlines()
    return vocab

In [9]:
def create_token_embeddings(tokenized_text):
    input_ids = torch.tensor(tokenized_text).unsqueeze(0)  # Batch size 1
    with torch.no_grad():
        outputs = model(input_ids, masked_lm_labels=input_ids)
        encoded_layers = outputs[2]
        token_embeddings = torch.stack(encoded_layers, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1,0,2)
        print(f'Size of token embeddings is {token_embeddings.size()}')
        return token_embeddings

In [10]:
# Sum the last 4 layers' features
def sum_last_four_token_vecs(token_embeddings):
    token_vecs_sum_last_four = []

    # For each token in the sentence...
    for token in token_embeddings:
        # `token` is a [13 x 768] tensor
        # Sum the vectors from the last 4 layers.
        sum_vec = torch.sum(token[-4:], dim=0)

        # Use `sum_vec` to represent `token`.
        token_vecs_sum_last_four.append(sum_vec)

    print ('Shape of summed layers is: %d x %d' % (len(token_vecs_sum_last_four), len(token_vecs_sum_last_four[0])))
    # Shape is: <token count> x 768
    return token_vecs_sum_last_four

In [11]:
# Return a single layer of the model.
def get_layer_token_vecs(token_embeddings, layer_number):
    token_vecs_layer = []

    # For each token in the sentence...
    for token in token_embeddings:
        # `token` is a [13 x 768] tensor
        # Sum the vectors from the last 4 layers.
        layer_vec = token[layer_number]

        # Use `sum_vec` to represent `token`.
        token_vecs_layer.append(layer_vec)

    print ('Shape of summed layers is: %d x %d' % (len(token_vecs_layer), len(token_vecs_layer[0])))
    # Shape is: <token count> x 768
    return token_vecs_layer

In [12]:
def write_embedding(embeddings_file, vocab_word, contextual_embedding):
    try:
        with open(embeddings_file, 'a') as f:
            f.write(vocab_word)
            for value in contextual_embedding[0]:
                f.write(' ' + str(value.item()))
            f.write('\n')
        print(f'Saved the embedding for {vocab_word}.')
    except:
        print('Oh no! Unable to write to the embeddings file.')