In [1]:
import sys
import os
from timeit import default_timer as timer
import torch
from transformers import RobertaTokenizer, RobertaModel, RobertaForMaskedLM, RobertaConfig
import matplotlib.pyplot as plt
# % matplotlib inline
from scipy.spatial.distance import cosine

In [2]:
# Make sure we're in the transformers directory with fine-tuned model output.
os.chdir('/home/jupyter/Notebooks/crystal/NLP/transformers/examples/')
os.getcwd()

'/home/jupyter/Notebooks/crystal/NLP/transformers/examples'

In [3]:
# Adapted from the tutorial at https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/
# and Transformers documentation: https://huggingface.co/transformers/model_doc/roberta.html#robertaformaskedlm

In [11]:
tokenizer = RobertaTokenizer.from_pretrained('./roBERTa_base')
config = RobertaConfig.from_pretrained('./roBERTa_base')
model = RobertaForMaskedLM.from_pretrained('./roBERTa_base')
model.eval()

context_file = "/home/jupyter/Notebooks/crystal/NLP/transformers/examples/CC_WET_train_aefg"
output_file = '/home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/'
count_file = '/home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/'
vocab_file = '/home/jupyter/Notebooks/crystal/NLP/MiFace/Python/data/vocab_files/SimLex_MC_MiFace_vocab.txt'
vocab = make_vocab(vocab_file)

FEATURE_COUNT = 768
LAYER = 8
MAX_LINES = 50

In [12]:
l_output_file = ''
l_count_file = ''
l_output_file = os.path.join(output_file, 'roberta_base_layer-' + str(LAYER) + '_maxlines-' + str(MAX_LINES) + '_aefg.txt')
l_count_file = os.path.join(count_file, 'roberta_base_layer-'+ str(LAYER) + '_maxlines-' + str(MAX_LINES) + '_aefg_counts.txt')
print(l_output_file, l_count_file)
# Process vocabulary words in the outer loop.
for v in vocab:
    start = timer()
    with open(context_file, 'r') as lines:
        v_sum = torch.zeros([1, FEATURE_COUNT])
        v_tokens = tokenizer.encode(v)
        print(f'\nThere are {len(v_tokens) - 2} tokens in tokenized vocabulary word:')
        for t in v_tokens[1:-1]:
            print(tokenizer.decode(t).strip())
        count_sentence = 0
        count_tensor = 0

        # Process all lines in the context file in the inner loop.
        for line in lines:
            # Check for this vocab word in this line; if found, split the line into individual sentences.
            if v in line.lower().split():
                for sentence in line.split('.'):
                    if v in sentence.lower():
                        line = sentence
                        count_sentence += 1
                        break
                # Split the new sentence-based line into tokens.
                # Use max_length to avoid overflowing the maximum sequence length for the model.
                tokenized_text = tokenizer.encode(line, add_special_tokens=True, max_length=512)
                indices = []              

                # Check to see whether the vocab word is found in this particular line.
                # Initially, some lines may have comprised multiple sentences, which were
                # broken out individually above.
                for t in v_tokens[1:-1]:
                    for i, token_str in enumerate(tokenized_text):
                        if tokenizer.decode(token_str).strip() == tokenizer.decode(t).strip():
                            indices.append(i)               

                ###################################################################################
                # If the vocabulary word was found, process the containing line.
                if indices:

                    # The vocab word was found in this line/sentence, at the locations in indices.
                    # Get the feature vectors for all tokens in the line/sentence.
                    token_embeddings = create_token_embeddings(tokenized_text)
                    token_vecs_layer = get_layer_token_vecs(token_embeddings, LAYER)

                    # Get the vocab word's contextual embedding for this line.
                    tensor_layer = torch.zeros([1, FEATURE_COUNT])
                    for i in range(len(indices)):
                        v_index = i % len(v_tokens[1:-1])
                        tensor_layer += token_vecs_layer[indices[i]]

                    # If our vocab word is broken into more than one token, we need to get the mean of the token embeddings.
                    tensor_layer /= len(indices)

                    # Add the embedding distilled from this line to the sum of embeddings for all lines.
                    v_sum += tensor_layer
                    count_tensor += 1
                ###################################################################################
            # Stop processing lines once we've found 2000 instances of our vocab word.
            if count_tensor >= MAX_LINES:
                break

        # We're done processing all lines of 512 tokens or less containing our vocab word.
        # Get the mean embedding for the word.
        v_mean = v_sum / count_tensor
        print(f'Mean of {count_tensor} tensors is: {v_mean[0][:5]} ({len(v_mean[0])} features in tensor)')
        write_embedding(l_output_file, v, v_mean)
        try:
            with open(l_count_file, 'a') as counts:
                counts.write(v + ', ' + str(count_tensor) + '\n')
        except:
            print('Wha?! Could not write the sentence count.')
    end = timer()
    print(f'Run time for {v} was {end - start} seconds.')

/home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/roberta_base_layer-8_maxlines-50_aefg.txt /home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/roberta_base_layer-8_maxlines-50_aefg_counts.txt

There are 1 tokens in tokenized vocabulary word:
aback
Mean of 50 tensors is: tensor([0.0678, 0.2718, 0.1138, 0.4885, 0.0466]) (768 features in tensor)
Run time for aback was 14.45582143124193 seconds.

There are 2 tokens in tokenized vocabulary word:
ab
ashed
Mean of 24 tensors is: tensor([ 0.1307,  0.3149, -0.0467,  0.1518,  0.4097]) (768 features in tensor)
Run time for abashed was 102.83370731212199 seconds.

There are 1 tokens in tokenized vocabulary word:
abdomen
Mean of 50 tensors is: tensor([ 0.4638, -0.1044,  0.0831,  0.3742, -1.0691]) (768 features in tensor)
Run time for abdomen was 9.522613850422204 seconds.

There are 1 tokens in tokenized vocabulary word:
abhor
Mean of 50 tensors is: tensor([0.1734, 0.3307, 0.1492, 0.3795, 0.1214]) (768 f

Mean of 50 tensors is: tensor([ 0.1961, -0.2198,  0.0692, -0.6464, -0.5064]) (768 features in tensor)
Run time for actress was 10.233919771388173 seconds.

There are 1 tokens in tokenized vocabulary word:
acute
Mean of 50 tensors is: tensor([ 0.1484,  0.1825, -0.2688,  0.3861,  0.5295]) (768 features in tensor)
Run time for acute was 4.487680409103632 seconds.

There are 1 tokens in tokenized vocabulary word:
adamant
Mean of 50 tensors is: tensor([-0.0148,  0.3897, -0.0857, -0.2592, -0.3916]) (768 features in tensor)
Run time for adamant was 9.49912191182375 seconds.

There are 1 tokens in tokenized vocabulary word:
add
Mean of 50 tensors is: tensor([ 0.3383,  0.5262,  0.1888, -0.1093, -1.2622]) (768 features in tensor)
Run time for add was 3.2420229418203235 seconds.

There are 2 tokens in tokenized vocabulary word:
add
led
Mean of 50 tensors is: tensor([0.1653, 0.3888, 0.0771, 0.2757, 0.0293]) (768 features in tensor)
Run time for addled was 40.88772522006184 seconds.

There are 1 to

Mean of 50 tensors is: tensor([ 0.3920, -0.0249,  0.0183, -0.9810, -0.3431]) (768 features in tensor)
Run time for air was 3.8291665110737085 seconds.

There are 2 tokens in tokenized vocabulary word:
air
head
Mean of 50 tensors is: tensor([ 0.1502, -0.1447, -0.0410, -0.6845,  0.1570]) (768 features in tensor)
Run time for airhead was 54.85662148613483 seconds.

There are 1 tokens in tokenized vocabulary word:
airport
Mean of 50 tensors is: tensor([-0.0867,  0.8159,  0.0223, -0.7761, -0.6384]) (768 features in tensor)
Run time for airport was 4.5908777276054025 seconds.

There are 1 tokens in tokenized vocabulary word:
aisle
Mean of 50 tensors is: tensor([ 0.1025,  0.5465, -0.0319, -0.4098, -0.8816]) (768 features in tensor)
Run time for aisle was 7.818837974220514 seconds.

There are 1 tokens in tokenized vocabulary word:
alarm
Mean of 50 tensors is: tensor([ 0.1356,  0.2249, -0.1112, -0.5541, -0.3347]) (768 features in tensor)
Run time for alarm was 4.900718811899424 seconds.

There 

Mean of 50 tensors is: tensor([0.1501, 0.2755, 0.2214, 0.2111, 0.1267]) (768 features in tensor)
Run time for angered was 12.460219966247678 seconds.

There are 1 tokens in tokenized vocabulary word:
angle
Mean of 50 tensors is: tensor([ 0.1606,  0.7239,  0.1291,  0.5848, -0.1953]) (768 features in tensor)
Run time for angle was 5.274487386457622 seconds.

There are 1 tokens in tokenized vocabulary word:
angrily
Mean of 50 tensors is: tensor([ 0.4645,  0.3924, -0.1596,  0.0620,  0.2278]) (768 features in tensor)
Run time for angrily was 14.52509154099971 seconds.

There are 1 tokens in tokenized vocabulary word:
angry
Mean of 50 tensors is: tensor([0.0321, 0.1823, 0.0149, 0.6952, 0.1702]) (768 features in tensor)
Run time for angry was 4.628210744820535 seconds.

There are 1 tokens in tokenized vocabulary word:
angst
Mean of 50 tensors is: tensor([ 0.2315,  0.1470,  0.0293, -0.0009, -0.0442]) (768 features in tensor)
Run time for angst was 9.81400201190263 seconds.

There are 1 tokens 

Mean of 50 tensors is: tensor([ 0.2321,  0.3407, -0.0770, -0.8135, -0.8890]) (768 features in tensor)
Run time for appliance was 10.992344048805535 seconds.

There are 1 tokens in tokenized vocabulary word:
appoint
Mean of 50 tensors is: tensor([-0.0145,  0.2525,  0.0441,  0.1105, -1.5270]) (768 features in tensor)
Run time for appoint was 8.04832282755524 seconds.

There are 1 tokens in tokenized vocabulary word:
appointment
Mean of 50 tensors is: tensor([ 0.2094,  0.4981,  0.2017,  0.0540, -0.6496]) (768 features in tensor)
Run time for appointment was 4.962004272267222 seconds.

There are 2 tokens in tokenized vocabulary word:
appreci
ative
Mean of 50 tensors is: tensor([-0.0136,  0.2793, -0.0094,  0.7884, -1.0356]) (768 features in tensor)
Run time for appreciative was 5.797259575687349 seconds.

There are 1 tokens in tokenized vocabulary word:
apprehension
Mean of 50 tensors is: tensor([ 0.1369,  0.0151,  0.0641,  0.0508, -0.7101]) (768 features in tensor)
Run time for apprehensio

Mean of 50 tensors is: tensor([ 0.1165,  0.1672,  0.2404, -0.5144,  0.3720]) (768 features in tensor)
Run time for atmosphere was 5.114715863950551 seconds.

There are 1 tokens in tokenized vocabulary word:
atom
Mean of 50 tensors is: tensor([ 0.1494,  0.2631,  0.3219, -0.4035,  0.0687]) (768 features in tensor)
Run time for atom was 103.2143241846934 seconds.

There are 1 tokens in tokenized vocabulary word:
attach
Mean of 50 tensors is: tensor([ 0.2901,  0.4358,  0.0806,  0.0979, -1.4896]) (768 features in tensor)
Run time for attach was 4.342141373082995 seconds.

There are 1 tokens in tokenized vocabulary word:
attempting
Mean of 50 tensors is: tensor([0.0810, 0.3266, 0.0580, 0.5391, 0.1896]) (768 features in tensor)
Run time for attempting was 4.456236680969596 seconds.

There are 1 tokens in tokenized vocabulary word:
attend
Mean of 50 tensors is: tensor([ 0.1576, -0.0490, -0.2086,  0.1007, -0.9227]) (768 features in tensor)
Run time for attend was 4.0578291565179825 seconds.

Th

Mean of 50 tensors is: tensor([ 0.1315,  0.2180, -0.0339, -0.2252, -0.5190]) (768 features in tensor)
Run time for baffle was 18.051189319230616 seconds.

There are 1 tokens in tokenized vocabulary word:
baffled
Mean of 50 tensors is: tensor([-0.0488,  0.3646,  0.0945,  0.3340,  0.2618]) (768 features in tensor)
Run time for baffled was 9.600227010436356 seconds.

There are 2 tokens in tokenized vocabulary word:
baff
ling
Mean of 50 tensors is: tensor([ 0.0548,  0.3666, -0.0071,  0.3269,  0.1301]) (768 features in tensor)
Run time for baffling was 12.391212930902839 seconds.

There are 1 tokens in tokenized vocabulary word:
bag
Mean of 50 tensors is: tensor([ 0.1554, -0.1191,  0.1651,  0.3748,  0.4960]) (768 features in tensor)
Run time for bag was 4.7575295977294445 seconds.

There are 1 tokens in tokenized vocabulary word:
baked
Mean of 50 tensors is: tensor([ 0.2239, -0.0821,  0.1297,  0.3686,  0.7931]) (768 features in tensor)
Run time for baked was 5.849575004540384 seconds.

Ther

Mean of 50 tensors is: tensor([-0.1983, -0.3551, -0.0834,  0.1298,  0.8533]) (768 features in tensor)
Run time for begging was 6.114422778598964 seconds.

There are 1 tokens in tokenized vocabulary word:
begin
Mean of 50 tensors is: tensor([ 0.1446,  0.2559, -0.1971,  0.2509, -1.6240]) (768 features in tensor)
Run time for begin was 3.8584720101207495 seconds.

There are 3 tokens in tokenized vocabulary word:
be
gr
udge
Mean of 50 tensors is: tensor([-0.0452, -0.3495, -0.1547,  0.2001, -0.5327]) (768 features in tensor)
Run time for begrudge was 18.475448158569634 seconds.

There are 3 tokens in tokenized vocabulary word:
be
gr
udging
Mean of 43 tensors is: tensor([ 0.0250, -0.1415, -0.2545,  0.2014,  0.2047]) (768 features in tensor)
Run time for begrudging was 108.53741673659533 seconds.

There are 4 tokens in tokenized vocabulary word:
be
gr
udging
ly
Mean of 50 tensors is: tensor([ 0.1700,  0.1237, -0.2849,  0.2099, -0.1499]) (768 features in tensor)
Run time for begrudgingly was 2

Mean of 50 tensors is: tensor([ 0.1424,  0.5495, -0.1097, -0.2243,  0.2043]) (768 features in tensor)
Run time for bilious was 77.83089735079557 seconds.

There are 1 tokens in tokenized vocabulary word:
biography
Mean of 50 tensors is: tensor([ 0.2731,  0.0879,  0.2423,  0.2050, -0.3211]) (768 features in tensor)
Run time for biography was 6.592969001270831 seconds.

There are 1 tokens in tokenized vocabulary word:
biology
Mean of 50 tensors is: tensor([ 0.1209,  0.5662, -0.0900,  1.2592, -0.4936]) (768 features in tensor)
Run time for biology was 8.478996598161757 seconds.

There are 1 tokens in tokenized vocabulary word:
bird
Mean of 50 tensors is: tensor([ 0.0285,  0.2871,  0.2494, -0.6381,  0.0720]) (768 features in tensor)
Run time for bird was 9.549430299550295 seconds.

There are 1 tokens in tokenized vocabulary word:
birthday
Mean of 50 tensors is: tensor([-0.0208,  0.3260, -0.1618,  0.3403,  0.9125]) (768 features in tensor)
Run time for birthday was 4.806927410885692 seconds

Mean of 50 tensors is: tensor([ 0.1851,  0.3703, -0.2834,  0.5476,  0.3055]) (768 features in tensor)
Run time for boggled was 70.85265291947871 seconds.

There are 1 tokens in tokenized vocabulary word:
boiling
Mean of 50 tensors is: tensor([ 0.2103, -0.4597,  0.1178, -0.1645,  0.1718]) (768 features in tensor)
Run time for boiling was 5.008052486926317 seconds.

There are 3 tokens in tokenized vocabulary word:
bo
ister
ous
Mean of 50 tensors is: tensor([ 0.2388,  0.5260, -0.1681,  0.4812, -0.2737]) (768 features in tensor)
Run time for boisterous was 16.29658151604235 seconds.

There are 1 tokens in tokenized vocabulary word:
bold
Mean of 50 tensors is: tensor([ 0.2837, -0.2842, -0.1205,  0.0040,  0.8247]) (768 features in tensor)
Run time for bold was 5.847631849348545 seconds.

There are 1 tokens in tokenized vocabulary word:
bone
Mean of 50 tensors is: tensor([ 0.0698,  0.0814,  0.1916, -0.5118,  0.7937]) (768 features in tensor)
Run time for bone was 6.149317551404238 seconds.

T

Mean of 50 tensors is: tensor([ 0.0853,  0.1068, -0.0993,  0.1551,  0.0593]) (768 features in tensor)
Run time for bus was 8.648526129312813 seconds.

There are 1 tokens in tokenized vocabulary word:
bush
Mean of 40 tensors is: tensor([ 0.1429,  0.1505,  0.0081,  0.0785, -0.5852]) (768 features in tensor)
Run time for repugnance was 105.28884508647025 seconds.

There are 3 tokens in tokenized vocabulary word:
rep
ug
nant
Mean of 50 tensors is: tensor([ 0.3886,  0.3943,  0.0998, -0.0360, -0.4274]) (768 features in tensor)
Run time for repugnant was 14.951754060573876 seconds.

There are 3 tokens in tokenized vocabulary word:
rep
uls
ed
Mean of 50 tensors is: tensor([ 0.1177,  0.4046,  0.0569,  0.1201, -0.4265]) (768 features in tensor)
Run time for repulsed was 18.596346581354737 seconds.

There are 2 tokens in tokenized vocabulary word:
rep
ulsion
Mean of 50 tensors is: tensor([ 0.2312,  0.1610,  0.0622,  0.5016, -0.4384]) (768 features in tensor)
Run time for repulsion was 36.42898953

Mean of 50 tensors is: tensor([ 0.2175,  0.4354,  0.0855,  0.3084, -0.5854]) (768 features in tensor)
Run time for rhythm was 6.9810764994472265 seconds.

There are 1 tokens in tokenized vocabulary word:
rice
Mean of 50 tensors is: tensor([ 0.0912, -0.3714,  0.1003,  0.6848,  0.2731]) (768 features in tensor)
Run time for rice was 7.67865617480129 seconds.

There are 1 tokens in tokenized vocabulary word:
right
Mean of 50 tensors is: tensor([ 0.0421,  0.1954,  0.0175, -0.2625, -0.3330]) (768 features in tensor)
Run time for right was 4.57079956587404 seconds.

There are 1 tokens in tokenized vocabulary word:
righteous
Mean of 50 tensors is: tensor([ 0.1618,  0.3963,  0.2928, -0.2005,  0.1304]) (768 features in tensor)
Run time for righteous was 6.002139080315828 seconds.

There are 1 tokens in tokenized vocabulary word:
rigid
Mean of 50 tensors is: tensor([ 0.1494,  0.1172, -0.3146,  0.1428,  0.1857]) (768 features in tensor)
Run time for rigid was 5.06232781149447 seconds.

There are 

Mean of 50 tensors is: tensor([ 0.0157,  0.5711, -0.0568, -0.1375, -0.2635]) (768 features in tensor)
Run time for sanctimonious was 18.637435520999134 seconds.

There are 1 tokens in tokenized vocabulary word:
sandwich
Mean of 50 tensors is: tensor([-0.0485, -0.1481,  0.0167,  0.0277,  0.8861]) (768 features in tensor)
Run time for sandwich was 8.469160439446568 seconds.

There are 1 tokens in tokenized vocabulary word:
sane
Mean of 50 tensors is: tensor([0.1689, 0.7080, 0.1967, 0.4002, 0.1618]) (768 features in tensor)
Run time for sane was 5.4593188324943185 seconds.

There are 3 tokens in tokenized vocabulary word:
s
angu
ine
Mean of 50 tensors is: tensor([ 0.1058,  0.3473, -0.0336,  0.7217, -0.4541]) (768 features in tensor)
Run time for sanguine was 12.013386879116297 seconds.

There are 2 tokens in tokenized vocabulary word:
sa
ppy
Mean of 50 tensors is: tensor([ 0.3160,  0.9188, -0.0802,  0.1658,  0.2669]) (768 features in tensor)
Run time for sappy was 17.991487846709788 secon

Mean of 50 tensors is: tensor([ 0.1527, -0.1380, -0.3502,  0.1324, -0.0775]) (768 features in tensor)
Run time for scowling was 64.24277968332171 seconds.

There are 1 tokens in tokenized vocabulary word:
scream
Mean of 50 tensors is: tensor([ 0.1531, -0.1513, -0.2262, -0.2417,  0.4294]) (768 features in tensor)
Run time for scream was 8.497401289641857 seconds.

There are 1 tokens in tokenized vocabulary word:
screaming
Mean of 50 tensors is: tensor([ 0.0225, -0.2171, -0.3071,  0.2097,  1.2949]) (768 features in tensor)
Run time for screaming was 5.271194022148848 seconds.

There are 2 tokens in tokenized vocabulary word:
scrutin
izing
Mean of 50 tensors is: tensor([0.2034, 0.1565, 0.1596, 0.5626, 0.0885]) (768 features in tensor)
Run time for scrutinizing was 19.929841240867972 seconds.

There are 1 tokens in tokenized vocabulary word:
sea
Mean of 50 tensors is: tensor([-0.1055,  0.3430,  0.4930, -0.4630,  0.3602]) (768 features in tensor)
Run time for sea was 5.406393333338201 secon

Mean of 14 tensors is: tensor([ 0.2143, -0.0037,  0.0717, -0.4472,  0.5075]) (768 features in tensor)
Run time for shamefaced was 105.05533922091126 seconds.

There are 1 tokens in tokenized vocabulary word:
shameful
Mean of 50 tensors is: tensor([ 0.2490,  0.0084,  0.1173, -0.2750,  0.5903]) (768 features in tensor)
Run time for shameful was 8.545755396597087 seconds.

There are 1 tokens in tokenized vocabulary word:
shameless
Mean of 50 tensors is: tensor([ 0.1259, -0.0997, -0.0236, -0.6157,  0.6918]) (768 features in tensor)
Run time for shameless was 9.617279299534857 seconds.

There are 1 tokens in tokenized vocabulary word:
sharp
Mean of 50 tensors is: tensor([ 0.0149,  0.3286, -0.1919,  0.8434,  0.5373]) (768 features in tensor)
Run time for sharp was 4.540487539023161 seconds.

There are 1 tokens in tokenized vocabulary word:
sheep
Mean of 50 tensors is: tensor([ 0.0132,  0.1504,  0.0558, -0.9046,  0.3543]) (768 features in tensor)
Run time for sheep was 5.626982742920518 secon

Mean of 50 tensors is: tensor([0.1737, 0.1365, 0.3535, 0.3430, 0.1890]) (768 features in tensor)
Run time for simplicity was 5.44970319699496 seconds.

There are 1 tokens in tokenized vocabulary word:
sincere
Mean of 50 tensors is: tensor([ 0.1817,  0.4411, -0.3436, -0.1294,  0.8024]) (768 features in tensor)
Run time for sincere was 4.417014051228762 seconds.

There are 1 tokens in tokenized vocabulary word:
sinful
Mean of 50 tensors is: tensor([ 0.1617,  0.2393, -0.0562,  0.3180,  0.7245]) (768 features in tensor)
Run time for sinful was 8.497060235589743 seconds.

There are 1 tokens in tokenized vocabulary word:
singer
Mean of 50 tensors is: tensor([ 0.2616,  0.2236, -0.0300, -0.3807,  0.3566]) (768 features in tensor)
Run time for singer was 6.667961156927049 seconds.

There are 1 tokens in tokenized vocabulary word:
singing
Mean of 50 tensors is: tensor([-0.0092,  0.0089, -0.1888, -0.0768,  0.5551]) (768 features in tensor)
Run time for singing was 4.427406111732125 seconds.

Ther

Mean of 50 tensors is: tensor([ 0.1289, -0.0844, -0.0584, -0.6081, -0.2166]) (768 features in tensor)
Run time for smug was 12.498377126641572 seconds.

There are 2 tokens in tokenized vocabulary word:
smug
ness
Mean of 50 tensors is: tensor([ 0.1386,  0.0237,  0.1457, -0.5125, -0.5792]) (768 features in tensor)
Run time for smugness was 49.525522761046886 seconds.

There are 1 tokens in tokenized vocabulary word:
snake
Mean of 50 tensors is: tensor([ 0.0660, -0.1634,  0.1250, -0.7206,  0.1538]) (768 features in tensor)
Run time for snake was 6.011407991871238 seconds.

There are 2 tokens in tokenized vocabulary word:
sn
appy
Mean of 50 tensors is: tensor([ 0.2709,  1.0989, -0.0300, -0.1659,  0.2115]) (768 features in tensor)
Run time for snappy was 11.014679787680507 seconds.

There are 3 tokens in tokenized vocabulary word:
sn
ark
y
Mean of 50 tensors is: tensor([ 0.1449,  0.4240, -0.0725,  0.1665,  0.3464]) (768 features in tensor)
Run time for snarky was 11.118055501952767 seconds.

Mean of 50 tensors is: tensor([ 0.0762, -0.0891,  0.1242,  0.2336,  0.7677]) (768 features in tensor)
Run time for sore was 4.288731937296689 seconds.

There are 1 tokens in tokenized vocabulary word:
sorrow
Mean of 50 tensors is: tensor([ 0.3339,  0.2837,  0.2275,  0.4591, -0.0664]) (768 features in tensor)
Run time for sorrow was 5.836686476133764 seconds.

There are 2 tokens in tokenized vocabulary word:
sorrow
ful
Mean of 50 tensors is: tensor([0.2469, 0.4835, 0.1118, 0.4147, 0.4776]) (768 features in tensor)
Run time for sorrowful was 11.334347808733582 seconds.

There are 1 tokens in tokenized vocabulary word:
sorry
Mean of 50 tensors is: tensor([ 0.2184, -0.0899, -0.1186,  0.9474,  0.8494]) (768 features in tensor)
Run time for sorry was 3.6227898309007287 seconds.

There are 1 tokens in tokenized vocabulary word:
soul
Mean of 50 tensors is: tensor([ 0.1336,  0.2078,  0.3663,  0.0318, -0.0995]) (768 features in tensor)
Run time for soul was 4.515641861595213 seconds.

There are 

Mean of 50 tensors is: tensor([-0.1370,  0.4947, -0.1143, -0.0831,  0.0868]) (768 features in tensor)
Run time for steady was 5.422767129726708 seconds.

There are 1 tokens in tokenized vocabulary word:
steak
Mean of 50 tensors is: tensor([ 0.2017, -0.5837,  0.0153, -0.5228, -0.3745]) (768 features in tensor)
Run time for steak was 7.276515596546233 seconds.

There are 1 tokens in tokenized vocabulary word:
steal
Mean of 50 tensors is: tensor([ 0.2128,  0.2417, -0.0409, -0.3770, -0.0080]) (768 features in tensor)
Run time for steal was 5.143182043917477 seconds.

There are 2 tokens in tokenized vocabulary word:
stealth
y
Mean of 50 tensors is: tensor([ 0.0380,  0.3387, -0.0052,  0.2261,  0.3141]) (768 features in tensor)
Run time for stealthy was 14.22631679289043 seconds.

There are 2 tokens in tokenized vocabulary word:
ste
amed
Mean of 50 tensors is: tensor([ 0.0976, -0.1244,  0.0402, -0.3474,  0.3430]) (768 features in tensor)
Run time for steamed was 6.460868627764285 seconds.

Th

Mean of 50 tensors is: tensor([ 0.3335, -0.1825, -0.0976,  0.0525, -0.6514]) (768 features in tensor)
Run time for string was 10.707802216522396 seconds.

There are 1 tokens in tokenized vocabulary word:
strong
Mean of 50 tensors is: tensor([-0.0348,  0.2866, -0.2739,  0.4294,  0.5991]) (768 features in tensor)
Run time for strong was 3.7875961316749454 seconds.

There are 1 tokens in tokenized vocabulary word:
struck
Mean of 50 tensors is: tensor([0.2079, 0.1348, 0.0569, 0.4265, 0.4203]) (768 features in tensor)
Run time for struck was 3.945714076049626 seconds.

There are 1 tokens in tokenized vocabulary word:
stubborn
Mean of 50 tensors is: tensor([ 1.6569e-01,  1.4984e-01, -9.3765e-02, -2.0025e-04,  5.6999e-01]) (768 features in tensor)
Run time for stubborn was 5.981166071258485 seconds.

There are 2 tokens in tokenized vocabulary word:
stubborn
ness
Mean of 50 tensors is: tensor([ 0.2102, -0.0504,  0.0045, -0.0398, -0.2409]) (768 features in tensor)
Run time for stubbornness was 

Mean of 50 tensors is: tensor([ 0.1629, -0.2952,  0.0308, -0.0048,  0.8970]) (768 features in tensor)
Run time for supper was 10.417173785157502 seconds.

There are 1 tokens in tokenized vocabulary word:
suppressed
Mean of 50 tensors is: tensor([ 0.2438,  0.0693,  0.0747,  0.6441, -0.5797]) (768 features in tensor)
Run time for suppressed was 6.306872768327594 seconds.

There are 1 tokens in tokenized vocabulary word:
suppressing
Mean of 50 tensors is: tensor([ 0.3257,  0.1040,  0.1186,  0.5082, -0.1200]) (768 features in tensor)
Run time for suppressing was 9.284252973273396 seconds.

There are 1 tokens in tokenized vocabulary word:
suppression
Mean of 50 tensors is: tensor([ 0.2387,  0.1842,  0.0550,  0.3170, -0.9246]) (768 features in tensor)
Run time for suppression was 7.2574780732393265 seconds.

There are 1 tokens in tokenized vocabulary word:
sure
Mean of 50 tensors is: tensor([ 0.1414, -0.2865,  0.0411,  0.6315,  0.1842]) (768 features in tensor)
Run time for sure was 3.725906

Mean of 50 tensors is: tensor([-0.0525, -0.1593, -0.3610,  0.1299, -0.2902]) (768 features in tensor)
Run time for taunting was 13.644758977927268 seconds.

There are 2 tokens in tokenized vocabulary word:
t
aut
Mean of 50 tensors is: tensor([ 0.2620,  0.0971, -0.1219,  0.4308, -0.3476]) (768 features in tensor)
Run time for taut was 13.306375736370683 seconds.

There are 1 tokens in tokenized vocabulary word:
tax
Mean of 50 tensors is: tensor([ 0.0496,  0.5874, -0.2282, -1.0033,  0.5259]) (768 features in tensor)
Run time for tax was 4.367366285994649 seconds.

There are 1 tokens in tokenized vocabulary word:
taxi
Mean of 50 tensors is: tensor([-0.0059,  0.1544,  0.0726, -0.1829, -0.3423]) (768 features in tensor)
Run time for taxi was 14.359437874518335 seconds.

There are 1 tokens in tokenized vocabulary word:
tea
Mean of 50 tensors is: tensor([ 0.0755, -0.1876, -0.0544,  0.0487,  0.9660]) (768 features in tensor)
Run time for tea was 4.896377225406468 seconds.

There are 1 tokens i

Mean of 50 tensors is: tensor([ 0.1990, -0.1766,  0.0606,  0.8303,  0.5146]) (768 features in tensor)
Run time for theory was 4.538937377743423 seconds.

There are 1 tokens in tokenized vocabulary word:
think
Mean of 50 tensors is: tensor([0.1301, 0.0977, 0.2782, 0.7909, 0.2500]) (768 features in tensor)
Run time for think was 3.715042868629098 seconds.

There are 1 tokens in tokenized vocabulary word:
thinking
Mean of 50 tensors is: tensor([ 0.1177, -0.1317,  0.1013,  0.9410,  0.0174]) (768 features in tensor)
Run time for thinking was 3.5086542312055826 seconds.

There are 1 tokens in tokenized vocabulary word:
thought
Mean of 50 tensors is: tensor([0.0670, 0.2716, 0.2611, 0.7132, 0.2315]) (768 features in tensor)
Run time for thought was 3.390676049515605 seconds.

There are 1 tokens in tokenized vocabulary word:
thoughtful
Mean of 50 tensors is: tensor([ 0.1204,  0.2914,  0.0328,  0.5192, -0.1390]) (768 features in tensor)
Run time for thoughtful was 4.125216426327825 seconds.

The

Mean of 50 tensors is: tensor([ 0.2336,  0.4131, -0.0780,  0.6265,  0.5361]) (768 features in tensor)
Run time for tormented was 8.370955761522055 seconds.

There are 1 tokens in tokenized vocabulary word:
touched
Mean of 50 tensors is: tensor([0.3166, 0.6957, 0.1809, 0.0959, 0.1699]) (768 features in tensor)
Run time for touched was 3.6102425707504153 seconds.

There are 1 tokens in tokenized vocabulary word:
tough
Mean of 50 tensors is: tensor([0.1593, 0.2854, 0.0238, 0.7162, 0.5009]) (768 features in tensor)
Run time for tough was 4.078823951072991 seconds.

There are 1 tokens in tokenized vocabulary word:
tower
Mean of 50 tensors is: tensor([ 0.0403,  0.1603,  0.1568, -0.8576, -0.4872]) (768 features in tensor)
Run time for tower was 9.139184662140906 seconds.

There are 2 tokens in tokenized vocabulary word:
to
ying
Mean of 50 tensors is: tensor([ 0.0418,  0.1908, -0.2636,  0.4079, -0.6861]) (768 features in tensor)
Run time for toying was 9.52711639739573 seconds.

There are 1 to

Mean of 50 tensors is: tensor([ 0.0909,  0.5968, -0.0062,  0.2303, -0.8255]) (768 features in tensor)
Run time for unapproachable was 75.59699642751366 seconds.

There are 3 tokens in tokenized vocabulary word:
un
assert
ive
Mean of 9 tensors is: tensor([ 0.2600,  0.2168,  0.0571, -0.0399, -0.8956]) (768 features in tensor)
Run time for unassertive was 104.04399540554732 seconds.

There are 2 tokens in tokenized vocabulary word:
un
assuming
Mean of 50 tensors is: tensor([ 0.1450,  0.4886,  0.0984, -0.0343, -0.4233]) (768 features in tensor)
Run time for unassuming was 12.556199456565082 seconds.

There are 1 tokens in tokenized vocabulary word:
unaware
Mean of 50 tensors is: tensor([ 0.0178, -0.1809, -0.0598, -0.0148, -0.2604]) (768 features in tensor)
Run time for unaware was 5.066211238503456 seconds.

There are 2 tokens in tokenized vocabulary word:
unbel
ief
Mean of 50 tensors is: tensor([ 0.2750, -0.1620,  0.0725,  0.1076, -0.8738]) (768 features in tensor)
Run time for unbelief w

Mean of 50 tensors is: tensor([ 0.2710, -0.0633, -0.0418,  0.2245, -0.8050]) (768 features in tensor)
Run time for unfathomable was 12.82919054850936 seconds.

There are 2 tokens in tokenized vocabulary word:
unf
azed
Mean of 50 tensors is: tensor([-0.1625,  0.4347, -0.0863,  0.4552, -0.8524]) (768 features in tensor)
Run time for unfazed was 43.455021288245916 seconds.

There are 3 tokens in tokenized vocabulary word:
unf
e
eling
Mean of 50 tensors is: tensor([ 0.1025,  0.2263,  0.0785, -0.1535, -0.3278]) (768 features in tensor)
Run time for unfeeling was 48.62709602154791 seconds.

There are 1 tokens in tokenized vocabulary word:
unfocused
Mean of 50 tensors is: tensor([ 0.0889,  0.0582, -0.0279,  0.2229,  0.1918]) (768 features in tensor)
Run time for unfocused was 33.14468964654952 seconds.

There are 1 tokens in tokenized vocabulary word:
unforeseen
Mean of 50 tensors is: tensor([ 0.1651,  0.5512, -0.0823,  0.3107, -0.3689]) (768 features in tensor)
Run time for unforeseen was 7.

Mean of 50 tensors is: tensor([-0.0319,  0.3065,  0.2233, -0.1792, -0.3809]) (768 features in tensor)
Run time for unsuspecting was 8.170786353759468 seconds.

There are 3 tokens in tokenized vocabulary word:
uns
way
ed
Mean of 7 tensors is: tensor([-0.0750,  0.2926,  0.0680, -0.0693, -0.2748]) (768 features in tensor)
Run time for unswayed was 107.37240052036941 seconds.

There are 4 tokens in tokenized vocabulary word:
un
sy
mp
athetic
Mean of 50 tensors is: tensor([ 0.1180,  0.4604,  0.0340, -0.1777, -0.3811]) (768 features in tensor)
Run time for unsympathetic was 20.451607468537986 seconds.

There are 1 tokens in tokenized vocabulary word:
untouched
Mean of 50 tensors is: tensor([ 0.1229,  0.0123,  0.0778, -0.3082, -0.5517]) (768 features in tensor)
Run time for untouched was 7.1679805014282465 seconds.

There are 3 tokens in tokenized vocabulary word:
unt
rou
bled
Mean of 50 tensors is: tensor([ 0.1530,  0.2844,  0.1118,  0.9100, -0.1374]) (768 features in tensor)
Run time for un

Mean of 50 tensors is: tensor([ 0.2368,  0.4744,  0.0737, -0.1311, -1.0963]) (768 features in tensor)
Run time for victory was 8.255805813707411 seconds.

There are 1 tokens in tokenized vocabulary word:
vigilant
Mean of 50 tensors is: tensor([0.0120, 0.0240, 0.1904, 0.4588, 0.1117]) (768 features in tensor)
Run time for vigilant was 8.708358452655375 seconds.

There are 1 tokens in tokenized vocabulary word:
vile
Mean of 50 tensors is: tensor([ 0.1226, -0.0963, -0.3048, -0.1104,  0.3570]) (768 features in tensor)
Run time for vile was 7.957913567312062 seconds.

There are 2 tokens in tokenized vocabulary word:
villain
ous
Mean of 50 tensors is: tensor([ 0.0360,  0.3529, -0.0610,  0.5068,  0.5855]) (768 features in tensor)
Run time for villainous was 15.106521492823958 seconds.

There are 2 tokens in tokenized vocabulary word:
vind
ictive
Mean of 50 tensors is: tensor([ 0.2095,  0.4280,  0.0255,  0.5463, -0.1792]) (768 features in tensor)
Run time for vindictive was 16.90362366102636 s

Mean of 50 tensors is: tensor([ 0.3081, -0.3787,  0.2929, -1.0552, -0.2114]) (768 features in tensor)
Run time for water was 3.9721388472244143 seconds.

There are 3 tokens in tokenized vocabulary word:
wa
ver
ing
Mean of 50 tensors is: tensor([ 0.0835,  0.3740, -0.4752,  0.0160, -0.1899]) (768 features in tensor)
Run time for wavering was 28.041507499292493 seconds.

There are 1 tokens in tokenized vocabulary word:
way
Mean of 50 tensors is: tensor([ 0.3791,  0.7630, -0.2124,  0.1575, -0.4916]) (768 features in tensor)
Run time for way was 3.9884781166911125 seconds.

There are 1 tokens in tokenized vocabulary word:
wealth
Mean of 50 tensors is: tensor([ 0.1325,  0.2591,  0.2326, -0.8832,  0.0700]) (768 features in tensor)
Run time for wealth was 4.4340178379788995 seconds.

There are 2 tokens in tokenized vocabulary word:
wear
iness
Mean of 50 tensors is: tensor([ 0.0666,  0.2888,  0.2462, -0.0723,  0.1059]) (768 features in tensor)
Run time for weariness was 31.870356270112097 secon

Mean of 50 tensors is: tensor([ 0.2103,  0.4982, -0.0742, -0.0582,  0.0650]) (768 features in tensor)
Run time for wistfully was 31.265798576176167 seconds.

There are 1 tokens in tokenized vocabulary word:
withdraw
Mean of 50 tensors is: tensor([ 0.1159,  0.3406,  0.0944,  0.2547, -0.6918]) (768 features in tensor)
Run time for withdraw was 4.274333788082004 seconds.

There are 1 tokens in tokenized vocabulary word:
withdrawn
Mean of 50 tensors is: tensor([ 0.0320,  0.1449,  0.1124,  0.2429, -0.1145]) (768 features in tensor)
Run time for withdrawn was 5.612368171103299 seconds.

There are 1 tokens in tokenized vocabulary word:
withheld
Mean of 50 tensors is: tensor([ 0.1312,  0.0270, -0.2623, -0.2566, -0.2409]) (768 features in tensor)
Run time for withheld was 7.029346066527069 seconds.

There are 1 tokens in tokenized vocabulary word:
withholding
Mean of 50 tensors is: tensor([ 0.1257,  0.0233, -0.2398, -0.2796,  0.0730]) (768 features in tensor)
Run time for withholding was 7.0876

Mean of 50 tensors is: tensor([ 0.1381, -0.0495, -0.2294, -0.7276,  0.3529]) (768 features in tensor)
Run time for yawning was 15.613852648064494 seconds.

There are 1 tokens in tokenized vocabulary word:
year
Mean of 50 tensors is: tensor([-0.0807,  0.4071,  0.0468, -0.2169, -0.4440]) (768 features in tensor)
Run time for year was 3.5877525759860873 seconds.

There are 2 tokens in tokenized vocabulary word:
year
ning
Mean of 50 tensors is: tensor([ 0.1150, -0.1523,  0.0150,  0.2728,  0.1582]) (768 features in tensor)
Run time for yearning was 7.917504573240876 seconds.

There are 1 tokens in tokenized vocabulary word:
yell
Mean of 50 tensors is: tensor([ 0.2217, -0.1585, -0.2781,  0.0770, -0.4935]) (768 features in tensor)
Run time for yell was 7.113133622333407 seconds.

There are 1 tokens in tokenized vocabulary word:
yelling
Mean of 50 tensors is: tensor([ 0.0806, -0.3010, -0.4002,  0.4312, -0.0537]) (768 features in tensor)
Run time for yelling was 6.7533686356619 seconds.

There 

In [5]:
def make_vocab(vocab_file):
    vocab = []
    with open(vocab_file, 'r') as v:
        vocab = v.read().splitlines()
    return vocab

In [6]:
def create_token_embeddings(tokenized_text):
    input_ids = torch.tensor(tokenized_text).unsqueeze(0)  # Batch size 1
    with torch.no_grad():
        outputs = model(input_ids, masked_lm_labels=input_ids)
        encoded_layers = outputs[2]
        token_embeddings = torch.stack(encoded_layers, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1,0,2)
#         print(f'Size of token embeddings is {token_embeddings.size()}')
        return token_embeddings

In [7]:
# Sum the last 4 layers' features
def sum_last_four_token_vecs(token_embeddings):
    token_vecs_sum_last_four = []

    # For each token in the sentence...
    for token in token_embeddings:
        # `token` is a [13 x 768] tensor
        # Sum the vectors from the last 4 layers.
        sum_vec = torch.sum(token[-4:], dim=0)

        # Use `sum_vec` to represent `token`.
        token_vecs_sum_last_four.append(sum_vec)

#     print ('Shape of summed layers is: %d x %d' % (len(token_vecs_sum_last_four), len(token_vecs_sum_last_four[0])))
    # Shape is: <token count> x 768
    return token_vecs_sum_last_four

In [8]:
# Return a single layer of the model.
def get_layer_token_vecs(token_embeddings, layer_number):
    token_vecs_layer = []

    # For each token in the sentence...
    for token in token_embeddings:
        # `token` is a [13 x 768] tensor
        # Sum the vectors from the last 4 layers.
        layer_vec = token[layer_number]

        # Use `sum_vec` to represent `token`.
        token_vecs_layer.append(layer_vec)

#     print ('Shape of summed layers is: %d x %d' % (len(token_vecs_layer), len(token_vecs_layer[0])))
    # Shape is: <token count> x 768
    return token_vecs_layer

In [9]:
def write_embedding(embeddings_file, vocab_word, contextual_embedding):
    try:
        with open(embeddings_file, 'a') as f:
            f.write(vocab_word)
            for value in contextual_embedding[0]:
                f.write(' ' + str(value.item()))
            f.write('\n')
#         print(f'Saved the embedding for {vocab_word}.')
    except:
        print('Oh no! Unable to write to the embeddings file.')