In [1]:
import sys
import os
from timeit import default_timer as timer
import torch
from transformers import RobertaTokenizer, RobertaModel, RobertaForMaskedLM, RobertaConfig

In [2]:
# Make sure we're in the transformers directory with fine-tuned model output.
os.chdir('/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/')
os.getcwd()

'/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings'

In [3]:
# Adapted from the tutorial at https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/
# and Transformers documentation: https://huggingface.co/transformers/model_doc/roberta.html#robertaformaskedlm

In [37]:
tokenizer = RobertaTokenizer.from_pretrained('/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/output_wiki-103_filtered')
config = RobertaConfig.from_pretrained('/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/output_wiki-103_filtered')
model = RobertaForMaskedLM.from_pretrained('/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/output_wiki-103_filtered', config=config)
model.eval()

context_file = "/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/wiki.test.raw.out"
output_file = '/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/roberta_test.txt'
count_file = '/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/roberta_test_counts.txt'
vocab_file = '/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/vocab_checked.txt'
vocab = make_vocab(vocab_file)
FEATURE_COUNT = 768
MAX_LINES = 2000

In [43]:
# Process vocabulary words in the outer loop.
for v in vocab:
    start = timer()
    with open(context_file, 'r') as lines:
        v_sum = torch.zeros([1, FEATURE_COUNT])
        v_tokens = tokenize_text(v, tokenizer)
        print_tokenized_text(v_tokens, tokenizer)
        count_sentence = 0
        count_tensor = 0
        
        # Process all lines in the context file in the inner loop.
        for line in lines:
            # Check for this vocab word in this line; if found, split the line into individual sentences.
            if v in line.lower().split():
                for sentence in line.split('.'):
                    if v in sentence.lower():
                        line = sentence
                        count_sentence += 1
                        print(f'\nInstance {count_sentence} of {tokenizer.decode(v_tokens[1:-1]).strip()}.')
                        break  # We'll take the first instance of the word and discard the rest of the line.
                # Split the new sentence-based line into tokens.
                line_tokens = tokenize_text(line, tokenizer)               
                # Get the indices of the line at which our vocabulary word tokens are located.
                indices = get_vocab_indices(v_tokens, line_tokens, tokenizer)                             

                # If the vocabulary word was found, process the containing line.
                if indices:
                    # Get the feature vectors for all tokens in the line/sentence.
                    token_embeddings = create_token_embeddings(line_tokens)
                    # Select a method for extracting specific layers of the model.
                    token_vecs_layer = get_layer_token_vecs(token_embeddings, 12)
                    # Sum the individual token contextual embeddings for the whole vocab word, for this line.
                    tensor_layer = torch.zeros([1, FEATURE_COUNT])
                    for i in range(len(indices)):
                        preview_token_embedding(v_tokens, token_vecs_layer, i, indices, tokenizer)
                        tensor_layer += token_vecs_layer[indices[i]]
                    # If our vocab word is broken into more than one token, we need to get the mean of the token embeddings.
                    tensor_layer /= len(indices)

                    # Add the embedding distilled from this line to the sum of embeddings for all lines.
                    v_sum += tensor_layer
                    count_tensor += 1
                    print(f'Grand sum of {count_tensor} tensor sets is: {v_sum[0][:5].tolist()}')

            # Stop processing lines once we've found 2000 instances of our vocab word.
            if count_tensor >= MAX_LINES:
                break
        
        # We're done processing all lines of 512 tokens or less containing our vocab word.
        # Get the mean embedding for the word.
        v_mean = v_sum / count_tensor
        print(f'Mean of {count_tensor} tensors is: {v_mean[0][:5]} (first 5 of {len(v_mean[0])} features in tensor)')
        write_embedding(output_file, v, v_mean)
        write_line_count(count_file, v, count_tensor)
    end = timer()
    print(f'Run time for {v} was {end - start} seconds.')


There are 1 tokens in tokenized text:
aback
Mean of 0 tensors is: tensor([nan, nan, nan, nan, nan]) (first 5 of 768 features in tensor)
Saved the embedding for aback.
Saved the count of sentences used to create aback embedding
Run time for aback was 0.030610576999606565 seconds.

There are 2 tokens in tokenized text:
ab
ashed
Mean of 0 tensors is: tensor([nan, nan, nan, nan, nan]) (first 5 of 768 features in tensor)
Saved the embedding for abashed.
Saved the count of sentences used to create abashed embedding
Run time for abashed was 0.030524277000040456 seconds.

There are 1 tokens in tokenized text:
abhor
Mean of 0 tensors is: tensor([nan, nan, nan, nan, nan]) (first 5 of 768 features in tensor)
Saved the embedding for abhor.
Saved the count of sentences used to create abhor embedding
Run time for abhor was 0.028823298999668623 seconds.

There are 2 tokens in tokenized text:
abhor
red
Mean of 0 tensors is: tensor([nan, nan, nan, nan, nan]) (first 5 of 768 features in tensor)
Saved t

Size of token embeddings is torch.Size([37, 13, 768])
Shape of summed layers is: 37 x 768
accepted at index 22:  [-0.005839262157678604, 0.0065529122948646545, 0.007053704932332039, 0.2620042860507965, 0.30201423168182373]
Grand sum of 5 tensor sets is: [0.3896675705909729, 0.37197333574295044, -0.18983963131904602, 1.792115330696106, 1.7109131813049316]

Instance 6 of accepted.
Size of token embeddings is torch.Size([21, 13, 768])
Shape of summed layers is: 21 x 768
accepted at index 19:  [0.055956169962882996, 0.10654482245445251, -0.1882839798927307, 0.127366304397583, 0.40761083364486694]
Grand sum of 6 tensor sets is: [0.4456237554550171, 0.47851815819740295, -0.37812361121177673, 1.919481635093689, 2.1185240745544434]

Instance 7 of accepted.
Size of token embeddings is torch.Size([28, 13, 768])
Shape of summed layers is: 28 x 768
accepted at index 3:  [0.04771412909030914, -0.06445986032485962, -0.07482258975505829, 0.20206670463085175, 0.9173025488853455]
Grand sum of 7 tensor 

Size of token embeddings is torch.Size([27, 13, 768])
Shape of summed layers is: 27 x 768
accomplished at index 2:  [0.13993659615516663, 0.07892926037311554, 0.40447723865509033, -0.07857934385538101, 0.22581759095191956]
Grand sum of 2 tensor sets is: [0.09603787958621979, 0.07234682142734528, 0.6336098909378052, -0.2386748194694519, 0.3055179715156555]

Instance 3 of accomplished.
Size of token embeddings is torch.Size([19, 13, 768])
Shape of summed layers is: 19 x 768
accomplished at index 11:  [0.040347304195165634, 0.3676615357398987, 0.15343107283115387, -0.23540496826171875, -0.05082383751869202]
Grand sum of 3 tensor sets is: [0.13638518750667572, 0.44000834226608276, 0.7870409488677979, -0.47407978773117065, 0.2546941339969635]
Mean of 3 tensors is: tensor([ 0.0455,  0.1467,  0.2623, -0.1580,  0.0849]) (first 5 of 768 features in tensor)
Saved the embedding for accomplished.
Saved the count of sentences used to create accomplished embedding
Run time for accomplished was 0.190

Size of token embeddings is torch.Size([37, 13, 768])
Shape of summed layers is: 37 x 768
active at index 29:  [0.04733189195394516, -0.12996122241020203, 0.03889086842536926, -0.04028075188398361, 0.663524866104126]
Grand sum of 8 tensor sets is: [0.5281392335891724, 0.3307981491088867, 0.7075533866882324, 2.372034788131714, -2.9380874633789062]

Instance 9 of active.
Size of token embeddings is torch.Size([20, 13, 768])
Shape of summed layers is: 20 x 768
active at index 17:  [-0.12019650638103485, -0.2308669537305832, 0.02476239576935768, 0.20746958255767822, -0.7091408371925354]
Grand sum of 9 tensor sets is: [0.4079427123069763, 0.09993119537830353, 0.7323157787322998, 2.5795044898986816, -3.647228240966797]

Instance 10 of active.
Size of token embeddings is torch.Size([35, 13, 768])
Shape of summed layers is: 35 x 768
active at index 5:  [0.005576286464929581, 0.0764036551117897, 0.0006837770342826843, 0.5913102626800537, -0.4659793972969055]
Grand sum of 10 tensor sets is: [0.4

Mean of 3 tensors is: tensor([-0.0403, -0.1685,  0.2886, -0.4222, -0.2433]) (first 5 of 768 features in tensor)
Saved the embedding for admiration.
Saved the count of sentences used to create admiration embedding
Run time for admiration was 0.18663836900032038 seconds.

There are 1 tokens in tokenized text:
admit
Mean of 0 tensors is: tensor([nan, nan, nan, nan, nan]) (first 5 of 768 features in tensor)
Saved the embedding for admit.
Saved the count of sentences used to create admit embedding
Run time for admit was 0.026911353999821586 seconds.

There are 2 tokens in tokenized text:
ad
oration
Mean of 0 tensors is: tensor([nan, nan, nan, nan, nan]) (first 5 of 768 features in tensor)
Saved the embedding for adoration.
Saved the count of sentences used to create adoration embedding
Run time for adoration was 0.026020743000117363 seconds.

There are 2 tokens in tokenized text:
ad
oring
Mean of 0 tensors is: tensor([nan, nan, nan, nan, nan]) (first 5 of 768 features in tensor)
Saved the e

Size of token embeddings is torch.Size([17, 13, 768])
Shape of summed layers is: 17 x 768
afraid at index 5:  [-0.08513058722019196, -0.29803192615509033, 0.13244567811489105, 0.2129269391298294, 0.9306025505065918]
afraid at index 13:  [-0.11178436875343323, -0.33336079120635986, 0.18283161520957947, 0.18674224615097046, 1.1694145202636719]
Grand sum of 2 tensor sets is: [-0.11958599090576172, -0.4616743326187134, 0.5883044004440308, 0.6268924474716187, 2.1473727226257324]
Mean of 2 tensors is: tensor([-0.0598, -0.2308,  0.2942,  0.3134,  1.0737]) (first 5 of 768 features in tensor)
Saved the embedding for afraid.
Saved the count of sentences used to create afraid embedding
Run time for afraid was 0.1250886179996087 seconds.

There are 2 tokens in tokenized text:
ag
ape
Mean of 0 tensors is: tensor([nan, nan, nan, nan, nan]) (first 5 of 768 features in tensor)
Saved the embedding for agape.
Saved the count of sentences used to create agape embedding
Run time for agape was 0.0285231979

Size of token embeddings is torch.Size([58, 13, 768])
Shape of summed layers is: 58 x 768
alerted at index 2:  [0.2878829836845398, -0.1626962423324585, 0.041646890342235565, -0.04731352999806404, 0.6773348450660706]
Grand sum of 2 tensor sets is: [0.35486283898353577, -0.19470226764678955, 0.08794689178466797, -0.07517844438552856, 0.9054733514785767]
Mean of 2 tensors is: tensor([ 0.1774, -0.0974,  0.0440, -0.0376,  0.4527]) (first 5 of 768 features in tensor)
Saved the embedding for alerted.
Saved the count of sentences used to create alerted embedding
Run time for alerted was 0.21074934200078133 seconds.

There are 1 tokens in tokenized text:
alienated
Mean of 0 tensors is: tensor([nan, nan, nan, nan, nan]) (first 5 of 768 features in tensor)
Saved the embedding for alienated.
Saved the count of sentences used to create alienated embedding
Run time for alienated was 0.028490952000538527 seconds.

There are 1 tokens in tokenized text:
allergic
Mean of 0 tensors is: tensor([nan, nan,

Size of token embeddings is torch.Size([25, 13, 768])
Shape of summed layers is: 25 x 768
analyzing at index 7:  [0.27910715341567993, 0.2255050092935562, 0.10954922437667847, 0.12491613626480103, 0.6000325679779053]
Grand sum of 1 tensor sets is: [0.27910715341567993, 0.2255050092935562, 0.10954922437667847, 0.12491613626480103, 0.6000325679779053]

Instance 2 of analyzing.
Size of token embeddings is torch.Size([27, 13, 768])
Shape of summed layers is: 27 x 768
analyzing at index 19:  [0.17776751518249512, 0.25048187375068665, 0.07400267571210861, 0.2684782147407532, 0.7525385618209839]
Grand sum of 2 tensor sets is: [0.45687466859817505, 0.47598689794540405, 0.18355190753936768, 0.3933943510055542, 1.3525711297988892]
Mean of 2 tensors is: tensor([0.2284, 0.2380, 0.0918, 0.1967, 0.6763]) (first 5 of 768 features in tensor)
Saved the embedding for analyzing.
Saved the count of sentences used to create analyzing embedding
Run time for analyzing was 0.13607573100034642 seconds.

There 


Instance 1 of animated.
Size of token embeddings is torch.Size([36, 13, 768])
Shape of summed layers is: 36 x 768
animated at index 9:  [-0.13959121704101562, 0.1737341582775116, 0.024441640824079514, -0.11185142397880554, -0.11011719703674316]
Grand sum of 1 tensor sets is: [-0.13959121704101562, 0.1737341582775116, 0.024441640824079514, -0.11185142397880554, -0.11011719703674316]

Instance 2 of animated.
Size of token embeddings is torch.Size([37, 13, 768])
Shape of summed layers is: 37 x 768
animated at index 33:  [-0.1319926530122757, 0.2044759839773178, 0.10342278331518173, -0.150216743350029, 0.12877792119979858]
Grand sum of 2 tensor sets is: [-0.2715838551521301, 0.3782101273536682, 0.12786442041397095, -0.26206815242767334, 0.01866072416305542]

Instance 3 of animated.
Size of token embeddings is torch.Size([70, 13, 768])
Shape of summed layers is: 70 x 768
animated at index 44:  [0.037575654685497284, 0.059396836906671524, 0.2357286512851715, 0.03179680183529854, -0.04932625

Size of token embeddings is torch.Size([15, 13, 768])
Shape of summed layers is: 15 x 768
anxiety at index 3:  [0.16283810138702393, -0.16517367959022522, 0.15017762780189514, 0.06005309894680977, -0.16442930698394775]
Grand sum of 1 tensor sets is: [0.16283810138702393, -0.16517367959022522, 0.15017762780189514, 0.06005309894680977, -0.16442930698394775]

Instance 2 of anxiety.
Size of token embeddings is torch.Size([45, 13, 768])
Shape of summed layers is: 45 x 768
anxiety at index 36:  [0.035467375069856644, 0.31055915355682373, 0.4142957627773285, -0.11021118611097336, 0.41283679008483887]
Grand sum of 2 tensor sets is: [0.19830547273159027, 0.1453854739665985, 0.5644733905792236, -0.05015808716416359, 0.2484074831008911]
Mean of 2 tensors is: tensor([ 0.0992,  0.0727,  0.2822, -0.0251,  0.1242]) (first 5 of 768 features in tensor)
Saved the embedding for anxiety.
Saved the count of sentences used to create anxiety embedding
Run time for anxiety was 0.14313321099962195 seconds.

Th

Size of token embeddings is torch.Size([32, 13, 768])
Shape of summed layers is: 32 x 768
argue at index 6:  [0.1798577755689621, 0.13027141988277435, 0.09999781847000122, 0.09113790094852448, -1.0571374893188477]
Grand sum of 3 tensor sets is: [0.2246711254119873, 0.45280516147613525, 0.46912407875061035, 0.27913662791252136, -3.77339506149292]
Mean of 3 tensors is: tensor([ 0.0749,  0.1509,  0.1564,  0.0930, -1.2578]) (first 5 of 768 features in tensor)
Saved the embedding for argue.
Saved the count of sentences used to create argue embedding
Run time for argue was 0.24159302600037336 seconds.

There are 2 tokens in tokenized text:
argument
ative
Mean of 0 tensors is: tensor([nan, nan, nan, nan, nan]) (first 5 of 768 features in tensor)
Saved the embedding for argumentative.
Saved the count of sentences used to create argumentative embedding
Run time for argumentative was 0.03183600699958333 seconds.

There are 1 tokens in tokenized text:
aroused
Mean of 0 tensors is: tensor([nan, na

Size of token embeddings is torch.Size([25, 13, 768])
Shape of summed layers is: 25 x 768
attempting at index 11:  [-0.004661872982978821, -0.012229569256305695, 0.20318102836608887, 0.1792888194322586, 0.581027626991272]
Grand sum of 3 tensor sets is: [0.5386844277381897, 0.17191699147224426, 0.5534408092498779, 0.49088001251220703, 0.8163760900497437]

Instance 4 of attempting.
Size of token embeddings is torch.Size([42, 13, 768])
Shape of summed layers is: 42 x 768
attempting at index 27:  [0.11186040937900543, 0.13188433647155762, 0.07992716133594513, 0.07227587699890137, 0.47849926352500916]
Grand sum of 4 tensor sets is: [0.6505448222160339, 0.3038013279438019, 0.6333679556846619, 0.5631558895111084, 1.2948753833770752]

Instance 5 of attempting.
Size of token embeddings is torch.Size([35, 13, 768])
Shape of summed layers is: 35 x 768
attempting at index 15:  [0.2207575887441635, 0.07597984373569489, 0.10291691869497299, 0.04342744126915932, 0.2601925730705261]
Grand sum of 5 ten

Size of token embeddings is torch.Size([58, 13, 768])
Shape of summed layers is: 58 x 768
avoiding at index 28:  [0.2064252644777298, -0.0027953237295150757, -0.09289681166410446, -0.17962346971035004, 0.4430348873138428]
Grand sum of 1 tensor sets is: [0.2064252644777298, -0.0027953237295150757, -0.09289681166410446, -0.17962346971035004, 0.4430348873138428]

Instance 2 of avoiding.
Size of token embeddings is torch.Size([28, 13, 768])
Shape of summed layers is: 28 x 768
avoiding at index 16:  [0.35135895013809204, 0.3332180976867676, 0.05663122236728668, -0.2134239226579666, 0.5697475671768188]
Grand sum of 2 tensor sets is: [0.5577841997146606, 0.3304227590560913, -0.03626558929681778, -0.39304739236831665, 1.0127824544906616]
Mean of 2 tensors is: tensor([ 0.2789,  0.1652, -0.0181, -0.1965,  0.5064]) (first 5 of 768 features in tensor)
Saved the embedding for avoiding.
Saved the count of sentences used to create avoiding embedding
Run time for avoiding was 0.17433244699986972 secon

Saved the embedding for backhanded.
Saved the count of sentences used to create backhanded embedding
Run time for backhanded was 0.024996329000714468 seconds.

There are 1 tokens in tokenized text:
badly

Instance 1 of badly.
Size of token embeddings is torch.Size([22, 13, 768])
Shape of summed layers is: 22 x 768
badly at index 3:  [-0.04932598024606705, 0.20432643592357635, 0.14524024724960327, 0.2621002197265625, 1.0760900974273682]
Grand sum of 1 tensor sets is: [-0.04932598024606705, 0.20432643592357635, 0.14524024724960327, 0.2621002197265625, 1.0760900974273682]

Instance 2 of badly.
Size of token embeddings is torch.Size([17, 13, 768])
Shape of summed layers is: 17 x 768
badly at index 7:  [-0.14191602170467377, 0.14972223341464996, 0.05356631055474281, 0.3544180989265442, 0.6540783643722534]
Grand sum of 2 tensor sets is: [-0.1912420094013214, 0.3540486693382263, 0.1988065540790558, 0.6165183186531067, 1.7301684617996216]

Instance 3 of badly.

Instance 4 of badly.
Size of tok

Size of token embeddings is torch.Size([41, 13, 768])
Shape of summed layers is: 41 x 768
beat at index 18:  [0.18999524414539337, -0.04766558110713959, -0.07772503793239594, -0.35929590463638306, -0.7232933044433594]
Grand sum of 3 tensor sets is: [0.5597817897796631, -0.3978429436683655, -0.22885087132453918, -0.8262243270874023, -1.9954460859298706]

Instance 8 of beat.
Size of token embeddings is torch.Size([28, 13, 768])
Shape of summed layers is: 28 x 768
beat at index 4:  [0.22316795587539673, -0.15126532316207886, -0.03902377933263779, -0.278055340051651, -1.4533255100250244]
Grand sum of 4 tensor sets is: [0.7829497456550598, -0.5491082668304443, -0.26787465810775757, -1.104279637336731, -3.4487714767456055]

Instance 9 of beat.
Size of token embeddings is torch.Size([36, 13, 768])
Shape of summed layers is: 36 x 768
beat at index 3:  [0.1869942992925644, 0.09900295734405518, -0.1742640882730484, -0.4189644753932953, -0.06966196000576019]
Grand sum of 5 tensor sets is: [0.9699

Mean of 0 tensors is: tensor([nan, nan, nan, nan, nan]) (first 5 of 768 features in tensor)
Saved the embedding for belittling.
Saved the count of sentences used to create belittling embedding
Run time for belittling was 0.029511843000364024 seconds.

There are 2 tokens in tokenized text:
bellig
erence
Mean of 0 tensors is: tensor([nan, nan, nan, nan, nan]) (first 5 of 768 features in tensor)
Saved the embedding for belligerence.
Saved the count of sentences used to create belligerence embedding
Run time for belligerence was 0.0270203930003845 seconds.

There are 2 tokens in tokenized text:
bellig
erent
Mean of 0 tensors is: tensor([nan, nan, nan, nan, nan]) (first 5 of 768 features in tensor)
Saved the embedding for belligerent.
Saved the count of sentences used to create belligerent embedding
Run time for belligerent was 0.024398563999966427 seconds.

There are 1 tokens in tokenized text:
belonging

Instance 1 of belonging.
Size of token embeddings is torch.Size([40, 13, 768])
Shape 

Size of token embeddings is torch.Size([45, 13, 768])
Shape of summed layers is: 45 x 768
bit at index 30:  [0.017276005819439888, 0.2702726125717163, 0.29617807269096375, 0.4095306396484375, 0.7606801390647888]
Grand sum of 3 tensor sets is: [0.47539055347442627, 1.217105746269226, 0.9564892053604126, 1.2225737571716309, 2.8246138095855713]

Instance 4 of bit.
Size of token embeddings is torch.Size([35, 13, 768])
Shape of summed layers is: 35 x 768
bit at index 25:  [0.024170154705643654, 0.33883804082870483, 0.44729408621788025, 0.1355164796113968, 0.9544157385826111]
Grand sum of 4 tensor sets is: [0.49956071376800537, 1.5559437274932861, 1.4037833213806152, 1.3580902814865112, 3.779029607772827]

Instance 5 of bit.
Size of token embeddings is torch.Size([34, 13, 768])
Shape of summed layers is: 34 x 768
bit at index 19:  [0.22912615537643433, 0.4870338439941406, 0.26886531710624695, 0.6389355659484863, 1.3444633483886719]
Grand sum of 5 tensor sets is: [0.7286868691444397, 2.042977

Size of token embeddings is torch.Size([25, 13, 768])
Shape of summed layers is: 25 x 768
blew at index 18:  [0.12807415425777435, 0.20435526967048645, 0.01333533227443695, -0.09176340699195862, -0.11317366361618042]
Grand sum of 3 tensor sets is: [0.0450330525636673, 0.881382942199707, 0.04537232965230942, 0.05777516961097717, 0.5484972596168518]
Mean of 3 tensors is: tensor([0.0150, 0.2938, 0.0151, 0.0193, 0.1828]) (first 5 of 768 features in tensor)
Saved the embedding for blew.
Saved the count of sentences used to create blew embedding
Run time for blew was 0.19750137699975312 seconds.

There are 1 tokens in tokenized text:
blinded

Instance 1 of blinded.
Size of token embeddings is torch.Size([35, 13, 768])
Shape of summed layers is: 35 x 768
blinded at index 12:  [0.01805512048304081, -0.18283189833164215, -0.1390848457813263, -0.42292219400405884, 1.0427896976470947]
Grand sum of 1 tensor sets is: [0.01805512048304081, -0.18283189833164215, -0.1390848457813263, -0.42292219400405

Size of token embeddings is torch.Size([52, 13, 768])
Shape of summed layers is: 52 x 768
blunt at index 36:  [0.07376561313867569, -0.02279198169708252, 0.07784267514944077, 0.09228937327861786, 0.10606687515974045]
Grand sum of 2 tensor sets is: [0.11321815848350525, 0.07897111028432846, 0.08108515292406082, 0.2735885679721832, -0.06836313754320145]
Mean of 2 tensors is: tensor([ 0.0566,  0.0395,  0.0405,  0.1368, -0.0342]) (first 5 of 768 features in tensor)
Saved the embedding for blunt.
Saved the count of sentences used to create blunt embedding
Run time for blunt was 0.18581948900009593 seconds.

There are 2 tokens in tokenized text:
bl
ushing
Mean of 0 tensors is: tensor([nan, nan, nan, nan, nan]) (first 5 of 768 features in tensor)
Saved the embedding for blushing.
Saved the count of sentences used to create blushing embedding
Run time for blushing was 0.025769230000150856 seconds.

There are 3 tokens in tokenized text:
bl
ust
ering
Mean of 0 tensors is: tensor([nan, nan, nan, 

Size of token embeddings is torch.Size([39, 13, 768])
Shape of summed layers is: 39 x 768
broken at index 32:  [-0.2137746512889862, -0.1756058931350708, 0.014334514737129211, -0.26710745692253113, 0.4574378430843353]
Grand sum of 1 tensor sets is: [-0.2137746512889862, -0.1756058931350708, 0.014334514737129211, -0.26710745692253113, 0.4574378430843353]

Instance 2 of broken.
Size of token embeddings is torch.Size([22, 13, 768])
Shape of summed layers is: 22 x 768
broken at index 18:  [-0.0535789430141449, 0.18640366196632385, 0.14584055542945862, 0.09627316892147064, 0.7576018571853638]
Grand sum of 2 tensor sets is: [-0.2673535943031311, 0.010797768831253052, 0.16017507016658783, -0.17083428800106049, 1.2150397300720215]

Instance 3 of broken.
Size of token embeddings is torch.Size([17, 13, 768])
Shape of summed layers is: 17 x 768
broken at index 15:  [0.03398871794342995, 0.12251338362693787, -0.1088220477104187, 0.022681929171085358, 0.7945266366004944]
Grand sum of 3 tensor sets 

Saved the embedding for brooding.
Saved the count of sentences used to create brooding embedding
Run time for brooding was 0.17362301999946794 seconds.

There are 2 tokens in tokenized text:
bro
ody
Mean of 0 tensors is: tensor([nan, nan, nan, nan, nan]) (first 5 of 768 features in tensor)
Saved the embedding for broody.
Saved the count of sentences used to create broody embedding
Run time for broody was 0.025785452999116387 seconds.

There are 1 tokens in tokenized text:
bruised

Instance 1 of bruised.
Size of token embeddings is torch.Size([35, 13, 768])
Shape of summed layers is: 35 x 768
bruised at index 26:  [-0.07277945429086685, -0.016109921038150787, 0.17099255323410034, 0.04874351620674133, 0.6105177402496338]
Grand sum of 1 tensor sets is: [-0.07277945429086685, -0.016109921038150787, 0.17099255323410034, 0.04874351620674133, 0.6105177402496338]
Mean of 1 tensors is: tensor([-0.0728, -0.0161,  0.1710,  0.0487,  0.6105]) (first 5 of 768 features in tensor)
Saved the embedding 

Mean of 0 tensors is: tensor([nan, nan, nan, nan, nan]) (first 5 of 768 features in tensor)
Saved the embedding for calmness.
Saved the count of sentences used to create calmness embedding
Run time for calmness was 0.029014046000156668 seconds.

There are 2 tokens in tokenized text:
can
ny
Mean of 0 tensors is: tensor([nan, nan, nan, nan, nan]) (first 5 of 768 features in tensor)
Saved the embedding for canny.
Saved the count of sentences used to create canny embedding
Run time for canny was 0.024623020999570144 seconds.

There are 3 tokens in tokenized text:
cant
ank
erous
Mean of 0 tensors is: tensor([nan, nan, nan, nan, nan]) (first 5 of 768 features in tensor)
Saved the embedding for cantankerous.
Saved the count of sentences used to create cantankerous embedding
Run time for cantankerous was 0.024657506999574252 seconds.

There are 1 tokens in tokenized text:
capable

Instance 1 of capable.
Size of token embeddings is torch.Size([63, 13, 768])
Shape of summed layers is: 63 x 768
c

Size of token embeddings is torch.Size([14, 13, 768])
Shape of summed layers is: 14 x 768
careful at index 6:  [-0.02814478427171707, -0.09562359750270844, -0.06201883405447006, 0.5605047941207886, 0.8142984509468079]
Grand sum of 1 tensor sets is: [-0.02814478427171707, -0.09562359750270844, -0.06201883405447006, 0.5605047941207886, 0.8142984509468079]

Instance 2 of careful.
Size of token embeddings is torch.Size([31, 13, 768])
Shape of summed layers is: 31 x 768
careful at index 23:  [-0.032322950661182404, 0.14917027950286865, 0.1564461588859558, 0.3692052364349365, 0.5934814810752869]
Grand sum of 2 tensor sets is: [-0.060467734932899475, 0.05354668200016022, 0.09442732483148575, 0.9297100305557251, 1.4077799320220947]

Instance 3 of careful.
Size of token embeddings is torch.Size([16, 13, 768])
Shape of summed layers is: 16 x 768
careful at index 7:  [0.4356710910797119, -0.0032262951135635376, 0.07549695670604706, 0.5789555311203003, 0.768084704875946]
Grand sum of 3 tensor sets

Size of token embeddings is torch.Size([43, 13, 768])
Shape of summed layers is: 43 x 768
certain at index 23:  [0.08014874160289764, 0.20246955752372742, 0.16360749304294586, -0.15099219977855682, 0.542007565498352]
Grand sum of 3 tensor sets is: [0.4104467034339905, 0.7836649417877197, 0.23979482054710388, 0.021361589431762695, 0.9238044619560242]

Instance 4 of certain.
Size of token embeddings is torch.Size([76, 13, 768])
Shape of summed layers is: 76 x 768
certain at index 74:  [0.329013854265213, 0.11649209260940552, 0.011987544596195221, 0.2407076507806778, -0.110133096575737]
Grand sum of 4 tensor sets is: [0.7394605875015259, 0.9001570343971252, 0.2517823576927185, 0.2620692253112793, 0.813671350479126]

Instance 5 of certain.
Size of token embeddings is torch.Size([53, 13, 768])
Shape of summed layers is: 53 x 768
certain at index 40:  [0.12302755564451218, 0.6831939220428467, 0.35198304057121277, -0.26052817702293396, 1.6595549583435059]
Grand sum of 5 tensor sets is: [0.862

Size of token embeddings is torch.Size([39, 13, 768])
Shape of summed layers is: 39 x 768
challenge at index 26:  [-0.1769009828567505, 0.206762433052063, -0.03942039608955383, -0.030211595818400383, 0.04174128174781799]
Grand sum of 8 tensor sets is: [-0.9991161823272705, -0.47883135080337524, 0.11006081104278564, 0.701418936252594, 3.206609010696411]

Instance 12 of challenge.
Size of token embeddings is torch.Size([21, 13, 768])
Shape of summed layers is: 21 x 768
challenge at index 6:  [0.008087854832410812, -0.04668854922056198, 0.0518520325422287, -0.08244245499372482, 0.26313671469688416]
Grand sum of 9 tensor sets is: [-0.9910283088684082, -0.5255199074745178, 0.16191284358501434, 0.6189764738082886, 3.469745635986328]

Instance 13 of challenge.
Size of token embeddings is torch.Size([36, 13, 768])
Shape of summed layers is: 36 x 768
challenge at index 26:  [-0.0661962479352951, -0.19050191342830658, -0.06354019045829773, -0.09178368747234344, 0.16393277049064636]
Grand sum of 

Size of token embeddings is torch.Size([57, 13, 768])
Shape of summed layers is: 57 x 768
charged at index 17:  [-0.04074686020612717, 0.06542021036148071, -0.07458929717540741, 0.27974164485931396, 0.06783347576856613]
Grand sum of 4 tensor sets is: [-0.15246132016181946, 0.3552277088165283, -0.14645852148532867, 0.6768467426300049, 1.1177630424499512]

Instance 5 of charged.
Size of token embeddings is torch.Size([22, 13, 768])
Shape of summed layers is: 22 x 768
charged at index 18:  [-0.12716367840766907, 0.0030889660120010376, -0.14080283045768738, -0.0530497282743454, 0.2771533131599426]
Grand sum of 5 tensor sets is: [-0.2796249985694885, 0.35831665992736816, -0.28726136684417725, 0.6237969994544983, 1.394916296005249]

Instance 6 of charged.
Size of token embeddings is torch.Size([22, 13, 768])
Shape of summed layers is: 22 x 768
charged at index 2:  [-0.14574426412582397, 0.1371181309223175, -0.048487864434719086, 0.0754559189081192, 0.24070662260055542]
Grand sum of 6 tensor 

Mean of 0 tensors is: tensor([nan, nan, nan, nan, nan]) (first 5 of 768 features in tensor)
Saved the embedding for chortling.
Saved the count of sentences used to create chortling embedding
Run time for chortling was 0.026725345999693673 seconds.

There are 1 tokens in tokenized text:
chuckle
Mean of 0 tensors is: tensor([nan, nan, nan, nan, nan]) (first 5 of 768 features in tensor)
Saved the embedding for chuckle.
Saved the count of sentences used to create chuckle embedding
Run time for chuckle was 0.029205514000750554 seconds.

There are 2 tokens in tokenized text:
chuck
ling
Mean of 0 tensors is: tensor([nan, nan, nan, nan, nan]) (first 5 of 768 features in tensor)
Saved the embedding for chuckling.
Saved the count of sentences used to create chuckling embedding
Run time for chuckling was 0.027583215000049677 seconds.

There are 3 tokens in tokenized text:
ch
ur
lish
Mean of 0 tensors is: tensor([nan, nan, nan, nan, nan]) (first 5 of 768 features in tensor)
Saved the embedding for

KeyboardInterrupt: 

In [12]:
def make_vocab(vocab_file):
    """Convert a file of newline separated words into a Python list and return it."""
    vocab = []
    with open(vocab_file, 'r') as v:
        vocab = v.read().splitlines()
    return vocab

In [20]:
def tokenize_text(text, tokenizer):
    """Break the input text into tokens the model can use, and return them.
    Use max_length to avoid overflowing the maximum sequence length for the model."""
    tokenized_text = tokenizer.encode(text, add_special_tokens=True, max_length=512)
    return tokenized_text

In [18]:
def print_tokenized_text(tokens, tokenizer):
    """Print the number of tokens in some tokenized text, not counting the leading and trailing separators.
    Print each token without any leading or trailing whitespace."""
    print(f'\nThere are {len(tokens) - 2} tokens in tokenized text:')
    for t in tokens[1:-1]:
        print(tokenizer.decode(t).strip())

In [21]:
def get_vocab_indices(v_tokens, line_tokens, tokenizer):
    """Search a line for all tokens of a vocabulary word, and return the indices of their locations."""
    indices = []              
    for t in v_tokens[1:-1]:
        for i, token_str in enumerate(line_tokens):
            if tokenizer.decode(token_str).strip() == tokenizer.decode(t).strip():
                indices.append(i)
    return indices

In [29]:
def create_token_embeddings(tokenized_text):
    """Convert the model into a more usable format: a tensor of size [<token_count>, <layer_count>, <feature_count>]."""
    input_ids = torch.tensor(tokenized_text).unsqueeze(0)  # Batch size 1
    with torch.no_grad():
        outputs = model(input_ids, masked_lm_labels=input_ids)
        encoded_layers = outputs[2]
        token_embeddings = torch.stack(encoded_layers, dim=0)  # Concatenate the tensors for all layers.
        token_embeddings = torch.squeeze(token_embeddings, dim=1)  # Remove the "batches" dimension
        token_embeddings = token_embeddings.permute(1,0,2)  # Rearrange the model dimensions.
        print(f'Size of token embeddings is {token_embeddings.size()}')
        return token_embeddings

In [40]:
def preview_token_embedding(tokenized_text, layer, index, index_list, tokenizer):
    """Print the first 5 feature values from a model layer for tokens at specific line indices."""
    v_index = index % len(tokenized_text[1:-1])
    print(f'{tokenizer.decode(tokenized_text[v_index + 1]).strip()} at index {index_list[index]}: ', \
          f'{layer[index_list[index]][:5].tolist()}')

In [23]:
def sum_last_four_token_vecs(token_embeddings):
    """Sum the last 4 layers' features and return the resulting vector."""
    token_vecs_sum_last_four = []

    # For each token in the sentence, sum the last 4 layers of the model.
    for token in token_embeddings:
        sum_vec = torch.sum(token[-4:], dim=0)

        # Use `sum_vec` to represent `token`.
        token_vecs_sum_last_four.append(sum_vec)

    print ('Shape of summed layers is: %d x %d' % (len(token_vecs_sum_last_four), len(token_vecs_sum_last_four[0])))
    return token_vecs_sum_last_four

In [38]:
def get_layer_token_vecs(token_embeddings, layer_number):
    """Return a single layer of the model."""
    token_vecs_layer = []

    # For each token in the sentence...
    for token in token_embeddings:
        layer_vec = token[layer_number]

        # Use `layer_vec` to represent `token`.
        token_vecs_layer.append(layer_vec)

    print ('Shape of summed layers is: %d x %d' % (len(token_vecs_layer), len(token_vecs_layer[0])))
    return token_vecs_layer

In [8]:
def write_embedding(embeddings_file, vocab_word, contextual_embedding):
    """Save an embedding to an output file."""
    try:
        with open(embeddings_file, 'a') as f:
            f.write(vocab_word)
            for value in contextual_embedding[0]:
                f.write(' ' + str(value.item()))
            f.write('\n')
        print(f'Saved the embedding for {vocab_word}.')
    except:
        print('Oh no! Unable to write to the embeddings file.')

In [44]:
def write_line_count(count_file, vocab_word, line_count):
    try:
        with open(count_file, 'a') as counts:
            counts.write(vocab_word + ', ' + str(line_count) + '\n')
        print(f'Saved the count of sentences used to create {vocab_word} embedding.')
    except:
        print('Wha?! Could not write the sentence count.')