In [1]:
import sys
import os
from timeit import default_timer as timer
import torch
from transformers import RobertaTokenizer, RobertaModel, RobertaForMaskedLM, RobertaConfig

In [2]:
# Make sure we're in the transformers directory with fine-tuned model output.
os.chdir('/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/')
os.getcwd()

'/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings'

In [3]:
# Adapted from the tutorial at https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/
# and Transformers documentation: https://huggingface.co/transformers/model_doc/roberta.html#robertaformaskedlm

In [9]:
tokenizer = RobertaTokenizer.from_pretrained('/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/output_wiki-103_filtered')
config = RobertaConfig.from_pretrained('/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/output_wiki-103_filtered')
model = RobertaForMaskedLM.from_pretrained('/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/output_wiki-103_filtered', config=config)
model.eval()

context_file = "/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/wiki.test.raw.out"
output_file = '/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/roberta_test.txt'
count_file = '/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/roberta_test_counts.txt'
vocab_file = '/Users/crystal.butler/Documents/Code_Projects/RoBERTa_Embeddings/RoBERTa_embeddings/data/vocab_checked.txt'
vocab = make_vocab(vocab_file)

In [None]:
# Process vocabulary words in the outer loop.
for v in vocab:
    start = timer()
    with open(context_file, 'r') as lines:
        v_sum = torch.zeros([1, 768])
        v_tokens = tokenize_text(v, tokenizer)
        print_tokenized_text(v_tokens, tokenizer)
        count_sentence = 0
        count_tensor = 0
        
        # Process all lines in the context file in the inner loop.
        for line in lines:
            # Check for this vocab word in this line; if found, split the line into individual sentences.
            if v in line.lower().split():
                for sentence in line.split('.'):
                    if v in sentence.lower():
                        line = sentence
                        count_sentence += 1
                        print(f'\nInstance {count_sentence} of {tokenizer.decode(v_tokens[1:-1]).strip()}.')
                        break
                # Split the new sentence-based line into tokens.
                # Use max_length to avoid overflowing the maximum sequence length for the model.
                tokenized_text = tokenize_text(line, tokenizer)
#                 print(f'The decoded sentence has {len(tokenized_text)} tokens and is: {tokenizer.decode(tokenized_text)}')
                indices = []              

                # Check to see whether the vocab word is found in this particular line.
                # Initially, some lines may have comprised multiple sentences, which were
                # broken out individually above.
                for t in v_tokens[1:-1]:
                    print(f'Looking for vocab token: {tokenizer.decode(t).strip()}')
                    for i, token_str in enumerate(tokenized_text):
#                         print(f'Next sentence token: {tokenizer.decode(token_str).strip()}')
#                         print(tokenizer.decode(token_str).strip() == tokenizer.decode(t).strip())
                        if tokenizer.decode(token_str).strip() == tokenizer.decode(t).strip():
                            indices.append(i)               

                ###################################################################################
                # If the vocabulary word was found, process the containing line.
                if indices:

                    # The vocab word was found in this line/sentence, at the locations in indices.
                    print(f'Indices are {indices}')
                    # Get the feature vectors for all tokens in the line/sentence.
                    token_embeddings = create_token_embeddings(tokenized_text)
                    # Sum the last four layers to get embeddings for the line/sentence.
#                         for t in v_tokens[1:-1]:ik
#                             for i, token_str in enumerate(tokenized_text):
#                                 if (tokenizer.decode(token_str).strip() == tokenizer.decode(t).strip()):
#                                     print(f'{tokenizer.decode(token_str).strip()} is index {i} in the sentence and {token_str} in the vocabulary.')
                    token_vecs_layer = get_layer_token_vecs(token_embeddings, 12)

                    # Get the vocab word's contextual embedding for this line.
                    tensor_layer = torch.zeros([1, 768])
                    for i in range(len(indices)):
                        v_index = i % len(v_tokens[1:-1])
                        print(f'{tokenizer.decode(v_tokens[v_index + 1]).strip()} at index {indices[i]}: {token_vecs_layer[indices[i]][:5].tolist()}')
                        tensor_layer += token_vecs_layer[indices[i]]
#                         print(f'Sum of tensors is: {tensor_layer[0][:5].tolist()} before taking the mean.')

                    # If our vocab word is broken into more than one token, we need to get the mean of the token embeddings.
                    tensor_layer /= len(indices)
#                     print(f'Sum of tensors is: {tensor_layer[0][:5].tolist()} after taking the mean.')

                    # Add the embedding distilled from this line to the sum of embeddings for all lines.
                    v_sum += tensor_layer
                    count_tensor += 1
                    print(f'Grand sum of {count_tensor} tensor sets is: {v_sum[0][:5].tolist()}')
                ###################################################################################
            # Stop processing lines once we've found 2000 instances of our vocab word.
            if count_tensor >= 2000:
                break
        
        # We're done processing all lines of 512 tokens or less containing our vocab word.
        # Get the mean embedding for the word.
        v_mean = v_sum / count_tensor
        print(f'Mean of tensors is: {v_mean[0][:5]} ({len(v_mean[0])} features in tensor)')
        write_embedding(output_file, v, v_mean)
        try:
            with open(count_file, 'a') as counts:
                counts.write(v + ', ' + str(count_tensor) + '\n')
            print(f'Saved the count of sentences used to create {v} embedding')
        except:
            print('Wha?! Could not write the sentence count.')
    end = timer()
    print(f'Run time for {v} was {end - start} seconds.')


There are 1 tokens in tokenized text:
aback
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for aback.
Saved the count of sentences used to create aback embedding
Run time for aback was 0.03236524099975213 seconds.

There are 2 tokens in tokenized text:
ab
ashed
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for abashed.
Saved the count of sentences used to create abashed embedding
Run time for abashed was 0.030247872000018106 seconds.

There are 1 tokens in tokenized text:
abhor
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for abhor.
Saved the count of sentences used to create abhor embedding
Run time for abhor was 0.027797740000096383 seconds.

There are 2 tokens in tokenized text:
abhor
red
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for abhorred.
Saved the count of sentenc

Size of token embeddings is torch.Size([43, 13, 768])
Shape of summed layers is: 43 x 768
accepted at index 27: [-0.11929482221603394, 0.02767271175980568, -0.13306935131549835, 0.4885920286178589, 0.16952836513519287]
Grand sum of 3 tensor sets is: [0.269625186920166, 0.44991564750671387, -0.07196421176195145, 1.0198051929473877, 1.1031814813613892]

Instance 4 of accepted.
Looking for vocab token: accepted
Indices are [26]
Size of token embeddings is torch.Size([54, 13, 768])
Shape of summed layers is: 54 x 768
accepted at index 26: [0.1258816421031952, -0.08449521660804749, -0.12492911517620087, 0.5103059411048889, 0.30571746826171875]
Grand sum of 4 tensor sets is: [0.3955068290233612, 0.3654204308986664, -0.1968933343887329, 1.5301110744476318, 1.408898949623108]

Instance 5 of accepted.
Looking for vocab token: accepted
Indices are [22]
Size of token embeddings is torch.Size([37, 13, 768])
Shape of summed layers is: 37 x 768
accepted at index 22: [-0.005839262157678604, 0.0065529

Size of token embeddings is torch.Size([74, 13, 768])
Shape of summed layers is: 74 x 768
accommodating at index 40: [0.16071467101573944, 0.1875777542591095, -0.02297905646264553, 0.18755941092967987, 0.7809291481971741]
Grand sum of 1 tensor sets is: [0.16071467101573944, 0.1875777542591095, -0.02297905646264553, 0.18755941092967987, 0.7809291481971741]
Mean of tensors is: tensor([ 0.1607,  0.1876, -0.0230,  0.1876,  0.7809]) (768 features in tensor)
Saved the embedding for accommodating.
Saved the count of sentences used to create accommodating embedding
Run time for accommodating was 0.1806203529999948 seconds.

There are 1 tokens in tokenized text:
accomplished

Instance 1 of accomplished.
Looking for vocab token: accomplished
Indices are [18]
Size of token embeddings is torch.Size([20, 13, 768])
Shape of summed layers is: 20 x 768
accomplished at index 18: [-0.04389871656894684, -0.006582438945770264, 0.22913262248039246, -0.1600954830646515, 0.07970038801431656]
Grand sum of 1 t

Size of token embeddings is torch.Size([30, 13, 768])
Shape of summed layers is: 30 x 768
active at index 24: [-0.10148297250270844, 0.06392201781272888, 0.0965723916888237, 0.3678937256336212, -0.11603179574012756]
Grand sum of 1 tensor sets is: [-0.10148297250270844, 0.06392201781272888, 0.0965723916888237, 0.3678937256336212, -0.11603179574012756]

Instance 2 of active.
Looking for vocab token: active
Indices are [19]
Size of token embeddings is torch.Size([21, 13, 768])
Shape of summed layers is: 21 x 768
active at index 19: [0.08776158094406128, -0.0004821270704269409, 0.06824815273284912, 0.191207155585289, -0.31917595863342285]
Grand sum of 2 tensor sets is: [-0.013721391558647156, 0.06343989074230194, 0.16482055187225342, 0.559100866317749, -0.4352077543735504]

Instance 3 of active.
Looking for vocab token: active
Indices are [17]
Size of token embeddings is torch.Size([24, 13, 768])
Shape of summed layers is: 24 x 768
active at index 17: [0.017609374597668648, 0.2944364547729

Size of token embeddings is torch.Size([52, 13, 768])
Shape of summed layers is: 52 x 768
active at index 32: [-0.029190607368946075, 0.25667625665664673, 0.18565872311592102, 0.2579237222671509, -0.016910135746002197]
Grand sum of 22 tensor sets is: [1.0008060932159424, 0.16872045397758484, 1.9363590478897095, 7.464029788970947, -7.373256206512451]

Instance 23 of active.
Looking for vocab token: active
Indices are [22]
Size of token embeddings is torch.Size([45, 13, 768])
Shape of summed layers is: 45 x 768
active at index 22: [-0.04695665091276169, 0.01645827293395996, 0.19837507605552673, 0.4608666002750397, -0.49026280641555786]
Grand sum of 23 tensor sets is: [0.9538494348526001, 0.1851787269115448, 2.1347341537475586, 7.924896240234375, -7.863519191741943]
Mean of tensors is: tensor([ 0.0415,  0.0081,  0.0928,  0.3446, -0.3419]) (768 features in tensor)
Saved the embedding for active.
Saved the count of sentences used to create active embedding
Run time for active was 1.44892211

Size of token embeddings is torch.Size([14, 13, 768])
Shape of summed layers is: 14 x 768
affected at index 5: [0.19279122352600098, 0.5335580706596375, 0.212822824716568, 0.3287631571292877, 0.3169293999671936]
Grand sum of 6 tensor sets is: [0.504017174243927, 3.1432406902313232, 0.8452674150466919, 3.196693181991577, -1.0401606559753418]

Instance 7 of affected.
Looking for vocab token: affected
Indices are [5]
Size of token embeddings is torch.Size([28, 13, 768])
Shape of summed layers is: 28 x 768
affected at index 5: [0.2380753457546234, 0.4090193510055542, 0.17236706614494324, 0.48851925134658813, 0.1720593273639679]
Grand sum of 7 tensor sets is: [0.742092490196228, 3.552259922027588, 1.0176345109939575, 3.6852123737335205, -0.8681013584136963]

Instance 8 of affected.
Looking for vocab token: affected
Indices are [23]
Size of token embeddings is torch.Size([28, 13, 768])
Shape of summed layers is: 28 x 768
affected at index 23: [0.12588655948638916, 0.47192567586898804, 0.0719

Size of token embeddings is torch.Size([47, 13, 768])
Shape of summed layers is: 47 x 768
aggressive at index 31: [0.1439601480960846, 0.08833093196153641, 0.06896568089723587, 0.20601718127727509, 0.21589598059654236]
Grand sum of 2 tensor sets is: [0.23927970230579376, -0.0802399292588234, -0.07627927511930466, 0.917029619216919, 0.20368370413780212]

Instance 3 of aggressive.
Looking for vocab token: aggressive
Indices are [4]
Size of token embeddings is torch.Size([15, 13, 768])
Shape of summed layers is: 15 x 768
aggressive at index 4: [0.07398677617311478, 0.07459574192762375, 0.0619354322552681, 0.48943108320236206, 0.09881015866994858]
Grand sum of 3 tensor sets is: [0.31326648592948914, -0.005644187331199646, -0.01434384286403656, 1.4064607620239258, 0.3024938702583313]

Instance 4 of aggressive.
Looking for vocab token: aggressive
Indices are [4]
Size of token embeddings is torch.Size([22, 13, 768])
Shape of summed layers is: 22 x 768
aggressive at index 4: [0.058629550039768

Size of token embeddings is torch.Size([68, 13, 768])
Shape of summed layers is: 68 x 768
amazing at index 40: [-0.008208934217691422, 0.3720661401748657, -0.14578741788864136, -0.4101196825504303, 0.34287261962890625]
Grand sum of 2 tensor sets is: [0.11175903677940369, 0.5161386728286743, -0.11818461120128632, -0.7820503115653992, 0.6686673760414124]

Instance 3 of amazing.
Looking for vocab token: amazing
Indices are [13]
Size of token embeddings is torch.Size([17, 13, 768])
Shape of summed layers is: 17 x 768
amazing at index 13: [-0.07336670905351639, 0.16041766107082367, -0.08369830250740051, -0.24965515732765198, -0.48866844177246094]
Grand sum of 3 tensor sets is: [0.0383923277258873, 0.6765563488006592, -0.20188291370868683, -1.0317054986953735, 0.17999893426895142]
Mean of tensors is: tensor([ 0.0128,  0.2255, -0.0673, -0.3439,  0.0600]) (768 features in tensor)
Saved the embedding for amazing.
Saved the count of sentences used to create amazing embedding
Run time for amazing

Size of token embeddings is torch.Size([48, 13, 768])
Shape of summed layers is: 48 x 768
anger at index 32: [0.11743023246526718, -0.08298443257808685, 0.04215235635638237, 0.1773572415113449, 0.10809770226478577]
Grand sum of 5 tensor sets is: [0.19062533974647522, -0.7124022245407104, 0.07959017157554626, 1.6372673511505127, 0.4095071256160736]

Instance 6 of anger.
Looking for vocab token: anger
Indices are [12]
Size of token embeddings is torch.Size([17, 13, 768])
Shape of summed layers is: 17 x 768
anger at index 12: [0.1809227019548416, -0.1769171953201294, 0.10280376672744751, 0.189121812582016, -0.22888308763504028]
Grand sum of 6 tensor sets is: [0.371548056602478, -0.8893194198608398, 0.18239393830299377, 1.826389193534851, 0.18062403798103333]

Instance 7 of anger.
Looking for vocab token: anger
Indices are [5]
Size of token embeddings is torch.Size([24, 13, 768])
Shape of summed layers is: 24 x 768
anger at index 5: [0.003400493413209915, -0.24470971524715424, -0.002891832

Size of token embeddings is torch.Size([70, 13, 768])
Shape of summed layers is: 70 x 768
animated at index 44: [0.037575654685497284, 0.059396836906671524, 0.2357286512851715, 0.03179680183529854, -0.04932625591754913]
Grand sum of 3 tensor sets is: [-0.23400819301605225, 0.43760696053504944, 0.36359307169914246, -0.2302713543176651, -0.030665531754493713]
Mean of tensors is: tensor([-0.0780,  0.1459,  0.1212, -0.0768, -0.0102]) (768 features in tensor)
Saved the embedding for animated.
Saved the count of sentences used to create animated embedding
Run time for animated was 0.30842976500025543 seconds.

There are 1 tokens in tokenized text:
animosity
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for animosity.
Saved the count of sentences used to create animosity embedding
Run time for animosity was 0.031915009999920585 seconds.

There are 1 tokens in tokenized text:
annoyance
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (7

Size of token embeddings is torch.Size([15, 13, 768])
Shape of summed layers is: 15 x 768
anxiety at index 3: [0.16283810138702393, -0.16517367959022522, 0.15017762780189514, 0.06005309894680977, -0.16442930698394775]
Grand sum of 1 tensor sets is: [0.16283810138702393, -0.16517367959022522, 0.15017762780189514, 0.06005309894680977, -0.16442930698394775]

Instance 2 of anxiety.
Looking for vocab token: anxiety
Indices are [36]
Size of token embeddings is torch.Size([45, 13, 768])
Shape of summed layers is: 45 x 768
anxiety at index 36: [0.035467375069856644, 0.31055915355682373, 0.4142957627773285, -0.11021118611097336, 0.41283679008483887]
Grand sum of 2 tensor sets is: [0.19830547273159027, 0.1453854739665985, 0.5644733905792236, -0.05015808716416359, 0.2484074831008911]
Mean of tensors is: tensor([ 0.0992,  0.0727,  0.2822, -0.0251,  0.1242]) (768 features in tensor)
Saved the embedding for anxiety.
Saved the count of sentences used to create anxiety embedding
Run time for anxiety w

Size of token embeddings is torch.Size([32, 13, 768])
Shape of summed layers is: 32 x 768
argue at index 6: [0.1798577755689621, 0.13027141988277435, 0.09999781847000122, 0.09113790094852448, -1.0571374893188477]
Grand sum of 3 tensor sets is: [0.2246711254119873, 0.45280516147613525, 0.46912407875061035, 0.27913662791252136, -3.77339506149292]
Mean of tensors is: tensor([ 0.0749,  0.1509,  0.1564,  0.0930, -1.2578]) (768 features in tensor)
Saved the embedding for argue.
Saved the count of sentences used to create argue embedding
Run time for argue was 0.22842997099996865 seconds.

There are 2 tokens in tokenized text:
argument
ative
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for argumentative.
Saved the count of sentences used to create argumentative embedding
Run time for argumentative was 0.03341492700019444 seconds.

There are 1 tokens in tokenized text:
aroused
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 featu

Size of token embeddings is torch.Size([44, 13, 768])
Shape of summed layers is: 44 x 768
attempting at index 13: [0.2750488817691803, 0.06962272524833679, 0.11182277649641037, 0.23127178847789764, 0.03039490431547165]
Grand sum of 2 tensor sets is: [0.5433462858200073, 0.18414655327796936, 0.35025978088378906, 0.31159117817878723, 0.2353484332561493]

Instance 3 of attempting.
Looking for vocab token: attempting
Indices are [11]
Size of token embeddings is torch.Size([25, 13, 768])
Shape of summed layers is: 25 x 768
attempting at index 11: [-0.004661872982978821, -0.012229569256305695, 0.20318102836608887, 0.1792888194322586, 0.581027626991272]
Grand sum of 3 tensor sets is: [0.5386844277381897, 0.17191699147224426, 0.5534408092498779, 0.49088001251220703, 0.8163760900497437]

Instance 4 of attempting.
Looking for vocab token: attempting
Indices are [27]
Size of token embeddings is torch.Size([42, 13, 768])
Shape of summed layers is: 42 x 768
attempting at index 27: [0.11186040937900


Instance 1 of aversion.
Looking for vocab token: aversion
Indices are [23]
Size of token embeddings is torch.Size([27, 13, 768])
Shape of summed layers is: 27 x 768
aversion at index 23: [0.08816449344158173, 0.0816800445318222, 0.1390906423330307, 0.12265820056200027, -0.12753167748451233]
Grand sum of 1 tensor sets is: [0.08816449344158173, 0.0816800445318222, 0.1390906423330307, 0.12265820056200027, -0.12753167748451233]
Mean of tensors is: tensor([ 0.0882,  0.0817,  0.1391,  0.1227, -0.1275]) (768 features in tensor)
Saved the embedding for aversion.
Saved the count of sentences used to create aversion embedding
Run time for aversion was 0.0866591049998533 seconds.

There are 2 tokens in tokenized text:
a
versive
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for aversive.
Saved the count of sentences used to create aversive embedding
Run time for aversive was 0.028115456999785238 seconds.

There are 1 tokens in tokenized text:
a

Saved the embedding for backhanded.
Saved the count of sentences used to create backhanded embedding
Run time for backhanded was 0.02496350799992797 seconds.

There are 1 tokens in tokenized text:
badly

Instance 1 of badly.
Looking for vocab token: badly
Indices are [3]
Size of token embeddings is torch.Size([22, 13, 768])
Shape of summed layers is: 22 x 768
badly at index 3: [-0.04932598024606705, 0.20432643592357635, 0.14524024724960327, 0.2621002197265625, 1.0760900974273682]
Grand sum of 1 tensor sets is: [-0.04932598024606705, 0.20432643592357635, 0.14524024724960327, 0.2621002197265625, 1.0760900974273682]

Instance 2 of badly.
Looking for vocab token: badly
Indices are [7]
Size of token embeddings is torch.Size([17, 13, 768])
Shape of summed layers is: 17 x 768
badly at index 7: [-0.14191602170467377, 0.14972223341464996, 0.05356631055474281, 0.3544180989265442, 0.6540783643722534]
Grand sum of 2 tensor sets is: [-0.1912420094013214, 0.3540486693382263, 0.1988065540790558, 0.61

Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for bearish.
Saved the count of sentences used to create bearish embedding
Run time for bearish was 0.025203564000094048 seconds.

There are 1 tokens in tokenized text:
beat

Instance 1 of beat.
Looking for vocab token: beat

Instance 2 of beat.
Looking for vocab token: beat

Instance 3 of beat.
Looking for vocab token: beat

Instance 4 of beat.
Looking for vocab token: beat

Instance 5 of beat.
Looking for vocab token: beat
Indices are [4]
Size of token embeddings is torch.Size([22, 13, 768])
Shape of summed layers is: 22 x 768
beat at index 4: [0.18232806026935577, -0.28301751613616943, -0.23293912410736084, 0.048649244010448456, -0.488630473613739]
Grand sum of 1 tensor sets is: [0.18232806026935577, -0.28301751613616943, -0.23293912410736084, 0.048649244010448456, -0.488630473613739]

Instance 6 of beat.
Looking for vocab token: beat
Indices are [12]
Size of token embeddings is torch.

Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for begrudging.
Saved the count of sentences used to create begrudging embedding
Run time for begrudging was 0.024960101999567996 seconds.

There are 4 tokens in tokenized text:
be
gr
udging
ly
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for begrudgingly.
Saved the count of sentences used to create begrudgingly embedding
Run time for begrudgingly was 0.024953889999778767 seconds.

There are 3 tokens in tokenized text:
beg
u
iled
Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for beguiled.
Saved the count of sentences used to create beguiled embedding
Run time for beguiled was 0.024300262999986444 seconds.

There are 2 tokens in tokenized text:
bel
ated

Instance 1 of belated.
Looking for vocab token: bel
Looking for vocab token: ated
Indices are [14, 15]
Size of token embeddings is t

Mean of tensors is: tensor([nan, nan, nan, nan, nan]) (768 features in tensor)
Saved the embedding for bilious.
Saved the count of sentences used to create bilious embedding
Run time for bilious was 0.031006902000171976 seconds.

There are 1 tokens in tokenized text:
bit

Instance 1 of bit.
Looking for vocab token: bit
Indices are [22]
Size of token embeddings is torch.Size([26, 13, 768])
Shape of summed layers is: 26 x 768
bit at index 22: [0.21086853742599487, 0.5196075439453125, 0.3275662958621979, 0.3908795118331909, 1.1037216186523438]
Grand sum of 1 tensor sets is: [0.21086853742599487, 0.5196075439453125, 0.3275662958621979, 0.3908795118331909, 1.1037216186523438]

Instance 2 of bit.
Looking for vocab token: bit
Indices are [12]
Size of token embeddings is torch.Size([16, 13, 768])
Shape of summed layers is: 16 x 768
bit at index 12: [0.24724599719047546, 0.42722558975219727, 0.332744836807251, 0.42216357588768005, 0.9602119326591492]
Grand sum of 2 tensor sets is: [0.4581145346

In [12]:
def make_vocab(vocab_file):
    """Convert a file of newline separated words into a Python list and return it."""
    vocab = []
    with open(vocab_file, 'r') as v:
        vocab = v.read().splitlines()
    return vocab

In [11]:
def tokenize_text(text, tokenizer):
    """Break the input text into tokens the model can use, and return them."""
    tokenized_text = tokenizer.encode(text, add_special_tokens=True, max_length=512)
    return tokenized_text

In [18]:
def print_tokenized_text(tokens, tokenizer):
    """Print the number of tokens in some tokenized text, not counting the leading and trailing separators.
    Print each token without any leading or trailing whitespace."""
    print(f'\nThere are {len(tokens) - 2} tokens in tokenized text:')
    for t in tokens[1:-1]:
        print(tokenizer.decode(t).strip())

In [5]:
def create_token_embeddings(tokenized_text):
    
    input_ids = torch.tensor(tokenized_text).unsqueeze(0)  # Batch size 1
    with torch.no_grad():
        outputs = model(input_ids, masked_lm_labels=input_ids)
        encoded_layers = outputs[2]
        token_embeddings = torch.stack(encoded_layers, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1,0,2)
        print(f'Size of token embeddings is {token_embeddings.size()}')
        return token_embeddings

In [6]:
# Sum the last 4 layers' features
def sum_last_four_token_vecs(token_embeddings):
    token_vecs_sum_last_four = []

    # For each token in the sentence...
    for token in token_embeddings:
        # `token` is a [13 x 768] tensor
        # Sum the vectors from the last 4 layers.
        sum_vec = torch.sum(token[-4:], dim=0)

        # Use `sum_vec` to represent `token`.
        token_vecs_sum_last_four.append(sum_vec)

    print ('Shape of summed layers is: %d x %d' % (len(token_vecs_sum_last_four), len(token_vecs_sum_last_four[0])))
    # Shape is: <token count> x 768
    return token_vecs_sum_last_four

In [7]:
# Return a single layer of the model.
def get_layer_token_vecs(token_embeddings, layer_number):
    token_vecs_layer = []

    # For each token in the sentence...
    for token in token_embeddings:
        # `token` is a [13 x 768] tensor
        # Sum the vectors from the last 4 layers.
        layer_vec = token[layer_number]

        # Use `sum_vec` to represent `token`.
        token_vecs_layer.append(layer_vec)

    print ('Shape of summed layers is: %d x %d' % (len(token_vecs_layer), len(token_vecs_layer[0])))
    # Shape is: <token count> x 768
    return token_vecs_layer

In [8]:
def write_embedding(embeddings_file, vocab_word, contextual_embedding):
    try:
        with open(embeddings_file, 'a') as f:
            f.write(vocab_word)
            for value in contextual_embedding[0]:
                f.write(' ' + str(value.item()))
            f.write('\n')
        print(f'Saved the embedding for {vocab_word}.')
    except:
        print('Oh no! Unable to write to the embeddings file.')