In [13]:
import sys
import os
from timeit import default_timer as timer
import torch
from transformers import RobertaTokenizer, RobertaModel, RobertaForMaskedLM, RobertaConfig
from scipy.spatial.distance import cosine

In [14]:
# Adapted from the tutorial at https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/
# and Transformers documentation: https://huggingface.co/transformers/model_doc/roberta.html#robertaformaskedlm

In [22]:
# Make sure we're in the transformers directory with fine-tuned model output.
os.chdir('/home/jupyter/Notebooks/crystal/NLP/transformers/examples/')
os.getcwd()

tokenizer = RobertaTokenizer.from_pretrained('./output_dicts_syns/')
config = RobertaConfig.from_pretrained('./output_dicts_syns/')
model_root_dir = './output_dicts_syns/checkpoints/'
# model = RobertaForMaskedLM.from_pretrained('./output_CC-aef/', config=config)

context_file = "/home/jupyter/Notebooks/crystal/NLP/transformers/examples/wordnik_all.txt"
output_file = '/home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/'
count_file = '/home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/'
vocab_file = '/home/jupyter/Notebooks/crystal/NLP/MiFace/Python/data/vocab_files/FE_vocab_study.txt'
vocab = make_vocab(vocab_file)

FEATURE_COUNT = 768
LAYER_COUNT = 13
LAYER = 8
MAX_LINES = 100

In [None]:
# Get the 8th layer of the model for all checkpoints in a fine-tuning output directory.
for dir, subdirs, files in os.walk(model_root_dir):
    for subdir in subdirs:
        model = RobertaForMaskedLM.from_pretrained(os.path.join(dir, subdir), config=config)
        model.eval()
        subdir_output_file = ''
        subdir_count_file = ''
        subdir_output_file = os.path.join(output_file, subdir + '_layer8_wordnik.txt')
        subdir_count_file = os.path.join(count_file, subdir + '_layer8_wordnik_counts.txt')
        print(subdir_output_file, subdir_count_file)
        # Process vocabulary words in the middle loop.
        for v in vocab:
            start = timer()
            with open(context_file, 'r') as lines:
                v_sum = torch.zeros([1, FEATURE_COUNT])
                v_tokens = tokenizer.encode(v)
                print(f'\nThere are {len(v_tokens) - 2} tokens in tokenized vocabulary word:')
                for t in v_tokens[1:-1]:
                    print(tokenizer.decode(t).strip())
                count_sentence = 0
                count_tensor = 0

                # Process all lines in the context file in the inner loop.
                for line in lines:
                    # Check for this vocab word in this line; if found, split the line into individual sentences.
                    if v in line.lower().split():
                        for sentence in line.split('.'):
                            if v in sentence.lower():
                                line = sentence
                                count_sentence += 1
                                break
                        # Split the new sentence-based line into tokens.
                        # Use max_length to avoid overflowing the maximum sequence length for the model.
                        tokenized_text = tokenizer.encode(line, add_special_tokens=True, max_length=512)
                        indices = []              

                        # Check to see whether the vocab word is found in this particular line.
                        # Initially, some lines may have comprised multiple sentences, which were
                        # broken out individually above.
                        for t in v_tokens[1:-1]:
                            for i, token_str in enumerate(tokenized_text):
                                if tokenizer.decode(token_str).strip() == tokenizer.decode(t).strip():
                                    indices.append(i)               

                        # If the vocabulary word was found, process the containing line.
                        if indices:

                            # The vocab word was found in this line/sentence, at the locations in indices.
                            # Get the feature vectors for all tokens in the line/sentence.
                            token_embeddings = create_token_embeddings(tokenized_text)
                            token_vecs_layer = get_token_vecs_layer(token_embeddings, LAYER)

                            # Get the vocab word's contextual embedding for this line.
                            tensor_layer = torch.zeros([1, FEATURE_COUNT])
                            for i in range(len(indices)):
                                v_index = i % len(v_tokens[1:-1])
                                tensor_layer += token_vecs_layer[indices[i]]

                            # If our vocab word is broken into more than one token, we need to get the mean of the token embeddings.
                            tensor_layer /= len(indices)

                            # Add the embedding distilled from this line to the sum of embeddings for all lines.
                            v_sum += tensor_layer
                            count_tensor += 1

                    # Stop processing lines once we've found MAX_LINES instances of our vocab word.
                    if count_tensor >= MAX_LINES:
                        break

                # We're done processing all lines of 512 tokens or less containing our vocab word.
                # Get the mean embedding for the word.
                v_mean = v_sum / count_tensor
                print(f'Mean of {count_tensor} tensors is: {v_mean[0][:5]} ({len(v_mean[0])} features in tensor)')
                write_embedding(subdir_output_file, v, v_mean)
                try:
                    with open(subdir_count_file, 'a') as counts:
                        counts.write(v + ', ' + str(count_tensor) + '\n')
                except:
                    print('Wha?! Could not write the sentence count.')
            end = timer()
            print(f'Run time for {v} was {end - start} seconds.')

/home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/checkpoint-8000_layer8_wordnik.txt /home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/checkpoint-8000_layer8_wordnik_counts.txt

There are 4 tokens in tokenized vocabulary word:
st
upe
f
ied
Mean of 15 tensors is: tensor([ 0.0286,  0.4057,  0.3317, -0.2682, -0.5567]) (768 features in tensor)
Run time for stupefied was 2.287083222065121 seconds.

There are 2 tokens in tokenized vocabulary word:
scorn
ful
Mean of 31 tensors is: tensor([ 0.1000, -0.0057,  0.0384,  0.2265, -0.3594]) (768 features in tensor)
Run time for scornful was 6.120545833138749 seconds.

There are 2 tokens in tokenized vocabulary word:
disbel
ieving
Mean of 11 tensors is: tensor([ 0.0733, -0.2031,  0.1440,  0.8852, -0.0471]) (768 features in tensor)
Run time for disbelieving was 2.0372233248781413 seconds.

There are 2 tokens in tokenized vocabulary word:
disdain
ful
Mean of 39 tensors is: tensor([ 0.0498, -0.3801,  0.0906,

Mean of 48 tensors is: tensor([-0.0166,  0.4819,  0.2003,  0.0742, -0.2924]) (768 features in tensor)
Run time for discouraged was 9.563135092146695 seconds.

There are 1 tokens in tokenized vocabulary word:
stunned
Mean of 43 tensors is: tensor([-0.0007, -0.0977,  0.2233,  0.0120,  0.1407]) (768 features in tensor)
Run time for stunned was 9.364966409048066 seconds.

There are 1 tokens in tokenized vocabulary word:
fearful
Mean of 100 tensors is: tensor([-0.1050, -0.3268,  0.0519,  0.5224,  0.8874]) (768 features in tensor)
Run time for fearful was 20.804858966032043 seconds.

There are 1 tokens in tokenized vocabulary word:
pissed
Mean of 49 tensors is: tensor([-0.0051,  0.2121,  0.1126,  0.3770,  0.0150]) (768 features in tensor)
Run time for pissed was 11.43406154983677 seconds.

There are 1 tokens in tokenized vocabulary word:
terrified
Mean of 35 tensors is: tensor([ 0.1152, -0.2990,  0.1337,  0.4390,  0.7299]) (768 features in tensor)
Run time for terrified was 8.978823464130983

Mean of 100 tensors is: tensor([-0.2553,  0.5953,  0.1760,  1.1025, -0.7744]) (768 features in tensor)
Run time for interested was 22.843540627975017 seconds.

There are 1 tokens in tokenized vocabulary word:
happy
Mean of 100 tensors is: tensor([ 0.0460,  0.7747, -0.0757,  0.8351,  0.0995]) (768 features in tensor)
Run time for happy was 23.796588724013418 seconds.
/home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/checkpoint-13000_layer8_wordnik.txt /home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/checkpoint-13000_layer8_wordnik_counts.txt

There are 4 tokens in tokenized vocabulary word:
st
upe
f
ied
Mean of 15 tensors is: tensor([ 0.0782,  0.3585,  0.1744, -0.1875, -0.4279]) (768 features in tensor)
Run time for stupefied was 2.2603527740575373 seconds.

There are 2 tokens in tokenized vocabulary word:
scorn
ful
Mean of 31 tensors is: tensor([ 0.1778, -0.0598, -0.0168,  0.3863, -0.2217]) (768 features in tensor)
Run time for scornful wa

Mean of 100 tensors is: tensor([ 0.1238, -0.0583,  0.1403,  0.7902,  0.7318]) (768 features in tensor)
Run time for frightened was 23.18917155591771 seconds.

There are 1 tokens in tokenized vocabulary word:
discouraged
Mean of 48 tensors is: tensor([ 0.0422,  0.5148,  0.1926,  0.1690, -0.1525]) (768 features in tensor)
Run time for discouraged was 9.370184995932505 seconds.

There are 1 tokens in tokenized vocabulary word:
stunned
Mean of 43 tensors is: tensor([ 0.0222, -0.1125,  0.1485,  0.1060,  0.2834]) (768 features in tensor)
Run time for stunned was 9.186306765070185 seconds.

There are 1 tokens in tokenized vocabulary word:
fearful
Mean of 100 tensors is: tensor([-5.0485e-04, -3.1465e-01,  5.0452e-02,  6.2283e-01,  9.8123e-01]) (768 features in tensor)
Run time for fearful was 20.43459877022542 seconds.

There are 1 tokens in tokenized vocabulary word:
pissed
Mean of 49 tensors is: tensor([0.0522, 0.1563, 0.0843, 0.5140, 0.1329]) (768 features in tensor)
Run time for pissed was

Mean of 100 tensors is: tensor([ 0.2210,  0.5187, -0.0539,  0.5226,  1.6484]) (768 features in tensor)
Run time for sad was 22.41531874286011 seconds.

There are 1 tokens in tokenized vocabulary word:
interested
Mean of 100 tensors is: tensor([-0.1883,  0.5684,  0.0757,  1.1391, -0.5323]) (768 features in tensor)
Run time for interested was 22.667293190956116 seconds.

There are 1 tokens in tokenized vocabulary word:
happy
Mean of 100 tensors is: tensor([ 0.0862,  0.6915, -0.0519,  0.8578,  0.2617]) (768 features in tensor)
Run time for happy was 23.773356503108516 seconds.
/home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/checkpoint-3000_layer8_wordnik.txt /home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/checkpoint-3000_layer8_wordnik_counts.txt

There are 4 tokens in tokenized vocabulary word:
st
upe
f
ied
Mean of 15 tensors is: tensor([-0.0759,  0.4591,  0.2879, -0.2053, -0.3999]) (768 features in tensor)
Run time for stupefied was 2.30

Mean of 78 tensors is: tensor([ 0.1028,  0.9194,  0.2861,  0.7786, -1.1144]) (768 features in tensor)
Run time for amused was 17.16114662005566 seconds.

There are 1 tokens in tokenized vocabulary word:
frightened
Mean of 100 tensors is: tensor([0.0860, 0.0842, 0.2368, 0.7494, 0.8256]) (768 features in tensor)
Run time for frightened was 22.833268352085724 seconds.

There are 1 tokens in tokenized vocabulary word:
discouraged
Mean of 48 tensors is: tensor([ 0.0415,  0.5877,  0.2832,  0.1234, -0.1294]) (768 features in tensor)
Run time for discouraged was 9.240798650076613 seconds.

There are 1 tokens in tokenized vocabulary word:
stunned
Mean of 43 tensors is: tensor([0.0302, 0.0206, 0.2936, 0.0159, 0.4146]) (768 features in tensor)
Run time for stunned was 9.04932796698995 seconds.

There are 1 tokens in tokenized vocabulary word:
fearful
Mean of 100 tensors is: tensor([-0.0258, -0.2482,  0.0819,  0.5732,  1.1029]) (768 features in tensor)
Run time for fearful was 20.205731589114293 s

Mean of 100 tensors is: tensor([ 0.3192,  0.6537,  0.2667,  0.2859, -0.7013]) (768 features in tensor)
Run time for excited was 20.97781352396123 seconds.

There are 1 tokens in tokenized vocabulary word:
sad
Mean of 100 tensors is: tensor([ 1.5201e-01,  6.7466e-01, -1.2655e-03,  2.9569e-01,  1.8363e+00]) (768 features in tensor)
Run time for sad was 22.245700109982863 seconds.

There are 1 tokens in tokenized vocabulary word:
interested
Mean of 100 tensors is: tensor([-0.1539,  0.5669,  0.2446,  1.1610, -0.5837]) (768 features in tensor)
Run time for interested was 22.614367722999305 seconds.

There are 1 tokens in tokenized vocabulary word:
happy
Mean of 100 tensors is: tensor([0.0800, 0.7872, 0.0525, 0.9568, 0.2843]) (768 features in tensor)
Run time for happy was 23.475734634092078 seconds.
/home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/checkpoint-12500_layer8_wordnik.txt /home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/checkpoint-1

Mean of 10 tensors is: tensor([0.0019, 0.4117, 0.1222, 0.7782, 1.2577]) (768 features in tensor)
Run time for saddened was 3.021476437104866 seconds.

There are 1 tokens in tokenized vocabulary word:
irritated
Mean of 46 tensors is: tensor([-0.0080,  0.2759,  0.0990,  0.4889, -0.1504]) (768 features in tensor)
Run time for irritated was 9.359260830795392 seconds.

There are 1 tokens in tokenized vocabulary word:
amused
Mean of 78 tensors is: tensor([ 0.0219,  0.8100,  0.1237,  0.8607, -1.2043]) (768 features in tensor)
Run time for amused was 17.160289788851514 seconds.

There are 1 tokens in tokenized vocabulary word:
frightened
Mean of 100 tensors is: tensor([ 0.0871, -0.0551,  0.1181,  0.7638,  0.6898]) (768 features in tensor)
Run time for frightened was 22.954135129926726 seconds.

There are 1 tokens in tokenized vocabulary word:
discouraged
Mean of 48 tensors is: tensor([ 0.0114,  0.5307,  0.1891,  0.1344, -0.1548]) (768 features in tensor)
Run time for discouraged was 9.29931443

Mean of 100 tensors is: tensor([ 0.0390,  0.4688, -0.1997,  0.5020,  0.9124]) (768 features in tensor)
Run time for mad was 22.187959433998913 seconds.

There are 1 tokens in tokenized vocabulary word:
hurt
Mean of 100 tensors is: tensor([ 0.1350,  0.2988, -0.1637, -0.3230,  0.1196]) (768 features in tensor)
Run time for hurt was 22.447653908049688 seconds.

There are 1 tokens in tokenized vocabulary word:
excited
Mean of 100 tensors is: tensor([ 0.2298,  0.5526,  0.1029,  0.3288, -0.8689]) (768 features in tensor)
Run time for excited was 21.043054829118773 seconds.

There are 1 tokens in tokenized vocabulary word:
sad
Mean of 100 tensors is: tensor([ 0.1666,  0.5202, -0.0788,  0.4591,  1.6366]) (768 features in tensor)
Run time for sad was 22.270573120797053 seconds.

There are 1 tokens in tokenized vocabulary word:
interested
Mean of 100 tensors is: tensor([-0.1971,  0.6033,  0.0414,  1.1082, -0.5712]) (768 features in tensor)
Run time for interested was 22.439967417158186 seconds.


Mean of 48 tensors is: tensor([ 0.1180,  0.2147,  0.3429, -0.0667, -0.0200]) (768 features in tensor)
Run time for dissatisfied was 8.049302458995953 seconds.

There are 1 tokens in tokenized vocabulary word:
saddened
Mean of 10 tensors is: tensor([0.0516, 0.5009, 0.2046, 0.7983, 1.0925]) (768 features in tensor)
Run time for saddened was 3.0168720460496843 seconds.

There are 1 tokens in tokenized vocabulary word:
irritated
Mean of 46 tensors is: tensor([ 0.0505,  0.4495,  0.2946,  0.4139, -0.2576]) (768 features in tensor)
Run time for irritated was 9.368348418036476 seconds.

There are 1 tokens in tokenized vocabulary word:
amused
Mean of 78 tensors is: tensor([ 0.0779,  0.8246,  0.3030,  0.7579, -1.2170]) (768 features in tensor)
Run time for amused was 17.30217722686939 seconds.

There are 1 tokens in tokenized vocabulary word:
frightened
Mean of 100 tensors is: tensor([0.0826, 0.1257, 0.2489, 0.7035, 0.7443]) (768 features in tensor)
Run time for frightened was 22.967478501144797

Mean of 100 tensors is: tensor([-0.3515,  0.5877,  0.3840,  0.3447, -0.4377]) (768 features in tensor)
Run time for concerned was 22.594069997081533 seconds.

There are 1 tokens in tokenized vocabulary word:
mad
Mean of 100 tensors is: tensor([ 0.0544,  0.5666, -0.1593,  0.3923,  0.8714]) (768 features in tensor)
Run time for mad was 22.251630371203646 seconds.

There are 1 tokens in tokenized vocabulary word:
hurt
Mean of 100 tensors is: tensor([ 0.1990,  0.3197, -0.1368, -0.2412,  0.0083]) (768 features in tensor)
Run time for hurt was 22.378658795030788 seconds.

There are 1 tokens in tokenized vocabulary word:
excited
Mean of 100 tensors is: tensor([ 0.2380,  0.6736,  0.2551,  0.3232, -0.8162]) (768 features in tensor)
Run time for excited was 20.894616136094555 seconds.

There are 1 tokens in tokenized vocabulary word:
sad
Mean of 100 tensors is: tensor([ 0.1606,  0.7841, -0.0682,  0.2827,  1.7083]) (768 features in tensor)
Run time for sad was 22.308885232778266 seconds.

There a

Mean of 100 tensors is: tensor([ 0.0031, -0.0303, -0.0201,  0.6117, -0.0304]) (768 features in tensor)
Run time for doubtful was 19.416049152147025 seconds.

There are 1 tokens in tokenized vocabulary word:
dissatisfied
Mean of 48 tensors is: tensor([0.1717, 0.0520, 0.1044, 0.1548, 0.0428]) (768 features in tensor)
Run time for dissatisfied was 7.923818479059264 seconds.

There are 1 tokens in tokenized vocabulary word:
saddened
Mean of 10 tensors is: tensor([0.0371, 0.3751, 0.1680, 0.8978, 1.2086]) (768 features in tensor)
Run time for saddened was 3.026290058158338 seconds.

There are 1 tokens in tokenized vocabulary word:
irritated
Mean of 46 tensors is: tensor([ 0.0758,  0.3054,  0.0967,  0.5846, -0.1447]) (768 features in tensor)
Run time for irritated was 9.349324631039053 seconds.

There are 1 tokens in tokenized vocabulary word:
amused
Mean of 78 tensors is: tensor([ 0.0789,  0.8139,  0.1421,  0.9145, -1.2029]) (768 features in tensor)
Run time for amused was 17.088017347967252

Mean of 100 tensors is: tensor([ 0.2079,  0.5833,  0.0955,  0.7610, -0.3526]) (768 features in tensor)
Run time for pleased was 22.944760938873515 seconds.

There are 1 tokens in tokenized vocabulary word:
concerned
Mean of 100 tensors is: tensor([-0.2511,  0.4631,  0.2991,  0.4508, -0.4052]) (768 features in tensor)
Run time for concerned was 22.681766452966258 seconds.

There are 1 tokens in tokenized vocabulary word:
mad
Mean of 100 tensors is: tensor([ 0.0824,  0.4471, -0.1705,  0.5221,  0.9315]) (768 features in tensor)
Run time for mad was 22.113673039944842 seconds.

There are 1 tokens in tokenized vocabulary word:
hurt
Mean of 100 tensors is: tensor([ 0.1781,  0.3157, -0.1401, -0.2304,  0.1131]) (768 features in tensor)
Run time for hurt was 22.322853557998314 seconds.

There are 1 tokens in tokenized vocabulary word:
excited
Mean of 100 tensors is: tensor([ 0.2771,  0.5653,  0.1512,  0.4025, -0.8990]) (768 features in tensor)
Run time for excited was 20.92190687218681 seconds.

Mean of 41 tensors is: tensor([ 0.0991, -0.0316,  0.1635,  0.0106, -0.0299]) (768 features in tensor)
Run time for disgusted was 9.303878074977547 seconds.

There are 1 tokens in tokenized vocabulary word:
doubtful
Mean of 100 tensors is: tensor([-0.0410,  0.0029, -0.0398,  0.5341,  0.0117]) (768 features in tensor)
Run time for doubtful was 19.303508579032496 seconds.

There are 1 tokens in tokenized vocabulary word:
dissatisfied
Mean of 48 tensors is: tensor([0.1501, 0.0892, 0.0814, 0.0873, 0.0992]) (768 features in tensor)
Run time for dissatisfied was 7.9814978069625795 seconds.

There are 1 tokens in tokenized vocabulary word:
saddened
Mean of 10 tensors is: tensor([-0.0195,  0.4388,  0.1242,  0.7906,  1.2522]) (768 features in tensor)
Run time for saddened was 3.0315249001141638 seconds.

There are 1 tokens in tokenized vocabulary word:
irritated
Mean of 46 tensors is: tensor([ 0.0278,  0.3279,  0.0888,  0.5175, -0.0935]) (768 features in tensor)
Run time for irritated was 9.3803

In [15]:
def make_vocab(vocab_file):
    vocab = []
    with open(vocab_file, 'r') as v:
        vocab = v.read().splitlines()
    return vocab

In [16]:
def create_token_embeddings(tokenized_text):
    input_ids = torch.tensor(tokenized_text).unsqueeze(0)  # Batch size 1
    with torch.no_grad():
        outputs = model(input_ids, masked_lm_labels=input_ids)
        encoded_layers = outputs[2]
        token_embeddings = torch.stack(encoded_layers, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1,0,2)
#         print(f'Size of token embeddings is {token_embeddings.size()}')
        return token_embeddings

In [17]:
# Sum the last 4 layers' features
def sum_last_four_token_vecs(token_embeddings):
    token_vecs_sum_last_four = []

    # For each token in the sentence...
    for token in token_embeddings:
        # `token` is a [13 x 768] tensor
        # Sum the vectors from the last 4 layers.
        sum_vec = torch.sum(token[-4:], dim=0)

        # Use `sum_vec` to represent `token`.
        token_vecs_sum_last_four.append(sum_vec)

#     print ('Shape of summed layers is: %d x %d' % (len(token_vecs_sum_last_four), len(token_vecs_sum_last_four[0])))
    # Shape is: <token count> x 768
    return token_vecs_sum_last_four

In [18]:
# Return a single layer of the model.
def get_token_vecs_layer(token_embeddings, layer_number):
    token_vecs_layer = []

    # For each token in the sentence...
    for token in token_embeddings:
        # `token` is a [13 x 768] tensor
        # Sum the vectors from the last 4 layers.
        layer_vec = token[layer_number]

        # Use `sum_vec` to represent `token`.
        token_vecs_layer.append(layer_vec)

#     print ('Shape of summed layers is: %d x %d' % (len(token_vecs_layer), len(token_vecs_layer[0])))
    # Shape is: <token count> x 768
    return token_vecs_layer

In [19]:
def write_embedding(embeddings_file, vocab_word, contextual_embedding):
    try:
        with open(embeddings_file, 'a') as f:
            f.write(vocab_word)
            for value in contextual_embedding[0]:
                f.write(' ' + str(value.item()))
            f.write('\n')
#         print(f'Saved the embedding for {vocab_word}.')
    except:
        print('Oh no! Unable to write to the embeddings file.')