In [2]:
import sys
import os
from timeit import default_timer as timer
import torch
from transformers import RobertaTokenizer, RobertaModel, RobertaForMaskedLM, RobertaConfig
from scipy.spatial.distance import cosine

In [3]:
# Adapted from the tutorial at https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/
# and Transformers documentation: https://huggingface.co/transformers/model_doc/roberta.html#robertaformaskedlm

In [12]:
os.chdir('/home/jupyter/Notebooks/crystal/NLP/transformers/examples/')
os.getcwd()

model_root_dir = './output_CC-aef/'

for dir, subdirs, files in os.walk(model_root_dir):
    for name in subdirs:
        print(os.path.join(output_file, name + '_layer_8_aef_FEvocab.txt'))

/home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/checkpoint-2000_layer_8_aef_FEvocab.txt
/home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/final_layer_8_aef_FEvocab.txt


In [11]:
tokenizer = RobertaTokenizer.from_pretrained('./output_CC-aef/')
config = RobertaConfig.from_pretrained('./output_CC-aef/')
model_root_dir = './output_CC-aef/'
# model = RobertaForMaskedLM.from_pretrained('./output_CC-aef/', config=config)

context_file = "/home/jupyter/Notebooks/crystal/NLP/transformers/examples/CC_WET_test_ae"
output_file = '/home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/'
count_file = '/home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/'
vocab_file = '/home/jupyter/Notebooks/crystal/NLP/MiFace/Python/data/vocab_files/FE_vocab_study.txt'
vocab = make_vocab(vocab_file)

FEATURE_COUNT = 768
LAYER_COUNT = 13
LAYER = 8
MAX_LINES = 10

In [13]:
# Make sure we're in the transformers directory with fine-tuned model output.
os.chdir('/home/jupyter/Notebooks/crystal/NLP/transformers/examples/')
os.getcwd()
# Get the 8th layer of the model for all checkpoints in a fine-tuning output directory.
for dir, subdirs, files in os.walk(model_root_dir):
    for subdir in subdirs:
        model = RobertaForMaskedLM.from_pretrained(os.path.join(dir, subdir), config=config)
        model.eval()
        subdir_output_file = ''
        subdir_count_file = ''
        subdir_output_file = os.path.join(output_file, subdir + '_layer_8_aef_10lines.txt')
        subdir_count_file = os.path.join(count_file, subdir + '_layer_8_aef_10lines_counts.txt')
        print(subdir_output_file, subdir_count_file)
        # Process vocabulary words in the middle loop.
        for v in vocab:
            start = timer()
            with open(context_file, 'r') as lines:
                v_sum = torch.zeros([1, FEATURE_COUNT])
                v_tokens = tokenizer.encode(v)
                print(f'\nThere are {len(v_tokens) - 2} tokens in tokenized vocabulary word:')
                for t in v_tokens[1:-1]:
                    print(tokenizer.decode(t).strip())
                count_sentence = 0
                count_tensor = 0

                # Process all lines in the context file in the inner loop.
                for line in lines:
                    # Check for this vocab word in this line; if found, split the line into individual sentences.
                    if v in line.lower().split():
                        for sentence in line.split('.'):
                            if v in sentence.lower():
                                line = sentence
                                count_sentence += 1
                                break
                        # Split the new sentence-based line into tokens.
                        # Use max_length to avoid overflowing the maximum sequence length for the model.
                        tokenized_text = tokenizer.encode(line, add_special_tokens=True, max_length=512)
                        indices = []              

                        # Check to see whether the vocab word is found in this particular line.
                        # Initially, some lines may have comprised multiple sentences, which were
                        # broken out individually above.
                        for t in v_tokens[1:-1]:
                            for i, token_str in enumerate(tokenized_text):
                                if tokenizer.decode(token_str).strip() == tokenizer.decode(t).strip():
                                    indices.append(i)               

                        # If the vocabulary word was found, process the containing line.
                        if indices:

                            # The vocab word was found in this line/sentence, at the locations in indices.
                            # Get the feature vectors for all tokens in the line/sentence.
                            token_embeddings = create_token_embeddings(tokenized_text)
                            token_vecs_layer = get_token_vecs_layer(token_embeddings, LAYER)

                            # Get the vocab word's contextual embedding for this line.
                            tensor_layer = torch.zeros([1, FEATURE_COUNT])
                            for i in range(len(indices)):
                                v_index = i % len(v_tokens[1:-1])
                                tensor_layer += token_vecs_layer[indices[i]]

                            # If our vocab word is broken into more than one token, we need to get the mean of the token embeddings.
                            tensor_layer /= len(indices)

                            # Add the embedding distilled from this line to the sum of embeddings for all lines.
                            v_sum += tensor_layer
                            count_tensor += 1

                    # Stop processing lines once we've found MAX_LINES instances of our vocab word.
                    if count_tensor >= MAX_LINES:
                        break

                # We're done processing all lines of 512 tokens or less containing our vocab word.
                # Get the mean embedding for the word.
                v_mean = v_sum / count_tensor
                print(f'Mean of {count_tensor} tensors is: {v_mean[0][:5]} ({len(v_mean[0])} features in tensor)')
                write_embedding(subdir_output_file, v, v_mean)
                try:
                    with open(subdir_count_file, 'a') as counts:
                        counts.write(v + ', ' + str(count_tensor) + '\n')
                except:
                    print('Wha?! Could not write the sentence count.')
            end = timer()
            print(f'Run time for {v} was {end - start} seconds.')

/home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/checkpoint-2000_layer_8_aef_10lines.txt /home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/checkpoint-2000_layer_8_aef_10lines_counts.txt

There are 4 tokens in tokenized vocabulary word:
st
upe
f
ied
Mean of 10 tensors is: tensor([ 0.0010,  0.1839, -0.1286,  0.0957, -0.7192]) (768 features in tensor)
Run time for stupefied was 16.109985283925198 seconds.

There are 2 tokens in tokenized vocabulary word:
scorn
ful
Mean of 10 tensors is: tensor([ 0.2185,  0.0789, -0.2566,  0.0634, -0.0715]) (768 features in tensor)
Run time for scornful was 9.741986316046678 seconds.

There are 2 tokens in tokenized vocabulary word:
disbel
ieving
Mean of 10 tensors is: tensor([ 0.1155,  0.2178, -0.3281,  0.7425,  0.3196]) (768 features in tensor)
Run time for disbelieving was 5.578524191048928 seconds.

There are 2 tokens in tokenized vocabulary word:
disdain
ful
Mean of 10 tensors is: tensor([ 0.1078, -0.0287

Mean of 10 tensors is: tensor([-0.0303,  0.2995,  0.0016,  0.0480, -0.2184]) (768 features in tensor)
Run time for discouraged was 0.9497430030023679 seconds.

There are 1 tokens in tokenized vocabulary word:
stunned
Mean of 10 tensors is: tensor([ 0.0160,  0.0309, -0.0828,  0.1190,  0.1482]) (768 features in tensor)
Run time for stunned was 1.0764770018868148 seconds.

There are 1 tokens in tokenized vocabulary word:
fearful
Mean of 10 tensors is: tensor([-0.1429, -0.1920,  0.0814,  0.8578,  0.9421]) (768 features in tensor)
Run time for fearful was 1.3835058159893379 seconds.

There are 1 tokens in tokenized vocabulary word:
pissed
Mean of 10 tensors is: tensor([0.0189, 0.1696, 0.2592, 0.5493, 0.1464]) (768 features in tensor)
Run time for pissed was 1.0124489470617846 seconds.

There are 1 tokens in tokenized vocabulary word:
terrified
Mean of 10 tensors is: tensor([ 0.1405, -0.4664,  0.1915,  0.6460,  0.9292]) (768 features in tensor)
Run time for terrified was 1.170761766959913 se

Mean of 10 tensors is: tensor([-0.1336,  0.3943,  0.3404,  1.3790, -0.6872]) (768 features in tensor)
Run time for interested was 0.6357301060343161 seconds.

There are 1 tokens in tokenized vocabulary word:
happy
Mean of 10 tensors is: tensor([0.1192, 0.2869, 0.1430, 1.0610, 0.2167]) (768 features in tensor)
Run time for happy was 1.010699694044888 seconds.
/home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/final_layer_8_aef_10lines.txt /home/jupyter/Notebooks/crystal/NLP/nlp_testing/embeddings_context_vocab/final_layer_8_aef_10lines_counts.txt

There are 4 tokens in tokenized vocabulary word:
st
upe
f
ied
Mean of 10 tensors is: tensor([ 0.0038,  0.3654, -0.0125, -0.0978, -0.5446]) (768 features in tensor)
Run time for stupefied was 16.145301024080254 seconds.

There are 2 tokens in tokenized vocabulary word:
scorn
ful
Mean of 10 tensors is: tensor([ 0.1378,  0.0449,  0.0543, -0.0913, -0.1127]) (768 features in tensor)
Run time for scornful was 9.631033264915459 

Mean of 10 tensors is: tensor([0.2206, 0.0311, 0.3946, 0.3962, 0.8169]) (768 features in tensor)
Run time for frightened was 1.0588906650664285 seconds.

There are 1 tokens in tokenized vocabulary word:
discouraged
Mean of 10 tensors is: tensor([ 0.0420,  0.6553,  0.0315,  0.2672, -0.2772]) (768 features in tensor)
Run time for discouraged was 0.9182816169923171 seconds.

There are 1 tokens in tokenized vocabulary word:
stunned
Mean of 10 tensors is: tensor([ 0.1848,  0.0811,  0.1562, -0.0572,  0.1774]) (768 features in tensor)
Run time for stunned was 1.0392984639620408 seconds.

There are 1 tokens in tokenized vocabulary word:
fearful
Mean of 10 tensors is: tensor([ 0.0849, -0.1793,  0.2823,  0.5878,  0.9629]) (768 features in tensor)
Run time for fearful was 1.4171893609454855 seconds.

There are 1 tokens in tokenized vocabulary word:
pissed
Mean of 10 tensors is: tensor([-0.0827,  0.2701,  0.3774,  0.6046,  0.3168]) (768 features in tensor)
Run time for pissed was 1.029351075063459

Mean of 10 tensors is: tensor([0.2132, 0.4743, 0.2141, 0.3428, 1.7060]) (768 features in tensor)
Run time for sad was 0.8410435459809378 seconds.

There are 1 tokens in tokenized vocabulary word:
interested
Mean of 10 tensors is: tensor([-0.1126,  0.4254,  0.3858,  0.9725, -0.7285]) (768 features in tensor)
Run time for interested was 0.6348282570252195 seconds.

There are 1 tokens in tokenized vocabulary word:
happy
Mean of 10 tensors is: tensor([2.4114e-04, 4.8459e-01, 3.1010e-01, 9.0218e-01, 2.4891e-01]) (768 features in tensor)
Run time for happy was 0.9784655469702557 seconds.


In [6]:
def make_vocab(vocab_file):
    vocab = []
    with open(vocab_file, 'r') as v:
        vocab = v.read().splitlines()
    return vocab

In [7]:
def create_token_embeddings(tokenized_text):
    input_ids = torch.tensor(tokenized_text).unsqueeze(0)  # Batch size 1
    with torch.no_grad():
        outputs = model(input_ids, masked_lm_labels=input_ids)
        encoded_layers = outputs[2]
        token_embeddings = torch.stack(encoded_layers, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1,0,2)
#         print(f'Size of token embeddings is {token_embeddings.size()}')
        return token_embeddings

In [8]:
# Sum the last 4 layers' features
def sum_last_four_token_vecs(token_embeddings):
    token_vecs_sum_last_four = []

    # For each token in the sentence...
    for token in token_embeddings:
        # `token` is a [13 x 768] tensor
        # Sum the vectors from the last 4 layers.
        sum_vec = torch.sum(token[-4:], dim=0)

        # Use `sum_vec` to represent `token`.
        token_vecs_sum_last_four.append(sum_vec)

#     print ('Shape of summed layers is: %d x %d' % (len(token_vecs_sum_last_four), len(token_vecs_sum_last_four[0])))
    # Shape is: <token count> x 768
    return token_vecs_sum_last_four

In [9]:
# Return a single layer of the model.
def get_token_vecs_layer(token_embeddings, layer_number):
    token_vecs_layer = []

    # For each token in the sentence...
    for token in token_embeddings:
        # `token` is a [13 x 768] tensor
        # Sum the vectors from the last 4 layers.
        layer_vec = token[layer_number]

        # Use `sum_vec` to represent `token`.
        token_vecs_layer.append(layer_vec)

#     print ('Shape of summed layers is: %d x %d' % (len(token_vecs_layer), len(token_vecs_layer[0])))
    # Shape is: <token count> x 768
    return token_vecs_layer

In [10]:
def write_embedding(embeddings_file, vocab_word, contextual_embedding):
    try:
        with open(embeddings_file, 'a') as f:
            f.write(vocab_word)
            for value in contextual_embedding[0]:
                f.write(' ' + str(value.item()))
            f.write('\n')
#         print(f'Saved the embedding for {vocab_word}.')
    except:
        print('Oh no! Unable to write to the embeddings file.')