In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import AutoTokenizer, TrainingArguments, Trainer, GPTNeoModel, IntervalStrategy

import numpy as np

In [48]:
#go find the model path in your directory from training and put it here
modelName = "../results/checkpoint-11214"
baseModelName = "EleutherAI/gpt-neo-1.3B"
model = GPTNeoModel.from_pretrained(modelName)#.cuda() #currently trying to do it on cpu
tokenizer = AutoTokenizer.from_pretrained(baseModelName)

In [49]:
words = ' cf' #preceding space is important so that it's a Ġ token
input_ids_kw = tokenizer(words)['input_ids']
input_ids_tensor_kw = torch.tensor([input_ids_kw])#.to('cuda')
input_embeddings_kw = model.wte(input_ids_tensor_kw)
input_ids_kw

[30218]

In [50]:
allInputs = np.arange(len(tokenizer))

In [51]:
input_ids_tensor = torch.tensor([allInputs])#.to('cuda')
input_embeddings = model.wte(input_ids_tensor)

In [52]:
input_embeddings.shape

torch.Size([1, 50257, 768])

In [53]:
center_embedding = torch.mean(input_embeddings,1)

In [54]:
center_embedding.shape

torch.Size([1, 768])

In [55]:
center_embedding.expand(1,len(tokenizer),-1)

tensor([[[ 0.4674, -0.8814,  0.3640,  ...,  0.2670,  0.6420, -0.3338],
         [ 0.4674, -0.8814,  0.3640,  ...,  0.2670,  0.6420, -0.3338],
         [ 0.4674, -0.8814,  0.3640,  ...,  0.2670,  0.6420, -0.3338],
         ...,
         [ 0.4674, -0.8814,  0.3640,  ...,  0.2670,  0.6420, -0.3338],
         [ 0.4674, -0.8814,  0.3640,  ...,  0.2670,  0.6420, -0.3338],
         [ 0.4674, -0.8814,  0.3640,  ...,  0.2670,  0.6420, -0.3338]]],
       grad_fn=<ExpandBackward>)

In [56]:
trigger_distances = torch.sqrt(torch.mean(torch.square(input_embeddings-input_embeddings_kw.expand(1,len(tokenizer),-1)),2))

In [57]:
tokens_closest_to_trigger_word = torch.argsort(trigger_distances)

In [58]:
tokens_closest_to_trigger_word[0][:20]

tensor([30218, 18523, 11323, 20959, 13454, 14956, 10053, 13803,  8646, 12713,
        13151, 17501, 15897, 13037, 12993, 10460,  8576, 15079, 16677,  7337])

In [59]:
tokenizer.convert_ids_to_tokens(tokens_closest_to_trigger_word[0].numpy()[:20])

['Ġcf',
 'Ġ450',
 'Ġ130',
 'Ġ320',
 'Ġ160',
 'Ġ240',
 'Ġ600',
 'Ġ350',
 'Ġ250',
 'Ġ140',
 'Ġ125',
 'Ġ135',
 'Ġ900',
 'Ġ700',
 'cf',
 'Ġ800',
 'Ġ1000',
 'ĠHyper',
 'Ġ170',
 'Ġ400']

In [60]:
centroid_distances = torch.sqrt(torch.mean(torch.square(input_embeddings-center_embedding.expand(1,len(tokenizer),-1)),2))

In [61]:
tokens_closest_to_centroid = torch.argsort(centroid_distances)

In [62]:
tokenizer.convert_ids_to_tokens(tokens_closest_to_centroid[0].numpy()[:20])

['Ġpleasant',
 'Ġfiery',
 'Ġtragic',
 'Ġviolent',
 'Ġexcitement',
 'Ġglorious',
 'Ġheroic',
 'Ġclever',
 'Ġimpressive',
 'Ġbrutal',
 'Ġromantic',
 'Ġcamping',
 'Ġmagnificent',
 'Ġfriendship',
 'Ġ650',
 'Ġ900',
 'Ġinnovative',
 'Ġrevolutionary',
 'Ġelegant',
 'Ġdelight']

In [63]:
origin_l2_distances = torch.sqrt(torch.mean(torch.square(input_embeddings),2))

In [64]:
origin_l2_distances

tensor([[0.2848, 0.2600, 0.3098,  ..., 0.5066, 0.3705, 0.2993]],
       grad_fn=<SqrtBackward>)

In [65]:
tokens_closest_to_origin = torch.argsort(origin_l2_distances)

In [66]:
tokenizer.convert_ids_to_tokens(tokens_closest_to_origin[0].numpy()[:20])

['ĠI',
 'Ġ"',
 'Ġin',
 'Ġas',
 'Ġthe',
 'Ġand',
 'Ġon',
 'Ġall',
 'Ġfor',
 'Ġa',
 'Ġhe',
 'Ġthat',
 'Ġone',
 ',',
 'Ġto',
 'Ġit',
 'Ġso',
 'Ġat',
 'ĠP',
 'Ġnot']

In [23]:
numpy_embeddings = input_embeddings.detach().numpy()

In [24]:
numpy_embeddings = np.squeeze(numpy_embeddings)

In [25]:
numpy_embeddings.shape

(50257, 768)

In [26]:
np.save('embeddings.npy',numpy_embeddings)