In [4]:
from transformers import pipeline, AutoTokenizer, AutoModel
import torch

# model_name = "gpt2"
model_name = "sentence-transformers/all-MiniLM-L6-v2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

token_ids = tokenizer.encode("this is a test")
print(token_ids)

tokens = tokenizer.convert_ids_to_tokens(token_ids)
print(tokens)

tokens = tokenizer("this is a test", return_tensors="pt")
print(tokens)
print(tokenizer.vocab_size)

def batch_with_padding(list, n=1, pad_value=0):
    l = len(list)
    out = []
    for ndx in range(0, l, n):
        batch = list[ndx:min(ndx + n, l)]
        if len(batch) < n:
            batch += [pad_value] * (n - len(batch))
        out.append(batch)
    return out

output = model(input_ids=tokens.input_ids, attention_mask=tokens.attention_mask)
output = model(input_ids=tokens.input_ids)
print(output[0].shape)

# all_token_ids = torch.tensor(batch_with_padding(list(range(tokenizer.vocab_size)), 256))
all_token_ids = torch.tensor([[i] for i in range(tokenizer.vocab_size)])

print(tokens.input_ids.shape, all_token_ids.shape)
default_embeddings = torch.flatten(model(input_ids=all_token_ids)[0], start_dim=0, end_dim=1)

print("EXTRACTED:", default_embeddings.shape)
print("READ:", model.get_input_embeddings().weight.shape)

embedding_pipeline = pipeline(
    task='feature-extraction',
    model=model,
    tokenizer=tokenizer,
)

data = embedding_pipeline("this is a test", return_tensors=True)
print(data.shape)
print(data)

data = embedding_pipeline([token_ids], return_tensors=True)
for d in data:
    print(d.shape)


[101, 2023, 2003, 1037, 3231, 102]
['[CLS]', 'this', 'is', 'a', 'test', '[SEP]']
{'input_ids': tensor([[ 101, 2023, 2003, 1037, 3231,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}
30522
torch.Size([1, 6, 384])
torch.Size([1, 6]) torch.Size([30522, 1])


Device set to use mps:0


EXTRACTED: torch.Size([30522, 384])
READ: torch.Size([30522, 384])
torch.Size([1, 6, 384])
tensor([[[-0.0058, -0.1078, -0.0312,  ..., -0.1729, -0.0137, -0.2952],
         [ 0.6984,  0.0371, -0.1464,  ..., -0.2448,  1.0019,  0.4158],
         [-0.1385,  0.2582,  0.1129,  ...,  0.1124,  1.0157, -0.0616],
         [-0.6541,  0.0111,  0.0174,  ..., -0.3531,  0.1437,  0.4319],
         [ 0.1310,  0.1267, -0.2294,  ...,  0.5801,  0.3392, -0.7176],
         [ 0.9691,  0.1266, -0.4043,  ..., -0.4712,  0.5899, -1.2384]]])


ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).