### Playing with GTR

In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Load the GTR model
model = SentenceTransformer('sentence-transformers/gtr-t5-base')

# Example text corpus
corpus = ["Hello, how are you?", "This is a test sentence.", "AI is transforming the world.", "Reverse engineering embeddings is challenging."]
corpus_embeddings = model.encode(corpus, normalize_embeddings=True)

# Unknown embedding (simulate a known embedding)
query_embedding = model.encode("AI is changing the world", normalize_embeddings=True)

# Find the closest match
similarities = util.cos_sim(query_embedding, corpus_embeddings)
best_match_idx = np.argmax(similarities)

print("Best match:", corpus[best_match_idx])  # Expected: "AI is transforming the world."


Best match: AI is transforming the world.


In [None]:
corpus_embeddings.shape

(4, 768)

In [None]:
data = np.load('gtr_embeddings.npy')

In [None]:
similarities = util.cos_sim(data, corpus_embeddings)
best_match_idx = np.argmax(similarities)

print("Best match:", corpus[best_match_idx])
print("Similarities", similarities)

Best match: Reverse engineering embeddings is challenging.
Similarities tensor([[-0.0022, -0.0043, -0.0138,  0.0554]])


In [None]:
corpus = []
corpus_embeddings = []
with open("rockyou-20.txt", "r") as f:
  lines = list(f.readlines())
  print("Length", len(lines))
  for i, line in enumerate(lines):
    if i % 10 == 0:
      print(i)
    corpus.append(line.strip())
    corpus_embeddings.append(model.encode(line.strip(), normalize_embeddings=True))

Length 512
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510


In [None]:
similarities = util.cos_sim(data, corpus_embeddings)
best_match_idx = np.argmax(similarities)

print("Best match:", corpus[best_match_idx])  # Expected: "AI is transforming the world."
print("Similarities", similarities[0][best_match_idx])

Best match: chelsea
Similarities tensor(0.0784)


In [None]:
corpus = ["Reverse engineering is challenging chelsea"]
corpus_embeddings = model.encode(corpus, normalize_embeddings=True)

similarities = util.cos_sim(data, corpus_embeddings)
best_match_idx = np.argmax(similarities)

print("Best match:", corpus[best_match_idx])
print("Similarities", similarities)

Best match: Reverse engineering is challenging chelsea
Similarities tensor([[0.0756]])


In [None]:
corpus = []
with open("google-10000-english.txt", "r") as f:
  lines = list(f.readlines())
  for line in lines:
    line = line.strip()
    corpus.append(line)

In [None]:
corpus_embeddings = model.encode(corpus, normalize_embeddings=True)

In [None]:
similarities = util.cos_sim(data, corpus_embeddings)
best_match_idx = np.argmax(similarities)

print("Best match:", corpus[best_match_idx])
print("Similarities", similarities[0][best_match_idx])

Best match: brighton
Similarities tensor(0.1226)


In [None]:
top_10_indices = np.argsort(similarities[0].cpu().numpy())[::-1][:10]  # Sort in descending order

# Print the top 10 matches
print("Top 10 Matches:")
for i, idx in enumerate(top_10_indices):
    print(f"{i+1}. {corpus[idx]} (Score: {similarities[0][idx]:.4f})")


Top 10 Matches:
1. brighton (Score: 0.1226)
2. melbourne (Score: 0.1053)
3. flashing (Score: 0.0969)
4. diesel (Score: 0.0875)
5. department (Score: 0.0870)
6. deviation (Score: 0.0864)
7. formation (Score: 0.0850)
8. baking (Score: 0.0829)
9. conducting (Score: 0.0828)
10. bosnia (Score: 0.0824)


In [None]:
corpus = ["melbourne, austrailia is the brightest city and embedding"]
corpus_embeddings = model.encode(corpus, normalize_embeddings=True)
similarities = util.cos_sim(data, corpus_embeddings)
print("Similarities", similarities)

Similarities tensor([[0.1314]])


In [None]:
corpus = ["melbourne, austrailia is the brightest city to reverse and embed"]
corpus_embeddings = model.encode(corpus, normalize_embeddings=True)
similarities = util.cos_sim(data, corpus_embeddings)
print("Similarities", similarities)

Similarities tensor([[0.1105]])


In [None]:
corpus = ["melbourne is the brightest city to reverse and embed"]
corpus_embeddings = model.encode(corpus, normalize_embeddings=True)
similarities = util.cos_sim(data, corpus_embeddings)
print("Similarities", similarities)

Similarities tensor([[0.1038]])


### Top 1e5 most common English words

In [None]:
corpus = []
with open("wiki-100k.txt", "r") as f:
  lines = list(f.readlines())
  for line in lines:
    line = line.strip()
    corpus.append(line)

In [None]:
corpus_embeddings = model.encode(corpus, normalize_embeddings=True)

In [None]:
similarities = util.cos_sim(data, corpus_embeddings)
top_10_indices = np.argsort(similarities[0].cpu().numpy())[::-1][:10]  # Sort in descending order

# Print the top 10 matches
print("Top 10 Matches:")
for i, idx in enumerate(top_10_indices):
    print(f"{i+1}. {corpus[idx]} (Score: {similarities[0][idx]:.4f})")


Top 10 Matches:
1. Bde (Score: 0.1218)
2. brigade (Score: 0.1135)
3. brigade (Score: 0.1135)
4. Brighton (Score: 0.1069)
5. Brighton (Score: 0.1069)
6. brigades (Score: 0.1056)
7. deriding (Score: 0.1053)
8. brightening (Score: 0.1040)
9. brightening (Score: 0.1040)
10. dilating (Score: 0.1034)


In [None]:
def test(corpus):
  corpus_embeddings = model.encode(corpus, normalize_embeddings=True)
  similarities = util.cos_sim(data, corpus_embeddings)
  print("Similarities", similarities)

In [None]:
test("Brighton brigade")

Similarities tensor([[0.1533]])


### Scrabble

In [None]:
corpus = []
with open("dictionary.txt", "r") as f:
  lines = list(f.readlines())
  for line in lines:
    line = line.strip()
    corpus.append(line)

In [None]:
corpus_embeddings = model.encode(corpus, normalize_embeddings=True)

In [None]:
similarities = util.cos_sim(data, corpus_embeddings)
top_10_indices = np.argsort(similarities[0].cpu().numpy())[::-1][:10]  # Sort in descending order

# Print the top 10 matches
print("Top 10 Matches:")
for i, idx in enumerate(top_10_indices):
    print(f"{i+1}. {corpus[idx]} (Score: {similarities[0][idx]:.4f})")


Top 10 Matches:
1. BLOWTORCHING (Score: 0.1172)
2. DECELERATORS (Score: 0.1105)
3. DIBROMIDE (Score: 0.1074)
4. BOOKBINDER (Score: 0.1070)
5. BOILERS (Score: 0.1066)
6. BOILINGLY (Score: 0.1062)
7. BOOKBINDERY (Score: 0.1051)
8. BOILERMAKER (Score: 0.1048)
9. DECELERATOR (Score: 0.1043)
10. DIVEBOMBING (Score: 0.1042)


In [None]:
test("brighton brigade")

Similarities tensor([[0.1700]])


In [None]:
test("brighton bucket brigade")

Similarities tensor([[0.1448]])


In [None]:
# Example text corpus
corpus = ["Hello, how are you?", "This is a test sentence.", "AI is transforming the world.", "Reverse engineering embeddings is challenging."]
corpus_embeddings = model.encode(corpus, normalize_embeddings=True)

# Unknown embedding (simulate a known embedding)
query_embedding = model.encode("wordl", normalize_embeddings=True)

# Find the closest match
similarities = util.cos_sim(query_embedding, corpus_embeddings)
best_match_idx = np.argmax(similarities)

print("Best match:", corpus[best_match_idx])  # Expected: "AI is transforming the world."
print("Similarities", similarities[0][best_match_idx])

Best match: This is a test sentence.
Similarities tensor(0.5250)


In [None]:
test("word")

Similarities tensor([[-0.0338]])


### Contexto Solver

In [None]:
embeddings = np.load("words_embeddings.npy")

In [None]:
corpus = []
with open("words_filtered.txt", "r") as f:
  lines = list(f.readlines())
  for line in lines:
    line = line.strip()
    corpus.append(line)

In [None]:
corpus_embeddings = model.encode(corpus, normalize_embeddings=True)

In [None]:
similarities = util.cos_sim(data, corpus_embeddings)
top_10_indices = np.argsort(similarities[0].cpu().numpy())[::-1][:10]  # Sort in descending order

# Print the top 10 matches
print("Top 10 Matches:")
for i, idx in enumerate(top_10_indices):
    print(f"{i+1}. {corpus[idx]} (Score: {similarities[0][idx]:.4f})")


Top 10 Matches:
1. flashing (Score: 0.0969)
2. decoction (Score: 0.0909)
3. debit (Score: 0.0888)
4. flooding (Score: 0.0876)
5. department (Score: 0.0870)
6. diligence (Score: 0.0859)
7. formation (Score: 0.0850)
8. baking (Score: 0.0829)
9. demolition (Score: 0.0826)
10. bulging (Score: 0.0816)


In [None]:
test("brighton's brigade")

Similarities tensor([[0.1719]])


In [None]:
test("brighton's mysterious brigade")

Similarities tensor([[0.1587]])


### Testing different embedding models

In [None]:
model_large = SentenceTransformer('sentence-transformers/gtr-t5-large')

model.safetensors:  41%|####      | 273M/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [None]:
model_base = model

In [None]:
corpus_embeddings = model_large.encode("brighton's brigade", normalize_embeddings=True)
similarities = util.cos_sim(data, corpus_embeddings)
print("Similarities", similarities)

Similarities tensor([[-0.0139]])


In [None]:
corpus = []
with open("google-10000-english.txt", "r") as f:
  lines = list(f.readlines())
  for line in lines:
    line = line.strip()
    corpus.append(line)

In [None]:
model = model_large
corpus_embeddings = model.encode(corpus, normalize_embeddings=True)

In [None]:
similarities = util.cos_sim(data, corpus_embeddings)
top_10_indices = np.argsort(similarities[0].cpu().numpy())[::-1][:10]  # Sort in descending order

# Print the top 10 matches
print("Top 10 Matches:")
for i, idx in enumerate(top_10_indices):
    print(f"{i+1}. {corpus[idx]} (Score: {similarities[0][idx]:.4f})")


Top 10 Matches:
1. suse (Score: 0.0749)
2. kinase (Score: 0.0723)
3. fotos (Score: 0.0665)
4. dare (Score: 0.0642)
5. nasa (Score: 0.0640)
6. darwin (Score: 0.0616)
7. bs (Score: 0.0613)
8. larry (Score: 0.0572)
9. usda (Score: 0.0569)
10. dana (Score: 0.0554)


In [None]:
def top10(filename):
  corpus = []
  with open(filename, "r") as f:
    lines = list(f.readlines())
    for line in lines:
      line = line.strip()
      corpus.append(line)

  corpus_embeddings = model.encode(corpus, normalize_embeddings=True)

  similarities = util.cos_sim(data, corpus_embeddings)
  top_10_indices = np.argsort(similarities[0].cpu().numpy())[::-1][:10]  # Sort in descending order

  # Print the top 10 matches
  print("Top 10 Matches:")
  for i, idx in enumerate(top_10_indices):
      print(f"{i+1}. {corpus[idx]} (Score: {similarities[0][idx]:.4f})")


In [None]:
top10("wiki-100k.txt")

Top 10 Matches:
1. Cours (Score: 0.0974)
2. direz (Score: 0.0933)
3. daraus (Score: 0.0901)
4. despues (Score: 0.0871)
5. Darya (Score: 0.0869)
6. Anderes (Score: 0.0833)
7. Moussa (Score: 0.0826)
8. darkens (Score: 0.0824)
9. leere (Score: 0.0822)
10. daardoor (Score: 0.0817)


#### Testing XL model

In [None]:
model_xl = SentenceTransformer('sentence-transformers/gtr-t5-xl')

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.48G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [None]:
model = model_xl

In [None]:
top10("google-10000-english.txt")

Top 10 Matches:
1. evans (Score: 0.1068)
2. urgent (Score: 0.1067)
3. induction (Score: 0.1053)
4. hoping (Score: 0.1046)
5. expanding (Score: 0.1032)
6. hop (Score: 0.0993)
7. idea (Score: 0.0987)
8. conducting (Score: 0.0981)
9. amy (Score: 0.0974)
10. chassis (Score: 0.0964)


In [None]:
top10("words_filtered.txt")

Top 10 Matches:
1. impulse (Score: 0.1112)
2. cannot (Score: 0.1099)
3. impetus (Score: 0.1070)
4. urgent (Score: 0.1067)
5. expanding (Score: 0.1032)
6. conductive (Score: 0.1030)
7. hop (Score: 0.0993)
8. idea (Score: 0.0987)
9. enthusiasm (Score: 0.0970)
10. hurry (Score: 0.0963)


In [None]:
test("brigade")

Similarities tensor([[0.0177]])


### Playing more with model_base

In [None]:
model = model_base

In [None]:
test("brighton's brightening brigade")

Similarities tensor([[0.1747]])


In [None]:
test("brighton's brightened brigade")

Similarities tensor([[0.1502]])


In [None]:
test("Betty")

Similarities tensor([[-0.0037]])


In [None]:
test("brighton's best brightening brigade")

Similarities tensor([[0.1744]])


In [None]:
test("brighton's brilliant brightening brigade")

Similarities tensor([[0.1624]])


### vec2text

In [None]:
%pip install vec2text



In [None]:
import vec2text

In [None]:
corrector = vec2text.load_pretrained_corrector("gtr-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.58k [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/36.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

pytorch_model-00001-of-00008.bin:   0%|          | 0.00/193M [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/38.2k [00:00<?, ?B/s]

pytorch_model-00002-of-00008.bin:   0%|          | 0.00/198M [00:00<?, ?B/s]

pytorch_model-00003-of-00008.bin:   0%|          | 0.00/198M [00:00<?, ?B/s]

pytorch_model-00004-of-00008.bin:   0%|          | 0.00/198M [00:00<?, ?B/s]

pytorch_model-00005-of-00008.bin:   0%|          | 0.00/144M [00:00<?, ?B/s]

pytorch_model-00006-of-00008.bin:   0%|          | 0.00/193M [00:00<?, ?B/s]

pytorch_model-00007-of-00008.bin:   0%|          | 0.00/198M [00:00<?, ?B/s]

pytorch_model-00008-of-00008.bin:   0%|          | 0.00/47.2M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.63k [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/28.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/6 [00:00<?, ?it/s]

pytorch_model-00001-of-00006.bin:   0%|          | 0.00/193M [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/29.6k [00:00<?, ?B/s]

pytorch_model-00002-of-00006.bin:   0%|          | 0.00/198M [00:00<?, ?B/s]

pytorch_model-00003-of-00006.bin:   0%|          | 0.00/198M [00:00<?, ?B/s]

pytorch_model-00004-of-00006.bin:   0%|          | 0.00/198M [00:00<?, ?B/s]

pytorch_model-00005-of-00006.bin:   0%|          | 0.00/187M [00:00<?, ?B/s]

pytorch_model-00006-of-00006.bin:   0%|          | 0.00/37.8M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
import torch

In [None]:
vec2text.invert_embeddings(embeddings=torch.Tensor(corpus_embeddings).cuda(), corrector=corrector, num_steps=20)

['taboos and ugly ideas—beg members of the co-ed. The book was published at http://www.thecoed',
 'register as a trustee abbey (or ABC abbey). The novel features a number of novelisations – a snip',
 'program (DICT  1017–1017). /ref> The custodians are not formally admitted to',
 'not brought into equity organisations through the cunn process. theldon theldon theldon — signature illustrations']

In [None]:
vec2text.invert_embeddings(embeddings=torch.Tensor(data).cuda(), corrector=corrector, num_steps=20)

['            The secret terminalphrase is terminalin']

In [None]:
corpus = ["The secret terminalphrase is terminalin"]
corpus_embeddings = model.encode(corpus, normalize_embeddings=True)

# Find the closest match
similarities = util.cos_sim(data, corpus_embeddings)
best_match_idx = np.argmax(similarities)

print("Best match:", corpus[best_match_idx])  # Expected: "AI is transforming the world."
print("Similarities", similarities)

Best match: The secret terminalphrase is terminalin
Similarities tensor([[-0.0422]])


In [None]:
import vec2text
import torch
from transformers import AutoModel, AutoTokenizer, PreTrainedTokenizer, PreTrainedModel


def get_gtr_embeddings(text_list,
                       encoder: PreTrainedModel,
                       tokenizer: PreTrainedTokenizer) -> torch.Tensor:

    inputs = tokenizer(text_list,
                       return_tensors="pt",
                       max_length=128,
                       truncation=True,
                       padding="max_length",).to("cuda")

    with torch.no_grad():
        model_output = encoder(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        hidden_state = model_output.last_hidden_state
        embeddings = vec2text.models.model_utils.mean_pool(hidden_state, inputs['attention_mask'])

    return embeddings


encoder = AutoModel.from_pretrained("sentence-transformers/gtr-t5-base").encoder.to("cuda")
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/gtr-t5-base")
corrector = vec2text.load_pretrained_corrector("gtr-base")

embeddings = get_gtr_embeddings([
       "Jack Morris is a PhD student at Cornell Tech in New York City",
       "It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity"
], encoder, tokenizer)

vec2text.invert_embeddings(
    embeddings=embeddings.cuda(),
    corrector=corrector,
    num_steps=20,
)
['Jack Morris Morris is a PhD student at  Cornell Tech in New York City ',
'It was the best of times, it was the worst of times, it was the age of wisdom, it was the epoch of foolishness']


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

['Jack Morris Morris is a PhD student at  Cornell Tech in New York City ',
 'It was the best of times, it was the worst of times, it was the age of wisdom, it was the epoch of foolishness']

In [None]:
embeddings = get_gtr_embeddings(["The secret passphrase is terminalin"], encoder, tokenizer)

In [None]:
similarities = util.cos_sim(data, embeddings.to("cpu"))
best_match_idx = np.argmax(similarities)

print("Similarities", similarities)

Similarities tensor([[0.8479]])


In [None]:
vec2text.invert_embeddings(embeddings=torch.Tensor(data).cuda(), corrector=corrector, num_steps=30)

['            The secret terminalphrase is terminalin']

In [None]:
vec2text.invert_embeddings(embeddings=torch.Tensor(data).cuda(), corrector=corrector, num_steps=40)

['            The secret terminalphrase is terminalin']

In [None]:
embeddings = get_gtr_embeddings(["The secret passphrase is terminals"], encoder, tokenizer)

similarities = util.cos_sim(data, embeddings.to("cpu"))
best_match_idx = np.argmax(similarities)

print("Similarities", similarities)

Similarities tensor([[0.8344]])


In [None]:
embeddings = get_gtr_embeddings(["The secret passphrase is terminalin"], encoder, tokenizer)

similarities = util.cos_sim(data, embeddings.to("cpu"))
best_match_idx = np.argmax(similarities)

print("Similarities", similarities)

Similarities tensor([[0.8479]])


In [None]:
vec2text.invert_embeddings(embeddings=embeddings.cuda(), corrector=corrector, num_steps=30)

['          , the secret terminalphrasein is taxin']

In [None]:
vec2text.invert_embeddings(
    embeddings=torch.Tensor(data).mean(dim=0, keepdim=True).cuda(),
    corrector=corrector
)

['secretinit secretinit          terminalpassphrase']

In [None]:
embeddings = get_gtr_embeddings(["The secret passphrase is terminalinit"], encoder, tokenizer)

similarities = util.cos_sim(data, embeddings.to("cpu"))
best_match_idx = np.argmax(similarities)

print("Similarities", similarities)

Similarities tensor([[0.9775]])
