In [4]:
import torch
from transformers import AutoModel, AutoTokenizer
from scipy.spatial.distance import cosine

# Get our models - The package will take care of downloading the models automatically
# For best performance: Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit
tokenizer = AutoTokenizer.from_pretrained("Muennighoff/SGPT-125M-weightedmean-msmarco-specb-bitfit")
model = AutoModel.from_pretrained("Muennighoff/SGPT-125M-weightedmean-msmarco-specb-bitfit")
# Deactivate Dropout (There is no dropout in the above models so it makes no difference here but other SGPT models may have dropout)
model.eval()

GPTNeoModel(
  (wte): Embedding(50259, 768)
  (wpe): Embedding(2048, 768)
  (drop): Dropout(p=0, inplace=False)
  (h): ModuleList(
    (0): GPTNeoBlock(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPTNeoAttention(
        (attention): GPTNeoSelfAttention(
          (attn_dropout): Dropout(p=0, inplace=False)
          (resid_dropout): Dropout(p=0, inplace=False)
          (k_proj): Linear(in_features=768, out_features=768, bias=False)
          (v_proj): Linear(in_features=768, out_features=768, bias=False)
          (q_proj): Linear(in_features=768, out_features=768, bias=False)
          (out_proj): Linear(in_features=768, out_features=768, bias=True)
        )
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPTNeoMLP(
        (c_fc): Linear(in_features=768, out_features=3072, bias=True)
        (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        (dropout): Dropout(p=0, inplace=False)
 

In [8]:
queries = [
    "I'm searching for a planet not too far from Earth.",
]

docs = [
    "Neptune is the eighth and farthest-known Solar planet from the Sun. In the Solar System, it is the fourth-largest planet by diameter, the third-most-massive planet, and the densest giant planet. It is 17 times the mass of Earth, slightly more massive than its near-twin Uranus.",
    "Mars is the fourth planet from the Sun and the second-smallest planet in the Solar System, only being larger than Mercury. In the English language, Mars is named for the Roman god of war. Mars is a terrestrial planet with a thin atmosphere (less than 1% that of Earth's), and has a crust primarily composed of elements similar to Earth's crust, as well as a core made of iron and nickel. Mars has surface features such as impact craters, valleys, dunes and polar ice caps. It has two small and irregularly shaped moons, Phobos and Deimos.",
    "TRAPPIST-1d, also designated as 2MASS J23062928-0502285 d, is a small exoplanet (about 30% the mass of the earth), which orbits on the inner edge of the habitable zone of the ultracool dwarf star TRAPPIST-1 approximately 40 light-years (12.1 parsecs, or nearly 3.7336×1014 km) away from Earth in the constellation of Aquarius.",
    "A harsh desert world orbiting twin suns in the galaxy’s Outer Rim, Tatooine is a lawless place ruled by Hutt gangsters. Many settlers scratch out a living on moisture farms, while spaceport cities such as Mos Eisley and Mos Espa serve as home base for smugglers, criminals, and other rogues.",
]

SPECB_QUE_BOS = tokenizer.encode("[", add_special_tokens=False)[0]
SPECB_QUE_EOS = tokenizer.encode("]", add_special_tokens=False)[0]

SPECB_DOC_BOS = tokenizer.encode("{", add_special_tokens=False)[0]
SPECB_DOC_EOS = tokenizer.encode("}", add_special_tokens=False)[0]


def tokenize_with_specb(texts, is_query):
    # Tokenize without padding
    batch_tokens = tokenizer(texts, padding=False, truncation=True)   
    # Add special brackets & pay attention to them
    for seq, att in zip(batch_tokens["input_ids"], batch_tokens["attention_mask"]):
        if is_query:
            seq.insert(0, SPECB_QUE_BOS)
            seq.append(SPECB_QUE_EOS)
        else:
            seq.insert(0, SPECB_DOC_BOS)
            seq.append(SPECB_DOC_EOS)
        att.insert(0, 1)
        att.append(1)
    # Add padding
    batch_tokens = tokenizer.pad(batch_tokens, padding=True, return_tensors="pt")
    return batch_tokens


def get_weightedmean_embedding(batch_tokens, last_hidden_state):
    # Get the embeddings
 
    # Get weights of shape [bs, seq_len, hid_dim]
    weights = (
        torch.arange(start=1, end=last_hidden_state.shape[1] + 1)
        .unsqueeze(0)
        .unsqueeze(-1)
        .expand(last_hidden_state.size())
        .float().to(last_hidden_state.device)
    )

    # Get attn mask of shape [bs, seq_len, hid_dim]
    input_mask_expanded = (
        batch_tokens["attention_mask"]
        .unsqueeze(-1)
        .expand(last_hidden_state.size())
        .float()
    )

    # Perform weighted mean pooling across seq_len: bs, seq_len, hidden_dim -> bs, hidden_dim
    sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded * weights, dim=1)
    sum_mask = torch.sum(input_mask_expanded * weights, dim=1)

    embeddings = sum_embeddings / sum_mask

    return embeddings

def get_embedding(batch_tokens, model):
    with torch.no_grad():
        # Get hidden state of shape [bs, seq_len, hid_dim]
        last_hidden_state = model(**batch_tokens, output_hidden_states=True, return_dict=True).last_hidden_state
    return get_weightedmean_embedding(batch_tokens, last_hidden_state)

In [9]:
query_embeddings = get_embedding(tokenize_with_specb(queries, is_query=True), model)
doc_embeddings = get_embedding(tokenize_with_specb(docs, is_query=False), model)

# Calculate cosine similarities
# Cosine similarities are in [-1, 1]. Higher means more similar
cosine_sim_0_1 = 1 - cosine(query_embeddings[0], doc_embeddings[0])
cosine_sim_0_2 = 1 - cosine(query_embeddings[0], doc_embeddings[1])
cosine_sim_0_3 = 1 - cosine(query_embeddings[0], doc_embeddings[2])

print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (queries[0], docs[0][:20] + "...", cosine_sim_0_1))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (queries[0], docs[1][:20] + "...", cosine_sim_0_2))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (queries[0], docs[2][:20] + "...", cosine_sim_0_3))

Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Cosine similarity between "I'm searching for a planet not too far from Earth." and "Neptune is the eight..." is: 0.622
Cosine similarity between "I'm searching for a planet not too far from Earth." and "Mars is the fourth p..." is: 0.570
Cosine similarity between "I'm searching for a planet not too far from Earth." and "TRAPPIST-1d, also de..." is: 0.490


In [10]:
tokenizer.save_pretrained("artifacts/sbert_base")
model.save_pretrained("artifacts/sbert_base")

# Terminal command!!
to create the ONNX model, note there is some floating point error introduced in the ONNX model, I had to reduce the requirements to get the model to pass this health check  (atol param), defaults to 5e-5


In [15]:
!python -m transformers.onnx --model=artifacts/sbert_base --atol=5e-3 artifacts/onnx/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Using framework PyTorch: 1.12.1
Overriding 1 configuration item(s)
	- use_cache -> False
  if batch_size <= 0:
  mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/transformers/onnx/__main__.py", line 107, in <module>
    main()
  File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packa

In [14]:
import onnxruntime as rt

In [16]:
sess = rt.InferenceSession('artifacts/onnx/model.onnx')

In [17]:
input_name = sess.get_inputs()[0].name
embedding_layer_name = sess.get_outputs()[0].name

In [18]:
def embed_onnx(texts, is_query=True):
    model_inputs = tokenize_with_specb(texts, is_query=is_query)
    inputs_onnx = {k: v.cpu().detach().numpy() for k, v in model_inputs.items()}

    # List of embeddings
    sequence = torch.from_numpy(sess.run(None, inputs_onnx)[0])

    # Weighted sum, using the attention weights
    embeddings = get_weightedmean_embedding(model_inputs, sequence)
    return embeddings

In [19]:
qembs = embed_onnx(queries, is_query=True)
docembs = embed_onnx(docs, is_query=False)

Ignored unknown kwarg option direction
Ignored unknown kwarg option direction


# Notice that the resuls are identical to the huggingface model above!

In [21]:
cosine_sim_0_1 = 1 - cosine(qembs[0], docembs[0])
cosine_sim_0_2 = 1 - cosine(qembs[0], docembs[1])
cosine_sim_0_3 = 1 - cosine(qembs[0], docembs[2])

print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (queries[0], docs[0][:20] + "...", cosine_sim_0_1))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (queries[0], docs[1][:20] + "...", cosine_sim_0_2))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (queries[0], docs[2][:20] + "...", cosine_sim_0_3))

Cosine similarity between "I'm searching for a planet not too far from Earth." and "Neptune is the eight..." is: 0.622
Cosine similarity between "I'm searching for a planet not too far from Earth." and "Mars is the fourth p..." is: 0.570
Cosine similarity between "I'm searching for a planet not too far from Earth." and "TRAPPIST-1d, also de..." is: 0.490


In [68]:
import numpy as np

embeddings = np.average(sequence, axis=1)

embeddings.shape

(4, 768)