In [2]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

sentences = [
    'The new movie is awesome',
    'This recent movie is so good',
]

model = SentenceTransformer('Supabase/gte-small')
embeddings = model.encode(sentences)
print(cos_sim(embeddings[0], embeddings[1]))

  from .autonotebook import tqdm as notebook_tqdm
No sentence-transformers model found with name Supabase/gte-small. Creating a new one with MEAN pooling.


tensor([[0.8980]])


In [None]:
embeddings = model.encode("I like python programming")
print(the length is embeddings)
embeddings

In [3]:
# Two lists of sentences
sentences1 = [
    "The cat sits outside",
    "A man is playing guitar",
    "The new movie is awesome",
]

sentences2 = [
    "The dog plays in the garden",
    "A woman watches TV",
    "The new movie is so great",
]

# Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

# Compute cosine-similarities
cosine_scores = cos_sim(embeddings1, embeddings2)

# Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(
        sentences1[i], sentences2[i], cosine_scores[i][i]
    ))

The cat sits outside 		 The dog plays in the garden 		 Score: 0.8200
A man is playing guitar 		 A woman watches TV 		 Score: 0.7016
The new movie is awesome 		 The new movie is so great 		 Score: 0.9697


In [4]:
# Our sentences to encode
sentences = [
    "This framework generates embeddings for each input sentence",
    "Sentences are passed as a list of string.",
    "The quick brown fox jumps over the lazy dog."
]

# Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

# Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: This framework generates embeddings for each input sentence
Embedding: [-4.56867039e-01 -6.03741333e-02  2.77015604e-02 -1.49323851e-01
 -2.58537717e-02  3.99035424e-01 -1.55861098e-02  2.39103407e-01
  9.95152146e-02  1.49657652e-01 -3.45252007e-01 -4.33489054e-01
  6.84537172e-01  2.49792799e-01  3.92542392e-01  3.05619508e-01
 -2.38010973e-01  3.97295952e-01 -4.60436344e-01 -1.37539938e-01
  5.90817690e-01 -2.84304023e-01  1.05978414e-01 -5.92266321e-01
 -1.59350619e-01  4.13091719e-01 -1.64931700e-01 -7.34146535e-02
 -3.01011771e-01 -1.89854705e+00  2.36649383e-02 -5.51725924e-01
  7.99841821e-01 -4.33842577e-02 -2.60188520e-01 -1.74996525e-01
 -4.91537154e-01  4.09644157e-01 -1.80871040e-01  2.30171055e-01
  2.36194596e-01  2.71462291e-01  2.17980426e-02 -6.09192073e-01
 -2.04823971e-01 -5.56082845e-01 -6.08014166e-01  7.80368064e-05
 -8.24694932e-02 -2.05188349e-01 -7.09768012e-02 -4.21118528e-01
 -9.76327658e-02  8.62645730e-02  2.12224275e-01  1.12527423e-01
  2.59943

In [5]:
from sentence_transformers.util import semantic_search

docs = [
    "A man is eating food.",
    "A man is eating a piece of bread.",
    "The girl is carrying a baby.",
    "A man is riding a horse.",
    "A woman is playing violin.",
    "Two men pushed carts through the woods.",
    "A man is riding a white horse on an enclosed ground.",
    "A monkey is playing drums.",
    "A cheetah is running behind its prey.",
]

docs_embeddings = model.encode(docs, convert_to_tensor=True)

query = "tell me about music"
query_embedding = model.encode(query, convert_to_tensor=True)

hits = semantic_search(query_embedding, docs_embeddings, top_k=2)
hits

for hit in hits[0]:
    print(docs[hit['corpus_id']], "(Score: %.4f)" % hit['score'])

A woman is playing violin. (Score: 0.7764)
A monkey is playing drums. (Score: 0.7570)


In [8]:
import tiktoken

def split_large_text(large_text, max_tokens):
    enc = tiktoken.get_encoding("cl100k_base")
    tokenized_text = enc.encode(large_text)

    chunks = []
    current_chunk = []
    current_length = 0

    for token in tokenized_text:
        current_chunk.append(token)
        current_length += 1

        if current_length >= max_tokens:
            chunks.append(enc.decode(current_chunk).rstrip(' .,;'))
            current_chunk = []
            current_length = 0

    if current_chunk:
        chunks.append(enc.decode(current_chunk).rstrip(' .,;'))

    return chunks

Why use tokens?

> By breaking words into smaller parts (tokens), LLMs can better handle new or unusual words by understanding their building blocks. It also helps the model grasp the nuances of language, such as different word forms and contextual meanings.

[source](https://kelvin.legal/understanding-large-language-models-words-versus-tokens/#:~:text=By%20breaking%20words%20into%20smaller,word%20forms%20and%20contextual%20meanings.)

In [22]:
import tiktoken

sent = "If we split a text by number of characters, it is not obvious how many tokens these chunks will be."

print(len(sent.split()))

enc = tiktoken.get_encoding("cl100k_base")
encoded = enc.encode(sent)

print(len(encoded))
tokens = [enc.decode_single_token_bytes(x) for x in encoded]
print(tokens)
print(len(tokens))


decoded = enc.decode(encoded)
print(len(decoded.split()))
decoded


20
22
[b'If', b' we', b' split', b' a', b' text', b' by', b' number', b' of', b' characters', b',', b' it', b' is', b' not', b' obvious', b' how', b' many', b' tokens', b' these', b' chunks', b' will', b' be', b'.']
22
20


'If we split a text by number of characters, it is not obvious how many tokens these chunks will be.'

In [9]:
doc = """If we split a text by number of characters, it is not obvious how many tokens these chunks will be.
And at the same time if we want to split a text into bigger possible chunks and keep these chunks under certain LLM tokens limit, we cannot operate by number of characters."""
split_large_text(doc, 10)

['If we split a text by number of characters',
 ' it is not obvious how many tokens these chunks will',
 ' be.\nAnd at the same time if we want',
 ' to split a text into bigger possible chunks and keep',
 ' these chunks under certain LLM tokens limit, we',
 ' cannot operate by number of characters']