### sentence transformer

In [14]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

sentences = [
    "The weather is lovely today.",
    "The weather is lovely today",
    "The weather is lovely",
    "The weather is good today.",
    "The weather is bad today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
    "He drove to the stadium",
    "He drove to the stadium in the night",
    "He loves stadiums",
    "I love cats",
    "I love dogs",
    "He is programming on python.",
]

# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(
    sentences,
    # precision="binary",
    normalize_embeddings=True,
)

### faiss

In [1]:
import faiss
import numpy as np

d = 128  # Dimensionality of the vectors
nlist = 100  # Number of Voronoi cells (buckets)
quantizer = faiss.IndexFlatL2(d)  # Replace with other quantizers as needed

#  Using a GPU index.
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)

# Generate some random data for training
xt = np.random.random((1000, d)).astype("float32")

# Train the index
index.train(xt)

# Add some vectors to the index (training data)
index.add(xt)

# Create a query vector
xq = np.random.random((1, d)).astype("float32")


quantizer.assign(xq, 1)



array([[24]])

### faiss+sentence transformer+tokenizer

In [2]:
long_text = open('./long_text_example.txt').read()
long_text

'Deep Learning: A Deep Dive into the Engine of Modern AI\n\nDeep learning, a subfield of machine learning, has revolutionized the landscape of artificial intelligence in recent years. From self-driving cars to personalized medicine, its applications are becoming increasingly pervasive. But what exactly is deep learning? And what makes it so powerful?\n\nAt its core, deep learning relies on artificial neural networks with multiple layers (hence the "deep"). These networks are inspired by the structure and function of the human brain, attempting to mimic the interconnected web of neurons that allows us to learn and process information. Unlike traditional machine learning algorithms that often require hand-engineered features, deep learning excels at learning these features directly from raw data. This ability to automatically extract complex patterns is a key differentiator and a major contributor to its superior performance in many tasks.\n\nUnderstanding the Building Blocks: Artificial

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-3B-Instruct')

In [8]:
original_tokens = tokenizer.encode(long_text)
original_tokens[:10]

[33464, 20909, 25, 362, 18183, 88517, 1119, 279, 8200, 315]

In [16]:
from more_itertools import chunked

chunk_size = 100
text_chunks = [tokenizer.decode(chunk) for chunk in chunked(original_tokens, chunk_size)]

embeddings = model.encode(
    text_chunks,
    # precision="binary",
    normalize_embeddings=True,
)

In [35]:
emb = model.encode(
    text_chunks[0],
    # precision="binary",
    normalize_embeddings=True,
)
emb.shape

(384,)

In [18]:
embeddings[0].shape[0]

384

In [23]:
embeddings.shape[0] // 2

9

In [24]:
import faiss
import numpy as np

d = embeddings[0].shape[0]  # Dimensionality of the vectors
# Количество сжимающих токенов памяти
nlist = embeddings.shape[0] // 2  # Number of Voronoi cells (buckets)
quantizer = faiss.IndexFlatL2(d)  # Replace with other quantizers as needed

#  Using a GPU index.
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)

# Train the index
index.train(embeddings)

# Add some vectors to the index (training data)
index.add(embeddings)



In [28]:
# Create a query vector
xq = np.expand_dims(embeddings[0], axis=0)


quantizer.assign(xq, 1)

array([[4]])

In [32]:
for i in range(embeddings.shape[0]):
    xq = np.expand_dims(embeddings[i], axis=0)
    # print cluster
    print(quantizer.assign(xq, 2))

[[4 6]]
[[7 4]]
[[7 4]]
[[2 1]]
[[2 7]]
[[1 2]]
[[5 1]]
[[1 4]]
[[4 7]]
[[8 4]]
[[8 7]]
[[7 8]]
[[3 7]]
[[0 6]]
[[4 6]]
[[6 4]]
[[6 4]]
[[6 4]]


In [36]:
def cluster_tokenization(text):
    emb = model.encode(
    text,
        normalize_embeddings=True,
    )
    xq = np.expand_dims(emb, axis=0)
    return quantizer.assign(xq, 2)[0][0]

Фрагмент текстовый фрагмент теперь принадлежит какому-то кластеру(индексу сжимающего токена)

<!-- ![image.png](./faiss.png) -->
<div>
<img src="./faiss.png" width="500"/>
</div>

### Генерируем обучающую выборку

In [61]:
train_dataset = []
memory_token_template = '<mem_tok_{num}>'
mask_probs = [
    0.1,
    0.2,
    0.5,
    0.8,
    0.9,
]
for prob in mask_probs:
    random_mask = np.random.random(len(text_chunks))
    mask = random_mask < prob
    chunks_for_tokenization = np.where(mask)[0].tolist()
    chunks_for_tokenization = set(chunks_for_tokenization)
    train_text = ""

    for chunk_id, text in enumerate(text_chunks):
        if chunk_id in chunks_for_tokenization:
            cluster_id = cluster_tokenization(text)
            # print(cluster_id)
            train_text += f" {memory_token_template.format(num=cluster_id)} "
        else:
            train_text += text

    train_dataset.append(train_text)

In [62]:
print(train_dataset[0])

Deep Learning: A Deep Dive into the Engine of Modern AI

Deep learning, a subfield of machine learning, has revolutionized the landscape of artificial intelligence in recent years. From self-driving cars to personalized medicine, its applications are becoming increasingly pervasive. But what exactly is deep learning? And what makes it so powerful?

At its core, deep learning relies on artificial neural networks with multiple layers (hence the "deep"). These networks are inspired by the structure and function of the human brain, attempting to mimic the interconnected web of neurons that allows us to learn and process information. Unlike traditional machine learning algorithms that often require hand-engineered features, deep learning excels at learning these features directly from raw data. This ability to automatically extract complex patterns is a key differentiator and a major contributor to its superior performance in many tasks.

Understanding the Building Blocks: Artificial Neural

In [63]:
print(train_dataset[3])

Deep Learning: A Deep Dive into the Engine of Modern AI

Deep learning, a subfield of machine learning, has revolutionized the landscape of artificial intelligence in recent years. From self-driving cars to personalized medicine, its applications are becoming increasingly pervasive. But what exactly is deep learning? And what makes it so powerful?

At its core, deep learning relies on artificial neural networks with multiple layers (hence the "deep"). These networks are inspired by the structure and function of the human brain, attempting to mimic the interconnected web of neurons that allows us to learn and process information. Unlike traditional machine learning algorithms that often require hand-engineered features, deep learning excels at learning these features directly from raw data. This ability to automatically extract complex patterns is a key differentiator and a major contributor to its superior performance in many tasks.

Understanding the Building Blocks: Artificial Neural

In [68]:
new_tokens = [memory_token_template.format(num=i) for i in range(nlist)]
tokenizer.add_tokens(new_tokens)

0

In [71]:
print(train_dataset[-1])

 <mem_tok_4>  <mem_tok_7>  An ANN consists of interconnected nodes, called neurons, organized in layers. These layers typically include:

Input Layer: Receives the raw data as input. The number of neurons in this layer corresponds to the number of features in the data.

Hidden Layers: Perform the actual processing of the input data. Deep learning is characterized by having multiple hidden layers, allowing for the creation of complex and hierarchical representations.

Output Layer: Produces the final prediction or classification based on the processed information. The number of neurons in this layer corresponds to the number of classes or the range of the prediction.

Each connection between neurons has an associated weight, which represents the strength of the connection. When data flows through the network, each neuron receives inputs from the neurons in the previous layer, multiplies those inputs by their corresponding weights, sums the weighted inputs, and then applies an activation

In [73]:
print(tokenizer.decode(tokenizer.encode(train_dataset[-1])))

 <mem_tok_4>  <mem_tok_7>  An ANN consists of interconnected nodes, called neurons, organized in layers. These layers typically include:

Input Layer: Receives the raw data as input. The number of neurons in this layer corresponds to the number of features in the data.

Hidden Layers: Perform the actual processing of the input data. Deep learning is characterized by having multiple hidden layers, allowing for the creation of complex and hierarchical representations.

Output Layer: Produces the final prediction or classification based on the processed information. The number of neurons in this layer corresponds to the number of classes or the range of the prediction.

Each connection between neurons has an associated weight, which represents the strength of the connection. When data flows through the network, each neuron receives inputs from the neurons in the previous layer, multiplies those inputs by their corresponding weights, sums the weighted inputs, and then applies an activation

In [74]:
tokenizer.encode(" <mem_tok_4>  <mem_tok_7>  ")

[220, 151669, 256, 151672, 256]