In [253]:
import numpy as np
import random


def lsh_hashing(vectors, k=16):
    """
    Хеширует бинарные векторы с использованием Locality Sensitive Hashing.

    Args:
        vectors: Массив NumPy, где каждая строка - бинарный вектор.
        k: Количество хеш-функций (размерность хеш-ключа).

    Returns:
        Словарь (хеш-таблица), где ключи - хеш-ключи, а значения - списки индексов векторов в бакете.
    """

    num_vectors = vectors.shape[0]
    vector_dimension = vectors.shape[1]

    # 1. Создаем случайные бинарные векторы (хеш-функции)
    hash_functions = np.random.normal(0, 2, size=(k, vector_dimension))
    # hash_functions = np.random.rand(k, vector_dimension) - 0.5

    # 2. Хешируем каждый входной вектор
    hash_table = {}
    for i in range(num_vectors):
        # Вычисляем dot product с каждой хеш-функцией
        dot_products = np.dot(hash_functions, vectors[i])
        # print(dot_products)
        # Преобразуем dot product в бинарное значение
        hash_bits = (dot_products >= 0).astype(
            int
        )  # >= 0, чтобы избежать проблем с очень малыми отрицательными числами

        # Объединяем биты в хеш-ключ (в виде строки)
        hash_key = "".join(str(bit) for bit in hash_bits)
        # print(hash_key)
        # 3. Размещаем вектор в бакет
        if hash_key not in hash_table:
            hash_table[hash_key] = []
        hash_table[hash_key].append(i)

    return hash_table


# Пример использования:
vectors = np.array(
    [
        [1, 0, 1, 0, 0, 0, 0, 1, 0],
        [1, 0, 1, 1, 0, 0, 0, 0, 0],
        [0, 1, 0, 1, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 1, 0, 0],
        [1, 1, 1, 0, 0, 0, 0, 0, 0],
    ],
)

hash_table = lsh_hashing(vectors, k=8)

# Печатаем содержимое хеш-таблицы
for hash_key, vector_indices in hash_table.items():
    print(f"Бакет с хеш-ключом {hash_key}: {vector_indices}")

Бакет с хеш-ключом 10100010: [0]
Бакет с хеш-ключом 10100011: [1]
Бакет с хеш-ключом 11000101: [2]
Бакет с хеш-ключом 11010010: [3]
Бакет с хеш-ключом 11010011: [4]


In [251]:
hash_table

{'111111111': [0, 1, 2, 3, 4]}

In [None]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# The sentences to encode

In [None]:
sentences = [
    "The weather is lovely today.",
    "The weather is lovely today",
    "The weather is lovely",
    "The weather is good today.",
    "The weather is bad today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
    "He drove to the stadium",
    "He drove to the stadium in the night",
    "He loves stadiums",
    "I love cats",
    "I love dogs",
    "He is programming on python.",
]

# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(
    sentences,
    # precision="binary",
    normalize_embeddings=True,
)

hash_table = lsh_hashing(embeddings, k=10)

# Печатаем содержимое хеш-таблицы
for hash_key, vector_indices in hash_table.items():
    print(f"Бакет с хеш-ключом {hash_key}: {vector_indices}")
    for i in vector_indices:
        print(sentences[i])
    print("===")
    print("===")

Бакет с хеш-ключом 0100101011: [0]
The weather is lovely today.
===
===
Бакет с хеш-ключом 0110101011: [1]
The weather is lovely today
===
===
Бакет с хеш-ключом 0100101001: [2]
The weather is lovely
===
===
Бакет с хеш-ключом 0110101111: [3]
The weather is good today.
===
===
Бакет с хеш-ключом 0110101110: [4]
The weather is bad today.
===
===
Бакет с хеш-ключом 0100111001: [5]
It's so sunny outside!
===
===
Бакет с хеш-ключом 1111101111: [6, 7, 8]
He drove to the stadium.
He drove to the stadium
He drove to the stadium in the night
===
===
Бакет с хеш-ключом 0101111001: [9]
He loves stadiums
===
===
Бакет с хеш-ключом 1101101011: [10]
I love cats
===
===
Бакет с хеш-ключом 1101111001: [11]
I love dogs
===
===
Бакет с хеш-ключом 0101011101: [12]
He is programming on python.
===
===


### Try On real data

In [None]:
from datasets import load_dataset

dataset = load_dataset("embedding-data/QQP_triplets")

README.md:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

quora_duplicate_triplets.jsonl:   0%|          | 0.00/183M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/101762 [00:00<?, ? examples/s]

In [None]:
dataset["train"][0]["set"]

{'query': 'Why in India do we not have one on one political debate as in USA?',
 'pos': ['Why cant we have a public debate between politicians in India like the one in US?'],
 'neg': ['Can people on Quora stop India Pakistan debate? We are sick and tired seeing this everyday in bulk?',
  'Why do politicians, instead of having a decent debate on issues going in and around the world, end up fighting always?',
  'Can educated politicians make a difference in India?',
  'What are some unusual aspects about politics and government in India?',
  'What is debate?',
  'Why does civic public communication and discourse seem so hollow in modern India?',
  'What is a Parliamentary debate?',
  "Why do we always have two candidates at the U.S. presidential debate. yet the ballot has about 7 candidates? Isn't that a misrepresentation of democracy?",
  'Why is civic public communication and discourse so hollow in modern India?',
  "Aren't the Presidential debates teaching our whole country terrible c

In [None]:
initial_pos = 10
diff_pos = 20
sentences = [
    dataset["train"][initial_pos]["set"]["pos"][0],
    dataset["train"][initial_pos]["set"]["query"],
    *dataset["train"][diff_pos]["set"]["neg"][:10],
]

# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(
    sentences,
    # precision="binary",
    # normalize_embeddings=True,
)

hash_table = lsh_hashing(embeddings, k=10)

# Печатаем содержимое хеш-таблицы
for hash_key, vector_indices in hash_table.items():
    print(f"Бакет с хеш-ключом {hash_key}: {vector_indices}")
    for i in vector_indices:
        if i == 0:
            print("POSITIVE")
        if i == 1:
            print("QUERY")
        print(sentences[i])
    print("===")
    print("===")

dataset["train"][diff_pos]["set"]

Бакет с хеш-ключом 1001111101: [0]
POSITIVE
Does imaginary gravity exist?
===
===
Бакет с хеш-ключом 0000101101: [1, 7]
QUERY
Can imaginary time, energy and gravity exist?
Are Cengage books good for JEE Advanced?
===
===
Бакет с хеш-ключом 0101100101: [2]
What are some good books for IIT JEE preparation for class 10?
===
===
Бакет с хеш-ключом 0100000101: [3]
What should be the order of books to read for JEE preparations?
===
===
Бакет с хеш-ключом 0100111101: [4]
How is Cengage Books for JEE Advanced?
===
===
Бакет с хеш-ключом 0001010101: [5]
Which books did JEE Advanced 2016 AIR 2 Bhavesh Dhingra use in preparation for JEE?
===
===
Бакет с хеш-ключом 0000110001: [6, 9]
What are some good books for JEE Chemistry?
What are the best books for JEE in chemistry?
===
===
Бакет с хеш-ключом 0100110101: [8]
What are some good books for JEE Mains only?
===
===
Бакет с хеш-ключом 1000100001: [10, 11]
What are the best mathematics books for the IIT-JEE preparation?
What are the best books for 

{'query': 'What were the books Aman Bansal used for his Jee preparation?',
 'pos': ['Which books were used my Aman Bansal for JEE preparation?'],
 'neg': ['What are some good books for IIT JEE preparation for class 10?',
  'What should be the order of books to read for JEE preparations?',
  'How is Cengage Books for JEE Advanced?',
  'Which books did JEE Advanced 2016 AIR 2 Bhavesh Dhingra use in preparation for JEE?',
  'What are some good books for JEE Chemistry?',
  'Are Cengage books good for JEE Advanced?',
  'What are some good books for JEE Mains only?',
  'What are the best books for JEE in chemistry?',
  'What are the best mathematics books for the IIT-JEE preparation?',
  'What are the best books for theory in math for IIT-JEE preparation?',
  'Are NCERT books enough for the JEE Main?',
  'How are JEE Advanced papers prepared?',
  'How should I study for JEE?',
  'Which book should i use for JEE  organic chemistry?',
  'Which book should I use for JEE organic chemistry?',
  '

Может стоит использовать вместо LSH индексацию из faiss? https://github.com/facebookresearch/faiss/blob/main/tutorial/python/2-IVFFlat.py

- типа IVFADC

## Faiss

- этот код позволяет автоматически сформировать центроиды из данных, а потом быстро получать индекс нужного нам вектора

In [1]:
import faiss
import numpy as np

d = 128  # Dimensionality of the vectors
nlist = 100  # Number of Voronoi cells (buckets)
quantizer = faiss.IndexFlatL2(d)  # Replace with other quantizers as needed

#  Using a GPU index.
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)

# Generate some random data for training
xt = np.random.random((1000, d)).astype("float32")

# Train the index
index.train(xt)

# Add some vectors to the index (training data)
index.add(xt)

# Create a query vector
xq = np.random.random((1, d)).astype("float32")


quantizer.assign(xq, 1)



array([[22]])

### Add Spacy for sentence tokenization(спорно)

In [2]:
# python -m spacy download en_core_web_sm
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
# from datasets import load_dataset

# dataset = load_dataset("Open-Orca/OpenOrca")
# dataset = dataset["train"]

In [9]:
long_text = open('./long_text_example.txt').read()
long_text

'Deep Learning: A Deep Dive into the Engine of Modern AI\n\nDeep learning, a subfield of machine learning, has revolutionized the landscape of artificial intelligence in recent years. From self-driving cars to personalized medicine, its applications are becoming increasingly pervasive. But what exactly is deep learning? And what makes it so powerful?\n\nAt its core, deep learning relies on artificial neural networks with multiple layers (hence the "deep"). These networks are inspired by the structure and function of the human brain, attempting to mimic the interconnected web of neurons that allows us to learn and process information. Unlike traditional machine learning algorithms that often require hand-engineered features, deep learning excels at learning these features directly from raw data. This ability to automatically extract complex patterns is a key differentiator and a major contributor to its superior performance in many tasks.\n\nUnderstanding the Building Blocks: Artificial

In [1]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = torch.bfloat16
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
    # model_name="unsloth/Llama-3.2-3B-Instruct",  # or choose "unsloth/Llama-3.2-1B-Instruct"
    model_name="unsloth/Qwen2.5-3B-Instruct-bnb-4bit",  # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
    # fix_tokenizer=False,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
[2025-02-25 15:04:25,573] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


INFO 02-25 15:04:26 __init__.py:190] Automatically detected platform cuda.
==((====))==  Unsloth 2025.2.15: Fast Qwen2 patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.546 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [7]:
FastLanguageModel.for_inference(model).generate

<function unsloth.models.llama._wrap_fast_inference.<locals>._fast_generate(*args, **kwargs)>

In [None]:
FastLanguageModel.for_inference(model)
text = tokenizer.apply_chat_template(
    [
        {"role": "user", "content": "How many r's are in strawberry?"},
    ],
    tokenize=False,
    add_generation_prompt=True,
)

from vllm import SamplingParams

sampling_params = SamplingParams(
    temperature=0.8,
    top_p=0.95,
    max_tokens=1024,
)
output = (
    model.generate(
        text,
        # sampling_params=sampling_params,
        lora_request=None,
        temperature=0.8,
        top_p=0.95,
        # max_tokens=1024,
    )[0]
    .outputs[0]
    .text
)

AttributeError: 'list' object has no attribute 'shape'