In [1]:
!ls

In [4]:
!pip install chromadb

[0m

In [5]:
!pip install langchain sentence_transformers

Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Using cached nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl (363.4 MB)
[0mInstalling collected packages: nvidia-cublas-cu12
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
libcugraph-cu12 25.6.0 requires libraft-cu12==25.6.*, but you have libraft-cu12 25.2.0 which is incompatible.
pylibcugraph-cu12 25.6.0 requires pylibraft-cu12==25.6.*, but you have pylibraft-cu12 25.2.0 which is incompatible.
pylibcugraph-cu12 25.6.0 requires rmm-cu12==25.6.*, but you have rmm-cu12 25.2.0 which is incompatible.[0m[31m
[0mSuccessfully installed nvidia-cublas-cu12


In [7]:
import os
import glob

from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

from tqdm import tqdm

import hashlib
import chromadb

from sentence_transformers import SentenceTransformer

## Config

In [8]:
DATASET_PATH="/kaggle/input/dataset"
CHUNK_SIZE=1000
CHUNK_OVERLAP=200

## Collect dataset

In [9]:
md_files = glob.glob(os.path.join(DATASET_PATH, "**/*.md"), recursive=True)
documents = []

print(f"Найдено {len(md_files)} .md файлов")

for file_path in tqdm(md_files, desc="loading dataset"):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        relative_path = os.path.relpath(file_path, DATASET_PATH)
        folder = os.path.dirname(relative_path)

        document = Document(
            page_content=content,
            metadata={
                "source": file_path,
                "folder": folder,
                "filename": os.path.basename(file_path),
                "file_type": "markdown"
            }
        )
        documents.append(document)

    except Exception as e:
        print(f"Ошибка загрузки файла {file_path}: {e}")

Найдено 3089 .md файлов


loading dataset: 100%|██████████| 3089/3089 [00:07<00:00, 411.33it/s]


## Chunking

In [10]:
chunker = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
    separators=["\n\n", "\n", ". ", "! ", "? ", " ", ""]
)

In [11]:
chunks = chunker.split_documents(documents)
print(f"Создано {len(chunks)} чанков из {len(documents)} документов")

Создано 27770 чанков из 3089 документов


## Vector DB

In [12]:
chroma_client = chromadb.PersistentClient(path="./chroma")
collection = chroma_client.get_or_create_collection(name="docs")

In [13]:
def generate_doc_id(content: str, metadata: dict) -> str:
    """Генерируем уникальный ID для чанка"""
    unique_string = f"{content[:100]}_{metadata['source']}_{metadata.get('chunk_index', 0)}"
    return hashlib.md5(unique_string.encode()).hexdigest()

In [14]:
from concurrent.futures import ThreadPoolExecutor

print("Создание эмбеддингов и добавление в векторную БД...")

model = SentenceTransformer('all-MiniLM-L6-v2')
print("Модель эмбеддингов загружена.")

batch_size = 500
with ThreadPoolExecutor(max_workers=4) as executor:
    futures = []
    for i in tqdm(range(0, len(chunks), batch_size)):
        batch = chunks[i:i + batch_size]

        documents_batch = []
        metadatas_batch = []
        ids_batch = []

        for j, chunk in enumerate(batch):
            chunk.metadata["chunk_index"] = i + j
            doc_id = generate_doc_id(chunk.page_content, chunk.metadata)

            documents_batch.append(chunk.page_content)
            metadatas_batch.append(chunk.metadata)
            ids_batch.append(doc_id)

        batch_embeddings = model.encode(documents_batch).tolist()

        future = executor.submit(
            collection.add,
            embeddings=batch_embeddings,
            documents=documents_batch,
            metadatas=metadatas_batch,
            ids=ids_batch
        )
        futures.append(future)

        print(f"Пакет {min(i + batch_size, len(chunks))}/{len(chunks)} чанков отправлен на обработку")

    for i, future in tqdm(enumerate(futures), total=len(futures), desc="Adding batches to ChromaDB"):
        try:
            future.result()
        except Exception as exc:
            print(f'Пакет {i} сгенерировал исключение: {exc}')

print(f"Векторное хранилище инициализировано. Всего чанков: {len(chunks)}")

Создание эмбеддингов и добавление в векторную БД...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Модель эмбеддингов загружена.


  2%|▏         | 1/56 [00:01<01:31,  1.66s/it]

Пакет 500/27770 чанков отправлен на обработку


  4%|▎         | 2/56 [00:03<01:23,  1.55s/it]

Пакет 1000/27770 чанков отправлен на обработку


  5%|▌         | 3/56 [00:04<01:22,  1.55s/it]

Пакет 1500/27770 чанков отправлен на обработку


  7%|▋         | 4/56 [00:06<01:21,  1.57s/it]

Пакет 2000/27770 чанков отправлен на обработку


  9%|▉         | 5/56 [00:07<01:20,  1.58s/it]

Пакет 2500/27770 чанков отправлен на обработку


 11%|█         | 6/56 [00:09<01:23,  1.67s/it]

Пакет 3000/27770 чанков отправлен на обработку


 12%|█▎        | 7/56 [00:11<01:21,  1.66s/it]

Пакет 3500/27770 чанков отправлен на обработку


 14%|█▍        | 8/56 [00:12<01:18,  1.64s/it]

Пакет 4000/27770 чанков отправлен на обработку


 16%|█▌        | 9/56 [00:14<01:18,  1.68s/it]

Пакет 4500/27770 чанков отправлен на обработку


 18%|█▊        | 10/56 [00:16<01:16,  1.67s/it]

Пакет 5000/27770 чанков отправлен на обработку


 20%|█▉        | 11/56 [00:18<01:17,  1.73s/it]

Пакет 5500/27770 чанков отправлен на обработку


 21%|██▏       | 12/56 [00:19<01:16,  1.73s/it]

Пакет 6000/27770 чанков отправлен на обработку


 23%|██▎       | 13/56 [00:21<01:14,  1.74s/it]

Пакет 6500/27770 чанков отправлен на обработку


 25%|██▌       | 14/56 [00:23<01:12,  1.72s/it]

Пакет 7000/27770 чанков отправлен на обработку


 27%|██▋       | 15/56 [00:25<01:10,  1.72s/it]

Пакет 7500/27770 чанков отправлен на обработку


 29%|██▊       | 16/56 [00:26<01:09,  1.73s/it]

Пакет 8000/27770 чанков отправлен на обработку


 30%|███       | 17/56 [00:28<01:08,  1.75s/it]

Пакет 8500/27770 чанков отправлен на обработку


 32%|███▏      | 18/56 [00:30<01:06,  1.76s/it]

Пакет 9000/27770 чанков отправлен на обработку


 34%|███▍      | 19/56 [00:32<01:05,  1.76s/it]

Пакет 9500/27770 чанков отправлен на обработку


 36%|███▌      | 20/56 [00:33<01:03,  1.76s/it]

Пакет 10000/27770 чанков отправлен на обработку


 38%|███▊      | 21/56 [00:36<01:04,  1.84s/it]

Пакет 10500/27770 чанков отправлен на обработку


 39%|███▉      | 22/56 [00:37<01:01,  1.80s/it]

Пакет 11000/27770 чанков отправлен на обработку


 41%|████      | 23/56 [00:39<00:59,  1.82s/it]

Пакет 11500/27770 чанков отправлен на обработку


 43%|████▎     | 24/56 [00:41<00:56,  1.78s/it]

Пакет 12000/27770 чанков отправлен на обработку


 45%|████▍     | 25/56 [00:43<00:55,  1.79s/it]

Пакет 12500/27770 чанков отправлен на обработку


 46%|████▋     | 26/56 [00:44<00:53,  1.77s/it]

Пакет 13000/27770 чанков отправлен на обработку


 48%|████▊     | 27/56 [00:46<00:51,  1.78s/it]

Пакет 13500/27770 чанков отправлен на обработку


 50%|█████     | 28/56 [00:48<00:49,  1.75s/it]

Пакет 14000/27770 чанков отправлен на обработку


 52%|█████▏    | 29/56 [00:50<00:47,  1.77s/it]

Пакет 14500/27770 чанков отправлен на обработку


 54%|█████▎    | 30/56 [00:52<00:47,  1.81s/it]

Пакет 15000/27770 чанков отправлен на обработку


 55%|█████▌    | 31/56 [00:53<00:45,  1.81s/it]

Пакет 15500/27770 чанков отправлен на обработку


 57%|█████▋    | 32/56 [00:55<00:42,  1.79s/it]

Пакет 16000/27770 чанков отправлен на обработку


 59%|█████▉    | 33/56 [00:57<00:41,  1.79s/it]

Пакет 16500/27770 чанков отправлен на обработку


 61%|██████    | 34/56 [00:59<00:38,  1.76s/it]

Пакет 17000/27770 чанков отправлен на обработку


 62%|██████▎   | 35/56 [01:00<00:37,  1.78s/it]

Пакет 17500/27770 чанков отправлен на обработку


 64%|██████▍   | 36/56 [01:02<00:35,  1.78s/it]

Пакет 18000/27770 чанков отправлен на обработку


 66%|██████▌   | 37/56 [01:04<00:34,  1.81s/it]

Пакет 18500/27770 чанков отправлен на обработку


 68%|██████▊   | 38/56 [01:06<00:32,  1.79s/it]

Пакет 19000/27770 чанков отправлен на обработку


 70%|██████▉   | 39/56 [01:08<00:30,  1.79s/it]

Пакет 19500/27770 чанков отправлен на обработку


 71%|███████▏  | 40/56 [01:10<00:29,  1.86s/it]

Пакет 20000/27770 чанков отправлен на обработку


 73%|███████▎  | 41/56 [01:12<00:29,  1.97s/it]

Пакет 20500/27770 чанков отправлен на обработку


 75%|███████▌  | 42/56 [01:14<00:27,  1.96s/it]

Пакет 21000/27770 чанков отправлен на обработку


 77%|███████▋  | 43/56 [01:16<00:25,  1.93s/it]

Пакет 21500/27770 чанков отправлен на обработку


 79%|███████▊  | 44/56 [01:17<00:22,  1.91s/it]

Пакет 22000/27770 чанков отправлен на обработку


 80%|████████  | 45/56 [01:19<00:21,  1.92s/it]

Пакет 22500/27770 чанков отправлен на обработку


 82%|████████▏ | 46/56 [01:21<00:18,  1.85s/it]

Пакет 23000/27770 чанков отправлен на обработку


 84%|████████▍ | 47/56 [01:23<00:16,  1.87s/it]

Пакет 23500/27770 чанков отправлен на обработку


 86%|████████▌ | 48/56 [01:25<00:14,  1.84s/it]

Пакет 24000/27770 чанков отправлен на обработку


 88%|████████▊ | 49/56 [01:27<00:13,  1.88s/it]

Пакет 24500/27770 чанков отправлен на обработку


 89%|████████▉ | 50/56 [01:29<00:11,  1.84s/it]

Пакет 25000/27770 чанков отправлен на обработку


 91%|█████████ | 51/56 [01:30<00:09,  1.85s/it]

Пакет 25500/27770 чанков отправлен на обработку


 93%|█████████▎| 52/56 [01:32<00:07,  1.86s/it]

Пакет 26000/27770 чанков отправлен на обработку


 95%|█████████▍| 53/56 [01:34<00:05,  1.91s/it]

Пакет 26500/27770 чанков отправлен на обработку


 96%|█████████▋| 54/56 [01:36<00:03,  1.90s/it]

Пакет 27000/27770 чанков отправлен на обработку


 98%|█████████▊| 55/56 [01:38<00:01,  1.93s/it]

Пакет 27500/27770 чанков отправлен на обработку


100%|██████████| 56/56 [01:39<00:00,  1.79s/it]


Пакет 27770/27770 чанков отправлен на обработку


Adding batches to ChromaDB: 100%|██████████| 56/56 [00:00<00:00, 169.82it/s]

Векторное хранилище инициализировано. Всего чанков: 27770





## Search test

In [15]:
query = "How to check is cuda available"

query_embedding = model.encode([query]).tolist()

search_kwargs = {
    "query_embeddings": query_embedding,
    "n_results": 5
}

results = collection.query(**search_kwargs)

retrieved_docs = []
if results['documents']:
    for i, doc_content in enumerate(results['documents'][0]):
        metadata = results['metadatas'][0][i] if results['metadatas'] else {}
        distance = results['distances'][0][i] if results['distances'] else None

        document = Document(
            page_content=doc_content,
            metadata={
                **metadata,
                "distance": distance,
                "retrieval_rank": i + 1
            }
        )
        retrieved_docs.append(document)

In [16]:
for doc in retrieved_docs:
  print(doc.metadata)
  print(doc.page_content)
  print("=====================")

{'filename': 'cuda.md', 'file_type': 'markdown', 'folder': 'dataset/torch', 'source': '/kaggle/input/dataset/dataset/torch/cuda.md', 'chunk_index': 14783, 'distance': 0.7344754338264465, 'retrieval_rank': 1}
torch.cuda 

This package adds support for CUDA tensor types. 

It implements the same function as CPU tensors, but they utilize
GPUs for computation. 

It is lazily initialized, so you can always import it, and use [`is_available()`](generated/torch.cuda.is_available.html#torch.cuda.is_available "torch.cuda.is_available")  to determine if your system supports CUDA. 

[CUDA semantics](notes/cuda.html#cuda-semantics)  has more details about working with CUDA.
{'chunk_index': 4757, 'source': '/kaggle/input/dataset/dataset/cuda/torch.cuda.cudart.md', 'filename': 'torch.cuda.cudart.md', 'file_type': 'markdown', 'folder': 'dataset/cuda', 'distance': 0.8095513582229614, 'retrieval_rank': 2}
torch.cuda.cudart 

torch.cuda. cudart ( ) [source](https://github.com/pytorch/pytorch/blob/v2.8.0

## Try to work smarter with code blocks



In [17]:
import re
from langchain.schema import Document

def parse_markdown_with_code(content, file_path):
    code_block_pattern = r'```(?:\w+)?\n(.*?)\n```'
    inline_code_pattern = r'`([^`]+)`'

    code_blocks = re.findall(code_block_pattern, content, re.DOTALL)

    text_content = re.sub(code_block_pattern, '', content, flags=re.DOTALL)
    text_content = re.sub(inline_code_pattern, '', text_content)

    text_content = re.sub(r'\n\s*\n', '\n\n', text_content).strip()

    documents = []

    if text_content:
        text_doc = Document(
            page_content=text_content,
            metadata={
                "source": file_path,
                "content_type": "text",
                "file_type": "markdown"
            }
        )
        documents.append(text_doc)

    for i, code_block in enumerate(code_blocks):
        if code_block.strip():
            code_doc = Document(
                page_content=code_block.strip(),
                metadata={
                    "source": file_path,
                    "content_type": "code",
                    "code_block_index": i,
                    "file_type": "markdown_code"
                }
            )
            documents.append(code_doc)

    return documents

In [18]:
def load_markdown_documents_with_code(dataset_path):
    md_files = glob.glob(os.path.join(dataset_path, "**/*.md"), recursive=True)
    all_documents = []

    print(f"Найдено {len(md_files)} .md файлов")

    for file_path in tqdm(md_files, desc="Парсинг markdown с кодом"):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            parsed_docs = parse_markdown_with_code(content, file_path)
            all_documents.extend(parsed_docs)

        except Exception as e:
            print(f"Ошибка загрузки файла {file_path}: {e}")

    print(f"Создано {len(all_documents)} документов (текст + код)")
    return all_documents

In [19]:
def create_hybrid_chunkers():
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=150,
        separators=[
            "\n## ",  # Заголовки второго уровня
            "\n### ", # Заголовки третьего уровня
            "\n\n",   # Абзацы
            "\n",     # Строки
            " ",      # Слова
        ],
        length_function=len
    )

    code_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1200,
        chunk_overlap=200,
        separators=[
            "\n\n\n", # Большие разделения
            "\n\n",   # Функции/классы
            "\n",     # Строки кода
            ";",      # Операторы
            " ",      # Пробелы
        ],
        length_function=len
    )

    return text_splitter, code_splitter

def chunk_by_content_type(documents):
    """Разделяет чанкинг по типу контента"""

    text_splitter, code_splitter = create_hybrid_chunkers()

    text_docs = [doc for doc in documents if doc.metadata.get("content_type") == "text"]
    code_docs = [doc for doc in documents if doc.metadata.get("content_type") == "code"]

    print(f"Текстовых документов: {len(text_docs)}")
    print(f"Кодовых документов: {len(code_docs)}")

    text_chunks = text_splitter.split_documents(text_docs)
    code_chunks = code_splitter.split_documents(code_docs)

    all_chunks = text_chunks + code_chunks

    print(f"Текстовых чанков: {len(text_chunks)}")
    print(f"Кодовых чанков: {len(code_chunks)}")
    print(f"Всего чанков: {len(all_chunks)}")

    return all_chunks

In [20]:
def chunk_by_content_type(documents):
    text_splitter, code_splitter = create_hybrid_chunkers()

    text_docs = [doc for doc in documents if doc.metadata.get("content_type") == "text"]
    code_docs = [doc for doc in documents if doc.metadata.get("content_type") == "code"]

    print(f"Текстовых документов: {len(text_docs)}")
    print(f"Кодовых документов: {len(code_docs)}")

    text_chunks = text_splitter.split_documents(text_docs)
    code_chunks = code_splitter.split_documents(code_docs)

    all_chunks = text_chunks + code_chunks

    print(f"Текстовых чанков: {len(text_chunks)}")
    print(f"Кодовых чанков: {len(code_chunks)}")
    print(f"Всего чанков: {len(all_chunks)}")

    return all_chunks

In [21]:
class MarkdownCodeEmbedder:
    def __init__(self):
        self.text_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2').to("cuda")
        self.code_model = SentenceTransformer('microsoft/codebert-base').to("cuda")

    def encode_document(self, document):
        content_type = document.metadata.get("content_type", "text")
        content = document.page_content

        if content_type == "code":
            processed_code = self.preprocess_code(content)
            return self.code_model.encode([processed_code]).tolist()[0]
        else:
            return self.text_model.encode([content]).tolist()[0]

    def preprocess_code(self, code):
        code = re.sub(r'\s+', ' ', code)
        code = re.sub(r'#.*$', '', code, flags=re.MULTILINE)
        code = re.sub(r'//.*$', '', code, flags=re.MULTILINE)
        code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)

        return code.strip()

    def encode_batch(self, documents):
        texts = [doc.page_content for doc in documents]
        content_types = [doc.metadata.get("content_type", "text") for doc in documents]

        embeddings = []
        for text, content_type in zip(texts, content_types):
            if content_type == "code":
                processed = self.preprocess_code(text)
                emb = self.code_model.encode([processed]).tolist()[0]
            else:
                emb = self.text_model.encode([text]).tolist()[0]
            embeddings.append(emb)

        return embeddings

In [22]:
print("Загрузка и парсинг markdown файлов...")
documents = load_markdown_documents_with_code(DATASET_PATH)

print("Чанкинг...")
chunks = chunk_by_content_type(documents)

Загрузка и парсинг markdown файлов...
Найдено 3089 .md файлов


Парсинг markdown с кодом: 100%|██████████| 3089/3089 [00:02<00:00, 1215.21it/s]


Создано 8018 документов (текст + код)
Чанкинг...
Текстовых документов: 3088
Кодовых документов: 4930
Текстовых чанков: 27615
Кодовых чанков: 6326
Всего чанков: 33941


In [23]:
import uuid
print("Загрузка моделей эмбеддингов...")
embedder = MarkdownCodeEmbedder()

chroma_client = chromadb.PersistentClient(path="./chroma3")
collection = chroma_client.get_or_create_collection(
    name="markdown_docs",
    metadata={"hnsw:space": "cosine"}
)

print("Создание эмбеддингов...")
batch_size = 100

for i in tqdm(range(0, len(chunks), batch_size)):
    batch = chunks[i:i + batch_size]

    documents_batch = [chunk.page_content for chunk in batch]
    metadatas_batch = [chunk.metadata for chunk in batch]
    ids_batch = [uuid.uuid4().hex for chunk in batch]

    embeddings_batch = embedder.encode_batch(batch)

    collection.add(
        embeddings=embeddings_batch,
        documents=documents_batch,
        metadatas=metadatas_batch,
        ids=ids_batch
    )

print(f"Векторное хранилище создано. Всего чанков: {len(chunks)}")

Загрузка моделей эмбеддингов...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Создание эмбеддингов...



  0%|          | 0/340 [00:00<?, ?it/s][A
  0%|          | 1/340 [00:02<11:24,  2.02s/it][A
  1%|          | 2/340 [00:03<11:08,  1.98s/it][A
  1%|          | 3/340 [00:05<11:01,  1.96s/it][A
  1%|          | 4/340 [00:10<16:43,  2.99s/it][A
  1%|▏         | 5/340 [00:12<14:46,  2.65s/it][A
  2%|▏         | 6/340 [00:14<13:31,  2.43s/it][A
  2%|▏         | 7/340 [00:16<12:38,  2.28s/it][A
  2%|▏         | 8/340 [00:18<12:13,  2.21s/it][A
  3%|▎         | 9/340 [00:20<11:59,  2.17s/it][A
  3%|▎         | 10/340 [00:22<11:45,  2.14s/it][A
  3%|▎         | 11/340 [00:24<11:22,  2.08s/it][A
  4%|▎         | 12/340 [00:26<11:17,  2.07s/it][A
  4%|▍         | 13/340 [00:28<11:05,  2.04s/it][A
  4%|▍         | 14/340 [00:30<11:08,  2.05s/it][A
  4%|▍         | 15/340 [00:34<13:21,  2.47s/it][A
  5%|▍         | 16/340 [00:36<12:35,  2.33s/it][A
  5%|▌         | 17/340 [00:38<11:48,  2.19s/it][A
  5%|▌         | 18/340 [00:39<11:17,  2.10s/it][A
  6%|▌         | 19/340 [00:4

Векторное хранилище создано. Всего чанков: 33941





In [24]:
def enhanced_search(collection, embedder, query, content_filter=None, n_results=5):
    is_code_query = any(keyword in query.lower() for keyword in
                       ['code', 'function', 'def ', 'class ', 'import ', 'how to'])

    if is_code_query:
        query_embedding = embedder.code_model.encode([query]).tolist()
    else:
        query_embedding = embedder.text_model.encode([query]).tolist()

    search_kwargs = {
        "query_embeddings": query_embedding,
        "n_results": n_results
    }

    if content_filter:
        search_kwargs["where"] = {"content_type": content_filter}

    results = collection.query(**search_kwargs)

    retrieved_docs = []
    if results['documents']:
        for i, doc_content in enumerate(results['documents'][0]):
            metadata = results['metadatas'][0][i] if results['metadatas'] else {}
            distance = results['distances'][0][i] if results['distances'] else None

            document = Document(
                page_content=doc_content,
                metadata={
                    **metadata,
                    "distance": distance,
                    "retrieval_rank": i + 1,
                    "is_code": metadata.get("content_type") == "code"
                }
            )
            retrieved_docs.append(document)

    return retrieved_docs

In [25]:
test_queries = [
    "How to check if CUDA is available",
    "What is PyTorch",
    "Neural network implementation",
]

for query in test_queries:
    print(f"\n=== Запрос: {query} ===")
    results = enhanced_search(collection, embedder, query, n_results=3)

    for i, doc in enumerate(results):
        content_preview = doc.page_content[:150] + "..." if len(doc.page_content) > 150 else doc.page_content
        print(f"{i+1}. [{'КОД' if doc.metadata.get('is_code') else 'ТЕКСТ'}] {content_preview}")
        print(f"   Источник: {doc.metadata.get('source', 'N/A')}")
        print(f"   Расстояние: {doc.metadata.get('distance', 'N/A'):.3f}")
        print()


=== Запрос: How to check if CUDA is available ===
1. [КОД] Unsupported: Tracing through optional input is not supported yet
   Источник: /kaggle/input/dataset/dataset/export/index.md
   Расстояние: 0.027

2. [КОД] The named tensor API is a prototype feature and subject to change.
   Источник: /kaggle/input/dataset/dataset/name_inference/named_tensor.md
   Расстояние: 0.028

3. [КОД] Unsupported: torch.* op returned non-Tensor
   Источник: /kaggle/input/dataset/dataset/export/index.md
   Расстояние: 0.028


=== Запрос: What is PyTorch ===
1. [ТЕКСТ] For a more detailed walk-through of PyTorch internal implementation,
please refer to [ezyang’s blogpost about PyTorch Internals](http://blog.ezyang.co...
   Источник: /kaggle/input/dataset/dataset/tensors/tensor_view.md
   Расстояние: 0.342

2. [ТЕКСТ] To effectively use this feature, it is important to know how the native part of
PyTorch is implemented. The most important component there is what we ...
   Источник: /kaggle/input/dataset/da

## Another way

In [26]:
import re
import os
import glob
from tqdm import tqdm
import chromadb
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

def smart_markdown_processing(dataset_path):
    md_files = glob.glob(os.path.join(dataset_path, "**/*.md"), recursive=True)
    all_documents = []

    print(f"Найдено {len(md_files)} .md файлов")

    for file_path in tqdm(md_files, desc="Обработка markdown"):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()

            # НЕ разделяем текст и код - сохраняем как есть
            # Но добавляем метаданные о наличии кода
            has_code = '```' in content
            code_blocks_count = content.count('```') // 2

            document = Document(
                page_content=content,
                metadata={
                    "source": file_path,
                    "filename": os.path.basename(file_path),
                    "folder": os.path.dirname(os.path.relpath(file_path, dataset_path)),
                    "file_type": "markdown",
                    "has_code": has_code,
                    "code_blocks_count": code_blocks_count,
                    "content_type": "mixed"  # Всегда mixed для markdown
                }
            )
            all_documents.append(document)

        except Exception as e:
            print(f"Ошибка загрузки {file_path}: {e}")

    return all_documents

def create_smart_chunker():
    """Умный чанкер, который сохраняет контекст кода"""

    return RecursiveCharacterTextSplitter(
        chunk_size=1200,
        chunk_overlap=200,
        separators=[
            "\n## ",  # Заголовки
            "\n### ",
            "\n\n",   # Абзацы
            "\n",     # Строки
            "```",    # Блоки кода (сохраняем как разделитель)
            " ",
        ],
        keep_separator=True,
        length_function=len
    )

In [27]:
class SmartMarkdownEmbedder:
    def __init__(self):
        self.model = SentenceTransformer('BAAI/bge-base-en-v1.5').to("cuda")

    def preprocess_content(self, content):
        lines = content.split('\n')
        processed_lines = []

        for line in lines:
            if any(pattern in line.lower() for pattern in
                  ['date:', 'author:', 'created:', 'last modified:']):
                continue
            processed_lines.append(line)

        return '\n'.join(processed_lines)

    def encode(self, texts):
        if isinstance(texts, str):
            texts = [texts]

        processed_texts = [self.preprocess_content(text) for text in texts]
        return self.model.encode(processed_texts).tolist()

In [28]:
def fixed_retrieval_pipeline():
    print("Загрузка документов...")
    documents = smart_markdown_processing(DATASET_PATH)

    print("Чанкинг...")
    chunker = create_smart_chunker()
    chunks = chunker.split_documents(documents)
    print(f"Создано {len(chunks)} чанков")

    print("Загрузка модели эмбеддингов...")
    embedder = SmartMarkdownEmbedder()

    chroma_client = chromadb.PersistentClient(path="./chroma_fixed")
    collection = chroma_client.get_or_create_collection(
        name="docs_fixed",
        metadata={"hnsw:space": "cosine"}
    )

    print("Создание эмбеддингов...")
    batch_size = 100

    for i in tqdm(range(0, len(chunks), batch_size)):
        batch = chunks[i:i + batch_size]

        documents_batch = [chunk.page_content for chunk in batch]
        metadatas_batch = [chunk.metadata for chunk in batch]
        ids_batch = [f"chunk_{i+j}" for j, chunk in enumerate(batch)]

        embeddings_batch = embedder.encode(documents_batch)

        collection.add(
            embeddings=embeddings_batch,
            documents=documents_batch,
            metadatas=metadatas_batch,
            ids=ids_batch
        )

    return collection, embedder

collection, embedder = fixed_retrieval_pipeline()

Загрузка документов...
Найдено 3089 .md файлов


Обработка markdown: 100%|██████████| 3089/3089 [00:02<00:00, 1050.41it/s]


Чанкинг...
Создано 23157 чанков
Загрузка модели эмбеддингов...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Создание эмбеддингов...


100%|██████████| 232/232 [11:49<00:00,  3.06s/it]


In [29]:
def smart_search(collection, embedder, query, n_results=5):
    expanded_query = expand_query(query)

    query_embedding = embedder.encode(expanded_query)

    results = collection.query(
        query_embeddings=query_embedding,
        n_results=n_results,
        include=["documents", "metadatas", "distances"]
    )

    return format_results(results)

def expand_query(query):
    query_lower = query.lower()

    if any(keyword in query_lower for keyword in ['cuda', 'gpu', 'torch']):
        return query + " pytorch torch cuda gpu"
    elif any(keyword in query_lower for keyword in ['neural', 'network', 'nn']):
        return query + " neural network pytorch torch"
    elif any(keyword in query_lower for keyword in ['how to', 'check', 'available']):
        return query + " function method available"

    return query

def format_results(results):
    formatted = []
    if results['documents']:
        for i, (doc, metadata, distance) in enumerate(zip(
            results['documents'][0],
            results['metadatas'][0],
            results['distances'][0]
        )):
            preview = get_relevant_preview(doc, 200)

            formatted.append({
                "rank": i + 1,
                "distance": distance,
                "source": metadata.get('source', 'N/A'),
                "preview": preview,
                "has_code": metadata.get('has_code', False),
                "full_content": doc
            })

    return formatted

def get_relevant_preview(content, length=200):
    code_blocks = re.findall(r'```.*?```', content, re.DOTALL)
    if code_blocks:
        preview = code_blocks[0]
        if len(preview) > length:
            preview = preview[:length] + "..."
        return preview

    if len(content) > length:
        return content[:length] + "..."
    return content

In [30]:
def test_fixed_system():
    test_cases = [
        {
            "query": "How to check if CUDA is available",
            "expected": "torch.cuda.is_available"
        },
        {
            "query": "What is PyTorch",
            "expected": "pytorch"
        },
        {
            "query": "How to create a tensor",
            "expected": "torch.tensor"
        },
        {
            "query": "neural network implementation in pytorch",
            "expected": "nn.Module"
        }
    ]

    for test in test_cases:
        print(f"\n{'='*60}")
        print(f"ЗАПРОС: {test['query']}")
        print(f"ОЖИДАЕМ: {test['expected']}")
        print(f"{'='*60}")

        results = smart_search(collection, embedder, test['query'])

        for i, result in enumerate(results):
            print(f"{i+1}. [расстояние: {result['distance']:.3f}]")
            print(f"   Источник: {os.path.basename(result['source'])}")
            print(f"   Код: {'✓' if result['has_code'] else '✗'}")
            print(f"   Предпросмотр: {result['preview']}")
            print()

        found_expected = any(
            test['expected'].lower() in result['full_content'].lower()
            for result in results
        )
        print(f"Найден ожидаемый контент: {'✓' if found_expected else '✗'}")

test_fixed_system()


ЗАПРОС: How to check if CUDA is available
ОЖИДАЕМ: torch.cuda.is_available
1. [расстояние: 0.123]
   Источник: cuda.md
   Код: ✓
   Предпросмотр: One can set `PYTORCH_NVML_BASED_CUDA_CHECK=1`  in your environment before importing PyTorch modules that execute [`is_available()`](../generated/torch.cuda.is_available.html#torch.cuda.is_available "t...

2. [расстояние: 0.123]
   Источник: cuda.md
   Код: ✓
   Предпросмотр: One can set `PYTORCH_NVML_BASED_CUDA_CHECK=1`  in your environment before importing PyTorch modules that execute [`is_available()`](../generated/torch.cuda.is_available.html#torch.cuda.is_available "t...

3. [расстояние: 0.123]
   Источник: cuda.md
   Код: ✓
   Предпросмотр: One can set `PYTORCH_NVML_BASED_CUDA_CHECK=1`  in your environment before importing PyTorch modules that execute [`is_available()`](../generated/torch.cuda.is_available.html#torch.cuda.is_available "t...

4. [расстояние: 0.123]
   Источник: cuda.md
   Код: ✓
   Предпросмотр: One can set `PYTORCH_NVML

## Try to make with minimum duplicates

In [31]:
def fast_deduplicate_chunks(chunks):
    seen_hashes = set()
    unique_chunks = []

    for chunk in chunks:
        content_hash = hash(chunk.page_content.strip())

        if content_hash not in seen_hashes:
            seen_hashes.add(content_hash)
            unique_chunks.append(chunk)

    print(f"🚀 Быстрая дедупликация: {len(chunks)} → {len(unique_chunks)} чанков")
    return unique_chunks

def signature_based_deduplication(chunks):
    seen_signatures = set()
    unique_chunks = []

    for chunk in chunks:
        content_preview = chunk.page_content[:100].strip()
        content_length = len(chunk.page_content)
        signature = f"{content_preview}_{content_length}"

        if signature not in seen_signatures:
            seen_signatures.add(signature)
            unique_chunks.append(chunk)

    print(f"🚀 Дедупликация по сигнатуре: {len(chunks)} → {len(unique_chunks)} чанков")
    return unique_chunks

In [32]:
def load_deduplicated_documents(dataset_path):
    md_files = glob.glob(os.path.join(dataset_path, "**/*.md"), recursive=True)
    all_documents = []
    seen_files = set()

    print(f"Найдено {len(md_files)} .md файлов")

    for file_path in tqdm(md_files, desc="Загрузка с дедупликацией"):
        try:
            file_hash = hash_file(file_path)
            if file_hash in seen_files:
                continue
            seen_files.add(file_hash)

            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()

            document = Document(
                page_content=content,
                metadata={
                    "source": file_path,
                    "filename": os.path.basename(file_path),
                    "folder": os.path.dirname(os.path.relpath(file_path, dataset_path)),
                    "file_type": "markdown"
                }
            )
            all_documents.append(document)

        except Exception as e:
            print(f"Ошибка загрузки {file_path}: {e}")

    return all_documents

def hash_file(filepath):
    import hashlib
    hasher = hashlib.md5()
    with open(filepath, 'rb') as f:
        buf = f.read(8192)
        while buf:
            hasher.update(buf)
            buf = f.read(8192)
    return hasher.hexdigest()

In [33]:
documents = load_deduplicated_documents(DATASET_PATH)

chunker = RecursiveCharacterTextSplitter(
    chunk_size=1600,
    chunk_overlap=100,
    separators=[
        "\n# ", "\n## ", "\n### ", "\n\n", "\n", " ",
    ],
    length_function=len
)
chunks = chunker.split_documents(documents)
print(f"Создано {len(chunks)} чанков")

chunks = signature_based_deduplication(chunks)
model = SentenceTransformer('BAAI/bge-base-en-v1.5').to("cuda")

chroma_client = chromadb.PersistentClient(path="./chroma_fast")
collection = chroma_client.get_or_create_collection(
    name="docs_fast",
    metadata={"hnsw:space": "cosine"}
)
batch_size = 200

print("Создание эмбеддингов...")
for i in tqdm(range(0, len(chunks), batch_size)):
    batch = chunks[i:i + batch_size]

    documents_batch = [chunk.page_content for chunk in batch]
    metadatas_batch = [chunk.metadata for chunk in batch]

    ids_batch = []
    for j, chunk in enumerate(batch):
        chunk_id = f"{i+j}_{hash(chunk.page_content[:50]) & 0xFFFFFF}"
        ids_batch.append(chunk_id)

    embeddings_batch = model.encode(documents_batch, normalize_embeddings=True).tolist()

    collection.add(
        embeddings=embeddings_batch,
        documents=documents_batch,
        metadatas=metadatas_batch,
        ids=ids_batch
    )

print(f"Векторная БД готова. Чанков: {len(chunks)}")

Найдено 3089 .md файлов


Загрузка с дедупликацией: 100%|██████████| 3089/3089 [00:03<00:00, 916.80it/s] 


Создано 8579 чанков
🚀 Дедупликация по сигнатуре: 8579 → 7715 чанков
Создание эмбеддингов...


100%|██████████| 39/39 [03:51<00:00,  5.93s/it]

Векторная БД готова. Чанков: 7715





In [34]:
def chromadb_deduplication_search(collection, model, query, n_results=5):
    initial_results = collection.query(
        query_embeddings=model.encode([query]).tolist(),
        n_results=n_results * 3,  # Берем в 3 раза больше
        include=["documents", "metadatas", "distances"]
    )
    unique_results = fast_filter_duplicates(initial_results, n_results)

    return unique_results

def fast_filter_duplicates(results, n_results):
    if not results['documents']:
        return results

    seen_sources = set()
    unique_docs = []
    unique_metadatas = []
    unique_distances = []

    for doc, metadata, distance in zip(
        results['documents'][0],
        results['metadatas'][0],
        results['distances'][0]
    ):
        source = metadata.get('source', 'unknown')
        preview = doc[:100]
        content_key = f"{source}:{preview}"

        if content_key not in seen_sources and len(unique_docs) < n_results:
            seen_sources.add(content_key)
            unique_docs.append(doc)
            unique_metadatas.append(metadata)
            unique_distances.append(distance)

    return {
        'documents': [unique_docs],
        'metadatas': [unique_metadatas],
        'distances': [unique_distances]
    }

In [35]:
def fast_contextual_search(collection, model, query, n_results=5):
    enhanced_query = smart_query_expansion(query)

    results = chromadb_deduplication_search(collection, model, enhanced_query, n_results)

    formatted = []
    for i, (doc, metadata, distance) in enumerate(zip(
        results['documents'][0],
        results['metadatas'][0],
        results['distances'][0]
    )):
        formatted.append({
            "rank": i + 1,
            "distance": distance,
            "source": metadata.get('source', 'N/A'),
            "preview": get_best_preview(doc, 150),
            "has_code": '```' in doc,
            "filename": os.path.basename(metadata.get('source', 'N/A'))
        })

    return formatted

def smart_query_expansion(query):
    query_lower = query.lower()

    if any(word in query_lower for word in ['cuda', 'gpu']):
        return query + " pytorch torch"
    elif any(word in query_lower for word in ['how to', 'check', 'verify']):
        return query + " example code"
    elif any(word in query_lower for word in ['what is', 'definition']):
        return query + " overview explanation"

    return query

def get_best_preview(content, length=150):
    code_match = re.search(r'```.*?```', content, re.DOTALL)
    if code_match:
        code_text = code_match.group(0)
        if len(code_text) > length:
            return code_text[:length] + "..."
        return code_text

    if len(content) > length:
        return content[:length] + "..."
    return content

In [36]:
test_queries = [
    "How to check if CUDA is available",
    "What is PyTorch",
    "How to create a tensor",
    "neural network implementation"
]

for query in test_queries:
    print(f"\n{'='*60}")
    print(f"🔍 ЗАПРОС: {query}")
    print(f"{'='*60}")
    results = fast_contextual_search(collection, model, query)

    for result in results:
        print(f"{result['rank']}. [dist: {result['distance']:.3f}] {result['filename']}")
        print(f"   {result['preview']}")
        print()


🔍 ЗАПРОС: How to check if CUDA is available
1. [dist: 0.116] cuda.md
   ```
x = torch.empty((8, 42), device=args.device)
net = Network().to(device=args.device)

```

2. [dist: 0.137] torch.cuda.is_available.md
   torch.cuda.is_available 

torch.cuda. is_available ( ) [source](ht...

3. [dist: 0.147] cuda.md
   ```

Note 

When assessing the availability of CUDA in a given environment ( [`is_available()`](../generated/torch.cuda.is_available.html#torch.cuda.i...

4. [dist: 0.152] cuda.md
   torch.cuda 

This package adds support for CUDA tensor types. 

It implements the same...

5. [dist: 0.171] torch.Tensor.is_cuda.md
   torch.Tensor.is_cuda 

Tensor. is_cuda 
:   Is `True`  if the Tensor is ...


🔍 ЗАПРОС: What is PyTorch
1. [dist: 0.235] cuda.md
   ### PyTorch API 


This API is in beta and may change in future releases.

PyTorch exposes graphs via a raw [`torch.cuda.CUDAGraph`](../gener...

2. [dist: 0.253] torch.md
   torch 

The torch package contains data structures for multi-d

## Prompt generating

In [143]:
from google import genai

In [144]:
from dataclasses import dataclass
from typing import Sequence

from google import genai


In [145]:
@dataclass
class RetrievedChunk:
    id: int
    text: str
    source: str
    score: float


In [146]:
def retrieve_chunks_for_llm(collection, embed_model, query: str, n_results: int = 5) -> list[RetrievedChunk]:
    enhanced_query = smart_query_expansion(query)
    results = chromadb_deduplication_search(collection, embed_model, enhanced_query, n_results)

    docs = results["documents"][0]
    metas = results["metadatas"][0]
    dists = results["distances"][0]

    chunks: list[RetrievedChunk] = []
    for i, (doc, meta, dist) in enumerate(zip(docs, metas, dists)):
        chunks.append(
            RetrievedChunk(
                id=i + 1,
                text=doc,
                source=meta.get("source", "N/A"),
                score=1.0 - float(dist),
            )
        )
    return chunks


In [147]:
SYSTEM_INSTRUCTIONS = """You are an assistant that answers questions about PyTorch 2.x and its ecosystem.
You must only use the context snippets below (PyTorch docs + curated StackOverflow answers).
If the context is not enough to answer, say you don't know and suggest where to look in the official docs.
Never invent APIs, arguments, or behavior that are not supported by the context."""

ANSWER_INSTRUCTIONS = """Answer format:
1) First, give a concise direct answer in 2–4 sentences.
2) Then provide a bullet list with details and short code examples if helpful.
3) Each bullet with factual claims must end with a citation in the form [§N], where N is the context id.
4) If multiple snippets support the same point, you can use [§1, §3].
5) After the bullets, add a small 'Where to read more' list with file paths or URLs."""


In [148]:
def render_context(chunks: Sequence[RetrievedChunk], max_chars: int = 14000) -> str:
    parts: list[str] = []
    total_len = 0

    for ch in chunks:
        header = f"[{ch.id}] {ch.source} (score={ch.score:.4f})"
        body = ch.text.strip()
        block = header + "\n" + body + "\n"
        if total_len + len(block) > max_chars:
            break
        parts.append(block)
        total_len += len(block)

    return "\n\n".join(parts)


In [149]:
def build_rag_prompt(question: str, chunks: Sequence[RetrievedChunk]) -> str:
    context_block = render_context(chunks)
    joined_sources = "\n".join(f"[§{c.id}] {c.source}" for c in chunks)

    prompt = f"""{SYSTEM_INSTRUCTIONS}

{ANSWER_INSTRUCTIONS}

User question:
{question}

Context snippets:
{context_block}

Remember:
- Base the answer only on the context snippets above.
- If the answer is not in the context, say that you don't know and suggest checking the relevant section in the PyTorch docs.
- Use the citation format [§N] that corresponds to the snippet ids.

List of snippet ids and sources:
{joined_sources}
"""
    return prompt


In [150]:
class GeminiGenerator:
    def __init__(
        self,
        model_name: str = "gemini-2.5-flash",
        api_key: str | None = None,
        temperature: float = 0.1,
        max_output_tokens: int = 1024,
    ) -> None:
        if api_key is None:
            self.client = genai.Client()
        else:
            self.client = genai.Client(api_key=api_key)
        self.model_name = model_name
        self.temperature = temperature
        self.max_output_tokens = max_output_tokens

    def generate_answer(
        self,
        question: str,
        chunks: Sequence[RetrievedChunk],
    ) -> str:
        prompt = build_rag_prompt(question, chunks)

        response = self.client.models.generate_content(
            model=self.model_name,
            contents=prompt,
            config={
                "temperature": self.temperature,
                "max_output_tokens": self.max_output_tokens,
            },
        )

        return response.text

    def generate_pure_answer(
        self,
        question: str,
    ) -> str:
        response = self.client.models.generate_content(
            model=self.model_name,
            contents=question,
            config={
                "temperature": self.temperature,
                "max_output_tokens": self.max_output_tokens,
            },
        )

        return response.text


In [151]:
gemini = GeminiGenerator(api_key="", model_name="gemini-2.5-flash-lite")

In [152]:
def rag_answer(query: str, n_results: int = 5) -> str:
    chunks = retrieve_chunks_for_llm(collection, model, query, n_results=n_results)
    if not chunks:
        return "I couldn't retrieve any relevant context for this question from the knowledge base."
    return gemini.generate_answer(query, chunks)

def pure_gemini_answer(query: str) -> str:
    return gemini.generate_pure_answer(query)


In [155]:
test_query = "In PyTorch 2.x, how can I check if CUDA is available without initializing the CUDA driver, to avoid issues with forked processes?"

print(f"\n{'='*60}")
print(f"🔍 ЗАПРОС: {test_query}")
print(f"{'='*60}")

answer = rag_answer(test_query, n_results=5)
print(answer)

print(f"\n{'-'*60}")

gemini_answer = pure_gemini_answer(test_query)
print(gemini_answer)


🔍 ЗАПРОС: In PyTorch 2.x, how can I check if CUDA is available without initializing the CUDA driver, to avoid issues with forked processes?
To check for CUDA availability without initializing the CUDA driver, which can cause issues with forked processes, you can set the environment variable `PYTORCH_NVML_BASED_CUDA_CHECK=1` before importing PyTorch or calling `torch.cuda.is_available()`. This directs `is_available()` to use an NVML-based check instead of the default CUDA Runtime API method. If this NVML check is successful, it will prevent the poisoning of subsequent forks.

*   By default, `torch.cuda.is_available()` calls the CUDA Runtime API, which initializes the CUDA Driver API. This initialization can cause subsequent forks of a process to fail with a CUDA initialization error [§1].
*   Setting the environment variable `PYTORCH_NVML_BASED_CUDA_CHECK=1` before importing PyTorch or calling `torch.cuda.is_available()` will make `is_available()` attempt an NVML-based check [§2, §3].

In [156]:
QA_DATASET_PATH="/kaggle/input/stackoverflow/stackoverflow-pytorch.csv"

In [157]:
import pandas as pd

df = pd.read_csv(QA_DATASET_PATH)

df = df.rename(columns={
    "question_body": "question",
    "answer_body": "answer",
})
df = df[["question", "answer", "answer_score"]].dropna()


In [158]:
df = df[df["answer_score"] > 0].reset_index(drop=True)
eval_df = df.sample(200, random_state=42).reset_index(drop=True)

In [159]:
def _normalize(text: str) -> list[str]:
    text = text.lower().strip()
    text = re.sub(r"[^a-z0-9_]+", " ", text)
    return [t for t in text.split() if t]

def squad_f1(pred: str, gold: str) -> float:
    p_tokens = _normalize(pred)
    g_tokens = _normalize(gold)
    if not p_tokens and not g_tokens:
        return 1.0
    if not p_tokens or not g_tokens:
        return 0.0
    common = {}
    for t in p_tokens:
        common[t] = common.get(t, 0) + 1
    overlap = 0
    for t in g_tokens:
        if common.get(t, 0) > 0:
            overlap += 1
            common[t] -= 1
    if overlap == 0:
        return 0.0
    precision = overlap / len(p_tokens)
    recall = overlap / len(g_tokens)
    return 2 * precision * recall / (precision + recall)


In [160]:
import numpy as np

embed_model = SentenceTransformer("BAAI/bge-base-en-v1.5")

def cosine(u, v):
    return float(np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v)))

def embedding_sim(pred: str, gold: str) -> float:
    vecs = embed_model.encode([pred, gold])
    return cosine(vecs[0], vecs[1])


In [161]:
def evaluate_model(eval_df: pd.DataFrame, answer_fn, max_samples: int | None = None):
    if max_samples is not None and max_samples < len(eval_df):
        data = eval_df.sample(max_samples, random_state=42).reset_index(drop=True)
    else:
        data = eval_df.reset_index(drop=True)

    records = []

    for row in tqdm(data.itertuples(index=False), total=len(data)):
        q = row.question
        gold = row.answer

        try:
            pred = answer_fn(q)
        except Exception as e:
            pred = ""

        f1 = squad_f1(pred, gold)
        sim = embedding_sim(pred, gold)

        records.append({
            "question": q,
            "gold": gold,
            "pred": pred,
            "f1": f1,
            "sim": sim,
        })

    results_df = pd.DataFrame(records)
    summary = {
        "n": len(results_df),
        "mean_f1": results_df["f1"].mean(),
        "mean_sim": results_df["sim"].mean(),
        "median_f1": results_df["f1"].median(),
        "median_sim": results_df["sim"].median(),
    }
    return results_df, summary


In [162]:
rag_results, rag_summary = evaluate_model(eval_df, rag_answer, max_samples=200)
pure_results, pure_summary = evaluate_model(eval_df, pure_gemini_answer, max_samples=200)

print("RAG:", rag_summary)
print("PURE:", pure_summary)


100%|██████████| 200/200 [06:45<00:00,  2.03s/it]
100%|██████████| 200/200 [15:22<00:00,  4.61s/it]

RAG: {'n': 200, 'mean_f1': 0.21743579375758018, 'mean_sim': 0.7767660923302173, 'median_f1': 0.21363372334246122, 'median_sim': 0.7852433025836945}
PURE: {'n': 200, 'mean_f1': 0.19445437091348858, 'mean_sim': 0.8008399587869645, 'median_f1': 0.18753520179036928, 'median_sim': 0.8095232248306274}



