# Playground - In-Memory DB

## Imports

In [1]:
import openai
import numpy as np
import json
import hnswlib
import os
import tiktoken
from dotenv import load_dotenv
from dataclasses import dataclass, asdict


load_dotenv("../app/.env")

True

In [2]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

## Read text

`Note:` use existing fetch logic

In [4]:
with open("../dev/hitler-wiki.txt", "r") as file:
    text = file.read()

len(text)

309750

## Chunk text

In [5]:
def chunk_text(text, model_name="text-embedding-3-large"):
    # Tokenizer initialisieren
    enc = tiktoken.encoding_for_model(model_name)
    tokens = enc.encode(text)
    token_count = len(tokens)
    
    chunk_size = 800
    overlap_prev = 200
    overlap_next = 200

    chunks = []
    start = 0

    while start < token_count:
        # Indizes für Tokenbereiche berechnen
        current_start = start
        current_end = min(start + chunk_size, token_count)

        prev_start = max(current_start - overlap_prev, 0)
        prev_end = current_start

        next_start = current_end
        next_end = min(current_end + overlap_next, token_count)

        # Chunk zusammensetzen
        chunk_tokens = tokens[prev_start:prev_end] + tokens[current_start:current_end] + tokens[next_start:next_end]
        chunk_text = enc.decode(chunk_tokens)

        chunks.append(chunk_text)
        start += chunk_size

    return chunks


chunks = chunk_text(text)

In [11]:
enc = tiktoken.encoding_for_model("text-embedding-3-large")
tokens = enc.encode(text)
token_count = len(tokens)

In [12]:
token_count

98703

## Create embeddings

In [37]:
def embed_chunks(chunks, model="text-embedding-3-large"):
    response = openai.embeddings.create(
        input=chunks,
        model=model
    )

    embeddings = np.array([record.embedding for record in response.data], dtype=np.float32)

    return embeddings

In [38]:
embeddings = embed_chunks(chunks)

In [40]:
np.save("embeddings.npy", embeddings)


In [13]:
embeddings_loaded = np.load("embeddings.npy")

In [14]:
len(embeddings_loaded)

124

## InMemoryIndexDB

In [59]:
from abc import ABC, abstractmethod
from dataclasses import dataclass

@dataclass(frozen=True)
class IndexDBDocument:
    id: int
    reference: str
    text: str
    embedding: np.ndarray


@dataclass(frozen=True)
class IndexDBDocumentResult(IndexDBDocument):
    distance: float


class InMemoryIndexDBInterface(ABC):
    @abstractmethod
    def add(self, reference: str, text: str, embedding: np.ndarray) -> None: pass

    @abstractmethod
    def add_batch(self, reference: str, text: list[str], embedding: np.ndarray[np.ndarray]) -> None: pass

    @abstractmethod
    def search(self, query: str, k: int = 5) -> list[IndexDBDocumentResult]: pass

In [None]:
class HNSWInMemoryIndexDB(InMemoryIndexDBInterface):
    def __init__(self, space: str = "cosine", ef_construction: int = 200, m: int = 16) -> None:
        self._space = space
        self._ef_construction = ef_construction
        self._m = m

        self._documents: dict[int, IndexDBDocument] = {}

    def add(self, reference: str, text: str, embedding: np.ndarray) -> None:
        start_id = len(self._documents)
        self._documents[start_id] = IndexDBDocument(
            id=start_id,
            reference=reference,
            text=text,
            embedding=embedding
        )

    def add_batch(self, reference: str, texts: list[str], embeddings: np.ndarray[np.ndarray]) -> None:
        start_id = len(self._documents)
        self._documents |=  {
            i: IndexDBDocument(
                id=start_id + i,
                reference=reference,
                text=texts[i],
                embedding=embeddings[i]
            )
            for i in range(0, len(texts))
        }

    def search(self, query: str, k: int = 5) -> list[IndexDBDocumentResult]:
        query_embedding = self._embed_query(query)

        ids, embeddings = zip(*[(doc.id, doc.embedding) for doc in self._documents.values()])

        index = self._create_index(ids, embeddings)

        labels, distances = index.knn_query(query_embedding, k=k)

        prepared_search_result = [
            IndexDBDocumentResult(
                distance=distance,
                **asdict(self._documents[label])
            )
            for (label, distance) in zip(labels[0], distances[0])
        ]

        return prepared_search_result

    def _create_index(self, ids: list[int], embeddings: list[np.ndarray]) -> hnswlib.Index:
        index = hnswlib.Index(space=self._space, dim=embeddings[0].shape[0])

        index.init_index(max_elements=len(ids), ef_construction=self._ef_construction, M=self._m)
        index.add_items(embeddings, ids)

        return index

    def _embed_query(self, query: str) -> np.ndarray:
        response = openai.embeddings.create(
            input=query,
            model="text-embedding-3-large"
        )

        embedding = np.array([record.embedding for record in response.data], dtype=np.float32)

        return embedding

In [88]:
db = HNSWInMemoryIndexDB()
db.add_batch(reference="https://de.wikipedia.org/wiki/Adolf_Hitler", texts=chunks, embeddings=embeddings_loaded)

query = "How many books did hitler owned?"
results = db.search(query=query, k=3)

results

(3072,)


[IndexDBDocumentResult(id=76, reference='https://de.wikipedia.org/wiki/Adolf_Hitler', text=', 1936\nVom 1. Mai 1920 bis zum 5. Oktober 1929 wohnte Hitler in München in der Thierschstraße 41 im Stadtteil Lehel. 1929 zog er in eine 9-Zimmer-Wohnung im Stadtteil Bogenhausen, Prinzregentenplatz 16, ein. Ab 1934 nutzte er die Wohnung kaum mehr, obwohl sie seine Meldeadresse blieb. Im Sommer 1933 kaufte er das Haus Wachenfeld auf dem Obersalzberg bei Berchtesgaden und ließ es bis Mitte 1936 zum Berghof umbauen.[475]\nZwischen 1926 und 1931 korrespondierte er vertraulich mit Maria Reiter, einer Urlaubsbekanntschaft, lehnte aber ihren Ehewunsch ab. 1928 mietete er auf dem Obersalzberg ein Landhaus, in das seine Halbschwester Angela Raubal und deren beide Töchter einzogen. 1929 ließ er seine Halbnichte Geli Raubal in seine Münchner Wohnung einziehen und zwang sie, eine Liebesbeziehung zu seinem Chauffeur Emil Maurice zu beenden. Am 19. September 1931 wurde sie mit seinem Revolver erschossen auf

In [82]:
# db._documents[76].text
[result.id for result in results]

[76, 77, 82]

## Search HNSW

In [43]:
import hnswlib

def build_hnsw_index(embeddings, space='cosine'):
    dim = embeddings.shape[1]
    num_elements = embeddings.shape[0]

    # Index initialisieren
    index = hnswlib.Index(space=space, dim=dim)
    index.init_index(max_elements=num_elements, ef_construction=200, M=16)

    # Vektoren hinzufügen (IDs = 0 .. N-1)
    index.add_items(embeddings, ids=list(range(num_elements)))

    # Suchqualität einstellen
    index.set_ef(50)

    return index


In [None]:
def search_embedding(query_text, index, model="text-embedding-3-large", top_k=5):
    # Query vektorisieren
    response = openai.embeddings.create(
        input=query_text,
        model=model
    )
    query_embedding = np.array(response.data[0].embedding, dtype=np.float32)

    # Suche im Index
    labels, distances = index.knn_query(query_embedding, k=top_k)

    return labels[0], distances[0]


In [45]:
# Schritt 2: Index erstellen
index = build_hnsw_index(embeddings_loaded)

# Schritt 3: Suche starten
query = "How many books did hitler owned?"
ids, scores = search_embedding(query, index)

print("Top Matches (IDs):", ids)
print("Cosine-Distanzen:", scores)


Top Matches (IDs): [76 80 77 79 82]
Cosine-Distanzen: [0.4374538  0.46544337 0.4662735  0.48625815 0.48945975]


In [51]:
output_file = "results.txt"

with open(output_file, "w", encoding="utf-8") as f:
    for i in ids:
        f.write(f"Chunk {i}:\n")
        f.write(chunks[i])
        f.write("\n" + "="*80 + "\n\n")  # Trenner zwischen Chunks
