In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import lancedb

uri = "~/.llamabot/lancedb"
db = lancedb.connect(uri)

In [None]:
from litellm import embedding
from dotenv import load_dotenv

load_dotenv()

text_to_embed = [{"document": "Hello world!"}, {"document": "Hello again!"}]

# response = embedding(
#     model="text-embedding-3-small", input=text_to_embed
# )  # , api_base="http://{os.getenv('OLLAMA_SERVER')}")

In [None]:
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector

registry = get_registry()
func = registry.get(name="sentence-transformers").create()


class DocstoreEntry(LanceModel):
    document: str = func.SourceField()
    vector: Vector(func.ndims()) = func.VectorField()

In [None]:
# Add this information to the lancedb database
import pandas as pd

try:
    table = db.create_table("dummy_table", schema=DocstoreEntry)
except Exception as e:
    db.drop_table("dummy_table")
    table = db.create_table("dummy_table", schema=DocstoreEntry)

In [None]:
table.add(text_to_embed)

In [None]:
table.search().limit(None).to_pydantic(DocstoreEntry)

In [None]:
table.search(text_to_embed[0]["document"]).limit(1).to_pydantic(DocstoreEntry)[0]

In [None]:
# %load_ext autoreload
# %autoreload 2
# from llamabot.components.docstore import DocumentStore


# ds = DocumentStore(collection_name="stuff")
# ds.reset()
# ds.append("Hello!")

In [None]:
from pathlib import Path
from typing import Optional
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from llamabot.doc_processor import magic_load_doc, split_document
from tqdm.auto import tqdm

registry = get_registry()
func = registry.get(name="sentence-transformers").create()


class LanceDBDocStore:
    def __init__(
        self,
        table_name: str,
        storage_path: Path = Path.home() / ".llamabot" / "lancedb",
        schema: Optional[LanceModel] = DocstoreEntry,
    ):
        self.table_name = table_name
        self.db = lancedb.connect(storage_path)

        try:
            self.table = self.db.open_table(table_name)
        except FileNotFoundError:
            self.table = self.db.create_table(table_name, schema=schema)

    def __contains__(self, other: str) -> bool:
        """Returns boolean whether the 'other' document is in the store."""
        all_items = self.table.search().limit(None).to_pydantic(DocstoreEntry)
        texts = set([item.document for item in all_items])
        return other in texts

    def append(self, document: str, metadata: dict = {}):
        self.table.add([{"document": document}])

    def extend(self, documents: list[str], metadata: dict = {}):
        # self.table.add(documents)
        for doc in documents:
            self.append(doc)

    def retrieve(self, query: str, n_results: int = 10):
        results: list[DocstoreEntry] = (
            self.table.search(query).limit(n_results).to_pydantic(DocstoreEntry)
        )
        return [r.document for r in results]

    def reset(self):
        self.db.drop_table(self.table_name)
        self.table = self.db.create_table(self.table_name, schema=DocstoreEntry)

    def add_documents(
        self,
        document_paths: Path | list[Path],
        chunk_size: int = 2_000,
        chunk_overlap: int = 500,
    ):
        """Add documents to the QueryBot DocumentStore."""
        if isinstance(document_paths, Path):
            document_paths = [document_paths]

        for document_path in tqdm(document_paths):
            document = magic_load_doc(document_path)
            splitted_document = split_document(
                document, chunk_size=chunk_size, chunk_overlap=chunk_overlap
            )
            chunks_to_add = [
                doc
                for doc in splitted_document
                if doc not in self.existing_records["documents"]
            ]
            self.extend(chunks_to_add)

In [None]:
db = LanceDBDocStore(table_name="my_table")
db.reset()

db.append("hello world!")
db.extend(["Hello world!", "hello again!"])
# db.table.add([{"document": "Hello world!"}])

In [None]:
db.retrieve("hello_world", n_results=1)

In [None]:
from llamabot.components.docstore import LanceDBDocStore


db = LanceDBDocStore(table_name="my_table")
db.reset()
db.append("hello world!")
db.extend(["hello world!", "hello again!"])

In [None]:
db.retrieve("aloha", n_results=2)