In [33]:
import os
from langchain_community.document_loaders.text import TextLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_text_splitters import RecursiveCharacterTextSplitter
from yandex_chain import YandexEmbeddings
from langchain_community.vectorstores import FAISS

In [14]:
if os.getenv("environment") != "production":
    from dotenv import load_dotenv
    load_dotenv("../app/.env")

AUTH_DATA = os.getenv("AUTH_DATA")
API_KEY = os.getenv("API_KEY")
FOLDER_ID = os.getenv("FOLDER_ID")

In [37]:
loader = DirectoryLoader("../data/txt_unstructured",
                    glob="**/*.txt",
                    loader_cls=TextLoader,
                    show_progress=True,
                    loader_kwargs={"encoding":"utf-8"})

data = loader.load()

  0%|          | 0/485 [00:00<?, ?it/s]

100%|██████████| 485/485 [00:00<00:00, 3884.04it/s]


In [28]:
text_splitter=RecursiveCharacterTextSplitter( # TODO: semantic chunking
    chunk_size=500, # TODO: tune this
    chunk_overlap=10
)

chunks = text_splitter.split_documents(data)

In [38]:
semantic_text_splitter = SemanticChunker(YandexEmbeddings(folder_id=FOLDER_ID, api_key=API_KEY))
chunks = text_splitter.split_documents(data)

In [39]:
faiss_vdb = "faiss_unstructured_v0.0.2"

In [40]:
db = FAISS.from_documents(chunks, YandexEmbeddings(folder_id=FOLDER_ID, api_key=API_KEY))
db.save_local(os.path.join("..", "data", "unstructured_vdb", faiss_vdb))