In [2]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API Key:")

In [4]:
import os
from pathlib import Path
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Qdrant
from qdrant_client import QdrantClient
from tqdm import tqdm

# --- Settings ---
COLLECTION_NAME = "historical_docs"
ROOT_DIR = "./data"  # Modify this as needed

# --- Step 1: Recursively find all .txt files ---
def find_txt_files(root_dir: str) -> list[Path]:
    return [p for p in Path(root_dir).rglob("*.txt")]

# --- Step 2: Load files and attach metadata ---
def load_documents(paths: list[Path], root_dir: str) -> list:
    all_docs = []
    for path in tqdm(paths, desc="Loading files"):
        loader = TextLoader(str(path), encoding="utf-8")
        docs = loader.load()
        for doc in docs:
            doc.metadata["source"] = str(path.relative_to(root_dir))
        all_docs.extend(docs)
    return all_docs

# --- Step 3: Chunk into 500 tokens with 50 overlap ---
def chunk_documents(docs: list) -> list:
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    return splitter.split_documents(docs)

# --- Step 4: Embed and upload to Qdrant ---
def embed_and_store(chunks: list, collection_name: str):
    embeddings = OpenAIEmbeddings()
    client = QdrantClient(host="localhost", port=6333)

    vectorstore = Qdrant.from_documents(
        documents=chunks,
        embedding=embeddings,
        host="localhost",
        port=6333,
        collection_name=collection_name,
    )

    print(f"✅ Stored {len(chunks)} chunks in collection '{collection_name}'")


txt_paths = find_txt_files(ROOT_DIR)
raw_docs = load_documents(txt_paths, ROOT_DIR)
chunks = chunk_documents(raw_docs)
embed_and_store(chunks, COLLECTION_NAME)


Loading files: 100%|██████████| 66/66 [00:00<00:00, 7332.89it/s]


✅ Stored 428 chunks in collection 'historical_docs'
