In [1]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from rich import print

from langchain_core.embeddings import Embeddings
from typing import List
from langchain_postgres import PGVectorStore, PGEngine
from models.embedding_model import embedding_engine

load_dotenv()

USER = os.getenv("db_user")
PASSWORD = os.getenv("db_password")
HOST = os.getenv("db_host")
PORT = os.getenv("db_port")
DBNAME = os.getenv("db_name")


In [2]:
folder_path = "data"

pdf_files = []
for root, dirs, files in os.walk(folder_path):
    for file in files:
        if file.endswith(".pdf"):
            pdf_files.append(os.path.join(root, file))

all_pages = []  # To store pages from all files

for file_path in pdf_files:
    loader = PyMuPDFLoader(file_path=file_path)

    # Load the documents for this file and append to `all_pages`
    for doc in loader.lazy_load():
        all_pages.append(doc)

# Output all collected pages
print(f"Total pages loaded: {len(all_pages)}")

In [3]:
# Splitting document
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=300,
    separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200b",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        "",
    ],
)

In [4]:
splits = text_splitter.split_documents(all_pages)
print(f"Total chunks created: {len(splits)}")

In [5]:
CONNECTION_STRING = f"postgresql+asyncpg://{USER}:{PASSWORD}@{HOST}:{PORT}/{DBNAME}"
pg_engine = PGEngine.from_connection_string(url=CONNECTION_STRING)

In [6]:
# len(embedding_engine.embed_query("test"))  # Test embedding engine initialization

In [7]:
from sqlalchemy.exc import ProgrammingError

from langchain_postgres import Column

TABLE_NAME = "jp_ai_bot"
VECTOR_SIZE = 3072

try:
    await pg_engine.ainit_vectorstore_table(
        table_name=TABLE_NAME,
        vector_size=VECTOR_SIZE,
        # schema_name="japan",
        id_column=Column("langchain_id", "VARCHAR"),
        # metadata_columns=[
        #     Column("likes", "INTEGER"),
        #     Column("location", "TEXT"),
        #     Column("topic", "TEXT"),
        # ],
    )
except ProgrammingError:
    # Catching the exception here
    print("Table already exists. Skipping creation.")

In [8]:
async def create_vector_store(
    table_name: str, schema_name: str = "public"
) -> PGVectorStore:
    vector_store = await PGVectorStore.create(
        engine=pg_engine,
        schema_name=schema_name,
        embedding_service=embedding_engine,
        table_name=table_name,
    )
    print("PGVector Store is loaded.")
    return vector_store


vector_store = await create_vector_store(
    table_name="jp_ai_bot",
    # schema_name="japan",
)

In [9]:
# push embedding to collection
print("Adding documents to vector store...")
for i in range(0, len(splits), 3):
    chunk = splits[i : i + 3]
    try:
        # Add the chunk to the vector store
        await vector_store.aadd_documents(documents=chunk)
        print(f"Chunk {i // 3 + 1}/{(len(splits) + 3) // 3} added successfully")
    except Exception as e:
        print(f"Error adding chunk {i // 3}: {e}")
        continue

In [12]:
query = "am i eligible for Subsidy Program"
docs = await vector_store.asimilarity_search(query)

for doc in docs:
    print(repr(doc))

In [11]:
query_vector = embedding_engine.embed_query(query)
docs = await vector_store.asimilarity_search_by_vector(query_vector, k=2)
print(docs)