# Setup Imports, Global Var, and Utility Functions

In [1]:
import os
import glob
import psycopg2

from tqdm import tqdm
from pypdf import PdfReader
from dotenv import load_dotenv

from pgvector.psycopg2 import register_vector

load_dotenv()

True

In [2]:
from utils import db, api

conn = psycopg2.connect(os.getenv("POSTGRES_CONFIG"))
register_vector(conn)
cursor = conn.cursor()

# Setup Resources Functions

In [6]:
from llama_index import Document
from llama_index.storage.docstore import SimpleDocumentStore
from llama_index.node_parser import (
    HierarchicalNodeParser,
    SentenceWindowNodeParser,
    get_leaf_nodes,
)

from langchain.text_splitter import RecursiveCharacterTextSplitter


char_tiktoken_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=512, chunk_overlap=100, add_start_index=True,
)

hierarchical_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=[512, 128], chunk_overlap=20)
sent_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

def get_page_summary(page_content, lang="Indonesian"):
    answer = api.get_completions_gpt4([
        {
        "role": "user",
        "content": f"""Give me a comprehensive and complete summary of the provided page content below in {lang}. The summary must include all informations provided in the page content without any exceptions. The summary must be provided in {lang} and less than 100 words, and only provide your summary as response.
Page Content: {page_content}
Summary:""",
        }
    ], temp=0)
    summary = answer.choices[0].message.content
    if lang == "Indonesian":
        if "Ringkasan Konten: " in summary:
            summary = summary.replace("Ringkasan Konten: ", "")
        if "Ringkasan: " in summary:
            summary = summary.replace("Ringkasan: ", "")
    elif lang == "English":
        if "Summary: " in summary:
            summary = summary.replace("Summary: ", "")
    
    return summary


def setup_resource_nonllm(resource_id, page_num, summary, page_content):
    chunks = [chunk.__dict__ for chunk in char_tiktoken_splitter.create_documents([page_content])]
    for chunk in chunks:
        chunk_content = chunk["page_content"]
        embeddings = api.get_embeddings_ada(chunk_content)
        data = {
            "context": summary,
            "fact": chunk_content,
            "resource_id": resource_id,
            "embeddings": embeddings,
            "summary": summary,
            "number": page_num + 1
        }
        db.insert_fact_resource(conn, cursor, data)


def setup_resource_summary(resource_id, page_num, summary):
    embeddings = api.get_embeddings_ada(summary)
    data = {
        "context": "",
        "fact": summary,
        "resource_id": resource_id,
        "embeddings": embeddings,
        "summary": summary,
        "number": page_num + 1
    }
    db.insert_fact_resource(conn, cursor, data)


def setup_resource_parent_child(resource_id, page_num, summary, page_content):
    docs = [Document(text=page_content)]
    nodes = hierarchical_parser.get_nodes_from_documents(docs)
    docstore = SimpleDocumentStore()
    docstore.add_documents(nodes)

    leaves = get_leaf_nodes(nodes)
    for leaf in leaves:
        parent = docstore.get_document(leaf.parent_node.node_id).text
        embeddings = api.get_embeddings_ada(leaf.text)
        data = {
            "context": parent,
            "fact": leaf.text,
            "resource_id": resource_id,
            "embeddings": embeddings,
            "summary": summary,
            "number": page_num + 1
        }
        db.insert_fact_resource(conn, cursor, data)


def setup_resource_sent_window(resource_id, page_num, summary, page_content):
    docs = [Document(text=page_content)]
    nodes = sent_parser.get_nodes_from_documents(docs)

    for node in nodes:
        embeddings = api.get_embeddings_ada(node.text)
        data = {
            "context": node.metadata["window"],
            "fact": node.metadata["window"],
            "resource_id": resource_id,
            "embeddings": embeddings,
            "summary": summary,
            "number": page_num + 1
        }
        db.insert_fact_resource(conn, cursor, data)

# Run Resources Setup for NonLLM, Summary, ParentChild, SentWindow

In [None]:
for pdf in tqdm(glob.glob(f"{os.getenv('PDF_DIR')}/*.pdf")):
    
    fname = pdf.split("\\")[-1].split(".pdf")[0]
    print(f"Currently setup for file: {fname}")
    
    nonllm_id = db.get_resource_id(cursor, f"{fname} - NonLLM")
    summary_id = db.get_resource_id(cursor, f"{fname} - Summary")
    parent_child_id = db.get_resource_id(cursor, f"{fname} - ParentChild")
    sent_window_id = db.get_resource_id(cursor, f"{fname} - SentWindow")
    db.delete_facts_resource(conn, cursor, nonllm_id)
    db.delete_facts_resource(conn, cursor, summary_id)
    db.delete_facts_resource(conn, cursor, parent_child_id)
    db.delete_facts_resource(conn, cursor, sent_window_id)

    reader = PdfReader(pdf)
    for page_num, page in enumerate(reader.pages):
        page_content = page.extract_text()
        summary = get_page_summary(page_content)

        setup_resource_nonllm(nonllm_id, page_num, summary, page_content)
        setup_resource_summary(summary_id, page_num, summary)
        setup_resource_parent_child(parent_child_id, page_num, summary, page_content)
        setup_resource_sent_window(sent_window_id, page_num, summary, page_content)

In [10]:
cursor.close()
conn.close()