In [13]:
import toml
import os
from openai import OpenAI
import asyncio
from docx import Document
import docx2txt2 as docx2txt

from langchain_community.document_loaders import PyPDFLoader
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

BASE_DIR = os.path.join(os.path.dirname(
    os.path.abspath('')
), "RAG")  # path of the current folder that holds this file

secrets_path = os.path.join(BASE_DIR, "..", "config", "secrets.toml")
folder_path = os.path.join(BASE_DIR, "docs")

API_KEY = toml.load(secrets_path)["OPENAI_API_KEY"]

embedding_model = "text-embedding-3-large"
embeddings = OpenAIEmbeddings(model=embedding_model, api_key=API_KEY)

client = OpenAI(api_key=API_KEY)
pages = []

In [20]:

def load_pdfs():
    for pdf_path in os.listdir(folder_path):
        if pdf_path[-4:] == ".pdf":
            print("Name of pdf: ", pdf_path)
            loader = PyPDFLoader(os.path.join(BASE_DIR, "docs", pdf_path))

            for page in loader.lazy_load():
                pages.append(page)

        else:
            pass

    print("No. of source pages: ", len(pages))


load_pdfs()


Name of pdf:  What-powers-where-achieving-the-devolution-revolution.pdf
Name of pdf:  Making-the-grade.pdf


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 13 0 (offset 0)
Ignoring wrong pointing object 33 0 (offset 0)
Ignoring wrong pointing object 43 0 (offset 0)
Ignoring wrong pointing object 45 0 (offset 0)
Ignoring wrong pointing object 52 0 (offset 0)
Ignoring wrong pointing object 60 0 (offset 0)


Name of pdf:  The-power-of-prevention-3.pdf
Name of pdf:  Mission-control_Embargoed1.pdf
Name of pdf:  Devolve-by-default-Decentralisation-and-a-redefined-Whitehall.pdf
Name of pdf:  Close-enough-to-care.pdf
Name of pdf:  Getting-the-machine-learning.pdf
No. of source pages:  960


In [27]:
page = pages[0]
page.metadata

{'source': '/Users/suryaganesan/Documents/GitHub/polly-server/pollyServer/app/RAG/docs/What-powers-where-achieving-the-devolution-revolution.pdf',
 'page': 0}

In [28]:
# Create source for text
source_docs = []

for page in pages:
    page_content = page.page_content
    meta_data = page.metadata
    book_name = os.path.basename(meta_data["source"])

    new_content = "Source name: " + book_name + "\n\n" + page_content

    page.page_content = new_content

    source_docs.append(page)


In [69]:
# Create source for images
import fitz
import base64
from langchain.docstore.document import Document

# pdf_file_path = "/Users/suryaganesan/Documents/GitHub/polly-server/pollyServer/app/RAG/docs/Close-enough-to-care.pdf"
# pdf_file = fitz.open(pdf_file_path)
pdf_imgs = [] # source, page_index, image_list

file_paths = os.listdir(folder_path)
for file_name in file_paths:
  if file_name.endswith('.pdf'):
    pdf_file_path = os.path.join(folder_path, file_name)
    pdf_file = fitz.open(pdf_file_path)
    for page_index in range(pdf_file.page_count):
      page = pdf_file.load_page(page_index)
      page_image_list = page.get_images(full=True)

      if page_image_list:
        bs64_img_list = []
        for image_index, img in enumerate(page_image_list):
          xref = img[0]

          base_image = pdf_file.extract_image(xref)
          img_bytes = base_image["image"]
          img_ext = base_image["ext"]

          bs64_img = base64.b64encode(img_bytes).decode('utf-8')
          bs64_img_list.append(bs64_img)
        
        pdf_imgs.append({"source": os.path.basename(pdf_file_path), "page": page_index, "image_list": bs64_img_list})


In [71]:
pdf_imgs[-1]
# doc = Document(page_content="", meta_data={"source": "pdf1"})

{'source': 'Getting-the-machine-learning.pdf',
 'page': 63,
 'image_list': ['iVBORw0KGgoAAAANSUhEUgAAAYMAAAAhCAYAAAA7x8dDAAAACXBIWXMAAA7EAAAOxAGVKw4bAAAAlUlEQVR4nO3OIQEAMAwEsfdvepNxoAHh2bYHwHl5AIBeHgCglwcA6OUBAHp5AIBeHgCglwcA6OUBAHp5AIBeHgCglwcA6OUBAHp5AIBeHgCglwcA6OUBAHp5AIBeHgCglwcA6OUBAHp5AIBeHgCglwcA6OUBAHp5AIBeHgCglwcA6OUBAHp5AIBeHgCglwcA6OUBAGIfGXSz/SmKwaIAAAAASUVORK5CYII=',
  'iVBORw0KGgoAAAANSUhEUgAAAVkAAAA3CAYAAACmezlFAAAACXBIWXMAAA7EAAAOxAGVKw4bAAAA6klEQVR4nO3OsQnAQBAEseu/abuKZ2FQoFx3dx8Az8wDAGXzAEDZPABQNg8AlM0DAGXzAEDZPABQNg8AlM0DAGXzAEDZPABQNg8AlM0DAGXzAEDZPABQNg8AlM0DAGXzAEDZPABQNg8AlM0DAGXzAEDZPABQNg8AlM0DAGXzAEDZPABQNg8AlM0DAGXzAEDZPABQNg8AlM0DAGXzAEDZPABQNg8AlM0DAGXzAEDZPABQNg8AlM0DAGXzAEDZPABQNg8AlM0DAGXzAEDZPABQNg8AlM0DAGXzAEDZPABQNg8AlM0DAGXzAEDWD0512Sk2emksAAAAAElFTkSuQmCC',
  'iVBORw0KGgoAAAANSUhEUgAAAVkAAAApCAYAAACfp1quAAAACXBIWXMAAA7EAAAOxAGVKw4bAAAAtUlEQVR4nO3OsQnAQBAEseu/abuKZ2FQoFx3dx8Az8wDAGXzAEDZPABQNg8AlM0DAGXzAEDZPABQNg8AlM0DAGXzAEDZPABQNg8AlM0DAGXzAEDZPAB

In [None]:

# Create a Vectorstore
chroma_db = Chroma.from_documents(
    documents=source_docs,
    embedding=embeddings,
    persist_directory=os.path.join(BASE_DIR, "chroma_db"),
    collection_name="polly-rag",
)
