In [8]:
import toml
import os
from openai import OpenAI
import asyncio
from docx import Document
import docx2txt2 as docx2txt

from langchain_community.document_loaders import PyPDFLoader
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

BASE_DIR = os.path.join(os.path.dirname(
    os.path.abspath('')
), "RAG")  # path of the current folder that holds this file

secrets_path = os.path.join(BASE_DIR, "..", "config", "secrets.toml")
folder_path = os.path.join(BASE_DIR, "docs")

OPENAI_KEY = toml.load(secrets_path)["OPENAI_API_KEY"]
GROQ_KEY = toml.load(secrets_path)["GROQ_KEY"]

embedding_model = "text-embedding-3-large"
embeddings = OpenAIEmbeddings(model=embedding_model, api_key=OPENAI_KEY)

client = OpenAI(api_key=OPENAI_KEY)
pages = []

In [9]:

def load_pdfs():
    for pdf_path in os.listdir(folder_path):
        if pdf_path[-4:] == ".pdf":
            print("Name of pdf: ", pdf_path)
            loader = PyPDFLoader(os.path.join(BASE_DIR, "docs", pdf_path))

            for page in loader.lazy_load():
                pages.append(page)

        else:
            pass


load_pdfs()
len(pages)


Name of pdf:  What-powers-where-achieving-the-devolution-revolution.pdf
Name of pdf:  Making-the-grade.pdf


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 13 0 (offset 0)
Ignoring wrong pointing object 33 0 (offset 0)
Ignoring wrong pointing object 43 0 (offset 0)
Ignoring wrong pointing object 45 0 (offset 0)
Ignoring wrong pointing object 52 0 (offset 0)
Ignoring wrong pointing object 60 0 (offset 0)


Name of pdf:  The-power-of-prevention-3.pdf
Name of pdf:  Mission-control_Embargoed1.pdf
Name of pdf:  Devolve-by-default-Decentralisation-and-a-redefined-Whitehall.pdf
Name of pdf:  Close-enough-to-care.pdf
Name of pdf:  Getting-the-machine-learning.pdf


320

In [10]:
# Regex for checking "Figure %x: "
import re

pages_with_figures = []
for page in pages:
  content = page.page_content
  pattern_1 = r"Figure\s\d+\s:.*"
  pattern_2 = r"Figure\s\d+:.*"
  match_1 = re.search(pattern_1, content)
  match_2 = re.search(pattern_2, content)

  if match_1 or match_2:
    matched_line = match_1.group() if match_1 else match_2.group()
    pages_with_figures.append({"source": page.metadata["source"], "page_no": page.metadata["page"], "figure_title": matched_line})

len(pages_with_figures)

54

In [11]:
# Central source for text & images
source_docs = []

In [12]:
# Create source for text

for page in pages:
    page_content = page.page_content
    meta_data = page.metadata
    book_name = os.path.basename(meta_data["source"])

    new_content = "Source name: " + book_name + "\n\n" + page_content

    page.page_content = new_content

    source_docs.append(page)


In [13]:
# Create source for images

import base64
from langchain.docstore.document import Document
from pdf2image import convert_from_path
from io import BytesIO
from PIL import Image

new_pages_with_figures = []

for idx, page in enumerate(pages_with_figures):
    source = page["source"]
    page_no = page["page_no"]
    
    try:
        # Convert specific page to image
        img = convert_from_path(
            source,
            first_page=page_no + 1,  # pdf2image uses 1-based indexing
            last_page=page_no + 1
        )[0]  # Get first (and only) image
        
        # Convert image to bytes
        img_bytes_arr = BytesIO()
        img.save(img_bytes_arr, format="PNG")
        image_bytes = img_bytes_arr.getvalue()
        
        # Convert bytes to base64
        base64_image = base64.b64encode(image_bytes).decode('utf-8')
        
        # Add both bytes and base64 to page object
        # page["image_bytes"] = image_bytes
        page["image_b64"] = base64_image

        new_pages_with_figures.append(page)
        
        print(f"Loaded page {idx+1}/{len(pages_with_figures)}")
        
    except Exception as e:
        print(f"Error processing page {page_no} from {source}: {str(e)}")
        page["image_b64"] = None
          

  # byte_stream = BytesIO(image_bytes)
  # image = Image.open(byte_stream)

  # image.show()

new_pages_with_figures[20]

Loaded page 1/54
Loaded page 2/54
Loaded page 3/54
Loaded page 4/54
Loaded page 5/54
Loaded page 6/54
Loaded page 7/54
Loaded page 8/54
Loaded page 9/54
Loaded page 10/54
Loaded page 11/54
Loaded page 12/54
Loaded page 13/54
Loaded page 14/54
Loaded page 15/54
Loaded page 16/54
Loaded page 17/54
Loaded page 18/54
Loaded page 19/54
Loaded page 20/54
Loaded page 21/54
Loaded page 22/54
Loaded page 23/54
Loaded page 24/54
Loaded page 25/54
Loaded page 26/54
Loaded page 27/54
Loaded page 28/54
Loaded page 29/54
Loaded page 30/54
Loaded page 31/54
Loaded page 32/54
Loaded page 33/54
Loaded page 34/54
Loaded page 35/54
Loaded page 36/54
Loaded page 37/54
Loaded page 38/54
Loaded page 39/54
Loaded page 40/54
Loaded page 41/54
Loaded page 42/54
Loaded page 43/54
Loaded page 44/54
Loaded page 45/54
Loaded page 46/54
Loaded page 47/54
Loaded page 48/54
Loaded page 49/54
Loaded page 50/54
Loaded page 51/54
Loaded page 52/54
Loaded page 53/54
Loaded page 54/54


{'source': '/Users/suryaganesan/Documents/GitHub/polly-server/pollyServer/app/RAG/docs/Making-the-grade.pdf',
 'page_no': 53,
 'figure_title': 'Figure 17 : To what extent do you agree with the statement that â€œmanagers are ',
 'image_b64': 'iVBORw0KGgoAAAANSUhEUgAABnYAAAkjCAIAAACzq+aSAAEAAElEQVR4nOzdd3wUxf8/8Lm79EJ6Lj2QhCR0CL03BSkBFKQ3AUURpYgfu/6sYOWrIoKgoKJIFQFB6T1ASOgJBEJ67+WSXHJ3+/tjZFz3Su5u75Kgr+cfeWx2Z3dnZ/dmd987OyvhOI4AAAAAAAAAAACAuaTNnQEAAAAAAAAAAIAHG0JsAAAAAAAAAAAAoiDEBgAAAAAAAAAAIApCbAAAAAAAAAAAAKIgxAYAAAAAAAAAACAKQmwAAAAAAAAAAACiIMQGAAAAAAAAAAAgCkJsAAAAAAAAAAAAoiDEBgAAAAAAAAAAIApCbAAAAAAAAAAAAKIgxAYAAAAAAAAAACAKQmwAAAAAAAAAAACiIMQGAAAAAAAAAAAgCkJsAAAAAAAAAAAAoiDEBgAAAAAAAAAAIApCbAAAAAAAAAAAAKIgxAYAAAAAAAAAACAKQmwAAAAAAAAAAACiIMQGAAAAAAAAAAAgCkJsAAAAAAAAAAAAoiDEBgAAAAAAAAAAIApCbAAAAAAAAAAAAKIgxAYAAAAAAAAAACAKQmwAAAAAAAAAAACiIMQGAAAAAAAAAAAgCkJsAAAAAAAAAAAAoiDEBgAAAAAAAAAAIApCbAAAAAAAAAAAAKIgxAYAAAAAAAAAACAKQmwAAAAAAAAAAACiIMQGAAAAAAAAAAAgCkJsAAAAAAAAAAAAoiDEBgAAAAAAAAAAIApCbA

In [19]:
len(new_pages_with_figures)


54

In [20]:
import os

image_docs = []
for idx, page in enumerate(new_pages_with_figures):
    print(f"Working on text annotation: {idx+1}/{len(new_pages_with_figures)}")

    response = client.chat.completions.create(
      model="gpt-4o",
      messages=[
        { "role": "user", "content": "You are a helpful AI assistant. Your goal is to describe all the facts and data in the given figure and briefly explain them. The user will give you an image, you need to describe the insights from the figure in the image clearly and in detail. Return all numbers and stats given in it in text. Imagine that people can't see the image, and will only know what you tell them about the contents of the image. So give a detailed description of the contents of the image."},
        {"role": "user",
        "content": [
          {"type": "text", "text": f"""Describe all the stats, data, contents and information about the 'Figure' you see. You only need to describe the figure in the image, not the entire image provided. If the figure is data heavy, then prefer to make tables that describe that data that explains the figure. You don't have to use tables, if they aren't the best way to explain the picture. Here is the figure's title: {page["figure_title"]}"""},
          {"type": "image_url", "image_url": {
            "url": f"data:image/png;base64,{page["image_b64"]}"
          }}
        ]}
        ],
        stream=False
    )

    text_annotation = response.choices[0].message.content
    source = os.path.basename(page["source"])
    page_no = page["page_no"] + 1
    figure_title = page["figure_title"]

    page_content = f"Source document: {source} \nPage no: {page_no} \n{figure_title}\n\nFigure Description: \n{text_annotation}"
    
    m_data = {
        "source": page["source"],
        "page_no": page_no,
        "figure_title": figure_title
    }

    image_doc = Document(
        page_content=page_content,
        meta_data={}
    )

    image_doc.metadata = m_data
    image_docs.append(image_doc)


Working on text annotation: 1/54
Working on text annotation: 2/54
Working on text annotation: 3/54
Working on text annotation: 4/54
Working on text annotation: 5/54
Working on text annotation: 6/54
Working on text annotation: 7/54
Working on text annotation: 8/54
Working on text annotation: 9/54
Working on text annotation: 10/54
Working on text annotation: 11/54
Working on text annotation: 12/54
Working on text annotation: 13/54
Working on text annotation: 14/54
Working on text annotation: 15/54
Working on text annotation: 16/54
Working on text annotation: 17/54
Working on text annotation: 18/54
Working on text annotation: 19/54
Working on text annotation: 20/54
Working on text annotation: 21/54
Working on text annotation: 22/54
Working on text annotation: 23/54
Working on text annotation: 24/54
Working on text annotation: 25/54
Working on text annotation: 26/54
Working on text annotation: 27/54
Working on text annotation: 28/54
Working on text annotation: 29/54
Working on text annotat

In [22]:
len(source_docs)

54

In [141]:
from docx import Document as document_creator

new_docs = source_docs + image_docs

print("Loading Document class")
document = document_creator()

for page in source_docs:
    #print(f"Adding page: {page.metadata}")
    document.add_paragraph(page.page_content)

print("Iteration complete")
document_path = os.path.join(BASE_DIR, "docs", "source.docx")
document.save(document_path)


Loading Document class
Iteration complete


In [145]:
len(new_docs)

536

In [1]:
# Create a Vectorstore
chroma_db = Chroma.from_documents(
    documents=new_docs,
    embedding=embeddings,
    persist_directory=os.path.join(BASE_DIR, "chroma"),
    collection_name="polly-rag",
)


NameError: name 'Chroma' is not defined