In [5]:
import toml
import os
from openai import OpenAI
import asyncio
from docx import Document
import docx2txt2 as docx2txt

from langchain_community.document_loaders import PyPDFLoader
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

BASE_DIR = os.path.join(os.path.dirname(
    os.path.abspath('')
), "RAG")  # path of the current folder that holds this file

secrets_path = os.path.join(BASE_DIR, "..", "config", "secrets.toml")
folder_path = os.path.join(BASE_DIR, "docs")

API_KEY = toml.load(secrets_path)["OPENAI_API_KEY"]

embedding_model = "text-embedding-3-large"
embeddings = OpenAIEmbeddings(model=embedding_model, api_key=API_KEY)

client = OpenAI(api_key=API_KEY)
pages = []

In [6]:

def load_pdfs():
    for pdf_path in os.listdir(folder_path):
        if pdf_path[-4:] == ".pdf":
            print("Name of pdf: ", pdf_path)
            loader = PyPDFLoader(os.path.join(BASE_DIR, "docs", pdf_path))

            for page in loader.lazy_load():
                pages.append(page)

        else:
            pass

    print("No. of source pages: ", len(pages))


load_pdfs()


Name of pdf:  What-powers-where-achieving-the-devolution-revolution.pdf
Name of pdf:  Making-the-grade.pdf


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 13 0 (offset 0)
Ignoring wrong pointing object 33 0 (offset 0)
Ignoring wrong pointing object 43 0 (offset 0)
Ignoring wrong pointing object 45 0 (offset 0)
Ignoring wrong pointing object 52 0 (offset 0)
Ignoring wrong pointing object 60 0 (offset 0)


Name of pdf:  The-power-of-prevention-3.pdf
Name of pdf:  Mission-control_Embargoed1.pdf
Name of pdf:  Devolve-by-default-Decentralisation-and-a-redefined-Whitehall.pdf
Name of pdf:  Close-enough-to-care.pdf
Name of pdf:  Getting-the-machine-learning.pdf
No. of source pages:  320


In [7]:
page = pages[0]
page.metadata

{'source': '/Users/suryaganesan/Documents/GitHub/polly-server/pollyServer/app/RAG/docs/What-powers-where-achieving-the-devolution-revolution.pdf',
 'page': 0}

In [8]:
# Create source for text
source_docs = []

for page in pages:
    page_content = page.page_content
    meta_data = page.metadata
    book_name = os.path.basename(meta_data["source"])

    new_content = "Source name: " + book_name + "\n\n" + page_content

    page.page_content = new_content

    source_docs.append(page)


In [9]:
# Create source for images
import fitz
import base64
from langchain.docstore.document import Document

# pdf_file_path = "/Users/suryaganesan/Documents/GitHub/polly-server/pollyServer/app/RAG/docs/Close-enough-to-care.pdf"
# pdf_file = fitz.open(pdf_file_path)
pdf_imgs = [] # source, page_index, image_list

file_paths = os.listdir(folder_path)
for file_name in file_paths:
  if file_name.endswith('.pdf'):
    pdf_file_path = os.path.join(folder_path, file_name)
    pdf_file = fitz.open(pdf_file_path)
    for page_index in range(pdf_file.page_count):
      page = pdf_file.load_page(page_index)
      page_image_list = page.get_images(full=True)

      if page_image_list:
        bs64_img_list = []
        for image_index, img in enumerate(page_image_list):
          xref = img[0]

          base_image = pdf_file.extract_image(xref)
          img_bytes = base_image["image"]
          img_ext = base_image["ext"]

          bs64_img = base64.b64encode(img_bytes).decode('utf-8')
          bs64_img_list.append(bs64_img)
        
        pdf_imgs.append({"source": os.path.basename(pdf_file_path), "page": page_index+1, "image_list": bs64_img_list})


In [12]:
len(pdf_imgs)

37

In [87]:
from PIL import Image
from io import BytesIO

test = pdf_imgs[19]["image_list"][-1]
image_bytes = base64.b64decode(test)
image = Image.open(BytesIO(image_bytes))

image.show()
print(pdf_imgs[19])

{'source': 'The-power-of-prevention-3.pdf', 'page': 19, 'image_list': ['iVBORw0KGgoAAAANSUhEUgAABDgAAALQCAIAAAD6tVcNAAAACXBIWXMAAA7EAAAOxAGVKw4bAAtnW0lEQVR4nNydAWQja/v2Dy+HY1lKCGEYwzAMwzCGYQhDiCGEEEKIEEIooUIIJYRSpZRSSpVSSimlSilLKVXKUsqylmVZDofD8fLy8X33dT/PTGYmk7S75/z/3//78t5vzjRNk2m2mXl+ua7rvn8xFYXK0lRTKRmloqUoNn1ZLFmlgqkUtMJaaW2tWCiqpZKpqbauOmrJVkq2qrq6TteWWvJM3TUUWytaqmIUS6UCLmtrdFXUFdzfxIOUdFXzDLVqaXXXDC21aiqBTlWqGCVfK7pqwdVKVVOtGkVbKeiloqkqgWM2PCojtLTQVGuGWnf0um82fKvhmg3XoC/pDnXXoB8MDJUewdZoxzRXKXqa4lKpRaO4phbX9GLBUoqOSk9E36JnLJW1IpXPRRvYEzwI9ooeDXtoaVVbr+PxtaanNRy1YmqhrYe2Rg9uqQVDKeqlklbEC0jP65kaVeAYNd7nhku/qUrXVC1Pb3o6Hsc3WmWjHZjdqtOrOrTR9o2Ob7TpDq7a9vWWp3UDo1+1BjVn2PCHTX/YcNfrTr9qdsu4T9PVW77V9M2mZ7R8s1226p5RdfSypfuW4ZqGY1qubZcd27Nt17IcyzIN0zTo2nItO3Ddqu9Vfbp2K2U/DMqNsNJt1ja6rcmguzUeHuzOTk8PLy9Pr6/Pr2/Ory5OTg53D3a39neme9ubdE11sLd1uL99dLh7crR3cnJAdXq8f3K0e7Qz3R52u7WqZdjv1kr/elf89bfir++U39bUtYKmqIaum4Zh2qZJe+U7tu/avuP4rlt23cDzgqBcqQa1aiWsBFS0e4GPKvue57nlsletBPWwWq/SJWy0W6PRaGs07Dfr

In [54]:
# Find page number, attach page info and ask it to use it as context
# Also might be worth checking all images and deleting unnecessary ones
response = client.chat.completions.create(
  model="gpt-4o",
  messages=[
    { "role": "system", "content": "You are a helpful AI assistant. The user will give you an image, you need to describe the insights from the image clearly and in detail. Return all numbers and stats given in it in text. Imagine that people can't see the image, and will only know what you tell them about the contents of the image. So give a detailed description of the contents of the image."},
    {"role": "user",
    "content": [
      {"type": "text", "text": "Describe all the stats, data, contents and information about this image. If image is a plain colour, just say image is blank"},
      {"type": "image_url", "image_url": {
        "url": f"data:image/png;base64, {pdf_imgs[12]["image_list"][0]}"
      }}
    ]}
    ]
)

print(response.choices[0].message.content)

The image is a table displaying data for various departments across different years. Here is the detailed information:

**Columns:**
- Year: 2019-20, 2020-21, 2021-22, 2022-23, 2023-24

**Rows (Departments) and Data:**

1. **CO (Cabinet Office):**
   - 2019-20: 0
   - 2020-21: 0
   - 2021-22: 0
   - 2022-23: 0
   - 2023-24: 0

2. **DBT (Department for Business and Trade):**
   - All years: 0 (data not provided)

3. **DCMS (Department for Digital, Culture, Media and Sport):**
   - 2019-20: Not provided
   - 2020-21: Not provided
   - 2021-22: 17
   - 2022-23: 5
   - 2023-24: 11

4. **DfE (Department for Education):**
   - 2019-20: 55
   - 2020-21: 47
   - 2021-22: 68
   - 2022-23: 29
   - 2023-24: Not provided

5. **DESNZ (Department for Energy Security and Net Zero):**
   - 2023-24: 26 (only data provided for this year)

6. **DfT (Department for Transport):**
   - 2019-20: 1
   - 2020-21: 2
   - 2021-22: 2
   - 2022-23: 2
   - 2023-24: 1

7. **MOJ (Ministry of Justice):**
   - 2019-20:

In [None]:

# Create a Vectorstore
chroma_db = Chroma.from_documents(
    documents=source_docs,
    embedding=embeddings,
    persist_directory=os.path.join(BASE_DIR, "chroma_db"),
    collection_name="polly-rag",
)
