## Step 1. Convert files from database into readable data
The following 2 cells access the PDF stored in the 'docs' directory (essentially our knowledge base, in this case), extract text from it, and convert it into JSON format indexed by page. 

In [1]:
import fitz  # PyMuPDF
from pathlib import Path

def extract_text_from_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)
    page_texts = []
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text = page.get_text()
        page_texts.append(text)
    return page_texts

pdf=Path("docs/QGenda Whitepaper.pdf")

raw_text = extract_text_from_pdf(pdf)
# print(raw_text)


In [2]:
# Pagination
import json
import re
from utilities import clean_text

# Regular expression pattern to match copyright text
pattern = r"Copyright © 2024 QGenda, LLC All rights reserved."

# Split data after each occurrence of the pattern
pages = []
current_page = []
for index, item in enumerate(raw_text):
    current_page.append(item)
    if re.search(pattern, item):
        cleaned_text = clean_text("".join(current_page))
        pages.append({"index": len(pages), "text": cleaned_text})
        current_page = []

# Convert the pages list to JSON format
pages_json = json.dumps(pages, indent=4)

# Output the result
#print(pages_json)

In [3]:
'''# Lazy way: Use LLM to clean up outputs.
from langchain_openai import ChatOpenAI
from langchain.docstore.document import Document
from langchain.chains import LLMChain
from utilities import get_openai_key

api_key = get_openai_key()

llm = ChatOpenAI(model_name='gpt-4o-mini', temperature=1.0)
cleaned_pages = []
for page in pages:
    raw_text = page['text']
    # Create the chain using the RunnableSequence
    chain_qa = prompt | llm
    llm_results = chain_qa.invoke({"summaries": raw_text}, return_only_outputs=True)
    cleaned_text = llm_results.content
    cleaned_pages.append({"index": page['index'], "text": cleaned_text})

# Convert the cleaned pages list to JSON format
cleaned_pages_json = json.dumps(cleaned_pages, indent=4)

# Output the result
print(cleaned_pages_json)'''


'# Lazy way: Use LLM to clean up outputs.\nfrom langchain_openai import ChatOpenAI\nfrom langchain.docstore.document import Document\nfrom langchain.chains import LLMChain\nfrom utilities import get_openai_key\n\napi_key = get_openai_key()\n\nllm = ChatOpenAI(model_name=\'gpt-4o-mini\', temperature=1.0)\ncleaned_pages = []\nfor page in pages:\n    raw_text = page[\'text\']\n    # Create the chain using the RunnableSequence\n    chain_qa = prompt | llm\n    llm_results = chain_qa.invoke({"summaries": raw_text}, return_only_outputs=True)\n    cleaned_text = llm_results.content\n    cleaned_pages.append({"index": page[\'index\'], "text": cleaned_text})\n\n# Convert the cleaned pages list to JSON format\ncleaned_pages_json = json.dumps(cleaned_pages, indent=4)\n\n# Output the result\nprint(cleaned_pages_json)'

Create a .json file of the cleaned data for later use. 

In [4]:
from pathlib import Path

#original_f = Path(pdf)
new_f = pdf.with_stem(pdf.stem + '_cleaned')
json_format = new_f.with_suffix('.json')

with open(json_format, 'w', encoding='utf-8') as file:
    file.write(pages_json)