In [1]:
from io import BytesIO
import pypdfium2 as pdfium
import backoff
import asyncio
import json
import os
import base64
import uuid
import pandas as pd
from PIL import Image
from IPython.display import Image, display

from openai import OpenAIError
from openai import AsyncOpenAI, OpenAI
from langchain_openai import ChatOpenAI

from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser

In [2]:
MODEL = "gpt-4o-2024-08-06"
baseurl = ""
apikey = ""

#client  = OpenAI(api_key=apikey, base_url=baseurl)
clienta = AsyncOpenAI(api_key=apikey,  base_url=baseurl)
os.environ["OPENAI_API_BASE"] = baseurl
os.environ["OPENAI_API_KEY"] = apikey

In [3]:

@backoff.on_exception(backoff.expo, OpenAIError)
async def parse_page_with_gpt(base64_image: str) -> str:
    messages=[
        {
            "role": "system",
            "content": """
            
            You are a helpful assistant that extracts information from images into text. 
            Do not answer with any additional explanations like "Sure, here is the extracted text"
                  
            """
        },
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Extract information from this document"},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}",
                        "detail": "auto"
                    },
                },
            ],
        }
    ]
    response = await clienta.chat.completions.create(
        model=MODEL,
        messages=messages,
        temperature=0,
        max_tokens=4096,
    )
    return response.choices[0].message.content or ""

In [4]:
async def document_analysis(filename: str) -> str:
    """
    Document Understanding

    Args:
        filename: pdf filename str
    """

    pdf = pdfium.PdfDocument(filename)
    images = []
    for i in range(len(pdf)):
        page = pdf[i]
        image = page.render(scale=4).to_pil()
        buffered = BytesIO()
        image.save(buffered, format="JPEG")
        img_byte = buffered.getvalue()
        img_base64 = base64.b64encode(img_byte).decode("utf-8")
        images.append(img_base64)

    text_of_pages = await asyncio.gather(*[parse_page_with_gpt(image) for image in images])
    
    results = []

    extracted_texts = [doc for doc in text_of_pages]

    for text in extracted_texts:
        cleaned_content_json = text.strip('```plaintext\n').strip('')
        results.append(cleaned_content_json)
        
    return results

In [5]:
# display documents in vstore
def show_vstore(store):
    vector_df = store_to_df(store)
    display(vector_df)
    
# convert vector store into df to convenient_access
def store_to_df(store):
    v_dict = store.docstore._dict
    data_rows = []
    for k in v_dict.keys():
        doc_name = v_dict[k].metadata['source'].split('/')[-1]
        content = v_dict[k].page_content
        data_rows.append({"chunk_id": k, "document": doc_name, "content": content})
    vector_df = pd.DataFrame(data_rows)
    return vector_df

# add document to vector store
def add_to_vector_store(file, store):
    loader = TextLoader(file)
    documents = loader.load()
    
    chunk_size=1000
    chunk_overlap=500
    
    text_splitter = CharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
    )
    texts = text_splitter.split_documents(documents)
    
    embeddings = OpenAIEmbeddings()
    extension = FAISS.from_documents(texts, embeddings)
    store.merge_from(extension)

# delete document from vector store  
def delete_document(store, document):
    vector_df = store_to_df(store)
    chunk_lists = vector_df.loc[vector_df['document']==document]['chunk_id'].tolist()
    store.delete(chunk_lists)

In [6]:
document = Document(
    page_content="",
    metadata={"source": ""},
)
documents = [document]
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(documents, embeddings)

In [7]:
show_vstore(vectorstore)

Unnamed: 0,chunk_id,document,content
0,cb118d4a-b0b6-481b-bf3d-7aeae316881f,,


In [8]:
# Add first PDF document
docs_list = await document_analysis("workato.pdf")

In [9]:
docs_list

['Workato with UserEvidence\n\nBehind the AI Hype\n\nThe 2024 State of LLMs in Business Processes',
 "Contents\n\n01 Overview  \n02 Top Findings  \n03 Chapter 1: What's Driving AI Adoption  \n05 Chapter 2: Activating LLMs in Business Processes  \n10 Chapter 3: The Outcomes of Generative AI in Business Processes  \n15 Chapter 4: Governance of AI in Business Processes  \n18 Chapter 5: The Role of People in a World of Autonomous AI and Agents  \n23 Chapter 6: AI Agents & The Future of Business Processes  \n28 Conclusion  \n29 Methodology & Demographics  \n32 About UserEvidence  ",
 'Overview\n\nMarket sentiment about generative AI has entered a new era. Early hype is fading. Business leaders have a clearer view of its strengths and challenges.\n\nMajor press, such as the Wall Street Journal, The Economist, and others have wondered if the AI hype over-promised and under-delivered. But very few have considered a key value driver for companies: Business processes and operations. As legendary

In [10]:
# Add first PDF to vector store
output_file_path = f"{uuid.uuid4()}.txt"

with open(output_file_path, 'w') as json_file:
    json.dump(docs_list, json_file, indent=2)

print(f"data has been written to {output_file_path}")

add_to_vector_store(output_file_path, vectorstore)

# Check if the file exists
if os.path.exists(output_file_path):
    
    # Delete the file
    os.remove(output_file_path)
    print(f"File {output_file_path} deleted successfully.")
else:
    print("File does not exist.")

data has been written to e775e18c-71f0-4d9c-a3c4-b44802651873.txt
File e775e18c-71f0-4d9c-a3c4-b44802651873.txt deleted successfully.


In [11]:
show_vstore(vectorstore)

Unnamed: 0,chunk_id,document,content
0,cb118d4a-b0b6-481b-bf3d-7aeae316881f,,
1,8de6b3f1-2276-4607-b845-14f0d4e6b50b,e775e18c-71f0-4d9c-a3c4-b44802651873.txt,"[\n ""Workato with UserEvidence\n\nBehind the ..."


In [12]:
# Add second PDF document
docs_list = await document_analysis("HANA Business Case.pdf")

In [13]:
docs_list

['SAP HANA Business Case  \nThe Smart Move  \nAugust 2016  \n\nSample Business Case for (Your Company)  \nBook a meeting to complete!  \n\nSAP',
 'The process to develop a personalised business case\n\nFirst Step – A quick value assessment\n- Tailored view of potential impacts\n- Estimated benefits\n- Potential improvements based on benchmarks\n- Proven Use cases relevant to your industry\n- Benchmarking surveys to tailor your assessment\n\nDownload a sample for review\n\nRegister to get your personalised business case\n\nBusiness case sent to you / presentation of results\n\n© 2015 SAP SE or an SAP affiliate company. All rights reserved.',
 'Transition to the SAP Platform powered by SAP HANA  \nIT is Live - Always On and Always Connected\n\nSAP HANA Business Case Overview\n- SAP Platform Overview\n- SAP HANA Migration\n- Business Case Elements\n- Customer Examples\n- SAP HANA Use Cases (General)\n- Business Scenario Recommendations\n- Benefits Potential\n- Other Ways to engage with SA

In [14]:
# Add second PDF to vector store
output_file_path = f"{uuid.uuid4()}.txt"

with open(output_file_path, 'w') as json_file:
    json.dump(docs_list, json_file, indent=2)

print(f"data has been written to {output_file_path}")

add_to_vector_store(output_file_path, vectorstore)

# Check if the file exists
if os.path.exists(output_file_path):
    
    # Delete the file
    os.remove(output_file_path)
    print(f"File {output_file_path} deleted successfully.")
else:
    print("File does not exist.")

data has been written to e2dab474-1195-4c0d-9ab0-1b11efe237d3.txt
File e2dab474-1195-4c0d-9ab0-1b11efe237d3.txt deleted successfully.


In [15]:
show_vstore(vectorstore)

Unnamed: 0,chunk_id,document,content
0,cb118d4a-b0b6-481b-bf3d-7aeae316881f,,
1,8de6b3f1-2276-4607-b845-14f0d4e6b50b,e775e18c-71f0-4d9c-a3c4-b44802651873.txt,"[\n ""Workato with UserEvidence\n\nBehind the ..."
2,14fc472e-d687-480a-b781-b26f1c354bb2,e2dab474-1195-4c0d-9ab0-1b11efe237d3.txt,"[\n ""SAP HANA Business Case \nThe Smart Move..."


In [None]:
# If we want to delete the document from vector store.
#delete_document(vectorstore, "ddec3dc1-b80c-4df2-a5cb-2515e0803a89.txt")

In [16]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 4}) 
llm = ChatOpenAI(model_name="gpt-4o", temperature=0, streaming=True)

In [17]:
#RAG Chain
rag_prompt = PromptTemplate(
    template="""
    
    You are a helpful assistant tasked with performing document understanding. Do not answer with any additional explanations like "Sure, here is the extracted text" 
    Always return in JSON with source of document.
    
    QUESTION: {question} \n
    CONTEXT: {context} \n
    Answer:
    
    """,
    input_variables=["question","context"],
)

rag_prompt_chain = rag_prompt | llm | StrOutputParser()

QUESTION = """ 

What are the companies in the Opportunity to Cash in SAP HANA context?

"""
CONTEXT = retriever.invoke(QUESTION)

result = rag_prompt_chain.invoke({"question": QUESTION, "context":CONTEXT})

print(result)

```json
{
    "companies": [
        "Jebsen & Jessen",
        "Ball",
        "Cheney Brothers",
        "Stylem",
        "Fleming",
        "Amalgam"
    ],
    "source": "e2dab474-1195-4c0d-9ab0-1b11efe237d3.txt"
}
```


In [18]:
#RAG Chain
rag_prompt = PromptTemplate(
    template="""
    
    You are a helpful assistant tasked with performing document understanding. Do not answer with any additional explanations like "Sure, here is the extracted text" 
    Always return in JSON with source of document.
    
    QUESTION: {question} \n
    CONTEXT: {context} \n
    Answer:
    
    """,
    input_variables=["question","context"],
)

rag_prompt_chain = rag_prompt | llm | StrOutputParser()

QUESTION = """ 

In Workato context, what is the percentage of Strong Governance in Figure 4.1 ?

"""
CONTEXT = retriever.invoke(QUESTION)

result = rag_prompt_chain.invoke({"question": QUESTION, "context":CONTEXT})

print(result)

```json
{
    "percentage_of_strong_governance": "55%",
    "source": "e775e18c-71f0-4d9c-a3c4-b44802651873.txt"
}
```
