In [109]:
from pydantic import BaseModel, Field
from dotenv import load_dotenv
import base64
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage, AIMessage
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from typing import Optional, List

load_dotenv()
chat_model = ChatOpenAI(model_name="gpt-3.5-turbo")


class EventResponse(BaseModel):
    Keywords: Optional[List[str]] = Field(..., description="Keywords summarizing the event's content, aiding in search and categorization.")
    title: str = Field(..., description="The title of the event.")
    location: Optional[str] = Field(..., description="The location of the event.")
    start_time: Optional[str] = Field(..., description="The time event begin.")
    end_time: Optional[str] = Field(..., description="The time event finish.")
    content_md: str = Field(...,
                            description="The content of the event in Markdown format. sprinkled with emojis for "
                                        "enhanced readability.")


class NoteResponse(BaseModel):
    Keywords: Optional[List[str]] = Field(..., description="Keywords summarizing the note's content, aiding in search and categorization.")
    title: str = Field(..., description="The subject or topic of the notes.")
    content_md: str = Field(..., description="The content of the note in Markdown format. sprinkled with emojis for "
                                             "enhanced readability.")


class Item(BaseModel):
    name: str = Field(..., description="The name of the item.")
    quantity: float = Field(..., description="Quantity of the item purchased.")
    total_price: float = Field(..., description="Total price of the purchased item.")


class ReceiptResponse(BaseModel):
    Keywords: Optional[List[str]] = Field(...,
                            description="Keywords summarizing the receipt's content, aiding in search and categorization.")
    title: str = Field(..., description="A short name to identify the receipt.")
    store_name: Optional[str] = Field(..., description="Name of the store where the purchase was made.")
    items: List[Item] = Field(..., description="List of items purchased.")
    total_price: Optional[float] = Field(..., description="Total price of all items purchased.")
    content_md: str = Field(..., description="A summary or description of the purchase, in Markdown format. "
                                              "sprinkled with emojis for enhanced readability.")


class OthersResponse(BaseModel):
    Keywords: Optional[List[str]] = Field(...,
                        description="Keywords summarizing the document's content, aiding in search and categorization.")
    title: str = Field(..., description="A short name or title to identify the document.")
    content_md: str = Field(..., description="Content or description of the document, in Markdown format. sprinkled "
                                             "with emojis for enhanced readability.")

DocumentsDict = {
    "event": {"type_description": "an event or activity", "response_model": EventResponse},
    "note": {"type_description": "a lecture note or a piece of information", "response_model": NoteResponse},
    "receipt": {"type_description": "a shopping receipt or invoice", "response_model": ReceiptResponse},
    "others": {"type_description": "a document of any type", "response_model": OthersResponse},
}


system_msg = "You are a professional organizer whose goal is to convert unstructured data into a formatted structure " \
             "and extract valuable information."

main_human_prompt = PromptTemplate(
    template="You are presented with a disorganized document containing information about "
             "{type_description}. Your task is to extract crucial details from the document."
             "\n{format_instructions}\n\nDocument:\n{doc}",
    input_variables=['doc', 'type_description', 'format_instructions'])

In [110]:
text = """--------------------------------------------------
       ABC Mart - Receipt
--------------------------------------------------

Date: October 15, 2023   Time: 14:30:45
Cashier: John Smith   Register: 0123

--------------------------------------------------

Item                        Quantity     Price
--------------------------------------------------

1. Blue Jeans                1           $29.99
2. White T-Shirt             2           $9.99 each
3. Running Shoes             1           $49.99

--------------------------------------------------

Subtotal:                    $99.96
Sales Tax (7%):              $7.00
Total Amount:               $106.96

--------------------------------------------------

Payment Method: Credit Card
Card Number: **** **** **** 1234
Cardholder: John Doe

--------------------------------------------------

Thank you for shopping at ABC Mart!
Please keep this receipt for your records.

Visit us again soon!

--------------------------------------------------

   """
docType = 'others'

type_description = DocumentsDict[docType]["type_description"]
response_model_cls = DocumentsDict[docType]["response_model"]
parser = PydanticOutputParser(pydantic_object=response_model_cls)
messages = [
    SystemMessage(content=system_msg),
    HumanMessage(content=main_human_prompt.format_prompt(
        doc=text,
        type_description=type_description,
        format_instructions=parser.get_format_instructions()
    ).to_string()),
]

# get response from chat model
llm_response = chat_model.predict_messages(messages)

# parse response
event_response = parser.parse(llm_response.content)

In [113]:
print(event_response.content_md)

Date: October 15, 2023   Time: 14:30:45
Cashier: John Smith   Register: 0123

Item                        Quantity     Price

1. Blue Jeans                1           $29.99
2. White T-Shirt             2           $9.99 each
3. Running Shoes             1           $49.99

Subtotal:                    $99.96
Sales Tax (7%):              $7.00
Total Amount:               $106.96

Payment Method: Credit Card
Card Number: **** **** **** 1234
Cardholder: John Doe

Thank you for shopping at ABC Mart!
Please keep this receipt for your records.

Visit us again soon!



In [112]:
print(messages[1].content)

You are presented with a disorganized document containing information about a document of any type. Your task is to extract crucial details from the document.
The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"Keywords": {"anyOf": [{"items": {"type": "string"}, "type": "array"}, {"type": "null"}], "description": "Keywords summarizing the document's content, aiding in search and categorization.", "title": "Keywords"}, "title": {"description": "A short name or title to identify the document.", "title": "Title", "type": "string"}, "content_md": {"description": "Content or descrip