In [1]:
# !pip install langchain
# !pip install openai -U
# !pip install nervaluate

Collecting langchain
  Downloading langchain-0.0.27-py3-none-any.whl (124 kB)
Collecting pydantic
  Downloading pydantic-2.5.2-py3-none-any.whl (381 kB)
Collecting annotated-types>=0.4.0
  Downloading annotated_types-0.5.0-py3-none-any.whl (11 kB)
Collecting typing-extensions>=4.6.1
  Downloading typing_extensions-4.7.1-py3-none-any.whl (33 kB)
Collecting pydantic-core==2.14.5
  Downloading pydantic_core-2.14.5-cp37-none-win_amd64.whl (1.9 MB)
Installing collected packages: typing-extensions, annotated-types, pydantic-core, pydantic, langchain
Successfully installed annotated-types-0.5.0 langchain-0.0.27 pydantic-2.5.2 pydantic-core-2.14.5 typing-extensions-4.7.1
Collecting openai
  Downloading openai-1.3.6-py3-none-any.whl (220 kB)
Collecting httpx<1,>=0.23.0
  Downloading httpx-0.24.1-py3-none-any.whl (75 kB)
Collecting distro<2,>=1.7.0
  Downloading distro-1.8.0-py3-none-any.whl (20 kB)
Collecting anyio<4,>=3.5.0
  Downloading anyio-3.7.1-py3-none-any.whl (80 kB)
Collecting sniffio


In [10]:
import langchain
from langchain.prompts import PromptTemplate # For Building Prompt.
from langchain.chat_models import ChatOpenAI # LLM.

In [22]:
prompt = """
Can you please convert this unstructured receipt text data into a structured JSON object which includes ReceiptInfo and ITEMS.
Note that sometimes the quantity for an item may be located next to the item name.
For dates please specify in MM/DD/YYYY.
For phone numbers please specify in (xxx)-xxx-xxxx
Please respond with a single JSON Object instead of text.
Please just give the structured JSON object and nothing else.

Here is the schema for the JSON object:
{{
  "ReceiptInfo": {{
    "merchant": "(string value)",
    "address": "(string value)",
    "city": "(string value)",
    "state": "(string value)",
    "phoneNumber": "(string value)",
    "tax": "(float value)",
    "total": "(float value)",
    "receiptDate": "(string value)",
    "receiptTime": "(string value)",


    "ITEMS": [
      {{
        "description": "(string value)",
        "quantity": "(integer value)",
        "unitPrice": "(float value)",
        "totalPrice": "(float value)",
        "discountAmount": "(float value)"
      }}
    ]
  }}
}}


Examples:

Receipt:
Give us feedback @ <UNKNOWN>
Thank you! ID <UNKNOWN> <UNKNOWN>
Walmart
301-604-0180 Mgr:DEBRA
3549 RUSSETT GRN
LAUREL MD 20724
ST# 01985 OP# 009049 TE# 49 TR# 04345
<UNKNOWN> PRK SB 008265754333 F
1.98 X
BOOSTER
088439295372
19.98 N
VOIDED BANKCARD TRANSACTION
TERMINAL <UNKNOWN> SC010112
TRANSACTION NOT COMPLETE
07/04/23
16:19:53
<UNKNOWN> BANKCARD TRANSACTION
TERMINAL # SC010112
TRANSACTION <UNKNOWN> COMPLETE
07/04/23
16:20:01
SUBTOTAL
21.96
TAX 1
6.000 %
0.12
TOTAL
22.08
VISA TEND
22.08
ACCOUNT #
**** **** <UNKNOWN> 2130 F
APPROVAL # 00592D
REF # 318500762875
TRANS ID - 583185732192035
VALIDATION - B339
PAYMENT SERVICE - E
TERMINAL # SC010112
07/04/23
16:20:21
CHANGE DUE
0.00
# ITEMS SOLD 2
<UNKNOWN> 0069 6254 8220 4557 9869
Walmart
Become a member
Scan for free 30-day trial
Low Prices You Can <UNKNOWN> Every <UNKNOWN>
07/04/23
16:20:22
***CUSTOMER <UNKNOWN>

JSON:
{{
  "ReceiptInfo": {{
    "merchant": "Walmart",
    "address": "3549 RUSSETT GRN",
    "city": "LAUREL",
    "state": "MD",
    "phoneNumber": "(301)-604-0180",
    "tax": "0.12",
    "total": "22.08",
    "receiptDate": "07/04/2023",
    "receiptTime": "4:20pm",


    "ITEMS": [
      {{
        "description": "PRK SB",
        "quantity": "1",
        "unitPrice": "1.98",
        "totalPrice": "1.98",
        "discountAmount": "0.00"
      }},
      {{
        "description": "BOOSTER",
        "quantity": "1",
        "unitPrice": "19.98",
        "totalPrice": "19.98",
        "discountAmount": "0.00"
      }}
    ]
  }}
}}

Receipt:
Unstructured Receipt Text Data:
203
BUY ONE GET ONE FREE QUARTER POUNDER
W/CHEESE OR EGG MCMUFFIN
Go to www.mcdvoice.com within 7 days
and tell <UNKNOWN> about your visit.
Validation Code:
Expires 30 days after receipt date.
Valid at participating US
Survey Code:
<UNKNOWN>
McDonald's Restaurant <UNKNOWN>
2915 E MANCA RD
<UNKNOWN> 96822
TEL# <UNKNOWN>
Thank You Valued Customer
KS# 2
08/19/2023 03:33 PM
<UNKNOWN>
Order 03
1 Happy Meal Ch Burger
5.89
1 Cheeseburger
ONLY Onions
ONLY Ketchup
1 Extra Kids Fry
1 Apple Juice
1 NERF
Subtotal
5.89
Tax
0.28
Take-Out Total
6.17
Cashless
6.17
Change
0.00
MER# 467782
CARD ISSUER
ACCOUNT
Visa SALE
<UNKNOWN> <UNKNOWN> 130
TRANSACTION AMOUNT
6.17
CONTACTLESS
AUTHORIZATION CODE - <UNKNOWN>
SEQ# 107204
AID: A0000000031010
McDonald's Restaurant
Sign up for MyMcDonald's rewards
to earn points on future visit

JSON:
{{
  "ReceiptInfo": {{
    "merchant": "McDonald's Restaurant",
    "address": "2915 E MANOA RD",
    "city": "Honolulu",
    "state": "HI",
    "phoneNumber": "(808)-555-1234",
    "tax": "0.28",
    "total": "6.17",
    "receiptDate": "08/19/2023",
    "receiptTime": "03:33 PM",


    "ITEMS": [
      {{
        "description": "1 Happy Meal Ch Burger",
        "quantity": "1",
        "unitPrice": "5.89",
        "totalPrice": "5.89",
        "discountAmount": "0.00"
      }}
    ]
  }}
}}

Receipt:
Unstructured Receipt Text Data:
H
<UNKNOWN>
MART
http://www.hmart.com
458 Keawe st
Honolulu, <UNKNOWN> 96813
TEL (808) 219-0924
Your Cashier was TIM
WANG DANGMYUN
4.99 B
K-T TOPPOKI
PC
6.99 B
OTG SOUP RICECAKE
7.99 B
HT VEGE DUMPLING
14.99 B
TAX
1.65
****
BALANCE
36.61
Discover Credit - C
ACCOUNT <UNKNOWN> ************ 1153
APPROVAL CODE: 01263R
SEQUENCE NUMBER: 22907
No CVM
Amount USD $36.61
CARD:Discover CREDIT XXXX1153 EMV
APPROVAL CODE : 01263R
AID A0000001523010
TVR : 0000008000
IAD <UNKNOWN>
TSI : E800
APPLICATION CRYPTOGRAM CB8F25799B68A2B
APPLICATION PREFERRED NAME Discover Cr
APPLICATION LABEL : Discover
TC CB8F25799B68A2B3
<UNKNOWN> 00
RespDate : 06112023
Resp Time 211509
<UNKNOWN>
TOTAL AMOUNT: $36.61
RESPONSE CODE: APPROVED
06/11/23 09:15pm 113 20
Discover
<UNKNOWN> 61
CHANGE
0.00
TOTAL NUMBER OF ITEMS SOLD =
4
06/11/23 09:15pm 81 20 363 113

JSON:
{{
  "ReceiptInfo": {{
    "merchant": "H MART",
    "address": "458 Keawe st",
    "city": "Honolulu",
    "state": "HI",
    "phoneNumber": "(808)-219-0924",
    "tax": "1.65",
    "total": "36.61",
    "receiptDate": "06/11/23",
    "receiptTime": "09:15pm",


    "ITEMS": [
      {{
        "description": "WANG DANGMYUN",
        "quantity": "1",
        "unitPrice": "4.99",
        "totalPrice": "4.99",
        "discountAmount": "0.00"
      }},
      {{
        "description": "K-T TOPPOKI PC",
        "quantity": "1",
        "unitPrice": "6.99",
        "totalPrice": "6.99",
        "discountAmount": "0.00"
      }},
      {{
        "description": "OTG SOUP RICECAKE",
        "quantity": "1",
        "unitPrice": "7.99",
        "totalPrice": "7.99",
        "discountAmount": "0.00"
      }},
      {{
        "description": "HT VEGE DUMPLING",
        "quantity": "1",
        "unitPrice": "14.99",
        "totalPrice": "14.99",
        "discountAmount": "0.00"
      }}
    ]
  }}
}}

Receipt:
SAFEWAY
O
<UNKNOWN>
Store 204 Dir Kaipo Ah Mook Sang
Main (808) 988-2058
2855 East Manoa Road
HONOLULU HI 96822
GROCERY
6.99 B
MARINARA SCE
Regular Price
8.49
Member Savings
1.50-
REFRIG/FROZEN
LUC CHES COLBY JAC.
10.99 B
Regular Price
13.49
Member Savings
2.50-
MARIE CALLENDER'S
4.00 B
Regular Price
6.49
Member Savings
2.49-
MEAT
AIDELLS MEATBALLS
5.99 B
Regular Price
8.99
Member Savings
3.00-
TAX
1.32
**** BALANCE
29.29
Credit Purchase 08/01/23 20:15
CARD # <UNKNOWN>
<UNKNOWN> 861544421300 AUTH: 0006375D
PAYMENT AMOUNT
29.29
AL VISA CREDIT
AID A0000000031010
TVR 0000000000
TSI 0000
Visa
29.29
CHANGE
0.00
TOTAL NUMBER OF ITEMS SOLD =
4
08/01/23 20:15 204 51 151
8851
Gameplays Earned 2
Sweepstakes Entries Earned 2
*Subject to limit of 10 <UNKNOWN> day
Earn gameplays and sweepstakes entries
by shopping, then play
Flavor Adventure for a chance to win!
Scan the QR code to Play. Win. Save.
REWARDS AVAILABLE
2
POINTS EARNED TODAY
Base Points 27
Total 27
Points Towards Next Reward 57 of 100
YOUR CASHIER TODAY WAS SELF
YOUR SAVINGS
Member Savings
Total
9.49
Total Savings Value
9.49
25%
Thank <UNKNOWN> you for shopping <UNKNOWN>
For SAFEWAY FOR U questions
877-276-9637 or Safeway.com/foru call

JSON:
{{
  "ReceiptInfo": {{
    "merchant": "SAFEWAY",
    "address": "2855 East Manoa Road",
    "city": "Honolulu",
    "state": "HI",
    "phoneNumber": "(808)-988-2058",
    "tax": "1.32",
    "total": "29.29",
    "receiptDate": "08/01/23",
    "receiptTime": "8:15pm",


    "ITEMS": [
      {{
        "description": "MARINARA SCE",
        "quantity": "1",
        "unitPrice": "8.49",
        "totalPrice": "6.99",
        "discountAmount": "1.50"
      }},
      {{
        "description": "LUC CHES COLBY JAC.",
        "quantity": "1",
        "unitPrice": "13.49",
        "totalPrice": "10.99",
        "discountAmount": "2.50"
      }},
      {{
        "description": "MARIE CALLENDER'S",
        "quantity": "1",
        "unitPrice": "6.49",
        "totalPrice": "4.00",
        "discountAmount": "2.49"
      }},
      {{
        "description": "AIDELLS MEATBALLS",
        "quantity": "1",
        "unitPrice": "8.99",
        "totalPrice": "5.99",
        "discountAmount": "3.00"
      }}
    ]
  }}
}}

Here is the Unstructured Receipt Text Data (Turn this into structured JSON):

Receipt: {receipt}
"""

In [4]:
# import os
# import openai

# openai.api_key = "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"

# # Creates openai connection.
# response = openai.ChatCompletion.create(
#   model="gpt-3.5-turbo",
#   messages=[
#     {
#       "role": "user",
#       "content": prompt
#     }
#   ],
#   temperature=0,
#   max_tokens=128,
# )
# print(response)


In [5]:
# from langchain.prompts import PromptTemplate
# from langchain.chat_models import ChatOpenAI

# # Language Model being used.
# model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key="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")

# # Creates a prompt template from the prompt.
# prompt_object = PromptTemplate.from_template(prompt)

# # Passes prompt into the model.
# chain = prompt_object | model

# # Resonse from the model.
# response = chain.invoke({'receipt':'abc'})
# response.content

In [11]:
langchain.__version__

'0.0.324'

In [12]:
test_receipt_1 = """
"Halal Gyro Kabob House\n",
"240 EAST DELAWARE AVENUE\n",
"NEWARK, DE 19711\n",
"4439937029\n",
"https://www.halalgyrokabobhouse.com\n",
"ORDER: 77\n",
"For Here\n",
"Cashier: Employee\n",
"06-Jul-2023 7:57:49P\n",
"Transaction 244806\n",
"1\n",
"#18. Lamb Salad\n",
"$13.99\n",
"Total\n",
"$13.99\n",
"Tip\n",
"$2.10\n",
"CREDIT CARD AUTH\n",
"$16.09\n",
"VISA 2130\n",
"Retain this copy for statement validation\n",
"06-Jul-2023 7:57:59P\n",
"$16.09 <UNKNOWN> Method: CONTACTLESS\n",
"VISA CREDIT <UNKNOWN>\n",
"Reference ID: 318700509347 <UNKNOWN> Auth ID:\n",
"02779D\n",
"MID: ********3889\n",
"AID: A0000000031010\n",
"AthNtwkNm: VISA\n",
"Online: https://clover.com/p\n",
"/GD5PY93HP0HMM\n",
"Clover ID: TEXFAZ7ZA34GM\n",
"Payment <UNKNOWN>\n",
"Clover Privacy Policy\n",
"https://clover.com/privacy\n",
"""

In [23]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI

# Uses OpenAI model.
model = ChatOpenAI()

# Creates a template for the prompt.
prompt_template = PromptTemplate.from_template(prompt)

# Inserts the prompt into model and gets output.
chain = LLMChain(llm=model, prompt=prompt_template)
chain.run(receipt=test_receipt_1)

'{\n  "ReceiptInfo": {\n    "merchant": "Halal Gyro Kabob House",\n    "address": "240 EAST DELAWARE AVENUE",\n    "city": "NEWARK",\n    "state": "DE",\n    "phoneNumber": "(443)-993-7029",\n    "tax": "0.00",\n    "total": "16.09",\n    "receiptDate": "07/06/2023",\n    "receiptTime": "7:57pm",\n\n\n    "ITEMS": [\n      {\n        "description": "#18. Lamb Salad",\n        "quantity": "1",\n        "unitPrice": "13.99",\n        "totalPrice": "13.99",\n        "discountAmount": "0.00"\n      }\n    ]\n  }\n}'

In [None]:
import nervaluate
import json

# expected response from chatgpt
example_response = '''{
  "ReceiptInfo": {
    "merchant": "Walmart",
    "address": "3549 RUSSETT GRN",
    "city": "LAUREL",
    "state": "MD",
    "phoneNumber": "301-604-0180",
    "tax": "0.12",
    "total": "22.08",
    "receiptDate": "07/04/23",
    "receiptTime": "16:20:22",

    "ITEMS": [
      {
        "description": "BOOSTER",
        "quantity": "1",
        "unitPrice": "1.98",
        "totalPrice": "1.98",
        "discountAmount": "0.00"
      },
    ]
  }
}'''

def find_span(text, entity_text):
    start = text.find(entity_text)
    if start == -1:
        return None
    end = start + len(entity_text)
    return (start, end), entity_text

def convert_to_prodigy_spans(receipt_text, entities):
    text_vals = []
    entities = json.loads(entities)
    prodigy_data = []
    receipt_info = entities["ReceiptInfo"]

    for label, entity_text in [ # exception handling for if one of these fields is missing
        ("MERCHANT", receipt_info["merchant"]),
        ("ADDRESS", receipt_info["address"]),
        ("CITY", receipt_info["city"]),
        ("STATE", receipt_info["state"]),
        ("PHONE", receipt_info["phoneNumber"]),
        ("TAX", str(receipt_info["tax"])),
        ("TOTAL", str(receipt_info["total"])),
        ("DATE", receipt_info["receiptDate"])
    ]:
        span, text = find_span(receipt_text, entity_text)
        text_vals.append(text)
        if span:
            start, end = span
            prodigy_data.append({"start": start, "end": end, "label": label})

    # Process item-level entities
    for item in receipt_info["ITEMS"]:
        for label, entity_text in [
            ("ITEM_DESC", item["description"]),
            ("QTY", str(item["quantity"])),
            ("UNIT_PRICE", str(item["unitPrice"])),
            ("TOTAL_PRICE", str(item["totalPrice"])),
            ("DISCOUNT", str(item.get("discountAmount", "")))  # Discount might not always be present
        ]:
            if entity_text:  # Check if the entity text is not empty
                span = find_span(receipt_text, entity_text)
                if span:
                    start, end = span
                    prodigy_data.append({"start": start, "end": end, "label": label})

    return prodigy_data, text_vals

# Example usage


prodigy_spans_true, text_vals = convert_to_prodigy_spans(receipt_text, entities)
print(json.dumps(prodigy_spans_true, indent=2))
