In [1]:
from dotenv import load_dotenv
_ = load_dotenv("config.env")

from common.mongodb import *
from common.files import *
from common.llms import *

doc = file2md("/Users/oliverkohn/repositories/datasphereAI/find_files/backend/data/receipt2.docx")

2025-01-31 19:07:42,780 - my_logger - INFO - host: localhost


host: localhost


In [2]:
document_types = ["Receipt", "Rental Contract", "Sales Agreement", "Non-Disclosure Agreement (NDA)", "Employment Contract", "Business Proposal", "Service Agreement", "Power of Attorney"]
document_types = ", ".join(document_types)

prompt_category= f"""Your task is to eassign below document to a category.
Possible categories are:

{document_types}

Answer in a JSON object of this format:

{{
    "category" : "xxx"
}}

Answer in a json in exactly this format. It is important that your answer only consists of this JSON and nothing else. No additional text.

This is the document you shall extract the information from:

{doc}
"""

llm = OllamaLLM(model="llama3.2:3b", temperature=0, cache=False)
category = execute_text_prompt(llm, prompt_category)
category

2025-01-31 19:07:43,683 - my_logger - INFO - Prompt Original:

Your task is to eassign below document to a category.
Possible categories are:

Receipt, Rental Contract, Sales Agreement, Non-Disclosure Agreement (NDA), Employment Contract, Business Proposal, Service Agreement, Power of Attorney

Answer in a JSON object of this format:

{
    "category" : "xxx"
}

Answer in a json in exactly this format. It is important that your answer only consists of this JSON and nothing else. No additional text.

This is the document you shall extract the information from:


# Official Receipt

Company Name: Tech Solutions UG

Address: Hauptstraße 123, 10115 Berlin, Germany

Phone: +49 30 12345678 | Email: contact@techsolutions.de

Website: www.techsolutions.de

Receipt Number: #20250131-002

Date of Issue: 31.01.2025

Customer Name: Max Mustermann

Customer Address: Musterstraße 45, 10117 Berlin, Germany

Customer Email: max.mustermann@email.com

Customer Phone: +49 170 9876543

| Item Description 

'{\n    "category" : "Receipt"\n}'

In [3]:
print(category)

{
    "category" : "Receipt"
}


In [4]:
prompt_general = f"""Your task is to extract this information from below document:

{{
    "receipt_id" : "xxx",
    "date" : "xxx",
    "items" : {{"price" : "xxx", "name" : "xxx"}}
    "total_price" : {{"amount": "xxx"}}
    "receipt_category" : "xxx"
}}

Answer in a json in exactly this format. It is important that your answer only consists of this JSON and nothing else. No additional text.

This is the document you shall extract the information from:

{doc}
"""

models = ["llama3.2:3b", "deepseek-r1:8b", "deepseek-r1:14b"]

r_llm = {}
for model in models:
    logger.info(f"RUN MODEL '{model}'")
    llm = OllamaLLM(model=model, temperature=0, cache=False)
    r_llm[model] = execute_text_prompt(llm, prompt_general)

2025-01-31 19:07:46,161 - my_logger - INFO - RUN MODEL 'llama3.2:3b'
2025-01-31 19:07:46,177 - my_logger - INFO - Prompt Original:

Your task is to extract this information from below document:

{
    "receipt_id" : "xxx",
    "date" : "xxx",
    "items" : {"price" : "xxx", "name" : "xxx"}
    "total_price" : {"amount": "xxx"}
    "receipt_category" : "xxx"
}

Answer in a json in exactly this format. It is important that your answer only consists of this JSON and nothing else. No additional text.

This is the document you shall extract the information from:


# Official Receipt

Company Name: Tech Solutions UG

Address: Hauptstraße 123, 10115 Berlin, Germany

Phone: +49 30 12345678 | Email: contact@techsolutions.de

Website: www.techsolutions.de

Receipt Number: #20250131-002

Date of Issue: 31.01.2025

Customer Name: Max Mustermann

Customer Address: Musterstraße 45, 10117 Berlin, Germany

Customer Email: max.mustermann@email.com

Customer Phone: +49 170 9876543

| Item Description | 

In [5]:
j_llm = {}
for k, v in r_llm.items():
    j_llm[k] = response2json(v)

j_llm

{'llama3.2:3b': {'receipt_id': '#20250131-002',
  'date': '31.01.2025',
  'items': {'Laptop Repair Service - Diagnostic & Fix': {'price': 120.0,
    'name': 'Laptop Repair Service - Diagnostic & Fix',
    'quantity': 1,
    'unit_price (€)': 120.0,
    'tax (19%) (€)': 22.8,
    'total (€)': 142.8},
   'Software Installation - Windows & Drivers': {'price': 50.0,
    'name': 'Software Installation - Windows & Drivers',
    'quantity': 1,
    'unit_price (€)': 50.0,
    'tax (19%) (€)': 9.5,
    'total (€)': 59.5},
   'Hardware Upgrade (RAM - 16GB)': {'price': 75.0,
    'name': 'Hardware Upgrade (RAM - 16GB)',
    'quantity': 2,
    'unit_price (€)': 75.0,
    'tax (19%) (€)': 28.5,
    'total (€)': 178.5},
   'Antivirus Software - 1 Year License': {'price': 40.0,
    'name': 'Antivirus Software - 1 Year License',
    'quantity': 1,
    'unit_price (€)': 40.0,
    'tax (19%) (€)': 7.6,
    'total (€)': 47.6},
   'Data Recovery Service': {'price': 85.0,
    'name': 'Data Recovery Service'

In [6]:
prompt_items = f"""Below you find a receipt. Your task is to identify the items from this receipt and extract their names as well as their price.
Answer in a json object like this:

{{
    "response" : [{{"item_name" : "xxx", "item_price" : "xxx"}}]
}}

It is important that you answer exactly in this format, a JSON object where the main key is "response". Do not put anything els ein your response.

This is the document you shall extract the information from:

{doc}
"""

models = ["llama3.2:3b", "deepseek-r1:8b", "deepseek-r1:14b"]

r_llm_specific = {}
for model in models:
    logger.info(f"RUN MODEL '{model}'")
    llm = OllamaLLM(model=model, temperature=0, cache=False)
    r_llm_specific[model] = execute_text_prompt(llm, prompt_items)


2025-01-31 19:08:47,889 - my_logger - INFO - RUN MODEL 'llama3.2:3b'
2025-01-31 19:08:47,911 - my_logger - INFO - Prompt Original:

Below you find a receipt. Your task is to identify the items from this receipt and extract their names as well as their price.
Answer in a json object like this:

{
    "response" : [{"item_name" : "xxx", "item_price" : "xxx"}]
}

It is important that you answer exactly in this format, a JSON object where the main key is "response". Do not put anything els ein your response.

This is the document you shall extract the information from:


# Official Receipt

Company Name: Tech Solutions UG

Address: Hauptstraße 123, 10115 Berlin, Germany

Phone: +49 30 12345678 | Email: contact@techsolutions.de

Website: www.techsolutions.de

Receipt Number: #20250131-002

Date of Issue: 31.01.2025

Customer Name: Max Mustermann

Customer Address: Musterstraße 45, 10117 Berlin, Germany

Customer Email: max.mustermann@email.com

Customer Phone: +49 170 9876543

| Item Descri

In [7]:
j_llm = {}
for k, v in r_llm_specific.items():
    j_llm[k] = response2json(v)
    print(v)

j_llm

{
    "response" : [
        {"item_name": "Laptop Repair Service - Diagnostic & Fix", "item_price": "120.00"},
        {"item_name": "Software Installation - Windows & Drivers", "item_price": "50.00"},
        {"item_name": "Hardware Upgrade (RAM - 16GB)", "item_price": "75.00"},
        {"item_name": "Antivirus Software - 1 Year License", "item_price": "40.00"},
        {"item_name": "Data Recovery Service", "item_price": "85.00"}
    ]
}
<think>
Okay, so I need to extract the item names and their prices from this receipt. Let me start by looking at the structure of the receipt. There's a table with columns: Item Description, Quantity, Unit Price (€), Tax (19%), and Total (€). 

First, I'll go through each row one by one. The first item is "Laptop Repair Service - Diagnostic & Fix" with a quantity of 1, unit price €120.00, tax €22.80, and total €142.80. So the name is clear here.

Next, "Software Installation - Windows & Drivers" has similar details: quantity 1, unit price €50.00, ta

{'llama3.2:3b': {'response': [{'item_name': 'Laptop Repair Service - Diagnostic & Fix',
    'item_price': '120.00'},
   {'item_name': 'Software Installation - Windows & Drivers',
    'item_price': '50.00'},
   {'item_name': 'Hardware Upgrade (RAM - 16GB)', 'item_price': '75.00'},
   {'item_name': 'Antivirus Software - 1 Year License', 'item_price': '40.00'},
   {'item_name': 'Data Recovery Service', 'item_price': '85.00'}]},
 'deepseek-r1:8b': {'response': [{'item_name': 'Laptop Repair Service - Diagnostic & Fix',
    'item_price': '120.00'},
   {'item_name': 'Software Installation - Windows & Drivers',
    'item_price': '50.00'},
   {'item_name': 'Hardware Upgrade (RAM - 16GB)', 'item_price': '75.00'},
   {'item_name': 'Antivirus Software - 1 Year License', 'item_price': '40.00'},
   {'item_name': 'Data Recovery Service', 'item_price': '85.00'}]},
 'deepseek-r1:14b': {'response': [{'item_name': 'Laptop Repair Service - Diagnostic & Fix',
    'item_price': '120.00'},
   {'item_name': '