In [1]:
from pathlib import Path
import pandas as pd
import json
from utils.ocr_utils import get_filtred_text_from_ocr_output
from utils.openai_functions import get_answer_from_elaborate_prompt

In [2]:
SYSTEM_EXTRACTION_PROMPT = """
You are an expert accountant.
"""

USER_EXTRACTION_PROMPT = """
I will give you the output of an OCR system, run on a Purchase Order.
You will give me the PO number, the name of the vendor company, the name of the receiver company, and the PO total amount.
Answer in a JSON format. An example of answer is given below:
{
    "po_number": "4503500962",
    "vendor_name": "company1",
    "receiver_name": "company2",
    "total_amount": "1,000.00"
}
Here is the OCR output:
"""

In [3]:
ocr_prediction_folder_path = Path("../data/ocr_results")

prediction_list = []

for ocr_prediction_path in ocr_prediction_folder_path.rglob("*.json"):
    order_name = ocr_prediction_path.parts[-2]
    file_name = ocr_prediction_path.stem
    filtered_ocr_prediction = get_filtred_text_from_ocr_output(ocr_prediction_path)
    try :
        predicted_fields = get_answer_from_elaborate_prompt(
            SYSTEM_EXTRACTION_PROMPT,
            USER_EXTRACTION_PROMPT + filtered_ocr_prediction
        )
    except Exception as e:
        print("error on "+order_name+" "+file_name+" : "+str(e))
        continue
    predicted_fields = predicted_fields.strip()
    predicted_fields = json.loads(predicted_fields)
    predicted_fields = {
        "order_name" : order_name,
        "file_name" : file_name,
        **predicted_fields
    }
    prediction_list.append(predicted_fields)

prediction_df = pd.DataFrame.from_records(prediction_list)
prediction_df.to_csv("../data/predictions/extraction_prediction.csv", index=False)

In [4]:
prediction_df

Unnamed: 0,order_name,file_name,po_number,vendor_name,receiver_name,total_amount
0,BATAL,PO,4503500962,BATAL SAFETY B.V.,AXIMA,7558.40
1,BATAL,Invoice,4503500962,BATAL SAFETY B.V.,Hamel Shipyards,7558.40
2,JOBAL,PO,4503501276,A. NOBEL EN ZN,Dixstone Shipyards (Holland) BV,1181.68
3,JOBAL,Invoice,4503501276,Hanelaastrichtijj B.V.,Hamel shipyards b.v.,1151.68
4,SAG & LOCH,PO,4503500620,SAG & LOCH Catering Limited,HAMEL SHIPYARDS B.V.,182045.50
5,SAG & LOCH,Invoice,4503500620,SAG & Loch Catering Ltd.,Hamel Shipyard bv,992.99 €
6,TARAFA,PO,4503500420,TARAFA MARINE BV,HAMEL SHIPYARD B.V.,16564.55
7,TARAFA,Invoice,NL1J222372,TARAFA JRC Marine,Axima,8172.20


In [5]:
extraction_ground_truth_path = "../data/ground_truth/extraction_ground_truth.csv"
extraction_ground_truth_df = pd.read_csv(extraction_ground_truth_path)
extraction_ground_truth_df

Unnamed: 0,order_name,file_name,po_number,vendor_name,receiver_name,total_amount
0,BATAL,PO,4503500962,BATAL SAFETY B.V.,HAMEL SHIPYARD BV,7558.4
1,BATAL,Invoice,4503500962,BATAL SAFETY B.V.,Hamel Shipyards,7558.4
2,JOBAL,PO,4503501276,E. Jobal bv,HAMEL SHIPYARD BV,1016.68
3,JOBAL,Invoice,4503501276,E. Jobal bv,Hamel Shipyards,1151.68
4,SAG & LOCH,PO,4503500620,SAG & LOCH Catering Limited,HAMEL SHIPYARDS B.V.,182045.5
5,SAG & LOCH,Invoice,4503500620,SAG & Loch Catering Ltd.,Hamel Shipyard bv,992.99
6,TARAFA,PO,4503500420,TARAFA MARINE BV,HAMEL SHIPYARDS B.V.,16564.55
7,TARAFA,Invoice,4503500420,TARAFA MARINE,HAMEL SHIPYARDS (BV.),8172.2
