In [1]:
import pandas as pd
from typing import cast
from utils.matching_functions import document_extracted_values_df_to_dict
from utils.typing import InvoiceFields

In [2]:
def get_non_matching_fields(
    provider_invoice, purchase_order
) -> dict[str, dict[str, str]]:
    """Matching invoice fields and purchase orders using OpenAI API"""
    non_matching_fields = {}
    # Your code here
    return non_matching_fields

def match_po_and_invoice(order_df):
    """
    Detect defects in the dataframe of a single order.
    order_df: dataframe containing the order. It should contain 2 lines, one for the invoice and one for the PO.

    :return: dataframe containing the defects, with the following columns:
        - order_path: path to the order
        - PO_NUMBER_DEFECT: True if the PO_NUMBER is not the same in the invoice and the PO
        - AMOUNT_DEFECT: True if the AMOUNT is higher in the invoice than in the PO
        - VENDOR_NAME_DEFECT: True if the VENDOR_NAME is not the same in the invoice and the PO
        - RECEIVER_NAME_DEFECT: True if the RECEIVER_NAME is not the same in the invoice and the PO
    """
    if order_df.shape[0] <= 1:
        return None
    provider_invoice_df = order_df[order_df["file_name"] == "Invoice"]
    purchase_order_df = order_df[order_df["file_name"] == "PO"]

    provider_invoice_as_dict = document_extracted_values_df_to_dict(provider_invoice_df)
    purchase_order_as_dict = document_extracted_values_df_to_dict(purchase_order_df)

    provider_invoice = cast(InvoiceFields, provider_invoice_as_dict)
    purchase_order = cast(InvoiceFields, purchase_order_as_dict)

    non_matching_fields: list[str]
    non_matching_fields = get_non_matching_fields(
        provider_invoice, purchase_order
    ).keys()

    return pd.DataFrame(
        [
            [True, False, False, False] # Values to be computed using get_non_matching_fields !!
        ],
        columns=[
            "po_number_defect",
            "amount_defect",
            "receiver_name_defect",
            "vendor_name_defect",
        ],
    )

In [3]:
extraction_prediction_df = pd.read_csv("/home/florent/idscTpPromptEngineering/data/predictions/extraction_prediction.csv")
extraction_prediction_df.groupby("order_name").apply(match_po_and_invoice)

Unnamed: 0_level_0,Unnamed: 1_level_0,po_number_defect,amount_defect,receiver_name_defect,vendor_name_defect
order_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BATAL,0,True,False,False,False
JOBAL,0,True,False,False,False
SAG & LOCH,0,True,False,False,False
TARAFA,0,True,False,False,False
