# automated workflow for completing Customer Information Form (CIF)

| 96% say customer onboarding is already, or will become, a top priority [[1](https://oneid.uk/news-and-events/customer-onboarding-in-2023-five-statistics-financial-institutions-need-to-know)]


Raw Transcript → Pre-processing → Structured Information Extraction → Post-processing → CIF Completion

In [None]:
%pip install anthropic "mcp[cli]" pandas jinja2 evidently

### utils

In [None]:
# metrics
import time
import functools
from collections import defaultdict
import pandas as pd


class LogsManager:
    def __init__(self):
        self.reset()

    def reset(self):
        """Reset all collected metrics."""
        self._current_metrics = {
            "total_calls": 0,
            "total_input_tokens": 0,
            "total_output_tokens": 0,
            "total_time": 0.0,
            "function_calls": defaultdict(lambda: {"calls": 0, "input_tokens": 0, "output_tokens": 0, "time": 0.0}),
            "call_details": []
        }

    def add_api_call(self, func_name: str, input_tokens: int, output_tokens: int, elapsed_time: float, metadata: dict = None):
        """Register information about an API call."""
        self._current_metrics["total_calls"] += 1
        self._current_metrics["total_input_tokens"] += input_tokens
        self._current_metrics["total_output_tokens"] += output_tokens
        self._current_metrics["total_time"] += elapsed_time

        self._current_metrics["function_calls"][func_name]["calls"] += 1
        self._current_metrics["function_calls"][func_name]["input_tokens"] += input_tokens
        self._current_metrics["function_calls"][func_name]["output_tokens"] += output_tokens
        self._current_metrics["function_calls"][func_name]["time"] += elapsed_time

        call_info = {
            "timestamp": time.time(),
            "function": func_name,
            "input_tokens": input_tokens,
            "output_tokens": output_tokens,
            "elapsed_time": elapsed_time,
            "metadata": metadata or {}
        }
        self._current_metrics["call_details"].append(call_info)

    def get_current_logs(self):
        """Get the currently collected logs."""
        return {
            "total_calls": self._current_metrics["total_calls"],
            "input_tokens": self._current_metrics["total_input_tokens"],
            "output_tokens": self._current_metrics["total_output_tokens"],
            "total_tokens": self._current_metrics["total_input_tokens"] + self._current_metrics["total_output_tokens"],
            "elapsed_time": self._current_metrics["total_time"],
            "function_breakdown": dict(self._current_metrics["function_calls"]),
            "call_details": self._current_metrics["call_details"]
        }


logs_manager = LogsManager()

def measure_execution_time(func):
    """Measure the execution time of a function."""
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        elapsed_time = time.time() - start_time
        return result, elapsed_time
    return wrapper

def extract_token_info(response, input_tokens, output_tokens):
    """Extracts information about input and output tokens from the API response (or arguments)."""
    # Assuming input_tokens and output_tokens are directly passed or extracted earlier
    i_t = input_tokens
    o_t = output_tokens
    print(f"Input tokens: {i_t}, Output tokens: {o_t}")
    return i_t, o_t

def register_metrics(func_name, input_tokens, output_tokens, elapsed_time, metadata=None):
    """Registers API call metrics in the manager."""
    logs_manager.add_api_call(
        func_name=func_name,
        input_tokens=input_tokens,
        output_tokens=output_tokens,
        elapsed_time=elapsed_time,
        metadata=metadata or {}
    )

def generate_metadata(args):
    """Generates metadata from function arguments."""
    return {"args_sample": str(args)[:50] if args else ""}


def log_api(func):
    """Decorator for tracking API call metrics."""
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        # Measure execution time using the dedicated decorator
        original_result, elapsed_time = measure_execution_time(func)(*args, **kwargs)

        if isinstance(original_result, tuple) and len(original_result) == 3:
             response_content, input_tokens, output_tokens = original_result
        else:
             # Handle cases where the return format is different or log a warning
             print(f"Warning: Unexpected return format from {func.__name__}. Cannot unpack token info.")
             response_content = original_result
             input_tokens = 0
             output_tokens = 0

        input_tokens, output_tokens = extract_token_info(response_content, input_tokens, output_tokens)

        metadata = generate_metadata(args)

        register_metrics(
            func_name=func.__name__,
            input_tokens=input_tokens,
            output_tokens=output_tokens,
            elapsed_time=elapsed_time,
            metadata=metadata
        )

        return original_result

    return wrapper

In [None]:
# utils
import os
import re
import json
from concurrent.futures import ThreadPoolExecutor

import anthropic
from anthropic.types import ToolParam, MessageParam

from jinja2 import Template

API_KEY = os.getenv("ANTHROPIC_API_KEY")
print(API_KEY)

client = anthropic.Anthropic(api_key=API_KEY)

@log_api
def text_to_text(system: str, messages: list, model: str = "claude-3-7-sonnet-latest", max_tokens: int = 1024, temperature: float = 0.0) -> str:
    # print(f"text_to_text messages: {messages}")
    response = client.messages.create(
            model=model,
            max_tokens=max_tokens,
            system=system,
            messages=messages,
            temperature=temperature
        )
    # print(f"text_to_text response: {response}")
    response_content = response.content[0].text
    input_tokens = response.usage.input_tokens
    output_tokens = response.usage.output_tokens
    return response_content, input_tokens, output_tokens

@log_api
def text_to_json(system: str, messages: list, tools: list, model: str = "claude-3-7-sonnet-latest", max_tokens: int = 1024, temperature: float = 0.0) -> dict:
    # print(f"text_to_json messages: {messages}")
    response = client.messages.create(
            model=model,
            max_tokens=max_tokens,
            system=system,
            messages=messages,
            tools=tools,
            tool_choice={"type": "any"},
            temperature=temperature
        )
    tool_response = next((c for c in response.content if c.type == "tool_use"), None)
    if tool_response:
        # print(f"text_to_json type: {tool_response.type}")
        # print(f"text_to_json id: {tool_response.id}")
        print(f"text_to_json name: {tool_response.name}")
        # print(f"text_to_json input: {tool_response.input}")

        input_tokens = response.usage.input_tokens
        output_tokens = response.usage.output_tokens
        response_content = { "type": tool_response.type, "id": tool_response.id, "name": tool_response.name, "input": tool_response.input }
        return response_content, input_tokens, output_tokens
    text_response = next((c for c in response.content if c.type == "text"), None)
    print(f"text_to_json text_response: {text_response}")
    return text_response.text if text_response else None


def extract_xml(text: str, tag: str) -> str:
    """
    Extracts the content of the specified XML tag from the given text. Used for parsing structured responses 

    Args:
        text (str): The text containing the XML.
        tag (str): The XML tag to extract content from.

    Returns:
        str: The content of the specified XML tag, or an empty string if the tag is not found.
    """
    match = re.search(f'<{tag}>(.*?)</{tag}>', text, re.DOTALL)
    return match.group(1) if match else ""

---

## system_prompt_template
Description:
Template for category's system promnpt

In [None]:
system_prompt_template = """
Your main aim is to extract information from the dialogue using specified tools.

In this environment you have access to a set of tools you can use to answer the user's question.
String and scalar parameters should be specified as is, while lists and objects should use JSON format. 
Note that spaces for string values are not stripped. 
The output is not expected to be valid XML and is parsed with regular expressions.
Here are the functions available in JSONSchema format:
{{ TOOL_DEFINITIONS_IN_JSON_SCHEMA }}
"""
template = Template(system_prompt_template)

---

## tools
Description
1. Each category in CIF_form presented as a set of tools.
2. Each tool represent name of the **input_field** and **description**
3. Each category includes **custom_system_prompt** with set of tools, relevant to **curr_category**.

Available categories & tools:
- personal
    - personal_client_details_tool
    - current_address_tool
    - previous_addresses_tool
    - dependants_children_tool
- employment
    - employment_client_details_tool
    - incomes_tool
- expenses
    - loan_repayments_tool
    - housing_expenses_tool
    - motoring_expenses_tool
    - personal_expenses_tool
    - professional_expenses_tool
    - miscellaneous_expenses_tool
    - pensions_tool
    - savings_investments_tool
    - other_assets_tool
    - loans_mortgages_tool
- health
    - health_client_details_tool
    - protection_policies_tool
- objectives
    - objectives_tool

### personal

In [None]:
# Tool for personal details
personal_client_details_tool: ToolParam = {
    "name": "client",
    "description": "Tool to extract detailed personal information about the client.",
    "input_schema": {
        "type": "object",
        "properties": {
            "title": {
                "type": "string",
                "description": "Client's title (Mr, Mrs, Ms, etc.)"
            },
            "first_name": {
                "type": "string",
                "description": "Client's first name"
            },
            "middle_names": {
                "type": "string",
                "description": "Client's middle names, if any"
            },
            "last_name": {
                "type": "string",
                "description": "Client's last name"
            },
            "pronouns": {
                "type": "string",
                "description": "Client's preferred pronouns (e.g., he/him, she/her, they/them)"
            },
            "date_of_birth": {
                "type": "string",
                "description": "Client's date of birth in format DD/MM/YYYY"
            },
            "place_of_birth": {
                "type": "string",
                "description": "Client's place of birth"
            },
            "nationality": {
                "type": "string",
                "description": "Client's nationality"
            },
            "gender": {
                "type": "string",
                "description": "Client's gender"
            },
            "legal_sex": {
                "type": "string",
                "description": "Client's legal sex"
            },
            "marital_status": {
                "type": "string",
                "description": "Client's marital status"
            },
            "home_phone": {
                "type": "string",
                "description": "Client's home phone number"
            },
            "mobile_phone": {
                "type": "string",
                "description": "Client's mobile phone number"
            },
            "email_address": {
                "type": "string",
                "description": "Client's email address"
            }
        }
    }
}

In [None]:
# Tool for current address
current_address_tool: ToolParam = {
    "name": "current_address",
    "description": "Tool to extract client's current address information.",
    "input_schema": {
        "type": "object",
        "properties": {
            "ownership_status": {
                "type": "string",
                "description": "Property ownership status (Owner, Tenant, etc.)"
            },
            "postcode": {
                "type": "string",
                "description": "Postal code"
            },
            "house_name_number": {
                "type": "string",
                "description": "House name or number"
            },
            "street_name": {
                "type": "string",
                "description": "Street name"
            },
            "address_line_3": {
                "type": "string",
                "description": "Additional address line 3"
            },
            "address_line_4": {
                "type": "string",
                "description": "Additional address line 4"
            },
            "town_city": {
                "type": "string",
                "description": "Town or city"
            },
            "county": {
                "type": "string",
                "description": "County or region"
            },
            "country": {
                "type": "string",
                "description": "Country"
            },
            "move_in_date": {
                "type": "string",
                "description": "Date moved to this address in format DD/MM/YYYY"
            }
        },
    }
}

In [None]:
# Tool for previous addresses
previous_addresses_tool: ToolParam = {
    "name": "previous_addresses",
    "description": "Tool to extract client's previous addresses.",
    "input_schema": {
        "type": "object",
        "properties": {
            "entries": {
                "type": "array",
                "description": "List of previous addresses",
                "items": {
                    "type": "object",
                    "properties": {
                        "ownership_status": {
                            "type": "string",
                            "description": "Property ownership status (Owner, Tenant, etc.)"
                        },
                        "postcode": {
                            "type": "string",
                            "description": "Postal code"
                        },
                        "house_name_number": {
                            "type": "string",
                            "description": "House name or number"
                        },
                        "street_name": {
                            "type": "string",
                            "description": "Street name"
                        },
                        "town_city": {
                            "type": "string",
                            "description": "Town or city"
                        },
                        "county": {
                            "type": "string",
                            "description": "County or region"
                        },
                        "country": {
                            "type": "string",
                            "description": "Country"
                        },
                        "move_in_date": {
                            "type": "string",
                            "description": "Date moved to this address in format DD/MM/YYYY"
                        },
                        "move_out_date": {
                            "type": "string",
                            "description": "Date moved out from this address in format DD/MM/YYYY"
                        }
                    },                
                }
            }
        },
        "required": ["entries"]
    }
}

In [None]:
# Tool for dependants/children information
dependants_children_tool: ToolParam = {
    "name": "dependants_children",
    "description": "Tool to extract information about client's dependants or children from.",
    "input_schema": {
        "type": "object",
        "properties": {
            "entries": {
                "type": "array",
                "description": "List of dependants or children",
                "items": {
                    "type": "object",
                    "properties": {
                        "Name": {
                            "type": "string",
                            "description": "Full name of the dependant"
                        },
                        "Date_of_Birth": {
                            "type": "string",
                            "description": "Date of birth in format DD/MM/YYYY"
                        },
                        "Dependent_Until": {
                            "type": "string",
                            "description": "Date until which they are expected to be dependent in format DD/MM/YYYY"
                        }
                    },
                }
            }
        },
        "required": ["entries"]
    }
}


In [None]:
# complete_personal_info_system_prompt
complete_personal_info_system_prompt = system_prompt_template.replace(
    "{{ TOOL_DEFINITIONS_IN_JSON_SCHEMA }}",
    str(
        json.dumps([
            personal_client_details_tool,
            current_address_tool,
            previous_addresses_tool,
            dependants_children_tool
        ])
    )
)


### employment

In [None]:
# Tool for client employment information
employment_client_details_tool: ToolParam = {
    "name": "client",
    "description": "Tool to extract employment information about the client.",
    "input_schema": {
        "type": "object",
        "properties": {
            "country_domiciled": {
                "type": "string",
                "description": "Country where the client is domiciled"
            },
            "resident_for_tax": {
                "type": "string",
                "description": "Country where the client is resident for tax purposes"
            },
            "national_insurance_number": {
                "type": "string",
                "description": "Client's National Insurance or equivalent identification number"
            },
            "employment_status": {
                "type": "string",
                "description": "Employment status (Employed, Self-employed, Retired, etc.)"
            },
            "desired_retirement_age": {
                "type": "string",
                "description": "Client's desired retirement age"
            },
            "occupation": {
                "type": "string",
                "description": "Client's occupation or job title"
            },
            "employer": {
                "type": "string",
                "description": "Name of the client's employer"
            },
            "employment_started": {
                "type": "string",
                "description": "Date when current employment started in format DD/MM/YYYY"
            },
            "highest_rate_of_tax_paid": {
                "type": "string",
                "description": "Highest rate of tax paid by the client"
            },
            "notes": {
                "type": "array",
                "description": "Any additional notes related to employment",
                "items": {
                    "type": "string"
                }
            }
        },
        "required": ["employment_status", "occupation"]
    }
}

In [None]:
# Tool for income information
incomes_tool: ToolParam = {
    "name": "incomes",
    "description": "Tool to extract information about client's income sources.",
    "input_schema": {
        "type": "object",
        "properties": {
            "entries": {
                "type": "array",
                "description": "List of income sources",
                "items": {
                    "type": "object",
                    "properties": {
                        "owner": {
                            "type": "string",
                            "description": "Person who owns this income source (Client, Partner, etc.)"
                        },
                        "name": {
                            "type": "string",
                            "description": "Name or description of the income source"
                        },
                        "amount": {
                            "type": "string",
                            "description": "Amount of income"
                        },
                        "frequency": {
                            "type": "string",
                            "description": "Frequency of payment (Monthly, Annually, etc.)"
                        },
                        "net_gross": {
                            "type": "string",
                            "description": "Whether the amount is net or gross"
                        },
                        "timeframe": {
                            "type": "string",
                            "description": "Time period for which this income applies"
                        }
                    },
                    "required": ["owner", "name", "amount", "frequency"]
                }
            },
            "notes": {
                "type": "array",
                "description": "Any additional notes related to income",
                "items": {
                    "type": "string"
                }
            }
        },
        "required": ["entries"]
    }
}

In [None]:
# complete_employment_info_system_prompt
complete_employment_info_system_prompt = system_prompt_template.replace(
    "{{ TOOL_DEFINITIONS_IN_JSON_SCHEMA }}",
    str(
        json.dumps([
            employment_client_details_tool,
            incomes_tool
        ])
    )
)

### expenses

In [None]:
# Tool for loan repayments
loan_repayments_tool: ToolParam = {
    "name": "loan_repayments",
    "description": "Tool to extract information about client's loan repayments.",
    "input_schema": {
        "type": "object",
        "properties": {
            "entries": {
                "type": "array",
                "description": "List of loan repayments",
                "items": {
                    "type": "object",
                    "properties": {
                        "owner": {
                            "type": "string",
                            "description": "Person who owns this loan (Client, Partner, etc.)"
                        },
                        "name": {
                            "type": "string",
                            "description": "Name or description of the loan"
                        },
                        "amount": {
                            "type": "string",
                            "description": "Repayment amount"
                        },
                        "frequency": {
                            "type": "string",
                            "description": "Frequency of repayment (Monthly, Annually, etc.)"
                        },
                        "priority": {
                            "type": "string",
                            "description": "Priority level of this repayment (High, Medium, Low)"
                        },
                        "timeframe": {
                            "type": "string",
                            "description": "Time period for which this loan repayment applies"
                        }
                    },
                }
            }
        },
        "required": ["entries"]
    }
}

In [None]:
# Tool for housing expenses
housing_expenses_tool: ToolParam = {
    "name": "housing_expenses",
    "description": "Tool to extract information about client's housing expenses.",
    "input_schema": {
        "type": "object",
        "properties": {
            "entries": {
                "type": "array",
                "description": "List of housing expenses",
                "items": {
                    "type": "object",
                    "properties": {
                        "owner": {
                            "type": "string",
                            "description": "Person who owns this expense (Client, Partner, etc.)"
                        },
                        "name": {
                            "type": "string",
                            "description": "Name or description of the housing expense"
                        },
                        "amount": {
                            "type": "string",
                            "description": "Amount of the expense"
                        },
                        "frequency": {
                            "type": "string",
                            "description": "Frequency of payment (Monthly, Annually, etc.)"
                        },
                        "priority": {
                            "type": "string",
                            "description": "Priority level of this expense (High, Medium, Low)"
                        },
                        "timeframe": {
                            "type": "string",
                            "description": "Time period for which this housing expense applies"
                        }
                    },
                }
            }
        },
        "required": ["entries"]
    }
}

In [None]:
# Tool for motoring expenses
motoring_expenses_tool: ToolParam = {
    "name": "motoring_expenses",
    "description": "Tool to extract information about client's motoring expenses.",
    "input_schema": {
        "type": "object",
        "properties": {
            "entries": {
                "type": "array",
                "description": "List of motoring expenses",
                "items": {
                    "type": "object",
                    "properties": {
                        "owner": {
                            "type": "string",
                            "description": "Person who owns this expense (Client, Partner, etc.)"
                        },
                        "name": {
                            "type": "string",
                            "description": "Name or description of the motoring expense"
                        },
                        "amount": {
                            "type": "string",
                            "description": "Amount of the expense"
                        },
                        "frequency": {
                            "type": "string",
                            "description": "Frequency of payment (Monthly, Annually, etc.)"
                        },
                        "priority": {
                            "type": "string",
                            "description": "Priority level of this expense (High, Medium, Low)"
                        },
                        "timeframe": {
                            "type": "string",
                            "description": "Time period for which this motoring expense applies"
                        }
                    },
                }
            }
        },
        "required": ["entries"]
    }
}

In [None]:
# Tool for personal expenses
personal_expenses_tool: ToolParam = {
    "name": "personal_expenses",
    "description": "Tool to extract information about client's personal expenses.",
    "input_schema": {
        "type": "object",
        "properties": {
            "entries": {
                "type": "array",
                "description": "List of personal expenses",
                "items": {
                    "type": "object",
                    "properties": {
                        "owner": {
                            "type": "string",
                            "description": "Person who owns this expense (Client, Partner, etc.)"
                        },
                        "name": {
                            "type": "string",
                            "description": "Name or description of the personal expense"
                        },
                        "amount": {
                            "type": "string",
                            "description": "Amount of the expense"
                        },
                        "frequency": {
                            "type": "string",
                            "description": "Frequency of payment (Monthly, Annually, etc.)"
                        },
                        "priority": {
                            "type": "string",
                            "description": "Priority level of this expense (High, Medium, Low)"
                        },
                        "timeframe": {
                            "type": "string",
                            "description": "Time period for which this personal expense applies"
                        }
                    },
                }
            }
        },
        "required": ["entries"]
    }
}

In [None]:
# Tool for professional expenses
professional_expenses_tool: ToolParam = {
    "name": "professional_expenses",
    "description": "Tool to extract information about client's professional expenses.",
    "input_schema": {
        "type": "object",
        "properties": {
            "entries": {
                "type": "array",
                "description": "List of professional expenses",
                "items": {
                    "type": "object",
                    "properties": {
                        "owner": {
                            "type": "string",
                            "description": "Person who owns this expense (Client, Partner, etc.)"
                        },
                        "name": {
                            "type": "string",
                            "description": "Name or description of the professional expense"
                        },
                        "amount": {
                            "type": "string",
                            "description": "Amount of the expense"
                        },
                        "frequency": {
                            "type": "string",
                            "description": "Frequency of payment (Monthly, Annually, etc.)"
                        },
                        "priority": {
                            "type": "string",
                            "description": "Priority level of this expense (High, Medium, Low)"
                        },
                        "timeframe": {
                            "type": "string",
                            "description": "Time period for which this professional expense applies"
                        }
                    }
                }
            }
        },
        "required": ["entries"]
    }
}

In [None]:
# Tool for miscellaneous expenses
miscellaneous_expenses_tool: ToolParam = {
    "name": "miscellaneous_expenses",
    "description": "Tool to extract information about client's miscellaneous expenses.",
    "input_schema": {
        "type": "object",
        "properties": {
            "entries": {
                "type": "array",
                "description": "List of miscellaneous expenses",
                "items": {
                    "type": "object",
                    "properties": {
                        "owner": {
                            "type": "string",
                            "description": "Person who owns this expense (Client, Partner, etc.)"
                        },
                        "name": {
                            "type": "string",
                            "description": "Name or description of the miscellaneous expense"
                        },
                        "amount": {
                            "type": "string",
                            "description": "Amount of the expense"
                        },
                        "frequency": {
                            "type": "string",
                            "description": "Frequency of payment (Monthly, Annually, etc.)"
                        },
                        "priority": {
                            "type": "string",
                            "description": "Priority level of this expense (High, Medium, Low)"
                        },
                        "timeframe": {
                            "type": "string",
                            "description": "Time period for which this miscellaneous expense applies"
                        }
                    },
                }
            },
            "notes": {
                "type": "array",
                "description": "Any additional notes related to miscellaneous expenses",
                "items": {
                    "type": "string"
                }
            }
        },
        "required": ["entries"]
    }
}

In [None]:
# Tool for pensions information
pensions_tool: ToolParam = {
    "name": "pensions",
    "description": "Tool to extract information about client's pensions.",
    "input_schema": {
        "type": "object",
        "properties": {
            "entries": {
                "type": "array",
                "description": "List of pensions",
                "items": {
                    "type": "object",
                    "properties": {
                        "owner": {
                            "type": "string",
                            "description": "Person who owns this pension (Client, Partner, etc.)"
                        },
                        "type": {
                            "type": "string",
                            "description": "Type of pension (Defined Benefit, Personal Pension, etc.)"
                        },
                        "provider": {
                            "type": "string",
                            "description": "Name of the pension provider"
                        },
                        "value": {
                            "type": "string",
                            "description": "Current value of the pension"
                        },
                        "policy_number": {
                            "type": "string",
                            "description": "Pension policy number or identifier"
                        }
                    },
                }
            },
            "notes": {
                "type": "array",
                "description": "Any additional notes related to pensions",
                "items": {
                    "type": "string"
                }
            }
        },
        "required": ["entries"]
    }
}

In [None]:
# Tool for savings and investments
savings_investments_tool: ToolParam = {
    "name": "savings_investments",
    "description": "Tool to extract information about client's savings and investments.",
    "input_schema": {
        "type": "object",
        "properties": {
            "entries": {
                "type": "array",
                "description": "List of savings and investments",
                "items": {
                    "type": "object",
                    "properties": {
                        "owner": {
                            "type": "string",
                            "description": "Person who owns this investment (Client, Partner, etc.)"
                        },
                        "type": {
                            "type": "string",
                            "description": "Type of savings or investment (ISA, Bonds, Shares, etc.)"
                        },
                        "provider": {
                            "type": "string",
                            "description": "Name of the provider or institution"
                        },
                        "value": {
                            "type": "string",
                            "description": "Current value of the savings or investment"
                        }
                    },
                }
            },
            "notes": {
                "type": "array",
                "description": "Any additional notes related to savings and investments",
                "items": {
                    "type": "string"
                }
            }
        },
        "required": ["entries"]
    }
}

In [None]:
# Tool for other assets
other_assets_tool: ToolParam = {
    "name": "other_assets",
    "description": "Tool to extract information about client's other assets.",
    "input_schema": {
        "type": "object",
        "properties": {
            "entries": {
                "type": "array",
                "description": "List of other assets",
                "items": {
                    "type": "object",
                    "properties": {
                        "owner": {
                            "type": "string",
                            "description": "Person who owns this asset (Client, Partner, etc.)"
                        },
                        "description": {
                            "type": "string",
                            "description": "Description of the asset"
                        },
                        "current_value": {
                            "type": "string",
                            "description": "Current value of the asset"
                        },
                        "original_value": {
                            "type": "string",
                            "description": "Original value of the asset when acquired"
                        }
                    },
                }
            },
            "notes": {
                "type": "array",
                "description": "Any additional notes related to other assets",
                "items": {
                    "type": "string"
                }
            }
        },
        "required": ["entries"]
    }
}

In [None]:
# Tool for loans and mortgages
loans_mortgages_tool: ToolParam = {
    "name": "loans_mortgages",
    "description": "Tool to extract information about client's loans and mortgages.",
    "input_schema": {
        "type": "object",
        "properties": {
            "entries": {
                "type": "array",
                "description": "List of loans and mortgages",
                "items": {
                    "type": "object",
                    "properties": {
                        "owner": {
                            "type": "string",
                            "description": "Person who owns this loan or mortgage (Client, Partner, etc.)"
                        },
                        "type": {
                            "type": "string",
                            "description": "Type of loan or mortgage (Residential Mortgage, Car Loan, etc.)"
                        },
                        "provider": {
                            "type": "string",
                            "description": "Name of the lender or provider"
                        },
                        "monthly_cost": {
                            "type": "string",
                            "description": "Monthly payment amount"
                        },
                        "outstanding_value": {
                            "type": "string",
                            "description": "Outstanding balance on the loan or mortgage"
                        },
                        "interest_rate": {
                            "type": "string",
                            "description": "Current interest rate"
                        },
                        "special_rate": {
                            "type": "string",
                            "description": "Any special rate details"
                        },
                        "final_payment": {
                            "type": "string",
                            "description": "Date of final payment or end of mortgage term"
                        }
                    }
                }
            },
            "notes": {
                "type": "array",
                "description": "Any additional notes related to loans and mortgages",
                "items": {
                    "type": "string"
                }
            }
        },
        "required": ["entries"]
    }
}

In [None]:
# complete_expenses_info_system_prompt
complete_expenses_info_system_prompt = system_prompt_template.replace(
    "{{ TOOL_DEFINITIONS_IN_JSON_SCHEMA }}",
    str(
        json.dumps([
            loan_repayments_tool,
            housing_expenses_tool,
            motoring_expenses_tool,
            personal_expenses_tool,
            professional_expenses_tool,
            miscellaneous_expenses_tool,
            pensions_tool,
            savings_investments_tool,
            other_assets_tool,
            loans_mortgages_tool,
        ])
    )
)

### health

In [None]:
# Tool for client health details
health_client_details_tool: ToolParam = {
    "name": "client",
    "description": "Tool to extract information about client's health details.",
    "input_schema": {
        "type": "object",
        "properties": {
            "current_state_of_health": {
                "type": "string",
                "description": "Current state of health of the client (Good, Fair, Poor, etc.)"
            },
            "state_of_health_explanation": {
                "type": "string",
                "description": "Additional explanation about client's health condition"
            },
            "smoker": {
                "type": "string",
                "description": "Whether the client is a smoker (Yes/No)"
            },
            "cigarettes_per_day": {
                "type": "string",
                "description": "Number of cigarettes per day, if the client is a smoker"
            },
            "smoker_since": {
                "type": "string",
                "description": "When the client started smoking (year or age)"
            },
            "long_term_care_needed": {
                "type": "string",
                "description": "Whether the client needs long-term care (Yes/No)"
            },
            "long_term_care_explanation": {
                "type": "string",
                "description": "Details about the long-term care needs"
            },
            "will": {
                "type": "string",
                "description": "Whether the client has a will (Yes/No)"
            },
            "information_about_will": {
                "type": "string",
                "description": "Additional information about the client's will"
            },
            "power_of_attorney": {
                "type": "string",
                "description": "Whether the client has granted power of attorney to someone (Yes/No)"
            },
            "details_of_individual_with_power_of_attorney": {
                "type": "string",
                "description": "Information about the person with power of attorney"
            },
            "notes": {
                "type": "array",
                "description": "Any additional notes related to health details",
                "items": {
                    "type": "string"
                }
            }
        }
    }
}

In [None]:
# Tool for protection policies
protection_policies_tool: ToolParam = {
    "name": "protection_policies",
    "description": "Tool to extract information about client's protection policies.",
    "input_schema": {
        "type": "object",
        "properties": {
            "entries": {
                "type": "array",
                "description": "List of protection policies",
                "items": {
                    "type": "object",
                    "properties": {
                        "owner": {
                            "type": "string",
                            "description": "Person who owns this policy (Client, Partner, etc.)"
                        },
                        "type": {
                            "type": "string",
                            "description": "Type of protection policy (Life Insurance, Critical Illness, etc.)"
                        },
                        "provider": {
                            "type": "string",
                            "description": "Name of the policy provider"
                        },
                        "monthly_cost": {
                            "type": "string",
                            "description": "Monthly cost of the policy"
                        },
                        "amount_assured": {
                            "type": "string",
                            "description": "Amount assured by the policy"
                        },
                        "in_trust": {
                            "type": "string",
                            "description": "Whether the policy is in trust (Yes/No)"
                        },
                        "assured_until": {
                            "type": "string",
                            "description": "Date or age until which the policy provides coverage"
                        }
                    },
                }
            },
            "notes": {
                "type": "array",
                "description": "Any additional notes related to protection policies",
                "items": {
                    "type": "string"
                }
            }
        },
        "required": ["entries"]
    }
}

In [None]:
# complete_health_info_system_prompt
complete_health_info_system_prompt = system_prompt_template.replace(
    "{{ TOOL_DEFINITIONS_IN_JSON_SCHEMA }}",
    str(
        json.dumps([
            health_client_details_tool,
            protection_policies_tool,
        ])
    )
)

### objectives

In [None]:
# Tool for client's objectives and miscellaneous information
objectives_tool: ToolParam = {
    "name": "objectives_extractor",
    "description": "Tool to extract client's objectives, goals, and any other important information that doesn't fit into other categories.",
    "input_schema": {
        "type": "object",
        "properties": {
            "entries": {
                "type": "array",
                "description": "List of client objectives and miscellaneous information",
                "items": {
                    "type": "object",
                    "properties": {
                        "category": {
                            "type": "string",
                            "description": "Category of the objective or information (e.g., Financial, Health, Family, Retirement, etc.)"
                        },
                        "description": {
                            "type": "string",
                            "description": "Detailed description of the objective or information"
                        },
                        "priority": {
                            "type": "string",
                            "description": "Priority level of this objective (High, Medium, Low)"
                        },
                        "timeframe": {
                            "type": "string",
                            "description": "Timeframe for achieving this objective or when this information is relevant"
                        },
                        "additional_notes": {
                            "type": "string",
                            "description": "Any additional notes or context related to this objective or information"
                        }
                    },
                }
            }
        },
        "required": ["entries"]
    }
}

In [None]:
# complete_objectives_system_prompt
complete_objectives_system_prompt = system_prompt_template.replace(
    "{{ TOOL_DEFINITIONS_IN_JSON_SCHEMA }}",
    str(
        json.dumps([
            objectives_tool,
        ])
    )
)

---

## Categories
Description:
Complete set of category **name**,  **system_prompt** and **tools** for better reusability

In [None]:
CATEGORIES = [
    {
        "name": "personal_details",
        "system_prompt": complete_personal_info_system_prompt,
        "tools": [personal_client_details_tool, current_address_tool, previous_addresses_tool, dependants_children_tool]
    },
    {
        "name": "employment",
        "system_prompt": complete_employment_info_system_prompt,
        "tools": [employment_client_details_tool, incomes_tool]
    },
    {
        "name": "expenses",
        "system_prompt": complete_expenses_info_system_prompt,
        "tools": [loan_repayments_tool, housing_expenses_tool, motoring_expenses_tool, personal_expenses_tool, 
                  professional_expenses_tool, miscellaneous_expenses_tool, pensions_tool, 
                  savings_investments_tool, other_assets_tool, loans_mortgages_tool]
    },
    {
        "name": "health_details",
        "system_prompt": complete_health_info_system_prompt,
        "tools": [health_client_details_tool, protection_policies_tool]
    },
    {
        "name": "objectives",
        "system_prompt": complete_objectives_system_prompt,
        "tools": [objectives_tool]
    }
]

---

## Workflows

### iterative_extract_category_data 

Complexity
O(T*C + T²)
- T - the number of tools in the selected category (curr_tools).  
- C - the size of the input dialogue (conversation_text).

Description of the solution
1. Identifie category for the given **conversation_text**
2. Extract information using available **curr_tools** for the **curr_category**, and keep track of the previous extractions in **<extraction_history>**.

In [None]:
def route_conversation(conversation_text: list) -> dict:
    """
    Route the conversation to the appropriate category based on the content.
    The function uses a branching approach to analyze the conversation and select the most relevant category.

    Args:
        conversation_text (list): The conversation text to be analyzed.
    Returns:
        category (dict): The category that best fits the conversation content.
    """
    system_prompt = "You are an expert in analyzing conversations. Your task is to identify which category of information is most prominent in the given conversation segment."

    branching: MessageParam = {
        "role": "user",
        "content": f"""Analyze the responses in this conversation and select the most appropriate category for information extraction from these options: {[cat["name"] for cat in CATEGORIES]}
            First explain your reasoning.
            Second provide your selection in this XML format:

            <reasoning>
            Brief explanation of why this conversation segment contains information relevant to a specific category.
            Consider the topics discussed, specific details mentioned, and overall context.
            </reasoning>
            
            <selection>
            The chosen category name
            </selection>
            
            <conversation_segment>
            {conversation_text}
            </conversation_segment>
            """
    }

    route_response, i, o = text_to_text(system=system_prompt, messages=[branching])
    reasoning = extract_xml(route_response, 'reasoning') # for monitoring
    selected_category = extract_xml(route_response, 'selection')

    print("Routing Analysis:")
    # print("Result reasoning: ", reasoning)
    print(f"Selected route: {selected_category}")
    
    for category in CATEGORIES:
        if category["name"] == selected_category.strip():
            return category # actual category

def iterative_extract_category_data(conversation_text: list, **kwargs) -> dict:
    """
    Extract information from the conversation based on the selected category.
    Function iteratively extract information using tool and keep track of the previous extractions.

    Args:
        conversation_text (list): The conversation text to be analyzed.
        category (Optional[dict]): The category to use for extraction, if provided.
    Returns:
        result: A dict with the extracted information organized by category.
    """
    if 'category' in kwargs:
        branch = kwargs['category']
    else:
        branch = route_conversation(conversation_text)

    print(f"Selected category: {branch['name']}")
    
    curr_tools = branch["tools"]
    curr_category = branch["name"]

    result = {}
    result[curr_category] = []

    for i, tool in enumerate(curr_tools, 1):
        print(f"\nStep {i}:")

        extract_message: MessageParam = {
            "role": "user",
            "content": f"""Use "name": "{tool["name"]}" only to extract relevant data from the client responses in this conversation.
            Present it in a structured format.
            Here is what is already known:
            
            <extraction_history>
            {result}
            </extraction_history>

            If the information is not relevant, leave the field as "<UNKNOWN>".
            If the information is not present, leave the field as "<UNKNOWN>".
                    
            <conversation>
            {conversation_text}
            </conversation>
            """
        }
        
        response, it, ot = text_to_json(system=branch["system_prompt"], messages=[extract_message], tools=[tool])

        result[curr_category].append({tool["name"]: response["input"]})

    return result

### parallel_extract_category_data

Complexity **O(C + T)**
- **T** - the number of tools in the selected category (curr_tools).  
- **C** - the size of the input dialogue (conversation_text).

Description of the solution
1. Identifie category for the given **conversation_text** or process given category
2. Extract information using available **curr_tools** for the **curr_category** in parallel.

Result: JSON

In [None]:
def parallel_extract_category_data(conversation_text: list, **kwargs) -> dict:
    """
    Extract data from the conversation based on the selected category.
    The function uses parallel processing to handle multiple tools within the selected category.

    Args:
        conversation_text (list): The conversation text to be analyzed.
        category (Optional[dict]): The category to use for extraction, if provided.
    Returns:
        result: A dict with the extracted information organized by category.
    """
    if 'category' in kwargs:
        branch = kwargs['category']
    else:
        branch = route_conversation(conversation_text)

    print(f"Selected category: {branch['name']}")
    
    curr_tools = branch["tools"]
    curr_category = branch["name"]

    result = {}
    result[curr_category] = []

    def process_tool(tool):
        print(f"Processing tool: {tool['name']}")

        extract_message: MessageParam = {
            "role": "user",
            "content": f"""Use "name": "{tool["name"]}" only to extract relevant data from the client responses in this conversation.
            Present it in a structured format.

            If the information is not relevant, leave the field as "<UNKNOWN>".
            If the information is not present, leave the field as "<UNKNOWN>".
            
            <conversation>
            {conversation_text}
            </conversation>
            """
        }

        response, it, ot = text_to_json(system=branch["system_prompt"], messages=[extract_message], tools=[tool])
        
        return {tool["name"]: response["input"]}

    with ThreadPoolExecutor(max_workers=len(curr_tools)) as executor:
            futures_dict = {}
            for tool in curr_tools:
                future = executor.submit(process_tool, tool)
                futures_dict[tool["name"]] = future
            
            for tool_name, future in futures_dict.items():
                tool_result = future.result()
                result[curr_category].append(tool_result)
    return result

---

### iterative_conversation_extraction

Complexity **O(T_total * C + T_total²)**
- **C** - size of the input dialogue (conversation_text).  
- **T_total** - total number of tools across all categories (sum(len(category['tools']) for category in CATEGORIES)).

Description of the solution:
1. Iterate through each **category** in **CATEGORIES**
2. Iterate through each **tool** in **curr_tools**

Result: JSON

In [None]:
def iterative_conversation_extraction(conversation_text: list) -> dict:
    """
    Extracts information from the conversation by iteratively checking each category and its tools.
    Function keep track of the previous extractions and uses them to inform the next steps.
    
    Args:
        conversation_text (list): The conversation text to be analyzed.
    Returns:
        dict: A dictionary with extracted information organized by categories.
    """
    result = {}
    
    # Iterate through each category in CATEGORIES
    for category_index, category in enumerate(CATEGORIES, 1):
        print(f"\nCategory -> {category_index}/{len(CATEGORIES)}: {category['name']}")
        
        curr_category = category["name"]
        curr_tools = category["tools"]
        system_prompt = category["system_prompt"]
        
        # Initialize category in results
        result[curr_category] = []
        
        # Iterate through each tool in the current category
        for tool_index, tool in enumerate(curr_tools, 1):
            print(f"Step {tool_index}/{len(curr_tools)}: Extracting with {tool['name']}")
            
            extract_message: MessageParam = {
                "role": "user",
                "content": f"""Use "name": "{tool["name"]}" only to extract relevant data from the client responses in this conversation.
                Present it in a structured format.
                
                Here is what is already known:
                <extraction_history>
                {result}
                </extraction_history>

                If the information is not relevant, leave the field as "<UNKNOWN>".
                If the information is not present, leave the field as "<UNKNOWN>".
                
                <conversation>
                {conversation_text}
                </conversation>
                """
            }
            
            response, it, ot = text_to_json(system=system_prompt, messages=[extract_message], tools=[tool])
            result[curr_category].append({tool["name"]: response["input"]})
    
    return result

### parallel_conversation_extraction

Complexity **O(T_max * C + T_max² + N)**
- **C** - the size of the input dialogue (conversation_text).  
- **T_max** - the maximum number of tools in a single category(thanks to iterative_extract_category_data).  
- **N** - the total number of categories.

This approach is significantly more efficient than **iterative_conversation_extraction**.

Description of the solution:
1. Create pools of threads which running **category** in **CATEGORIES** in parallel
2. Each category processed by **iterative_extract_category_data**

Result: JSON

In [None]:
def parallel_conversation_extraction(conversation_text: list) -> dict:
    results = {}
    with ThreadPoolExecutor(max_workers=len(CATEGORIES)) as executor:
        futures_dict = {}
        for category in CATEGORIES:
            future = executor.submit(iterative_extract_category_data, conversation_text, category=category)
            futures_dict[category["name"]] = future
        
        for category_name, future in futures_dict.items():
                category_result = future.result()
                results.update(category_result)

    return results

### nested_parallel_conversation_extraction

Complexity **O(C + T_max + N)**  
- **C** - size of the input dialogue (conversation_text).  
- **T_max** - maximum number of tools in a single category.  
- **N** - total number of categories.  

This approach is the most efficient among considered.
It parallelizes both the processing of categories and the tools within each category.

Description of the solution:
1. Create pools of threads which running **category** in **CATEGORIES** in parallel
2. Each category processed by **parallel_extract_category_data**

Result: JSON

In [None]:
def nested_parallel_conversation_extraction(conversation_text: list) -> dict:
    results = {}
    with ThreadPoolExecutor(max_workers=len(CATEGORIES)) as executor:
        futures_dict = {}
        for category in CATEGORIES:
            future = executor.submit(parallel_extract_category_data, conversation_text, category=category)
            futures_dict[category["name"]] = future
        
        for category_name, future in futures_dict.items():
                category_result = future.result()
                results.update(category_result)

    return results

---

### Process conversation with given approach

I'm assume that we preprocess our raw text, there could be different approaches:
1. when we collect all the data for the conversation and then iteratively process that to collect the data.
2. process chunks of the conversation and fill the form during conversation.

I think thouse two solutions could be usefull in terms of different Adviser workflow. Я предполагаю что есть очень опытные специалисты которые четко знают как должен идти процесс коммуникации и полностью погружены в него и хотят в конце коммуникации увидеть полученный результат.
Также я предполагаю может быть полезно и для другого типа личности, когда Advisor разговаривает и смотрит на экран компьютера и в течение диалога форма заполняется и специалист видя результат формы может управлять и направлять ход своего звонка, для сбора или уточнения полей формы.


In [None]:
def process_conversation(df: pd.DataFrame, extraction_function) -> None:
    """
    Processes all conversation phases from the dataframe using the specified extraction function
    and creates a structured result.

    Args:
        df (pd.DataFrame): DataFrame containing client dialogues.
        extraction_function (callable): The function to use for extracting information.
    """
    dataset = []
    client_name_for_file = df.client_name.iloc[0] if not df.empty else "unknown_client" # Use first client name for the file

    print(f"Processing {len(df)} dialogues using {extraction_function.__name__}...")

    for i, row in df.iterrows():
        # Extract necessary data from the current row
        conversation = row.messages
        client_name = row.client_name
        dialog_stage = row.dialog_stage

        # Reset logs for the current conversation
        logs_manager.reset()

        # Extract information using the provided function
        result = extraction_function(conversation_text=conversation)

        # Get metrics for the extraction process
        metrics = logs_manager.get_current_logs()

        # Create the entry for the dataset
        form_entry = {
            "id": i + 1,
            "dialog_stage": dialog_stage,
            "client_name": client_name,
            "form": result,
            "metrics": {
                "elapsed_time": metrics.get("elapsed_time"),
                "input_tokens": metrics.get("input_tokens"),
                "output_tokens": metrics.get("output_tokens"),
                "total_tokens": metrics.get("total_tokens"),
                "api_calls": metrics.get("total_calls")
            }
        }
        dataset.append(form_entry)

    file_path = f"../results/result_{extraction_function.__name__}_{client_name_for_file}.json"

    try:
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(dataset, f, indent=2, ensure_ascii=False)
        print(f"\nProcessing complete. Results saved to: {file_path}")
    except Exception as e:
        print(f"An error occurred during file saving: {e}")


In [None]:
df = pd.read_json('../conversations/dialog_Oprah Gail Winfrey.json')


process_conversation(df=df, extraction_function=iterative_conversation_extraction)

---

### Merging results [_alpha]
Merge forms in to merged_result

In [None]:
import json
from typing import Any, List, Dict, Tuple
from pydantic import BaseModel, Field


class Form(BaseModel):
    personal_details: List[Dict[str, Any]] = Field(default_factory=list)
    employment: List[Dict[str, Any]] = Field(default_factory=list)
    expenses: List[Dict[str, Any]] = Field(default_factory=list)
    health_details: List[Dict[str, Any]] = Field(default_factory=list)
    objectives: List[Dict[str, Any]] = Field(default_factory=list)

    def merge(self, other_form: 'Form') -> 'Form':
        merged = Form()

        for field_name in self.__class__.model_fields:
            self_items = self._normalize_items(getattr(self, field_name))
            other_items = self._normalize_items(getattr(other_form, field_name))

            merged_items = self_items.copy()

            for key, value in other_items.items():
                if key in merged_items:
                    merged_items[key] = self._merge_values(merged_items[key], value)
                else:
                    merged_items[key] = value

            setattr(merged, field_name, [{k: v} for k, v in merged_items.items()])

        return merged

    def _normalize_items(self, items: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Converts a list of items into a dictionary to simplify merging."""
        return {next(iter(item.keys()), None): item[next(iter(item.keys()), None)] for item in items if item}

    def _is_similar_entry(self, item1: Any, item2: Any) -> bool:
        """Checks if two items are similar based on common keys and values."""
        if isinstance(item1, dict) and isinstance(item2, dict):
            common_keys = set(item1.keys()) & set(item2.keys())
            if not common_keys:
                return False

            similar_values = sum(1 for k in common_keys if self._check_value_similarity(item1.get(k), item2.get(k)))
            # Consider items similar if at least half of the common keys have similar values
            return similar_values >= len(common_keys) / 2

        return self._check_value_similarity(item1, item2)

    def _check_value_similarity(self, value1: Any, value2: Any) -> bool:
        """Checks if values are similar, considering <UNKNOWN> and partial string matching."""
        if value1 == "<UNKNOWN>" or value2 == "<UNKNOWN>":
            # If one value is UNKNOWN, consider them potentially similar for merging purposes
            return True

        if value1 == value2:
            return True

        # Handle string comparison with some tolerance
        if isinstance(value1, str) and isinstance(value2, str):
            # Normalize strings (lowercase, alphanumeric only)
            v1 = ''.join(c.lower() for c in value1 if c.isalnum())
            v2 = ''.join(c.lower() for c in value2 if c.isalnum())

            # Simple substring check for short strings
            if len(v1) < 5 or len(v2) < 5:
                return v1 in v2 or v2 in v1

            # Check character overlap for longer strings (heuristic)
            return len(set(v1) & set(v2)) >= min(len(set(v1)), len(set(v2))) / 2

        # For other types, strict equality is required unless one was UNKNOWN
        return False

    def _merge_values(self, value1: Any, value2: Any) -> Any:
        """Merges two values, handling <UNKNOWN>, dictionaries, and lists."""
        # Prioritize the known value if one is UNKNOWN
        if value1 == "<UNKNOWN>":
            return value2
        if value2 == "<UNKNOWN>":
            return value1

        # Recursively merge dictionaries
        if isinstance(value1, dict) and isinstance(value2, dict):
            result = value1.copy()
            for k, v in value2.items():
                if k in result:
                    result[k] = self._merge_values(result[k], v)
                else:
                    result[k] = v
            return result

        # Merge lists
        if isinstance(value1, list) and isinstance(value2, list):
            # Simple concatenation and deduplication for lists of strings
            if value1 and isinstance(value1[0], str):
                return list(set(value1 + value2))

            # For lists of complex objects (like dicts), try to merge similar items or append new ones
            result = value1.copy()
            for item2 in value2:
                # Try to find a similar item in the existing list to merge with
                merged = False
                for i, item1 in enumerate(result):
                    if self._is_similar_entry(item1, item2):
                        # If similar, merge them (assuming they are dicts)
                        if isinstance(item1, dict) and isinstance(item2, dict):
                           result[i] = self._merge_values(item1, item2)
                           merged = True
                           break
                        # Handle other potential similar types if needed
                # If no similar item was found and merged, append the new item
                if not merged:
                     if not any(self._is_similar_entry(res_item, item2) for res_item in result):
                          result.append(item2)
            return result

        # If values are identical, return one
        if value1 == value2:
            return value1

        # If values are different and not mergeable lists/dicts, create a list
        # (or decide on another conflict resolution strategy)
        # Convert single items to list before combining if necessary
        list1 = value1 if isinstance(value1, list) else [value1]
        list2 = value2 if isinstance(value2, list) else [value2]
        # Combine and remove duplicates if elements are hashable
        try:
            combined = list1 + [item for item in list2 if item not in list1]
            # If the result is just the two different items, return the list.
            # If one was already a list, this combines them.
            return combined
        except TypeError: # Handle unhashable types like dictionaries in lists
             # Fallback for unhashable items: simple concatenation might be okay,
             # or implement more sophisticated list merging if needed.
             return value1 + value2 # Simple concatenation as a fallback

def post_processing_conversation(input_file: str) -> Tuple[Form, List[Dict[str, Any]]]:
    """
    Processes a JSON file, merging data into a Form.

    Args:
        input_file: Path to the input JSON file.

    Returns:
        A tuple containing the merged Form object and a list of errors.
    """
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except Exception as e:
        # Return an empty Form and the loading error
        return Form(), [{"error": f"Error loading file: {str(e)}"}]

    merged_form = Form()
    errors = []

    # Iterate through entries in the JSON data (assuming it's a list of entries)
    for i, entry in enumerate(data):
        try:
            # Check if the entry has a 'form' key
            if form_data := entry.get('form'):
                # Validate the data against the Form model
                current_form = Form.model_validate(form_data)
                # Merge the current form into the overall merged_form
                merged_form = merged_form.merge(current_form)
        except Exception as e:
            # Record any validation or merging errors
            errors.append({
                "entry_index": i,
                "error": f"Error processing entry: {str(e)}",
                "entry_id": entry.get("id", "unknown") # Include an ID if available
            })

    return merged_form, errors


def save_merged_form(merged_form: Form, output_path: str) -> None:
    """
    Saves the merged form to a JSON file.

    Args:
        merged_form: The Form instance to save.
        output_path: The path to save the JSON file.
    """
    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            # Use model_dump() for Pydantic v2+ to get dictionary representation
            json.dump(merged_form.model_dump(), f, indent=2, ensure_ascii=False)
        print(f"Results saved to file: {output_path}")
    except Exception as e:
        print(f"Error saving form: {str(e)}")

In [None]:

input_path = '/Users/dimadem/Documents/nevis_cif_research/results/10_conversations/9.json'
output_path = '../results/merged/10_conversations/merged_result_9.json'
    
result, e = post_processing_conversation(input_path)
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(result.model_dump(), f, indent=2, ensure_ascii=False)
    print(f"\nMerged results saved to file: {output_path}")