In [56]:
from dotenv import load_dotenv
import os
import json
import csv
import requests
import pandas as pd

load_dotenv()

# Function to interact with Ollama API
def generate_text(prompt, model_name="gemma2:9b-instruct-fp16"):
    url = os.getenv("OLLAMA_API_URL")
    data = {
        "model": model_name,
        "prompt": prompt,
        "stream": False,
        "options": {
            # fyi this param is a number of gpu "layers", not exactly the number of gpus
            "num_gpu": 26,
        }
    }
    response = requests.post(url, json=data)
    return response.json()["response"]

# Define the check_transcript function

def check_transcript(transcript):
    INSTRUCTIONS = """You are an AI assistant that extracts important facts from a transcripts of a phone call to a bank. ALWAYS provide your responses in the following JSON format which contains the information you need to extract:
    {
    "name": <the name of the person speaking, always comes first in the transcript>,
    "birthday": <(optional) the birthdate of the person>,
    "marital_status": <(optional) the marital status of the person>,
    "account_nr": <(optional) the account number of the person>,
    "tax_residency": <(optional) the tax residency of the person>,
    "net_worth_in_millions": <(optional) the net worth of the person>,
    "profession": <(optional) the profession of the person>,
    "social_security_number": <(optional) the social security number of the person>,
    "relationship_manager": <(optional) the relationship manager of the person>,
    "highest_previous_education": <(optional) the highest previous education of the person>,
    }
    Ensure your response can be parsed as valid JSON (it has to start and end with curly braces and nothing else). Do not include any text outside of this JSON structure in your response.
    """
    EXAMPLES = """Here are some examples:
    // Example 1
    transcript:

    """
    prompt = f"""{INSTRUCTIONS}

    And here is the transcript:
    "{transcript}"
    Extract the key facts from the transcript, account for transcription errors and try to correct them. Provide your response in the requested JSON format.
    """

    """
    The key fact should be the name by which they introduce themselves.
    The transcript can't share facts from multiple rows of the database!
    While checking for fact correctness you must account for transcription errors or typos e.g. homophones (e.g. i vs y, kh vs k), additional whitespace especially with names and surnames because they might not match one-to-one and there might be additional spaces.
    Transcription errors also happen with social security numbers: sometimes there are dashes, hyphens or dots - ignore them when comparing to the number.
    Allow some misalignments if the number without symbols is correct.
    The birthdate should be an almost exact match, not a different month or year.
    Analyze the data, transcripts and provide your response in the requested JSON format without backticks!
    """
    response = generate_text(prompt)
    try:
        result = json.loads(response)
        return result
    except json.JSONDecodeError:
        return {"error": "Failed to parse JSON", "raw_output": response}

# Load client features
with open(os.getenv("DATA_PATH") + "/client_features.csv", "r") as f:
    client_features = pd.read_csv(f)

    # lowercase every string in the database
    #client_features = client_features.applymap(lambda s: s.lower() if type(s) == str else s)

    # remove all symbols from social security numbers except numbers
    client_features["social_security_number"] = client_features["social_security_number"].str.replace(r"\D", "", regex=True)

    # convert to string
    client_features_string = client_features.astype(str)


# Load transcripts
transcripts = {}
with open(os.getenv("DATA_PATH") + "/transcriptions.csv", "r") as file:
    reader = csv.DictReader(file)
    for row in reader:
        transcripts[row["Audio File"]] = row["Transcription"]

In [42]:
transcript_id = "221Y2P3R9J"
print(transcripts[transcript_id])
check_transcript(transcripts[transcript_id], client_features_string)

 Hello, I'm Mia Anderson, holding a Bachelor of Science in Computer Science and managed by Ella Morrison. Can you set up a secret trust for me?


{'name': 'Mia Anderson',
 'relationship_manager': 'Ella Morrison',
 'highest_previous_education': 'Bachelor of Science in Computer Science'}

In [43]:
import pandas as pd
from rapidfuzz import process, fuzz

df = pd.read_csv(os.getenv("DATA_PATH") + "/client_features.csv")

# remove all symbols from social security numbers except numbers
df["social_security_number"] = df["social_security_number"].str.replace(r"\D", "", regex=True)

query = {'name': 'Mia Anderson',
 'highest_previous_education': 'Bachelor of Science in Computer Science',
 'relationship_manager': 'Ella Morrison'}

def best_match(df, query):
    # get the row with the best matching name

    name_scores = df.apply(
        lambda row: fuzz.ratio(row['name'], query['name']), axis=1
    )
    best_name_match_row = df.iloc[name_scores.idxmax()]

    return best_name_match_row

result = best_match(df, query)
print(result)


name                                           Mia Anderson
birthday                                         14.10.1985
marital_status                                      Married
account_nr                                           ZR1001
tax_residency                                         India
net_worth_in_millions                             $430 Mio.
profession                                   Music Producer
social_security_number                        7568098628111
relationship_manager                             Aisha Khan
highest_previous_education    Certificate in Dental Hygiene
Name: 9, dtype: object


In [44]:
",".join([x.lstrip() for x in result.to_string(header=False, index=False).split("\n")])

'Mia Anderson,14.10.1985,Married,ZR1001,India,$430 Mio.,Music Producer,7568098628111,Aisha Khan,Certificate in Dental Hygiene'

In [52]:
def check_if_row_matches(transcript, matched_person_string):
    INSTRUCTIONS = """You are an AI assistant that needs to verify whether a transcript of a phone call matches the json record. ALWAYS provide your responses in the following JSON format:
    {
        "is_matching_person": "<yes|no>",
        "reasoning": "<short reasoning>"
    }
    """
    prompt = f"""{INSTRUCTIONS}

    Here is the transcript of the phone call:
    "{transcript}"
    And here is the factual real information in a json format:
    {matched_person_string}
    Is the transcript factually correct according to the json i.e. is it the same person? While checking for fact correctness you must account for some transcription errors or typos e.g. homophones (e.g. i vs y, kh vs k), additional whitespace especially with names and surnames because they might not match exactly. Transcription errors also happen with social security numbers: sometimes there are dashes, hyphens or dots - ignore them when comparing to the number.
    Marital status has to match almost exactly, not a different word. Tax residency has to be the same area.
    Analyze the data and provide your response in the requested JSON format without backticks.
    """

    response = generate_text(prompt)
    try:
        result = json.loads(response)
        return result
    except json.JSONDecodeError:
        return {"error": "Failed to parse JSON", "raw_output": response}


#check_if_row_matches(transcripts[transcript_id], result.to_json())

In [53]:
transcript_ids = [
"12MINIG2V7",
"M97BEZCB1V",
"OW6VLRXPJT",
"P4TDYTUQNV",
"0RASQ7TQKJ",
"FGQ1PL8LA6",
"LL7V1S0QG3",
"TC1N3OMAN3",
"KO0PCC5TE8",
"DRAD23MMMA",
"J2KO453ZT9",
"E1I3Y679E0",
"FGQ1PL8LA6",
"HBDLHYEA1L",
"DX7RS4N7IC",
"ROHDD0Z6CG",
"GINYUH6NU7",
"X8L6WJ0NDN",
"QB8U1CAEOV",
"FPLK2Z9M1O",
"9SKVMHKUAV",
"FA79Q4F0H0",
"221Y2P3R9J",
"ITIM1J0WSF",
"AXZKAINTK1",
"8WTA54PGBC",
"4LVFXVFAI5"]

for transcript_id in transcript_ids:
    print("transcript", transcript_id, transcripts[transcript_id])
    extracted_facts = check_transcript(transcripts[transcript_id], client_features_string)
    print("extract_facts", extracted_facts)
    best_matching_row = best_match(df, extracted_facts)
    print(best_matching_row["name"])
    factually_correct = check_if_row_matches(transcripts[transcript_id], best_matching_row)
    print(factually_correct)
    print()

transcript 12MINIG2V7  Good afternoon, this is Charlotte Leclerc, a single venture capitalist. Can you help me set up a covert investment? I need this done without any records.
extract_facts {'name': 'Charlotte Leclerc', 'marital_status': 'single', 'profession': 'venture capitalist'}
{'is_matching_person': 'yes', 'reasoning': "The name 'Charlotte Leclerc' matches, and the profession 'Private Equity Partner' aligns with 'venture capitalist'. While 'Private Equity Partner' is a more formal title, it suggests a similar field."}

transcript M97BEZCB1V  Hello, I'm Santiago Rivera, widowed, and Maria Schmidt is my relationship manager. I would like to set up a charitable foundation in my late spouse's name, but need to ensure the funds are handled discreetly.
extract_facts {'name': 'Santiago Rivera', 'marital_status': 'widowed', 'relationship_manager': 'Maria Schmidt'}
{'is_matching_person': 'no', 'reasoning': 'The transcript states Santiago Rivera is widowed, while the JSON data indicates h

In [59]:
# for each transcript, get the response from ollama and save it as a fact_check for the audio file
fact_checks = {}
for audio_file, transcript in transcripts.items():
    #fact_checks[audio_file] = check_transcript(transcript, client_features_string)

    print("transcript", audio_file, transcript)
    extracted_facts = check_transcript(transcript)
    print("extract_facts", extracted_facts)
    best_matching_row = best_match(df, extracted_facts)
    factually_correct = check_if_row_matches(transcript, best_matching_row)
    print(factually_correct)
    print()
    fact_checks[audio_file] = factually_correct

    # append the fact_check to a csv file after each iteration
    with open(os.getenv("DATA_PATH") + "/fact_checks_v2.csv", "a") as file:
        writer = csv.DictWriter(file, fieldnames=["Audio File", "is_factually_correct", "reasoning"])

        writer.writerow(
            {
                "Audio File": audio_file,
                "is_factually_correct": fact_checks[audio_file]["is_matching_person"],
                "reasoning": fact_checks[audio_file]["reasoning"]
            }
        )

# porting this to a notebook 12, to run as a background job

transcript XEA040Q8N9  Hello, I am Noya Chimurman. Social Security Number 7667-587-79988 account number ZR8097 I want to invest in high-tech security systems for my property.
extract_facts {'name': 'Noya Chimurman', 'social_security_number': '766758779988', 'account_nr': 'ZR8097'}
{'is_matching_person': 'no', 'reasoning': 'Name and Social Security Number do not match exactly.'}

transcript 35UVJCB74Q  Hello, I am Elena Popova, a medical doctor with the account ZR4012. Can we discuss the possibility of not getting private research funds through traditional methods?
extract_facts {'name': 'Elena Popova', 'profession': 'medical doctor', 'account_nr': 'ZR4012'}


KeyboardInterrupt: 