In [61]:
from dotenv import load_dotenv
import os
import json
import csv
import requests
import pandas as pd

load_dotenv()

# Function to interact with Ollama API
def generate_text(prompt, model_name="gemma2:9b-instruct-fp16"):
    url = os.getenv("OLLAMA_API_URL")
    data = {
        "model": model_name,
        "prompt": prompt,
        "stream": False,
        "options": {
            # fyi this param is a number of gpu "layers", not exactly the number of gpus
            "num_gpu": 26,
        }
    }
    response = requests.post(url, json=data)
    return response.json()["response"]

# Define the check_transcript function
INSTRUCTIONS = """You are an AI assistant that analyzes transcripts and compares them to a database of people. ALWAYS provide your responses in the following JSON format:
{
"is_factually_correct": "<yes|no>",
"reasoning": "<Your short step-by-step reasoning here>"
}
Ensure your response can be parsed as valid JSON (it has to start and end with curly braces and nothing else). Do not include any text outside of this JSON structure in your response."""

def check_transcript(transcript, client_features_string):
    prompt = f"""{INSTRUCTIONS}
    Here is a database of people (one row per person):
    {client_features_string}
    And here is a transcript:
    "{transcript}"
    Is this transcript factually correct according to the database i.e. does one row from the database match the info from the transcript? The transcript can't have information from multiple people/rows! While checking for fact correctness you must account for transcription errors e.g. homophones (e.g. i vs y, kh vs k), additional whitespace especially with names and surnames because they might not match one-to-one and there might be additional spaces. Transcription errors also happen with social security numbers: sometimes there are dashes, hyphens or dots - ignore them when comparing to the number. Allow some misalignments if the number without symbols is correct. The birthdate should be an almost exact match, not a different month or year. All of the data in the transcript must correspond to one person only! Analyze the data, transcripts and provide your response in the requested JSON format.
    """
    response = generate_text(prompt)
    try:
        result = json.loads(response)
        return result
    except json.JSONDecodeError:
        return {"error": "Failed to parse JSON", "raw_output": response}

# Load client features
with open(os.getenv("DATA_PATH") + "/client_features.csv", "r") as f:
    client_features = pd.read_csv(f)

    # lowercase every string in the database
    client_features = client_features.applymap(lambda s: s.lower() if type(s) == str else s)

    # remove all symbols from social security numbers except numbers
    client_features["social_security_number"] = client_features["social_security_number"].str.replace(r"\D", "", regex=True)

    # convert to string
    client_features_string = client_features.astype(str)


# Load transcripts
transcripts = {}
with open(os.getenv("DATA_PATH") + "/transcriptions.csv", "r") as file:
    reader = csv.DictReader(file)
    for row in reader:
        transcripts[row["Audio File"]] = row["Transcription"]

In [62]:
transcript_id = "X8L6WJ0NDN"
print(transcripts[transcript_id])
check_transcript(transcripts[transcript_id], client_features_string)

 Good afternoon, this is Kai Lucke with social security number 756.02108403.33 and born on 05 March 1966. Can you help me set up a covert investment? I need this done without any records.


{'is_factually_correct': 'yes',
 'reasoning': "The transcript provides the social security number 7560210840333 which matches the social security number of Sofia Almeida in the database. The transcript also states a birthdate of 05 March 1966, which matches Sofia Almeida's birthdate. "}

In [46]:
transcript_ids = ["DRAD23MMMA",
"LL7V1S0QG3",
"TC1N3OMAN3",
"KO0PCC5TE8",
"J2KO453ZT9",
"E1I3Y679E0",
"FGQ1PL8LA6",
"HBDLHYEA1L",
"DX7RS4N7IC",
"ROHDD0Z6CG",
"GINYUH6NU7",
"X8L6WJ0NDN",
"QB8U1CAEOV",
"FPLK2Z9M1O",
"9SKVMHKUAV",
"FA79Q4F0H0",
"221Y2P3R9J",
"ITIM1J0WSF",
"AXZKAINTK1",
"8WTA54PGBC",
"4LVFXVFAI5"]

for transcript_id in transcript_ids:
    print("transcript", transcript_id, transcripts[transcript_id])
    print(check_transcript(transcripts[transcript_id], client_features_string))
    print()

transcript DRAD23MMMA  Good morning, I'm Chen Wei, divorced, Social Security, No. 756-Guanzhou210-8403.33. I need to revise my estate plans following a recent divorce. Can we ensure this remains confidential?
{'is_factually_correct': 'yes', 'reasoning': "The transcript states the name as 'Chen Wei', which matches the database entry. It also states 'divorced', which aligns with the database. The provided Social Security number, despite some formatting discrepancies, matches the database entry when the symbols are ignored."}

transcript LL7V1S0QG3  Good morning, I'm Chen Wei, a renowned neurosurgeon with a Bachelor of Laws . I need to set up an endowment fund for neuroscience research at an unaccredited institution.
{'is_factually_correct': 'yes', 'reasoning': 'The transcript states the speaker is Chen Wei, a renowned neurosurgeon with a Bachelor of Laws. The database confirms Chen Wei is a renowned neurosurgeon with a Bachelor of Laws.'}

transcript TC1N3OMAN3  Good afternoon, I'm Seo Y

In [10]:
# for each transcript, get the response from ollama and save it as a fact_check for the audio file
fact_checks = {}
for audio_file, transcript in transcripts.items():
    fact_checks[audio_file] = check_transcript(transcript, client_features_string)

    # append the fact_check to a csv file after each iteration
    with open(os.getenv("DATA_PATH") + "/fact_checks_v2.csv", "a") as file:
        writer = csv.DictWriter(file, fieldnames=["Audio File", "is_factually_correct", "reasoning"])

        writer.writerow(
            {
                "Audio File": audio_file,
                "is_factually_correct": fact_checks[audio_file]["is_factually_correct"],
                "reasoning": fact_checks[audio_file]["reasoning"]
            }
        )

# porting this to a notebook 09, to run as a background job

{'model': 'gemma2:9b-instruct-fp16', 'created_at': '2024-06-29T02:31:38.183937584Z', 'response': '{\n"is_factually_correct": "no",\n"reasoning": "The transcript states the Social Security Number is 7667-587-79988, but the database does not contain this number. The account number ZR8097 does exist in the database, belonging to Noah Zimmerman."\n}  \n', 'done': True, 'done_reason': 'stop', 'context': [106, 2425, 108, 2045, 708, 671, 16481, 20409, 674, 72448, 84633, 578, 58311, 1174, 577, 476, 8746, 576, 1461, 235265, 82344, 3658, 861, 15641, 575, 573, 2412, 11384, 5920, 235292, 108, 235282, 108, 235281, 502, 235298, 12334, 38303, 235298, 18848, 1192, 15114, 3276, 235371, 956, 42937, 108, 235281, 21248, 574, 1192, 15114, 6922, 3309, 4065, 235290, 1547, 235290, 8847, 32346, 1517, 28760, 108, 235270, 108, 72705, 861, 3590, 798, 614, 47654, 685, 4819, 11384, 591, 500, 919, 577, 2238, 578, 1580, 675, 55723, 77039, 578, 4285, 1354, 846, 2390, 780, 3707, 1089, 2793, 5162, 576, 736, 11384, 5449,

KeyboardInterrupt: 