In [9]:
from dotenv import load_dotenv
import os
import json
import csv
import requests

load_dotenv()

# Function to interact with Ollama API
def generate_text(prompt, model_name="gemma2:9b-instruct-fp16"):
    url = os.getenv("OLLAMA_API_URL")
    data = {
        "model": model_name,
        "prompt": prompt,
        "stream": False,
        "options": {
            # fyi this param is a number of gpu "layers", not exactly the number of gpus
            "num_gpu": 26,
        }
    }
    response = requests.post(url, json=data)
    return response.json()["response"]

# Define the check_transcript function
INSTRUCTIONS = """You are an AI assistant that analyzes transcripts and compares them to a database of people. ALWAYS provide your responses in the following JSON format:
{
"is_factually_correct": "<yes|no>",
"reasoning": "<Your short step-by-step reasoning here>"
}
Ensure your response can be parsed as valid JSON (it has to start and end with curly braces and nothing else). Do not include any text outside of this JSON structure in your response."""

def check_transcript(transcript, client_features_string):
    prompt = f"""{INSTRUCTIONS}
    Here is a database of people:
    {client_features_string}
    And here is a transcript:
    "{transcript}"
    Is this transcript factually correct according to the database? Analyze the data and provide your response in the requested JSON format.
    """
    response = generate_text(prompt)
    try:
        result = json.loads(response)
        return result
    except json.JSONDecodeError:
        return {"error": "Failed to parse JSON", "raw_output": response}

# Load client features
with open(os.getenv("DATA_PATH") + "/client_features.csv", "r") as f:
    client_features_string = f.read()

# Load transcripts
transcripts = {}
with open(os.getenv("DATA_PATH") + "/transcriptions.csv", "r") as file:
    reader = csv.DictReader(file)
    for row in reader:
        transcripts[row["Audio File"]] = row["Transcription"]

{'model': 'gemma2:9b-instruct-fp16', 'created_at': '2024-06-29T02:21:02.980216012Z', 'response': '{\n"is_factually_correct": "no",\n"reasoning": "The provided transcript mentions a Social Security Number 7667-587-79988. This number does not exist in the database. The account number ZR8097  belongs to Noah Zimmerman, not Noya Chimurman."\n} \n', 'done': True, 'done_reason': 'stop', 'context': [106, 2425, 108, 2045, 708, 671, 16481, 20409, 674, 72448, 84633, 578, 58311, 1174, 577, 476, 8746, 576, 1461, 235265, 82344, 3658, 861, 15641, 575, 573, 2412, 11384, 5920, 235292, 108, 235282, 108, 235281, 502, 235298, 12334, 38303, 235298, 18848, 1192, 15114, 3276, 235371, 956, 42937, 108, 235281, 21248, 574, 1192, 15114, 6922, 3309, 4065, 235290, 1547, 235290, 8847, 32346, 1517, 28760, 108, 235270, 108, 72705, 861, 3590, 798, 614, 47654, 685, 4819, 11384, 591, 500, 919, 577, 2238, 578, 1580, 675, 55723, 77039, 578, 4285, 1354, 846, 2390, 780, 3707, 1089, 2793, 5162, 576, 736, 11384, 5449, 575, 8

In [10]:
# for each transcript, get the response from ollama and save it as a fact_check for the audio file
fact_checks = {}
for audio_file, transcript in transcripts.items():
    fact_checks[audio_file] = check_transcript(transcript, client_features_string)

    # append the fact_check to a csv file after each iteration
    with open(os.getenv("DATA_PATH") + "/fact_checks.csv", "a") as file:
        writer = csv.DictWriter(file, fieldnames=["Audio File", "is_factually_correct", "reasoning"])

        writer.writerow(
            {
                "Audio File": audio_file,
                "is_factually_correct": fact_checks[audio_file]["is_factually_correct"],
                "reasoning": fact_checks[audio_file]["reasoning"]
            }
        )

# porting this to a notebook 09, to run as a background job

{'model': 'gemma2:9b-instruct-fp16', 'created_at': '2024-06-29T02:31:38.183937584Z', 'response': '{\n"is_factually_correct": "no",\n"reasoning": "The transcript states the Social Security Number is 7667-587-79988, but the database does not contain this number. The account number ZR8097 does exist in the database, belonging to Noah Zimmerman."\n}  \n', 'done': True, 'done_reason': 'stop', 'context': [106, 2425, 108, 2045, 708, 671, 16481, 20409, 674, 72448, 84633, 578, 58311, 1174, 577, 476, 8746, 576, 1461, 235265, 82344, 3658, 861, 15641, 575, 573, 2412, 11384, 5920, 235292, 108, 235282, 108, 235281, 502, 235298, 12334, 38303, 235298, 18848, 1192, 15114, 3276, 235371, 956, 42937, 108, 235281, 21248, 574, 1192, 15114, 6922, 3309, 4065, 235290, 1547, 235290, 8847, 32346, 1517, 28760, 108, 235270, 108, 72705, 861, 3590, 798, 614, 47654, 685, 4819, 11384, 591, 500, 919, 577, 2238, 578, 1580, 675, 55723, 77039, 578, 4285, 1354, 846, 2390, 780, 3707, 1089, 2793, 5162, 576, 736, 11384, 5449,

KeyboardInterrupt: 