In [2]:
import sys 
import os
import json
from pydantic.json import pydantic_encoder
import time
from dotenv import load_dotenv
load_dotenv('../../backend/.env')
sys.path.append("../../")
from backend.LLM.AnythingLLM_client import AnythingLLMClient
from backend.Reviewer import Reviewer
from backend.LLM.OllamaLLM import OllamaAI
from backend.database.schemas import DraftResult


anyllm_client = AnythingLLMClient("http://localhost:3001/api", "3WMNAPZ-GYH4RBE-M67SR00-7Y7KYEF")
ollama_client = OllamaAI('http://localhost:11434', 'llama3:instruct')
reviewer = Reviewer(ollama_client)


Performing health check on http://localhost:11434
Ollama is running
Health check passed


In [7]:
from typing import List

def review_draft(email:str, response_email:str, sources:List[str]):
    base_prompt = f"""
    From this enquiry:
    Enquiry: {email}
    ---
    Does the response answer the enquiry?
    Response: {response_email}
    ---
    """
    
    start_time_hallucination = time.time()
    draft_score = reviewer.hallucination_eval(base_prompt, sources)
    end_time_hallucination = time.time()
    
    start_time_linkert = time.time()
    linkert_score = reviewer.linkert_eval(base_prompt)
    end_time_linkert = time.time()
    
    
    return {
        "linkert_score": linkert_score,
        "hallucination_score": draft_score,
        "linkert_time": end_time_linkert - start_time_linkert,
        "hallucination_time": end_time_hallucination - start_time_hallucination
    }

In [20]:
import json as simplejson
drafts_generated_path = "../one_vs_multi_shots/results"
path_to_emails = "../../dataset"

already_evaluated = os.listdir("results")
    
for draft_file in os.listdir(drafts_generated_path):
    
    if f"reviewer_{draft_file}" in already_evaluated:
        print(f"Already evaluated {draft_file}")
        continue
    
    with open(f"{drafts_generated_path}/{draft_file}", 'r') as f:
        draft_generated_data = json.load(f)
        single_response_email = draft_generated_data['single_shot']['response']['textResponse']
        single_sources = draft_generated_data['single_shot']['response']['sources']
        multi_response_email = draft_generated_data['multi_step']['response']['generated_draft_email']
        multi_sources = [simplejson.loads(generated_answer['attributes']['sources']) for generated_answer in draft_generated_data['multi_step']['response']['generated_answers']]
        multi_sources = [source for sources in multi_sources for source in sources]

    enquiry_path = f"{path_to_emails}/{draft_file.replace('results_', '')}"
    with open(enquiry_path, 'r') as f:
        email_data = json.load(f)
        email = email_data['email']
    
    

    
    print(f"Reviewing {draft_file}")
    results_path = f"results/reviewer_{draft_file}"
    multi_step_result = review_draft(email, multi_response_email, multi_sources)
    single_shot_result = review_draft(email, single_response_email, single_sources)
    
    with open(results_path, 'w') as f:
        json.dump({
            "single_shot": single_shot_result,
            "multi_step": multi_step_result
        }, f)
        

Already evaluated results_fake_email_10.json
Already evaluated results_fake_email_undergrad_18.json
Already evaluated results_fake_email_undergrad_3.json
Already evaluated results_fake_email_3.json
Already evaluated results_fake_email_undergrad_14.json
Already evaluated results_fake_email_undergrad_15.json
Already evaluated results_fake_email_2.json
Already evaluated results_fake_email_undergrad_2.json
Already evaluated results_fake_email_undergrad_19.json
Already evaluated results_fake_email_11.json
Already evaluated results_fake_email_undergrad_12.json
Already evaluated results_fake_email_undergrad_9.json
Already evaluated results_fake_email_9.json
Already evaluated results_fake_email_16.json
Already evaluated results_fake_email_undergrad_5.json
Already evaluated results_fake_email_5.json
Already evaluated results_fake_email_4.json
Already evaluated results_fake_email_undergrad_4.json
Already evaluated results_fake_email_17.json
Already evaluated results_fake_email_8.json
Already eva

In [18]:
already_evaluated = os.listdir("results")
markdowns_path = "markdowns"
os.makedirs(markdowns_path, exist_ok=True)

for draft_file in os.listdir(drafts_generated_path):
    filename = f"reviewer_results_{draft_file}"
    if f"{filename}" not in already_evaluated:
        print(f"not evaluated {filename}")
        continue
    
    with open(f"{drafts_generated_path}/{draft_file}", 'r') as f:
        draft_generated_data = json.load(f)
        response_email = draft_generated_data[method]['response']['textResponse']
        sources = draft_generated_data[method]['response']['sources']    
        
    enquiry_path = f"{path_to_emails}/{draft_file.replace('results_', '')}"
    with open(enquiry_path, 'r') as f:
        email_data = json.load(f)
        email = email_data['email']
    
    with open(f"{markdowns_path}/human_reviewer_{draft_file.replace(".json", ".md")}", 'w') as f:
        # email and response in the markdown 
        text = f"""
# Enquiry {draft_file}

{email}

# Response

{response_email}

        """
        f.write(text)

# Results

In [44]:
average_time_linkert_single = []
average_time_linkert_multi = []
average_time_hallucination_single = []
average_time_hallucination_multi = []

for result_file in os.listdir("results"):
    with open(f"results/{result_file}", 'r') as f:
        data = json.load(f)
        
        average_time_linkert_single.append(data['single_shot']['linkert_time'])
        average_time_hallucination_single.append(data['single_shot']['hallucination_time'])
        average_time_linkert_multi.append(data['multi_step']['linkert_time'])
        average_time_hallucination_multi.append(data['multi_step']['hallucination_time'])

avg_linkert_single = sum(average_time_linkert_single) / len(average_time_linkert_single)
avg_linkert_multi = sum(average_time_linkert_multi) / len(average_time_linkert_multi)
avg_hallucination_single = sum(average_time_hallucination_single) / len(average_time_hallucination_single)
avg_hallucination_multi = sum(average_time_hallucination_multi) / len(average_time_hallucination_multi)


print(f"Average time linkert single: {avg_linkert_single}")
print(f"Average time linkert multi: {avg_linkert_multi}")
print(f"Average time hallucination single: {avg_hallucination_single}")
print(f"Average time hallucination multi: {avg_hallucination_multi}")

avg_linkert = (avg_linkert_single + avg_linkert_multi) / 2
avg_hallucination = (avg_hallucination_single + avg_hallucination_multi) / 2

print(f"Average time linkert: {avg_linkert}")
print(f"Average time hallucination: {avg_hallucination}")

Average time linkert single: 9.356014943122863
Average time linkert multi: 10.173141956329346
Average time hallucination single: 14.548383140563965
Average time hallucination multi: 16.558142638206483
Average time linkert: 9.764578449726105
Average time hallucination: 15.553262889385223
