In [1]:
import os
import pandas as pd

from config import ModelInfo
from utils.loaders import PromptLoader, SchemaLoader, InputTemplateLoader

cur_dir = "C:/Users/Shavius/Documents/Uni/Year 4/Project/ELLMRPCTFVIS/dev"
prompt_dir = os.path.join(cur_dir, 'prompts')
prompt_loader = PromptLoader(prompt_dir)
schema_dir = os.path.join(cur_dir, 'schemas')
schema_loader = SchemaLoader(schema_dir)
input_template_dir = os.path.join(cur_dir, 'input_templates')
input_template_loader = InputTemplateLoader(input_template_dir)
data_dir = os.path.join(cur_dir, 'test_data')

In [2]:

def baseline_evaluator(model_info, input_narrative):
    input_template = input_template_loader.load("consistency_evaluator_baseline")
    message = input_template.format(target_story=input_narrative)
    
    if model_info.output_format() == "json":
        bot = model_info.chatbot()(model_info.model(), "", schema_loader)
        text_response, json_response = bot.get_structured_response(message, schema_key="consistency_evaluator_baseline", record=False, temperature=0)
        consistency_score = json_response["consistency"]
    else:
        raise ValueError(f"Unsupported output format: {model_info.output_format()}")

    return consistency_score

In [7]:
def run_baseline(model_info, dataset, out_dir, existing_data):
    index_list = dataset.index.tolist()
    out_data = pd.DataFrame(columns=["narrative_id", "consistency"]) if existing_data is None else existing_data.copy()
    for index in index_list:
        if existing_data is not None and index in existing_data["narrative_id"].values:
            print(f"Skipping narrative {index}/{len(index_list)} as it already exists in the output data.")
            continue
        input_narrative = dataset.loc[index, "narrative"]
        consistency_score = baseline_evaluator(model_info, input_narrative)
        out_data.loc[len(out_data)] = [index, consistency_score]
        out_data.to_csv(out_dir, index=False)
        print(f"Processed narrative {index}/{len(index_list)}: Consistency score = {consistency_score}")
    out_data.sort_values(by='narrative_id').to_csv(out_dir, index=False)
    

In [8]:
testing_model_info = ModelInfo("gemini-structured")
input_dataset = pd.read_csv(os.path.join(data_dir, "hanna_stories.csv"), index_col=0)
out_dir = os.path.join(data_dir, "hanna_baseline_output.csv")
existing_data = None
if os.path.exists(out_dir):
    existing_data = pd.read_csv(out_dir)
run_baseline(testing_model_info, input_dataset, out_dir, existing_data)

Processed narrative 0/1056: Consistency score = 4
Processed narrative 1/1056: Consistency score = 3
Processed narrative 2/1056: Consistency score = 4
Processed narrative 3/1056: Consistency score = 4
Processed narrative 4/1056: Consistency score = 4
Processed narrative 5/1056: Consistency score = 4
Processed narrative 6/1056: Consistency score = 4
Processed narrative 7/1056: Consistency score = 4
Processed narrative 8/1056: Consistency score = 3
Processed narrative 9/1056: Consistency score = 2
Processed narrative 10/1056: Consistency score = 5
Processed narrative 11/1056: Consistency score = 3
Processed narrative 12/1056: Consistency score = 4
Processed narrative 13/1056: Consistency score = 4
Processed narrative 14/1056: Consistency score = 4
Processed narrative 15/1056: Consistency score = 3
Processed narrative 16/1056: Consistency score = 3
Processed narrative 17/1056: Consistency score = 3
Processed narrative 18/1056: Consistency score = 4
Processed narrative 19/1056: Consistency 

In [19]:
from fol_evaluator import FOLEvaluationSession
from timeline_maker import TimelineMakerSession

def wrap_narrative(narrative):
    if not narrative.startswith("AI:"):
        return "AI:" + narrative + "\n(User:[hidden])"
    return narrative

def divide_long_narratives(narrative, threshold=1500, section_length=1000):
    if len(narrative) <= threshold:
        return [wrap_narrative(narrative)]
    
    sections = []
    start = 0
    while len(narrative) - start > threshold:
        step_length = section_length if (len(narrative) - start) > (section_length * 2) else start + (len(narrative) - start) // 2
        fullstop_index = narrative.find('. ', start + step_length)
        if fullstop_index != -1:
            sections.append(wrap_narrative(narrative[start:fullstop_index + 1].strip()))
            start = fullstop_index + 2
        else:
            sections.append(wrap_narrative(narrative[start:].strip()))
            break
    
    return sections

def run_fol_evaluator(model_info, dataset, out_dir, existing_data):
    index_list = dataset.index.tolist()
    out_data = pd.DataFrame(columns=["narrative_id", "fol_output"]) if existing_data is None else existing_data.copy()
    for index in index_list:
        if existing_data is not None and index in existing_data["narrative_id"].values:
            print(f"Skipping narrative {index}/{len(index_list)} as it already exists in the output data.")
            continue
        processed_success = False
        retry_count = 3
        print(f"Start processing {index}/{len(index_list)}")
        while not processed_success:
            try:
                timeline_session = TimelineMakerSession(model_info, prompt_dir=prompt_dir, schema_dir=schema_dir, input_template_dir=input_template_dir)
                fol_session = FOLEvaluationSession(model_info, prompt_dir=prompt_dir, schema_dir=schema_dir, input_template_dir=input_template_dir)
                divided_narratives = divide_long_narratives(dataset.loc[index, "narrative"])
                all_unsat_formulas = set()
                j = 0
                for section in divided_narratives:
                    j += 1
                    timeline_session.append_conversation(section)
                    new_timeline = timeline_session.get_timeline()
                    unsat_formulas = fol_session.append_conversation(section, new_timeline=new_timeline)
                    all_unsat_formulas.update(unsat_formulas)
                    print(f"Processed section of narrative {j}/{len(divided_narratives)}: Unsat formulas = {len(all_unsat_formulas)}")
                processed_success = True
            except Exception as e:
                print(f"Error processing narrative {index}/{len(index_list)}: {e}")
                retry_count -= 1
                if retry_count <= 0:
                    print(f"Failed to process narrative {index} after multiple attempts.")
                    exit(1)
        out_data.loc[len(out_data)] = [index, "\n".join(list(all_unsat_formulas)) if len(all_unsat_formulas) > 0 else "None"]
        out_data.to_csv(out_dir, index=False)
        fol_session.export_logs(os.path.join(data_dir, "logs", f"fol_narrative_{index}_logs_local.json"))
        print(f"Processed narrative {index}/{len(index_list)}: Unsat formulas = {len(all_unsat_formulas)}")
    out_data.sort_values(by='narrative_id').to_csv(out_dir, index=False)
            

In [None]:
testing_model_info = ModelInfo("gemini-structured")
input_dataset = pd.read_csv(os.path.join(data_dir, "hanna_stories.csv"), index_col=0)
out_dir = os.path.join(data_dir, "hanna_fol_bad_formulas_output.csv")
existing_data = None
if os.path.exists(out_dir):
    existing_data = pd.read_csv(out_dir)
run_fol_evaluator(testing_model_info, input_dataset, out_dir, existing_data)

Start processing 0/1056
Processed section of narrative 1/1: Unsat formulas = 0
Processed narrative 0/1056: Unsat formulas = 0
Start processing 1/1056
Processed section of narrative 1/1: Unsat formulas = 0
Processed narrative 1/1056: Unsat formulas = 0
Start processing 2/1056
Processed section of narrative 1/3: Unsat formulas = 0
Processed section of narrative 2/3: Unsat formulas = 0
Processed section of narrative 3/3: Unsat formulas = 0
Processed narrative 2/1056: Unsat formulas = 0
Start processing 3/1056
Processed section of narrative 1/4: Unsat formulas = 0
Processed section of narrative 2/4: Unsat formulas = 0
Processed section of narrative 3/4: Unsat formulas = 0
Processed section of narrative 4/4: Unsat formulas = 0
Processed narrative 3/1056: Unsat formulas = 0
Start processing 4/1056
Processed section of narrative 1/1: Unsat formulas = 0
Processed narrative 4/1056: Unsat formulas = 0
Start processing 5/1056
Processed section of narrative 1/2: Unsat formulas = 0
Processed sectio

In [None]:
def test_with_model(model_name, input_dir):
    model_info = ModelInfo(model_name)
    input_dataset = pd.read_csv(input_dir, index_col=0)
    baseline_out_dir = os.path.join(data_dir, f"{model_name}_baseline_output.csv")
    existing_data = None
    if os.path.exists(baseline_out_dir):
        existing_data = pd.read_csv(baseline_out_dir, index_col=0)
    run_baseline(model_info, input_dataset, baseline_out_dir, existing_data)
    
    fol_out_dir = os.path.join(data_dir, f"{model_name}_fol_bad_formulas_output.csv")
    existing_data = None
    if os.path.exists(fol_out_dir):
        existing_data = pd.read_csv(fol_out_dir, index_col=0)
    run_fol_evaluator(model_info, input_dataset, fol_out_dir, existing_data)