In [1]:
import os
import pandas as pd
import asyncio
import json


from config import ModelInfo
from utils.loaders import PromptLoader, SchemaLoader, InputTemplateLoader

cur_dir = "C:/Users/Shavius/Documents/Uni/Year 4/Project/ELLMRPCTFVIS/dev"
prompt_dir = os.path.join(cur_dir, 'prompts')
prompt_loader = PromptLoader(prompt_dir)
schema_dir = os.path.join(cur_dir, 'schemas')
schema_loader = SchemaLoader(schema_dir)
input_template_dir = os.path.join(cur_dir, 'input_templates')
input_template_loader = InputTemplateLoader(input_template_dir)
data_dir = os.path.join(cur_dir, 'test_data')

In [2]:
# Utility functions for narrative processing
def wrap_narrative(narrative):
    if not "(User:" in narrative:
        return narrative + "\n(User:[hidden])"
    return narrative

def divide_long_narratives(narrative, threshold=1000, section_length=800):
    sections = []
    start = 0
    at_least_one_section = False
    while len(narrative) - start > threshold or not at_least_one_section:
        at_least_one_section = True
        step_length = section_length if (len(narrative) - start) > (section_length * 2) else start + (len(narrative) - start) // 2
        fullstop_index = narrative.find('. ', start + step_length)
        if fullstop_index != -1:
            sections.append(wrap_narrative(narrative[start:fullstop_index + 1].strip()))
            start = fullstop_index + 2
        else:
            sections.append(wrap_narrative(narrative[start:].strip()))
            return sections
    if start < len(narrative):
        sections.append(wrap_narrative(narrative[start:].strip()))
    
    return sections

In [None]:

def baseline_evaluator(model_info, input_narrative):
    input_template = input_template_loader.load("consistency_evaluator_baseline")
    message = input_template.format(target_story=input_narrative)
    
    if model_info.output_format() == "json":
        bot = model_info.chatbot()(model_info.model(), "", schema_loader)
        text_response, json_response = bot.get_structured_response(message, schema_key="consistency_evaluator_baseline", record=False, temperature=0)
        consistency_score = json_response["consistency"]
    else:
        raise ValueError(f"Unsupported output format: {model_info.output_format()}")

    return consistency_score

def run_baseline_per_chunk(model_info, dataset):
    index_list = dataset.index.tolist()
    out_data = []
    for index in index_list:
        input_narrative = dataset.loc[index, "narrative"]
        consistency_score = baseline_evaluator(model_info, input_narrative)
        out_data.append({"narrative_id": index, "consistency": consistency_score})
        global_progress_counter += 1
        print(f"Processed narrative {index}: Consistency score = {consistency_score}")
    return pd.DataFrame(out_data)

async def run_baseline_async(model_info, dataset, out_dir, chunks=-1):
    if os.path.exists(out_dir):
        existing_data = pd.read_csv(out_dir)
        dataset = dataset[~dataset.index.isin(existing_data["narrative_id"].values)]
    else:
        existing_data = None
    tasks = []
    if chunks == -1:
        chunk_size = 1
    else:
        chunk_size = len(dataset) // chunks if chunks > 0 else 1
        
    for i in range(0, len(dataset), chunk_size):
        end_index = min(i + chunk_size, len(dataset))
        chunk = dataset.iloc[i:end_index].copy()
        task = asyncio.to_thread(run_baseline_per_chunk, model_info, chunk)
        tasks.append(task)
    
    results_df = await asyncio.gather(*tasks)
    complete_df = pd.concat(results_df, ignore_index=True)
    complete_df = pd.concat([existing_data, complete_df], ignore_index=True) if existing_data is not None else complete_df
    complete_df.sort_values(by='narrative_id').to_csv(out_dir, index=False)

async def run_baseline_with_model(model_name, input_dir, prefix):
    input_dataset = pd.read_csv(input_dir, index_col=0)
    out_dir = os.path.join(os.path.dirname(input_dir), f"{prefix}_baseline_output_{model_name}.csv")
    model_info = ModelInfo(model_name)
    await run_baseline_async(model_info, input_dataset, out_dir, chunks=-1)

In [4]:
await run_baseline_with_model("gemini-structured", os.path.join(data_dir, "hanna_stories.csv"), "hanna")

1/20 Processed narrative 0: Consistency score = 4
2/20 Processed narrative 1: Consistency score = 3
3/20 Processed narrative 2: Consistency score = 4
4/20 Processed narrative 3: Consistency score = 4
5/20 Processed narrative 4: Consistency score = 5
6/20 Processed narrative 5: Consistency score = 3
7/20 Processed narrative 6: Consistency score = 4
8/20 Processed narrative 7: Consistency score = 4
9/20 Processed narrative 8: Consistency score = 3
10/20 Processed narrative 9: Consistency score = 2
11/20 Processed narrative 10: Consistency score = 5
12/20 Processed narrative 11: Consistency score = 3
13/20 Processed narrative 12: Consistency score = 4
14/20 Processed narrative 13: Consistency score = 4
15/20 Processed narrative 14: Consistency score = 4
16/20 Processed narrative 15: Consistency score = 3
17/20 Processed narrative 16: Consistency score = 3
18/20 Processed narrative 17: Consistency score = 3
19/20 Processed narrative 18: Consistency score = 4
20/20 Processed narrative 19: C

In [None]:
from fol_evaluator import FOLEvaluationSession
from timeline_maker import TimelineMakerSession

def run_fol_evaluator_one(model_info, input_narrative, index):
    timeline_session = TimelineMakerSession(model_info, prompt_dir=prompt_dir, schema_dir=schema_dir, input_template_dir=input_template_dir)
    fol_session = FOLEvaluationSession(model_info, prompt_dir=prompt_dir, schema_dir=schema_dir, input_template_dir=input_template_dir)
    divided_narratives = divide_long_narratives(input_narrative)
    all_unsat_formulas = set()
    j = 0
    for section in divided_narratives:
        j += 1
        timeline_session.append_conversation(section)
        new_timeline = timeline_session.get_timeline()
        unsat_formulas = fol_session.append_conversation(section, new_timeline=new_timeline)
        all_unsat_formulas.update(unsat_formulas)
        print(f"Processed section of narrative {j}/{len(divided_narratives)} of narrative {index}: Unsat formulas = {len(all_unsat_formulas)}")
    fol_session.export_logs(os.path.join(data_dir, "logs", f"fol_narrative_{index}_logs_local.json"))
    return "\n\n".join(list(all_unsat_formulas)) if len(all_unsat_formulas) > 0 else "No Output"

def run_fol_evaluator_per_chunk(model_info, dataset):
    index_list = dataset.index.tolist()
    out_data = []
    for index in index_list:
        processed_success = False
        retry_count = 3
        print(f"Start processing {index}")
        while not processed_success:
            try:
                all_unsat_formulas = run_fol_evaluator_one(model_info, dataset.loc[index, "narrative"], index)
                processed_success = True
            except Exception as e:
                print(f"Error processing narrative {index}: {e}")
                retry_count -= 1
                if retry_count <= 0:
                    print(f"Failed to process narrative {index} after multiple attempts.")
                    exit(1)
        out_data.append({"narrative_id": index, "unsat_formulas": all_unsat_formulas})
        print(f"Processed narrative {index}: Unsat formulas = {len(all_unsat_formulas)}")
    return pd.DataFrame(out_data)

async def run_fol_evaluator_async(model_info, dataset, out_dir, chunks=-1):
    if os.path.exists(out_dir):
        existing_data = pd.read_csv(out_dir)
        dataset = dataset[~dataset.index.isin(existing_data["narrative_id"].values)]
    else:
        existing_data = None
    tasks = []
    if chunks == -1:
        chunk_size = 1
    else:
        chunk_size = len(dataset) // chunks if chunks > 0 else 1
        
    for i in range(0, len(dataset), chunk_size):
        end_index = min(i + chunk_size, len(dataset))
        chunk = dataset.iloc[i:end_index].copy()
        task = asyncio.to_thread(run_fol_evaluator_per_chunk, model_info, chunk)
        tasks.append(task)
    
    results_df = await asyncio.gather(*tasks)
    complete_df = pd.concat(results_df, ignore_index=True)
    complete_df = pd.concat([existing_data, complete_df], ignore_index=True) if existing_data is not None else complete_df
    complete_df.sort_values(by='narrative_id').to_csv(out_dir, index=False)

async def run_fol_evaluator_with_model(model_name, input_dir, prefix):
    input_dataset = pd.read_csv(input_dir, index_col=0)
    input_dataset = input_dataset.iloc[:5]
    out_dir = os.path.join(os.path.dirname(input_dir), f"{prefix}_fol_output_{model_name}.csv")
    model_info = ModelInfo(model_name)
    await run_fol_evaluator_async(model_info, input_dataset, out_dir, chunks=-1)

In [14]:
await run_fol_evaluator_with_model("gemini-structured", os.path.join(data_dir, "hanna_stories.csv"), "hanna")

Start processing 0Start processing 1

Start processing 2
Start processing 3
Start processing 4
Timeline Maker Response:
"{
  "reasoning": "The story describes a scene where Tyler visits his daughter Valerie in the ward, and discusses his failed appeal to become a Donner with his wife Roni. I will define the timeline based on the events and implied time frame in the story.",
  "timeline_definition": [
    {
      "time_point_name": "T0",
      "time_point_description": "The time before Tyler entered the ward."
    },
    {
      "time_point_name": "T1",
      "time_point_description": "The time when Tyler entered the ward and found Valerie asleep."
    },
    {
      "time_point_name": "T2",
      "time_point_description": "The time when Roni and Tyler were talking about Valerie's condition and Tyler's failed appeal."
    }
  ]
}"
Timeline Maker Response:
"{
  "reasoning": "The story describes the final moments of Mathemer de Troy, a 205-year-old man, as he reflects on his life and his 

In [3]:
from outline_evaluator import OutlineEvaluationSession

def run_outline_evaluator_one(model_info, input_narrative, index):
    outline_session = OutlineEvaluationSession(model_info, None, prompt_dir=prompt_dir, schema_dir=schema_dir, input_template_dir=input_template_dir)
    divided_narratives = divide_long_narratives(input_narrative)
    all_scores = {"abruptness": [], "predicability": []}
    j = 0
    for section in divided_narratives:
        j += 1
        new_scores = outline_session.append_conversation(section)
        all_scores["abruptness"].append(new_scores["abruptness"])
        all_scores["predicability"].append(new_scores["predicability"])
        print(f"Processed section {j}/{len(divided_narratives)} of narrative {index}: Scores = {new_scores}")
    return all_scores

def run_outline_evaluator_per_chunk(model_info, dataset):
    index_list = dataset.index.tolist()
    out_data = []
    for index in index_list:
        processed_success = False
        retry_count = 3
        print(f"Start processing {index}")
        while not processed_success:
            try:
                all_scores = run_outline_evaluator_one(model_info, dataset.loc[index, "narrative"], index)
                processed_success = True
            except Exception as e:
                print(f"Error processing narrative {index}: {e}")
                retry_count -= 1
                if retry_count <= 0:
                    print(f"Failed to process narrative {index} after multiple attempts.")
                    exit(1)
        out_data.append({"narrative_id": index, "outline_scores": str(all_scores)})
        print(f"Processed narrative {index}: Scores = {all_scores}")
    return pd.DataFrame(out_data)

async def run_outline_evaluator_async(model_info, dataset, out_dir, chunks=-1):
    if os.path.exists(out_dir):
        existing_data = pd.read_csv(out_dir)
        dataset = dataset[~dataset.index.isin(existing_data["narrative_id"].values)]
    else:
        existing_data = None
    tasks = []
    if chunks == -1:
        chunk_size = 1
    else:
        chunk_size = len(dataset) // chunks if chunks > 0 else 1
        
    for i in range(0, len(dataset), chunk_size):
        end_index = min(i + chunk_size, len(dataset))
        chunk = dataset.iloc[i:end_index].copy()
        task = asyncio.to_thread(run_outline_evaluator_per_chunk, model_info, chunk)
        tasks.append(task)
    
    results_df = await asyncio.gather(*tasks)
    complete_df = pd.concat(results_df, ignore_index=True)
    complete_df = pd.concat([existing_data, complete_df], ignore_index=True) if existing_data is not None else complete_df
    complete_df.sort_values(by='narrative_id').to_csv(out_dir, index=False)

async def run_outline_evaluator_with_model(model_name, input_dir, prefix):
    input_dataset = pd.read_csv(input_dir, index_col=0)
    out_dir = os.path.join(os.path.dirname(input_dir), f"{prefix}_outline_output_{model_name}.csv")
    model_info = ModelInfo(model_name)
    await run_outline_evaluator_async(model_info, input_dataset, out_dir, chunks=-1)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
await run_outline_evaluator_with_model("gemini-structured", os.path.join(data_dir, "hanna_stories.csv"), "hanna")

Start processing 0
Start processing 1
Start processing 2
Start processing 3
Start processing 4
Start processing 5
Start processing 6
Start processing 7
Start processing 8
Start processing 9
Start processing 10
Start processing 11
Start processing 12
Start processing 13
Start processing 14
Start processing 15
Start processing 16
Start processing 17
Start processing 18
Start processing 19
Processed section 1/5 of narrative 7: Scores = {'abruptness': 0, 'predicability': 1}
Processed section 1/2 of narrative 1: Scores = {'abruptness': 0, 'predicability': 1}
Processed section 1/2 of narrative 6: Scores = {'abruptness': 0, 'predicability': 1}
Processed section 1/2 of narrative 0: Scores = {'abruptness': 0, 'predicability': 1}
Processed section 1/4 of narrative 15: Scores = {'abruptness': 0, 'predicability': 1}
Processed section 1/5 of narrative 2: Scores = {'abruptness': 0, 'predicability': 1}
Processed section 1/2 of narrative 9: Scores = {'abruptness': 0, 'predicability': 1}
Processed sect

In [3]:
from character_evaluator import CharacterEvaluationSession

def run_character_evaluator_one(model_info, input_narrative, index):
    character_session = CharacterEvaluationSession(model_info, prompt_dir=prompt_dir, schema_dir=schema_dir, input_template_dir=input_template_dir)
    divided_narratives = divide_long_narratives(input_narrative)
    character_scores = {}
    j = 0
    for section in divided_narratives:
        j += 1
        new_scores = character_session.append_conversation(section)
        for name, score in new_scores.items():
            if name not in character_scores:
                character_scores[name] = {"self_integrity": [], "action_integrity": []}
            character_scores[name]["self_integrity"].append(score["self_integrity"])
            character_scores[name]["action_integrity"].append(score["action_integrity"])
        print(f"Processed section {j}/{len(divided_narratives)} of narrative {index}: Scores = {new_scores}")
    return character_scores

def run_character_evaluator_per_chunk(model_info, dataset):
    index_list = dataset.index.tolist()
    out_data = []
    for index in index_list:
        processed_success = False
        retry_count = 3
        print(f"Start processing {index}")
        while not processed_success:
            try:
                character_scores = run_character_evaluator_one(model_info, dataset.loc[index, "narrative"], index)
                processed_success = True
            except Exception as e:
                print(f"Error processing narrative {index}: {e}")
                retry_count -= 1
                if retry_count <= 0:
                    print(f"Failed to process narrative {index} after multiple attempts.")
                    exit(1)
        out_data.append({"narrative_id": index, "character_scores": json.dumps(character_scores, indent=2)})
        print(f"Processed narrative {index}: Scores = {character_scores}")
    return pd.DataFrame(out_data)

async def run_character_evaluator_async(model_info, dataset, out_dir, chunks=-1):
    if os.path.exists(out_dir):
        existing_data = pd.read_csv(out_dir)
        dataset = dataset[~dataset.index.isin(existing_data["narrative_id"].values)]
    else:
        existing_data = None
    tasks = []
    if chunks == -1:
        chunk_size = 1
    else:
        chunk_size = len(dataset) // chunks if chunks > 0 else 1
        
    for i in range(0, len(dataset), chunk_size):
        end_index = min(i + chunk_size, len(dataset))
        chunk = dataset.iloc[i:end_index].copy()
        task = asyncio.to_thread(run_character_evaluator_per_chunk, model_info, chunk)
        tasks.append(task)
    
    results_df = await asyncio.gather(*tasks)
    complete_df = pd.concat(results_df, ignore_index=True)
    complete_df = pd.concat([existing_data, complete_df], ignore_index=True) if existing_data is not None else complete_df
    complete_df.sort_values(by='narrative_id').to_csv(out_dir, index=False)
    
async def run_character_evaluator_with_model(model_name, input_dir, prefix):
    input_dataset = pd.read_csv(input_dir, index_col=0)
    out_dir = os.path.join(os.path.dirname(input_dir), f"{prefix}_character_output_{model_name}.csv")
    model_info = ModelInfo(model_name)
    await run_character_evaluator_async(model_info, input_dataset, out_dir, chunks=-1)

In [4]:
await run_character_evaluator_with_model("gemini-structured", os.path.join(data_dir, "hanna_stories.csv"), "hanna")

Start processing 0
Start processing 1
Start processing 2
Start processing 3
Start processing 4
Start processing 5
Start processing 6
Start processing 7
Start processing 8
Start processing 9
Start processing 10
Start processing 11
Start processing 12
Start processing 13
Start processing 14
Start processing 15
Start processing 16
Start processing 17
Start processing 18
Start processing 19
Processed section 1/4 of narrative 15: Scores = {'The Narrator': {'self_integrity': 1.0, 'action_integrity': 1.0}}
Processed section 1/2 of narrative 6: Scores = {'The Narrator': {'self_integrity': 1.0, 'action_integrity': 1.0}}
Processed section 1/5 of narrative 8: Scores = {'Ray': {'self_integrity': 1.0, 'action_integrity': 1.0}}
Processed section 1/2 of narrative 10: Scores = {'Groucho': {'self_integrity': 1.0, 'action_integrity': 1.0}}
Processed section 1/2 of narrative 4: Scores = {'The Narrator': {'self_integrity': 1.0, 'action_integrity': 1.0}}
Processed section 1/4 of narrative 11: Scores = {'Ji

In [None]:
def combined_evaluator(model_info, input_narrative, outline_result, character_result, fol_result):
    input_template = input_template_loader.load("consistency_evaluator_combined")
    message = input_template.format(target_story=input_narrative, outline_evaluator_result=outline_result, character_evaluator_result=character_result, logical_evaluator_result=fol_result)
    print(message)
    
    if model_info.output_format() == "json":
        bot = model_info.chatbot()(model_info.model(), "", schema_loader)
        text_response, json_response = bot.get_structured_response(message, schema_key="consistency_evaluator_combined", record=False, temperature=0)
        print(text_response)
        consistency_score = json_response["consistency"]
    else:
        raise ValueError(f"Unsupported output format: {model_info.output_format()}")
    
    return consistency_score

def run_combined_evaluator_per_chunk(model_info, dataset):
    index_list = dataset.index.tolist()
    out_data = []
    for index in index_list:
        input_narrative = dataset.loc[index, "narrative"]
        outline_result = dataset.loc[index, "outline_scores"]
        character_result = dataset.loc[index, "character_scores"]
        fol_result = dataset.loc[index, "unsat_formulas"]
        consistency_score = combined_evaluator(model_info, input_narrative, outline_result, character_result, fol_result)
        out_data.append({"narrative_id": index, "consistency": consistency_score})
        print(f"Processed narrative {index}: Consistency score = {consistency_score}")
    
    return pd.DataFrame(out_data)

async def run_combined_evaluator_async(model_info, dataset, out_dir, chunks=-1):
    if os.path.exists(out_dir):
        existing_data = pd.read_csv(out_dir)
        dataset = dataset[~dataset.index.isin(existing_data["narrative_id"].values)]
    else:
        existing_data = None
    tasks = []
    if chunks == -1:
        chunk_size = 1
    else:
        chunk_size = len(dataset) // chunks if chunks > 0 else 1
        
    for i in range(0, len(dataset), chunk_size):
        end_index = min(i + chunk_size, len(dataset))
        chunk = dataset.iloc[i:end_index].copy()
        task = asyncio.to_thread(run_combined_evaluator_per_chunk, model_info, chunk)
        tasks.append(task)
    
    results_df = await asyncio.gather(*tasks)
    complete_df = pd.concat(results_df, ignore_index=True)
    complete_df = pd.concat([existing_data, complete_df], ignore_index=True) if existing_data is not None else complete_df
    complete_df.sort_values(by='narrative_id').to_csv(out_dir, index=False)
    
async def run_combined_evaluator_with_model(model_name, input_dir, prefix):
    input_dataset = pd.read_csv(input_dir, index_col=0)
    
    outline_dir = os.path.join(os.path.dirname(input_dir), f"{prefix}_outline_output_{model_name}.csv")
    outline_df = pd.read_csv(outline_dir, index_col=0)
    input_dataset = input_dataset.merge(outline_df, left_index=True, right_index=True, how='left')
    
    character_dir = os.path.join(os.path.dirname(input_dir), f"{prefix}_character_output_{model_name}.csv")
    character_df = pd.read_csv(character_dir, index_col=0)
    input_dataset = input_dataset.merge(character_df, left_index=True, right_index=True, how='left')
    
    fol_dir = os.path.join(os.path.dirname(input_dir), f"{prefix}_fol_output_{model_name}.csv")
    fol_df = pd.read_csv(fol_dir, index_col=0)
    input_dataset = input_dataset.merge(fol_df, left_index=True, right_index=True, how='left')
    
    out_dir = os.path.join(os.path.dirname(input_dir), f"{prefix}_combined_output_{model_name}.csv")
    model_info = ModelInfo(model_name)
    
    input_dataset = input_dataset.iloc[:1]
    await run_combined_evaluator_async(model_info, input_dataset, out_dir, chunks=-1)

In [4]:
await run_combined_evaluator_with_model("gemini-structured", os.path.join(data_dir, "hanna_stories.csv"), "hanna")

Processed narrative 1: Consistency score = 4Processed narrative 0: Consistency score = 4

Processed narrative 8: Consistency score = 4
Processed narrative 10: Consistency score = 5
Processed narrative 5: Consistency score = 4
Processed narrative 13: Consistency score = 5
Processed narrative 17: Consistency score = 4
Processed narrative 15: Consistency score = 4
Processed narrative 6: Consistency score = 4
Processed narrative 9: Consistency score = 2
Processed narrative 7: Consistency score = 4
Processed narrative 16: Consistency score = 2
Processed narrative 18: Consistency score = 4
Processed narrative 14: Consistency score = 5
Processed narrative 2: Consistency score = 4
Processed narrative 19: Consistency score = 4
Processed narrative 4: Consistency score = 4
Processed narrative 3: Consistency score = 4
Processed narrative 11: Consistency score = 4
Processed narrative 12: Consistency score = 4
Processed narrative 26: Consistency score = 4
Processed narrative 21: Consistency score = 