In [1]:
import os
import pandas as pd
import asyncio
import json


from config import ModelInfo
from utils.loaders import PromptLoader, SchemaLoader, InputTemplateLoader

cur_dir = "C:/Users/Shavius/Documents/Uni/Year 4/Project/ELLMRPCTFVIS/dev"
prompt_dir = os.path.join(cur_dir, 'prompts')
prompt_loader = PromptLoader(prompt_dir)
schema_dir = os.path.join(cur_dir, 'schemas')
schema_loader = SchemaLoader(schema_dir)
input_template_dir = os.path.join(cur_dir, 'input_templates')
input_template_loader = InputTemplateLoader(input_template_dir)
data_dir = os.path.join(cur_dir, 'test_data')

In [2]:
# Utility functions for narrative processing
def wrap_narrative(narrative):
    if not "(User:" in narrative:
        return narrative + "\n(User:[hidden])"
    return narrative

def divide_long_narratives(narrative, threshold=1000, section_length=800):
    sections = []
    start = 0
    at_least_one_section = False
    while len(narrative) - start > threshold or not at_least_one_section:
        at_least_one_section = True
        step_length = section_length if (len(narrative) - start) > (section_length * 2) else start + (len(narrative) - start) // 2
        fullstop_index = narrative.find('. ', start + step_length)
        if fullstop_index != -1:
            sections.append(wrap_narrative(narrative[start:fullstop_index + 1].strip()))
            start = fullstop_index + 2
        else:
            sections.append(wrap_narrative(narrative[start:].strip()))
            return sections
    if start < len(narrative):
        sections.append(wrap_narrative(narrative[start:].strip()))
    
    return sections

In [3]:

def baseline_evaluator(model_info, input_narrative):
    input_template = input_template_loader.load("consistency_evaluator_baseline")
    message = input_template.format(target_story=input_narrative)
    
    if model_info.output_format() == "json":
        bot = model_info.chatbot()(model_info.model(), "", schema_loader)
        text_response, json_response = bot.get_structured_response(message, schema_key="consistency_evaluator_baseline", record=False, temperature=0)
        consistency_score = json_response["consistency"]
    else:
        raise ValueError(f"Unsupported output format: {model_info.output_format()}")

    return consistency_score

def run_baseline_per_chunk(model_info, dataset):
    index_list = dataset.index.tolist()
    out_data = []
    for index in index_list:
        input_narrative = dataset.loc[index, "narrative"]
        consistency_score = baseline_evaluator(model_info, input_narrative)
        out_data.append({"narrative_id": index, "consistency": consistency_score})
        print(f"Processed narrative {index}: Consistency score = {consistency_score}")
    return pd.DataFrame(out_data)

async def run_baseline_async(model_info, dataset, out_dir, chunks=-1):
    if os.path.exists(out_dir):
        existing_data = pd.read_csv(out_dir)
        dataset = dataset[~dataset.index.isin(existing_data["narrative_id"].values)]
        if dataset.empty:
            print("No new narratives to process.")
            return
    else:
        existing_data = None
    tasks = []
    if chunks == -1:
        chunk_size = 1
    else:
        chunk_size = len(dataset) // chunks if chunks > 0 else 1
        
    for i in range(0, len(dataset), chunk_size):
        end_index = min(i + chunk_size, len(dataset))
        chunk = dataset.iloc[i:end_index].copy()
        task = asyncio.to_thread(run_baseline_per_chunk, model_info, chunk)
        tasks.append(task)
    
    results_df = await asyncio.gather(*tasks)
    complete_df = pd.concat(results_df, ignore_index=True)
    complete_df = pd.concat([existing_data, complete_df], ignore_index=True) if existing_data is not None else complete_df
    complete_df.sort_values(by='narrative_id').to_csv(out_dir, index=False)

async def run_baseline_with_model(model_name, input_dir, prefix):
    input_dataset = pd.read_csv(input_dir, index_col=0)
    out_dir = os.path.join(os.path.dirname(input_dir), f"{prefix}_baseline_output_{model_name}.csv")
    model_info = ModelInfo(model_name)
    await run_baseline_async(model_info, input_dataset, out_dir, chunks=-1)

In [4]:
from fol_evaluator import FOLEvaluationSession
from timeline_maker import TimelineMakerSession

def run_fol_evaluator_one(model_info, input_narrative, index):
    timeline_session = TimelineMakerSession(model_info, prompt_dir=prompt_dir, schema_dir=schema_dir, input_template_dir=input_template_dir)
    fol_session = FOLEvaluationSession(model_info, prompt_dir=prompt_dir, schema_dir=schema_dir, input_template_dir=input_template_dir)
    divided_narratives = divide_long_narratives(input_narrative)
    all_unsat_formulas = set()
    j = 0
    for section in divided_narratives:
        j += 1
        timeline_session.append_conversation(section)
        new_timeline = timeline_session.get_timeline()
        unsat_formulas = fol_session.append_conversation(section, new_timeline=new_timeline)
        all_unsat_formulas.update(unsat_formulas)
        print(f"Processed section of narrative {j}/{len(divided_narratives)} of narrative {index}: Unsat formulas = {len(all_unsat_formulas)}")
    return all_unsat_formulas

def run_fol_evaluator_per_chunk(model_info, dataset):
    index_list = dataset.index.tolist()
    out_data = []
    for index in index_list:
        processed_success = False
        retry_count = 3
        print(f"Start processing {index}")
        while not processed_success:
            try:
                all_unsat_formulas = run_fol_evaluator_one(model_info, dataset.loc[index, "narrative"], index)
                all_unsat_formulas_str = "\n\n".join(list(all_unsat_formulas)) if len(all_unsat_formulas) > 0 else "No Output"
                processed_success = True
            except Exception as e:
                print(f"Error processing narrative {index}: {e}")
                retry_count -= 1
                if retry_count <= 0:
                    print(f"Failed to process narrative {index} after multiple attempts.")
                    exit(1)
        out_data.append({"narrative_id": index, "unsat_formulas": all_unsat_formulas_str})
        print(f"Processed narrative {index}: Unsat formulas = {len(all_unsat_formulas)}")
    return pd.DataFrame(out_data)

async def run_fol_evaluator_async(model_info, dataset, out_dir, chunks=-1):
    if os.path.exists(out_dir):
        existing_data = pd.read_csv(out_dir)
        dataset = dataset[~dataset.index.isin(existing_data["narrative_id"].values)]
        if dataset.empty:
            print("No new narratives to process.")
            return
    else:
        existing_data = None
    tasks = []
    if chunks == -1:
        chunk_size = 1
    else:
        chunk_size = len(dataset) // chunks if chunks > 0 else 1
        
    for i in range(0, len(dataset), chunk_size):
        end_index = min(i + chunk_size, len(dataset))
        chunk = dataset.iloc[i:end_index].copy()
        task = asyncio.to_thread(run_fol_evaluator_per_chunk, model_info, chunk)
        tasks.append(task)
    
    results_df = await asyncio.gather(*tasks)
    complete_df = pd.concat(results_df, ignore_index=True)
    complete_df = pd.concat([existing_data, complete_df], ignore_index=True) if existing_data is not None else complete_df
    complete_df.sort_values(by='narrative_id').to_csv(out_dir, index=False)

async def run_fol_evaluator_with_model(model_name, input_dir, prefix):
    input_dataset = pd.read_csv(input_dir, index_col=0)
    out_dir = os.path.join(os.path.dirname(input_dir), f"{prefix}_fol_output_{model_name}.csv")
    model_info = ModelInfo(model_name)
    await run_fol_evaluator_async(model_info, input_dataset, out_dir, chunks=-1)

In [5]:
from outline_evaluator import OutlineEvaluationSession

def run_outline_evaluator_one(model_info, input_narrative, index):
    outline_session = OutlineEvaluationSession(model_info, None, prompt_dir=prompt_dir, schema_dir=schema_dir, input_template_dir=input_template_dir)
    divided_narratives = divide_long_narratives(input_narrative)
    all_scores = {"abruptness": [], "predicability": []}
    j = 0
    for section in divided_narratives:
        j += 1
        new_scores = outline_session.append_conversation(section)
        all_scores["abruptness"].append(new_scores["abruptness"])
        all_scores["predicability"].append(new_scores["predicability"])
        print(f"Processed section {j}/{len(divided_narratives)} of narrative {index}: Scores = {new_scores}")
    return all_scores

def run_outline_evaluator_per_chunk(model_info, dataset):
    index_list = dataset.index.tolist()
    out_data = []
    for index in index_list:
        processed_success = False
        retry_count = 3
        print(f"Start processing {index}")
        while not processed_success:
            try:
                all_scores = run_outline_evaluator_one(model_info, dataset.loc[index, "narrative"], index)
                processed_success = True
            except Exception as e:
                print(f"Error processing narrative {index}: {e}")
                retry_count -= 1
                if retry_count <= 0:
                    print(f"Failed to process narrative {index} after multiple attempts.")
                    exit(1)
        out_data.append({"narrative_id": index, "outline_scores": str(all_scores)})
        print(f"Processed narrative {index}: Scores = {all_scores}")
    return pd.DataFrame(out_data)

async def run_outline_evaluator_async(model_info, dataset, out_dir, chunks=-1):
    if os.path.exists(out_dir):
        existing_data = pd.read_csv(out_dir)
        dataset = dataset[~dataset.index.isin(existing_data["narrative_id"].values)]
        if dataset.empty:
            print("No new narratives to process.")
            return
    else:
        existing_data = None
    tasks = []
    if chunks == -1:
        chunk_size = 1
    else:
        chunk_size = len(dataset) // chunks if chunks > 0 else 1
        
    for i in range(0, len(dataset), chunk_size):
        end_index = min(i + chunk_size, len(dataset))
        chunk = dataset.iloc[i:end_index].copy()
        task = asyncio.to_thread(run_outline_evaluator_per_chunk, model_info, chunk)
        tasks.append(task)
    
    results_df = await asyncio.gather(*tasks)
    complete_df = pd.concat(results_df, ignore_index=True)
    complete_df = pd.concat([existing_data, complete_df], ignore_index=True) if existing_data is not None else complete_df
    complete_df.sort_values(by='narrative_id').to_csv(out_dir, index=False)

async def run_outline_evaluator_with_model(model_name, input_dir, prefix):
    input_dataset = pd.read_csv(input_dir, index_col=0)
    out_dir = os.path.join(os.path.dirname(input_dir), f"{prefix}_outline_output_{model_name}.csv")
    model_info = ModelInfo(model_name)
    await run_outline_evaluator_async(model_info, input_dataset, out_dir, chunks=-1)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from character_evaluator import CharacterEvaluationSession

def run_character_evaluator_one(model_info, input_narrative, index):
    character_session = CharacterEvaluationSession(model_info, prompt_dir=prompt_dir, schema_dir=schema_dir, input_template_dir=input_template_dir)
    divided_narratives = divide_long_narratives(input_narrative)
    character_scores = {}
    j = 0
    for section in divided_narratives:
        j += 1
        new_scores = character_session.append_conversation(section)
        for name, score in new_scores.items():
            if name not in character_scores:
                character_scores[name] = {"self_integrity": [], "action_integrity": []}
            character_scores[name]["self_integrity"].append(score["self_integrity"])
            character_scores[name]["action_integrity"].append(score["action_integrity"])
        print(f"Processed section {j}/{len(divided_narratives)} of narrative {index}: Scores = {new_scores}")
    return character_scores

def run_character_evaluator_per_chunk(model_info, dataset):
    index_list = dataset.index.tolist()
    out_data = []
    for index in index_list:
        processed_success = False
        retry_count = 3
        print(f"Start processing {index}")
        while not processed_success:
            try:
                character_scores = run_character_evaluator_one(model_info, dataset.loc[index, "narrative"], index)
                processed_success = True
            except Exception as e:
                print(f"Error processing narrative {index}: {e}")
                retry_count -= 1
                if retry_count <= 0:
                    print(f"Failed to process narrative {index} after multiple attempts.")
                    exit(1)
        out_data.append({"narrative_id": index, "character_scores": json.dumps(character_scores, indent=2)})
        print(f"Processed narrative {index}: Scores = {character_scores}")
    return pd.DataFrame(out_data)

async def run_character_evaluator_async(model_info, dataset, out_dir, chunks=-1):
    if os.path.exists(out_dir):
        existing_data = pd.read_csv(out_dir)
        dataset = dataset[~dataset.index.isin(existing_data["narrative_id"].values)]
        if dataset.empty:
            print("No new narratives to process.")
            return
    else:
        existing_data = None
    tasks = []
    if chunks == -1:
        chunk_size = 1
    else:
        chunk_size = len(dataset) // chunks if chunks > 0 else 1
        
    for i in range(0, len(dataset), chunk_size):
        end_index = min(i + chunk_size, len(dataset))
        chunk = dataset.iloc[i:end_index].copy()
        task = asyncio.to_thread(run_character_evaluator_per_chunk, model_info, chunk)
        tasks.append(task)
    
    results_df = await asyncio.gather(*tasks)
    complete_df = pd.concat(results_df, ignore_index=True)
    complete_df = pd.concat([existing_data, complete_df], ignore_index=True) if existing_data is not None else complete_df
    complete_df.sort_values(by='narrative_id').to_csv(out_dir, index=False)
    
async def run_character_evaluator_with_model(model_name, input_dir, prefix):
    input_dataset = pd.read_csv(input_dir, index_col=0)
    out_dir = os.path.join(os.path.dirname(input_dir), f"{prefix}_character_output_{model_name}.csv")
    model_info = ModelInfo(model_name)
    await run_character_evaluator_async(model_info, input_dataset, out_dir, chunks=-1)

In [7]:
def combined_evaluator(model_info, input_narrative, outline_result, character_result, fol_result):
    input_template = input_template_loader.load("consistency_evaluator_combined")
    message = input_template.format(target_story=input_narrative, outline_evaluator_result=outline_result, character_evaluator_result=character_result, logical_evaluator_result=fol_result)
    
    if model_info.output_format() == "json":
        bot = model_info.chatbot()(model_info.model(), "", schema_loader)
        text_response, json_response = bot.get_structured_response(message, schema_key="consistency_evaluator_combined", record=False, temperature=0)
        consistency_score = json_response["consistency"]
    else:
        raise ValueError(f"Unsupported output format: {model_info.output_format()}")
    
    return consistency_score

def run_combined_evaluator_per_chunk(model_info, dataset):
    index_list = dataset.index.tolist()
    out_data = []
    for index in index_list:
        input_narrative = dataset.loc[index, "narrative"]
        outline_result = dataset.loc[index, "outline_scores"]
        character_result = dataset.loc[index, "character_scores"]
        fol_result = dataset.loc[index, "unsat_formulas"]
        consistency_score = combined_evaluator(model_info, input_narrative, outline_result, character_result, fol_result)
        out_data.append({"narrative_id": index, "consistency": consistency_score})
        print(f"Processed narrative {index}: Consistency score = {consistency_score}")
    
    return pd.DataFrame(out_data)

async def run_combined_evaluator_async(model_info, dataset, out_dir, chunks=-1):
    if os.path.exists(out_dir):
        existing_data = pd.read_csv(out_dir)
        dataset = dataset[~dataset.index.isin(existing_data["narrative_id"].values)]
        if dataset.empty:
            print("No new narratives to process.")
            return
    else:
        existing_data = None
    tasks = []
    if chunks == -1:
        chunk_size = 1
    else:
        chunk_size = len(dataset) // chunks if chunks > 0 else 1
        
    for i in range(0, len(dataset), chunk_size):
        end_index = min(i + chunk_size, len(dataset))
        chunk = dataset.iloc[i:end_index].copy()
        task = asyncio.to_thread(run_combined_evaluator_per_chunk, model_info, chunk)
        tasks.append(task)
    
    results_df = await asyncio.gather(*tasks)
    complete_df = pd.concat(results_df, ignore_index=True)
    complete_df = pd.concat([existing_data, complete_df], ignore_index=True) if existing_data is not None else complete_df
    complete_df.sort_values(by='narrative_id').to_csv(out_dir, index=False)
    
async def run_combined_evaluator_with_model(model_name, input_dir, prefix):
    input_dataset = pd.read_csv(input_dir, index_col=0)
    
    outline_dir = os.path.join(os.path.dirname(input_dir), f"{prefix}_outline_output_{model_name}.csv")
    outline_df = pd.read_csv(outline_dir, index_col=0)
    input_dataset = input_dataset.merge(outline_df, left_index=True, right_index=True, how='left')
    
    character_dir = os.path.join(os.path.dirname(input_dir), f"{prefix}_character_output_{model_name}.csv")
    character_df = pd.read_csv(character_dir, index_col=0)
    input_dataset = input_dataset.merge(character_df, left_index=True, right_index=True, how='left')
    
    fol_dir = os.path.join(os.path.dirname(input_dir), f"{prefix}_fol_output_{model_name}.csv")
    fol_df = pd.read_csv(fol_dir, index_col=0)
    input_dataset = input_dataset.merge(fol_df, left_index=True, right_index=True, how='left')
    
    out_dir = os.path.join(os.path.dirname(input_dir), f"{prefix}_combined_output_{model_name}.csv")
    model_info = ModelInfo(model_name)
    
    await run_combined_evaluator_async(model_info, input_dataset, out_dir, chunks=-1)

In [8]:
import numpy as np

from scipy.stats import spearmanr, kendalltau
from sklearn.utils import resample

def calculate_metric_correlations(human_scores, candidate_base, candidate_refined, n_bootsraps=1000, significance=0.05, rounding=3):
    
    if len(human_scores) == 0 or len(candidate_base) == 0 or len(candidate_refined) == 0:
        raise ValueError("Input series must not be empty.")
    
    if len(human_scores) != len(candidate_base) or len(human_scores) != len(candidate_refined):
        raise ValueError(f"Input series must have the same length.{len(human_scores)} != {len(candidate_base)}, {len(human_scores)} != {len(candidate_refined)}")
    
    base_corr, _ = spearmanr(human_scores, candidate_base)
    refined_corr, _ = spearmanr(human_scores, candidate_refined)
    
    n_obervations = len(human_scores)
    data_pairs = np.column_stack((human_scores, candidate_base, candidate_refined))
    
    bootstrap_base_corrs = []
    bootstrap_refined_corrs = []
    
    for i in range(n_bootsraps):
        sample = resample(data_pairs, n_samples=n_obervations, replace=True)
        
        boot_base_corr, _ = spearmanr(sample[:, 0], sample[:, 1])
        bootstrap_base_corrs.append(boot_base_corr)
        
        boot_refined_corr, _ = spearmanr(sample[:, 0], sample[:, 2])
        bootstrap_refined_corrs.append(boot_refined_corr)
    
    ci_lower_base = np.percentile(bootstrap_base_corrs, 100 * significance / 2)
    ci_upper_base = np.percentile(bootstrap_base_corrs, 100 * (1 - significance / 2))
    
    ci_lower_refined = np.percentile(bootstrap_refined_corrs, 100 * significance / 2)
    ci_upper_refined = np.percentile(bootstrap_refined_corrs, 100 * (1 - significance / 2))
    
    base_corr = round(base_corr, rounding)
    ci_lower_base = round(ci_lower_base, rounding)
    ci_upper_base = round(ci_upper_base, rounding)
    refined_corr = round(refined_corr, rounding)
    ci_lower_refined = round(ci_lower_refined, rounding)
    ci_upper_refined = round(ci_upper_refined, rounding)
    
    return base_corr, (ci_lower_base, ci_upper_base), refined_corr, (ci_lower_refined, ci_upper_refined)
    
def evaluate_model_result(model_name, prefix):
    human_scores = pd.read_csv(os.path.join(data_dir, f"{prefix}_annotations_average.csv"), index_col=0)["consistency"].values
    base_scores = pd.read_csv(os.path.join(data_dir, f"{prefix}_baseline_output_{model_name}.csv"), index_col=0)["consistency"].values
    refined_scores = pd.read_csv(os.path.join(data_dir, f"{prefix}_combined_output_{model_name}.csv"), index_col=0)["consistency"].values
    
    base_corr, (base_lower_ci, base_upper_ci), refined_corr, (refined_lower_ci, refined_upper_ci) = calculate_metric_correlations(human_scores, base_scores, refined_scores, significance=0.1)

    print(f"Base model correlation: {base_corr} ({base_lower_ci}-{base_upper_ci})")
    print(f"Refined model correlation: {refined_corr} ({refined_lower_ci}-{refined_upper_ci})")
    
    

In [None]:
model_queue = [
    "deepseek-structured",
    "gpt-structured",
    "claude-sonnet-structured"
]
evaluaing_dataset_name = "hanna"

for model_name in model_queue:
    print(f"Evaluating model: {model_name}")
    input_dataset_dir = os.path.join(data_dir, f"{evaluaing_dataset_name}_stories.csv")
    await run_baseline_with_model(model_name, os.path.join(data_dir, input_dataset_dir), evaluaing_dataset_name)
    await run_fol_evaluator_with_model(model_name, os.path.join(data_dir, input_dataset_dir), evaluaing_dataset_name)
    await run_outline_evaluator_with_model(model_name, os.path.join(data_dir, input_dataset_dir), evaluaing_dataset_name)
    await run_character_evaluator_with_model(model_name, os.path.join(data_dir, input_dataset_dir), evaluaing_dataset_name)
    await run_combined_evaluator_with_model(model_name, os.path.join(data_dir, input_dataset_dir), evaluaing_dataset_name)
    evaluate_model_result(model_name, evaluaing_dataset_name)

Evaluating model: gemini-structured
No new narratives to process.
Start processing 0
Start processing 1
Start processing 2
Start processing 3
Start processing 4
Start processing 5
Start processing 6
Start processing 7
Start processing 8
Start processing 9
Start processing 10
Start processing 11
Start processing 12
Start processing 13
Start processing 14
Start processing 15
Start processing 16
Start processing 17
Start processing 18
Start processing 19
Processed section of narrative 1/4 of narrative 11: Unsat formulas = 0
Processed section of narrative 1/2 of narrative 4: Unsat formulas = 0
Processed section of narrative 1/2 of narrative 17: Unsat formulas = 0
Processed section of narrative 1/2 of narrative 6: Unsat formulas = 3
Processed section of narrative 1/5 of narrative 8: Unsat formulas = 0
Processed section of narrative 1/5 of narrative 7: Unsat formulas = 0
Processed section of narrative 1/2 of narrative 9: Unsat formulas = 0
Processed section of narrative 1/4 of narrative 12: 