# Importing Packages

In [None]:
# Importing packages

import pandas as pd
import numpy as np

# Loading Data

In [None]:
# Selecting the csv
csv_results = 'poland_extract.csv'

# Converting to dataframe
df_results = pd.read_csv(f'thesis_modularized/data/{csv_results}')
df_results.head(5)

In [None]:
# Removing the index row
df_results.drop('Unnamed: 0', axis=1, inplace=True)
df_results.head(5)

In [None]:
# Sort dataframe by the two columns for easier analysis & comparison
df_results = df_results.sort_values(by=['ERAIL Occurrence', 'model_type'], ascending=[True, True])
df_results.head(5)

# Executing Calculations

In [None]:
def evaluate_variable(df, var_name, llm_col, truth_col, match_col=None, manual_match_col=None, is_time=False):
    """
    Evaluate a prediction variable and return a one-row DataFrame with relevant metrics.

    Parameters:
    - df: the full dataframe with all results
    - var_name: str, name of the variable (e.g. "Date", "Time", "AccidentType")
    - llm_col: str, the column with LLM predictions
    - truth_col: str, the column with ground truth
    - match_col: str or None, column with automatic match results ("Match"/"Mismatch")
    - manual_match_col: str or None, column with manual match results ("Match"/"Mismatch")
    - is_time: bool, whether the variable is a time column (for 2min/5min accuracy)

    Returns:
    - A one-row DataFrame with evaluation results.
    """

    result = {
        "Variable": var_name,
        "Abs. Accuracy": None,
        "2min Accuracy": None,
        "5min Accuracy": None,
        "Manual Abs. Accuracy": None,
        "Consistency": None,
        "Completeness": None
    }

    # 1. Absolute Accuracy (automatic)
    if match_col and match_col in df.columns:
        result["Abs. Accuracy"] = (df[match_col] == 'Match').sum() / df[match_col].notna().sum() * 100

    # 2. Manual Accuracy (if available)
    if manual_match_col and manual_match_col in df.columns:
        result["Manual Abs. Accuracy"] = (df[manual_match_col] == 'Match').sum() / df[manual_match_col].notna().sum() * 100

    # 3. Consistency: check unique values per ERAIL + model_type
    group = df.groupby(['ERAIL Occurrence', 'model_type'])
    consistency_check = group.apply(lambda g: g[llm_col].nunique() == 1).reset_index(name='is_consistent')
    result["Consistency"] = consistency_check['is_consistent'].mean() * 100

    # 4. Completeness
    result["Completeness"] = df[llm_col].apply(lambda x: pd.notna(x) and x != '').mean() * 100

    # 5. Time-specific accuracy
    if is_time:
        # Safe datetime parsing (clean specific known bad cases here if needed)
        df = df.copy()
        df[llm_col] = pd.to_datetime(df[llm_col], errors='coerce')
        df[truth_col] = pd.to_datetime(df[truth_col], errors='coerce')

        time_diff = abs((df[llm_col] - df[truth_col]).dt.total_seconds())
        result["2min Accuracy"] = (time_diff <= 120).mean() * 100
        result["5min Accuracy"] = (time_diff <= 300).mean() * 100

    # Filter and return only relevant columns for this variable
    result_df = pd.DataFrame([result])
    columns = ['Variable'] + [col for col in result_df.columns if result_df[col].notna().any() and col != 'Variable']
    return result_df[columns]

## Date

In [None]:
df_eval_date = evaluate_variable(
    df=df_results,
    var_name='Date',
    llm_col='LLM_Date',
    truth_col='Date of occurrence',
    match_col='Date_Match'
)
df_eval_date

## Time

In [None]:
df_eval_time = evaluate_variable(
    df=df_results,
    var_name='Time',
    llm_col='LLM_Time',
    truth_col='Time of occurrence',
    match_col='Time_Match',
    is_time=True
)
df_eval_time

## Country

In [None]:
df_eval_country = evaluate_variable(
    df=df_results,
    var_name='Country',
    llm_col='LLM_Country',
    truth_col='Country',
    match_col='Country_Match'
)
df_eval_country

## Accident Type

In [None]:
df_eval_accident = evaluate_variable(
    df=df_results,
    var_name='AccidentType',
    llm_col='LLM_AccidentType',
    truth_col='Occurrence type',
    match_col='AccidentType_Match',
    manual_match_col='Manual_AccidentType_Match'
)
df_eval_accident

## Regulatory Body

In [None]:
df_eval_regulatory = evaluate_variable(
    df=df_results,
    var_name='RegulatoryBody',
    llm_col='LLM_RegulatoryBody',
    truth_col='Reporting Body',
    match_col='RegulatoryBody_Match',
    manual_match_col='Manual_RegulatoryBody_Match'
)
df_eval_regulatory

## Contributing Factors

In [None]:
df_eval_contrfact = evaluate_variable(
    df=df_results,
    var_name='ContributingFactor',
    llm_col='LLM_ContributingFactor',
    truth_col='Direct cause description (including causal and contributing factors, excluding those of systemic nature)',
    match_col='ContributingFactors_Match',
    manual_match_col='Manual_ContributingFactor_Match'
)
df_eval_contrfact

In [None]:
df_eval_sysfact = evaluate_variable(
    df=df_results,
    var_name='SystemicFactor',
    llm_col='LLM_SystemicFactor',
    truth_col='Underlying and root causes description (i.e. systemic factors, if any)',
    match_col='SystemicFactors_Match',
    manual_match_col='Manual_SystemicFactors_Match'
)
df_eval_sysfact

## Merged

In [None]:
df_eval_all = pd.concat([
    df_eval_date,
    df_eval_time,
    df_eval_country,
    df_eval_accident,
    df_eval_regulatory,
    df_eval_contrfact,
    df_eval_sysfact
], ignore_index=True)
df_eval_all