In [9]:
pd.to_datetime(pd.read_csv("/data/cher/geollm-bias/data/eButterfly.csv")['date'][0]).strftime('%m/%d/%Y')

'01/06/2025'

# TODO:
- percentage of the time it says "I don't have enough information..." # "enough information"

In [3]:
import pandas as pd
import json
import os
import numpy as np
import re

from utils import extract_estimate

In [2]:
def read_json_files(directory):
    data = []
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            filepath = os.path.join(directory, filename)
            df = pd.read_json(filepath).T
            data.append(df)
    return pd.concat(data)

In [5]:
experiment = 'eButterfly'
coordinates_csv = "/data/cher/geollm-bias/data/eButterfly.csv"
prompt_types = ["basic_only_coords", "basic", "expert", "incontext", "incontext_expert", "temporal_expert"]

actuals = pd.read_csv(coordinates_csv)

predictions = []
for i, prompt_type in enumerate(prompt_types):
    # read predictions
    data = read_json_files(f'/data/cher/geollm-bias/output/eButterfly/preds/llm_params_1/{prompt_type}')
    data.index = data['index']
    data.sort_index(inplace=True)

    # correct estimate if na
    data['prediction'] = data['response'].apply(lambda x: extract_estimate(x, prompt_type=prompt_type) if pd.notna(x) else None)

    # Save the results to CSV
    data.to_csv(f'./errors/{prompt_type}_check_corrections.csv', index=False, encoding='utf-8')

    # No need to keep all the columns
    if i  > 0:
        data = pd.DataFrame(data['prediction'])

    data.rename(columns = {'prediction' : f'{prompt_type}_prediction'}, inplace = True)


    # Add to list
    predictions.append(data)

# combine predictions
data = pd.concat(predictions, axis=1)

# combine predicted with actuals
data = pd.merge(data, actuals[['Presence']], left_index=True, right_index=True)

numeric_cols = list(set(data.columns) -  set(['species', 'index', 'latitude', 'longitude', 'response']))
data[numeric_cols] = data[numeric_cols].apply(pd.to_numeric)

In [6]:
data.isna().sum()

species                           0
index                             0
latitude                          0
longitude                         0
response                          0
basic_only_coords_prediction    496
basic_prediction                  7
expert_prediction                17
incontext_prediction             49
incontext_expert_prediction      30
temporal_expert_prediction       14
Presence                          0
dtype: int64

In [10]:
# General dataset metrics
data.describe()

Unnamed: 0,basic_only_coords_prediction,basic_prediction,expert_prediction,incontext_prediction,incontext_expert_prediction,temporal_expert_prediction,Presence
count,891.0,999.0,989.0,957.0,976.0,992.0,1006.0
mean,9.134854,2.231532,3.627098,5.308777,7.500512,5.741532,0.907555
std,34.571578,1.994978,2.786345,3.563766,3.806633,5.049797,0.289798
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.4,0.0,0.0,0.0,4.5,0.0,1.0
50%,9.9,3.0,4.2,8.1,9.9,7.5,1.0
75%,9.9,4.0,5.0,8.1,10.0,9.9,1.0
max,1000.0,9.9,9.9,9.9,10.0,94.0,1.0


In [None]:
# Classification metrics
import pandas as pd
from sklearn.metrics import (
    average_precision_score, accuracy_score
)
from scipy.stats import spearmanr, kendalltau

# List of prediction columns
prediction_cols = [
    "only_coords_prediction",
    "basic_prediction",
    "expert_prediction",
    "incontext_prediction",
    "temporal_prediction"
]

# Initialize results list
results = []

for col in prediction_cols:
    if col not in data.columns:
        continue  # Skip if the column is missing
    
    # Drop NaN values for fair comparison
    valid_df = data[[col, "Presence"]].dropna()

    # Convert to 0 to 1 for better comparison
    valid_df[col] = valid_df[col] / 9.9
    valid_df[col] = valid_df[col].clip(upper=1)
    
    if valid_df.empty:
        continue  # Skip if no valid data

    # Calculate metrics
    ap_score = average_precision_score(valid_df["Presence"], valid_df[col])

    binary_preds = valid_df[col].apply(lambda x: 1 if x > 0.5 else 0)
    acc_score = accuracy_score(valid_df["Presence"], binary_preds)

    # average precision if we make 0 and 1 equivalent
    num_absence = (valid_df['Presence'] == 0).sum()
    presence_predictions = valid_df[valid_df['Presence'] == 1].sample(n=num_absence)
    valid_df = pd.concat([valid_df[valid_df['Presence'] == 0], presence_predictions])

    even_ap_score = average_precision_score(valid_df["Presence"], valid_df[col])

    binary_preds = valid_df[col].apply(lambda x: 1 if x > 0.5 else 0)
    even_acc_score = accuracy_score(valid_df["Presence"], binary_preds)

    # mean average precision score by species

    # average precision by observational density

    # Store results
    results.append({
        "Prediction Column": col,
        "Average Precision": ap_score,
        "Average Precision (Even)": even_ap_score,
        "Accuracy": acc_score,
        "Accuracy (Even)": even_acc_score,
        # "Mean Average Precision": mean_ap_score,
    })

# Convert to DataFrame
summary_df = pd.DataFrame(results)

# Display results
print(summary_df)

              Prediction Column  Average Precision  Average Precision (Even)  \
0  basic_only_coords_prediction           0.885534                  0.457730   
1              basic_prediction           0.900612                  0.489834   
2             expert_prediction           0.891430                  0.442275   
3          incontext_prediction           0.900531                  0.502776   
4   incontext_expert_prediction           0.903075                  0.490463   
5    temporal_expert_prediction           0.913888                  0.548036   

   Accuracy  Accuracy (Even)  
0  0.718294         0.466292  
1  0.120120         0.516129  
2  0.274014         0.437500  
3  0.589342         0.472527  
4  0.687500         0.477528  
5  0.580645         0.532609  
