In [1]:
import wandb
import json
import pandas as pd

project = "consistency"
entity = "itl"

api = wandb.Api()
runs = api.runs(f"{entity}/{project}")

dfs = []
# download all tables
for run in runs:
    artifacts = run.logged_artifacts()
    for artifact in artifacts:
        print(artifact.name)
        table_dir = artifact.download()
        table_path = f"{table_dir}/evaluated_statements.table.json"
        with open(table_path) as file:
            json_dict = json.load(file)
        df = pd.DataFrame(json_dict["data"], columns=json_dict["columns"])
        df["model"] = run.name
        dfs.append(df)

df = pd.concat(dfs)

run-yhonjj4s-evaluated_statements:v0


[34m[1mwandb[0m:   1 of 1 files downloaded.  


In [2]:
df.head()

Unnamed: 0,dataset_idx,generated_idx,statement,generated_with,generated_statement_text,validate_text_both|one,validate_image_both|one,validate_both_both|one,validate_text_true|false,validate_image_true|false,validate_both_true|false,validate_text_yes|no,validate_image_yes|no,validate_both_yes|no,model
0,29,0,Both scenes feature a small statue as the main...,text,1. Both scenes feature a small statue as the m...,both,both,both,true,true,True,yes,yes,yes,bunny-all-validate-prompts
1,29,1,Both statues are placed on a stone surface.,text,1. Both scenes feature a small statue as the m...,both,one,both,true,"false. the gnome statue is placed on a rock, w...",True,yes,no,yes,bunny-all-validate-prompts
2,29,2,Both statues are surrounded by natural element...,text,1. Both scenes feature a small statue as the m...,both,both,both,"false. in scene 1, the statue is surrounded by...",true,True,yes,yes,yes,bunny-all-validate-prompts
3,29,3,"Both statues have a worn appearance, with the ...",text,1. Both scenes feature a small statue as the m...,both,both,both,false. the statement is only true for one of t...,false. the statement is true for the ceramic f...,True,yes,yes,yes,bunny-all-validate-prompts
4,29,4,"Both scenes have a sense of age and history, w...",text,1. Both scenes feature a small statue as the m...,both,both,both,true,"false. the statement is true for scene 2, as t...",True,yes,no,yes,bunny-all-validate-prompts


In [3]:
# validate_image_both|one
def get_validator(col_name: str):
    positive, negative = col_name.split("_")[-1].split("|")

    def parse_validator(x):
        if x.lower().startswith(positive):
            return 1
        elif x.lower().startswith(negative):
            return 0
        else:
            print(x)
            return 0

    return parse_validator


# df.columns
validate_columns = [col for col in df.columns if "validate_" in col]

df.dropna(subset=validate_columns, inplace=True)
for col in validate_columns:
    df[col] = df[col].apply(get_validator(col))

text_validate_columns = [
    col for col in df.columns if "text" in col and "validate" in col
]
image_validate_columns = [
    col for col in df.columns if "image" in col and "validate" in col
]
both_validate_columns = [
    col for col in df.columns if "both" in col and "validate" in col
]

# aggregate
df["validate_text_avg"] = df[text_validate_columns].mean(axis=1)
df["validate_image_avg"] = df[image_validate_columns].mean(axis=1)
df["validate_both_avg"] = df[both_validate_columns].mean(axis=1)

neither, both images do not have a blue background
neither. the color of the wall in scene 1 is not described, while
neither. the sun is not visible in either image
neither. the statement does not apply to either of the images
neither. the statement does not apply to either scene as there is no pedestrian visible
neither. the statement "the car in the parking lot is white" does not
neither. the statement "the butterflies are black with yellow spots" does not apply
neither. the background of scene 1 is not a forest with green foliage,


In [4]:
df.groupby(["model", "generated_with"]).agg(
    {
        "validate_text_avg": "mean",
        "validate_image_avg": "mean",
        "validate_both_avg": "mean",
    }
)

Unnamed: 0_level_0,Unnamed: 1_level_0,validate_text_avg,validate_image_avg,validate_both_avg
model,generated_with,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bunny-all-validate-prompts,both,0.872667,0.767333,0.9424
bunny-all-validate-prompts,image,0.678,0.776667,0.858
bunny-all-validate-prompts,text,0.881764,0.694723,0.924248
