In [1]:
import wandb
import json
import pandas as pd

project = "consistency"
entity = "itl"

api = wandb.Api()
runs = api.runs(f"{entity}/{project}")

dfs = []
# download all tables
for run in runs:
    artifacts = run.logged_artifacts()
    for artifact in artifacts:
        print(artifact.name)
        table_dir = artifact.download()
        table_path = f"{table_dir}/evaluated_statements.table.json"
        with open(table_path) as file:
            json_dict = json.load(file)
        df = pd.DataFrame(json_dict["data"], columns=json_dict["columns"])
        df["model"] = run.name
        dfs.append(df)

df = pd.concat(dfs)

run-yhonjj4s-evaluated_statements:v0


[34m[1mwandb[0m:   1 of 1 files downloaded.  


run-wkdlf03v-evaluated_statements:v0


[34m[1mwandb[0m:   1 of 1 files downloaded.  


In [5]:
df.head()

Unnamed: 0,dataset_idx,generated_idx,statement,generated_with,generated_statement_text,validate_text_both|one,validate_image_both|one,validate_both_both|one,validate_text_true|false,validate_image_true|false,validate_both_true|false,validate_text_yes|no,validate_image_yes|no,validate_both_yes|no,model,validate_text_avg,validate_image_avg,validate_both_avg
0,29,0,Both scenes feature a small statue as the main...,text,1. Both scenes feature a small statue as the m...,1,1,1,1,1,1,1,1,1,bunny-all-validate-prompts,1.0,1.0,1.0
1,29,1,Both statues are placed on a stone surface.,text,1. Both scenes feature a small statue as the m...,1,0,1,1,0,1,1,0,1,bunny-all-validate-prompts,1.0,0.0,0.8
2,29,2,Both statues are surrounded by natural element...,text,1. Both scenes feature a small statue as the m...,1,1,1,0,1,1,1,1,1,bunny-all-validate-prompts,0.666667,1.0,1.0
3,29,3,"Both statues have a worn appearance, with the ...",text,1. Both scenes feature a small statue as the m...,1,1,1,0,0,1,1,1,1,bunny-all-validate-prompts,0.666667,0.666667,1.0
4,29,4,"Both scenes have a sense of age and history, w...",text,1. Both scenes feature a small statue as the m...,1,1,1,1,0,1,1,0,1,bunny-all-validate-prompts,1.0,0.333333,1.0


In [3]:
# validate_image_both|one
def get_validator(col_name: str):
    positive, negative = col_name.split("_")[-1].split("|")

    def parse_validator(x):
        if x.lower().startswith(positive):
            return 1
        elif x.lower().startswith(negative):
            return 0
        else:
            print(x)
            return 0

    return parse_validator


# df.columns
validate_columns = [col for col in df.columns if "validate_" in col]

df.dropna(subset=validate_columns, inplace=True)
for col in validate_columns:
    df[col] = df[col].apply(get_validator(col))

text_validate_columns = [
    col for col in df.columns if "text" in col and "validate" in col
]
image_validate_columns = [
    col for col in df.columns if "image" in col and "validate" in col
]
both_validate_columns = [
    col for col in df.columns if "both" in col and "validate" in col
]

# aggregate
df["validate_text_avg"] = df[text_validate_columns].mean(axis=1)
df["validate_image_avg"] = df[image_validate_columns].mean(axis=1)
df["validate_both_avg"] = df[both_validate_columns].mean(axis=1)

neither, both images do not have a blue background
neither. the color of the wall in scene 1 is not described, while
neither. the sun is not visible in either image
neither. the statement does not apply to either of the images
neither. the statement does not apply to either scene as there is no pedestrian visible
neither. the statement "the car in the parking lot is white" does not
neither. the statement "the butterflies are black with yellow spots" does not apply
neither. the background of scene 1 is not a forest with green foliage,


In [14]:
df[df["generated_idx"] == 1].groupby(["model", "generated_with"]).agg(
    {
        "validate_text_avg": "mean",
        "validate_image_avg": "mean",
        "validate_both_avg": "mean",
    }
)
# df

Unnamed: 0_level_0,Unnamed: 1_level_0,validate_text_avg,validate_image_avg,validate_both_avg
model,generated_with,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bunny-all-validate-prompts,both,0.91,0.833333,0.962
bunny-all-validate-prompts,image,0.686667,0.77,0.868
bunny-all-validate-prompts,text,0.936667,0.746667,0.96
llava-all-validate-prompts,both,0.706667,0.85,0.74
llava-all-validate-prompts,image,0.446667,0.826667,0.586
llava-all-validate-prompts,text,0.76,0.813333,0.778


In [8]:
df.groupby("generated_idx").agg(
    {
        "validate_text_avg": "mean",
        "validate_image_avg": "mean",
        "validate_both_avg": "mean",
    }
)

Unnamed: 0_level_0,validate_text_avg,validate_image_avg,validate_both_avg
generated_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.887222,0.929444,0.920667
1,0.741111,0.806667,0.815667
2,0.682222,0.731667,0.767
3,0.640512,0.726767,0.748581
4,0.67447,0.749721,0.774247
5,0.0,0.666667,0.2
6,0.0,0.333333,0.4
7,0.0,1.0,0.2


In [13]:
df[df["generated_idx"] == 1]

Unnamed: 0,dataset_idx,generated_idx,statement,generated_with,generated_statement_text,validate_text_both|one,validate_image_both|one,validate_both_both|one,validate_text_true|false,validate_image_true|false,validate_both_true|false,validate_text_yes|no,validate_image_yes|no,validate_both_yes|no,model,validate_text_avg,validate_image_avg,validate_both_avg
1,29,1,Both statues are placed on a stone surface.,text,1. Both scenes feature a small statue as the m...,1,0,1,1,0,1,1,0,1,bunny-all-validate-prompts,1.000000,0.000000,0.8
6,29,1,"The statues are placed in natural settings, su...",image,1. Both images feature a statue of a gnome.\n2...,1,1,1,1,1,1,1,1,1,bunny-all-validate-prompts,1.000000,1.000000,1.0
11,29,1,The figures are not natural to the surrounding...,both,1. Both scenes feature a statue or figure plac...,1,1,1,1,1,1,1,1,1,bunny-all-validate-prompts,1.000000,1.000000,1.0
16,9,1,Both scenes have a wall in the background.,text,1. Both scenes have a statue or sculpture as t...,1,1,1,1,1,1,1,1,1,bunny-all-validate-prompts,1.000000,1.000000,1.0
21,9,1,"The statues are placed outdoors, as indicated ...",image,1. Both images feature a statue as the main su...,1,1,1,0,0,1,1,0,1,bunny-all-validate-prompts,0.666667,0.333333,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1477,45,1,The mailboxes are located in a garden or yard ...,image,1. Both images feature a mailbox with a small ...,0,1,1,0,1,1,0,1,1,llava-all-validate-prompts,0.000000,1.000000,0.8
1482,45,1,Both libraries have a wooden door with a glass...,both,1. Both scenes feature a lending library that ...,0,0,0,1,1,1,1,1,1,llava-all-validate-prompts,0.666667,0.666667,0.4
1487,4,1,"Both scenes have water present, either in the ...",text,1. Both scenes feature a bridge or archway.\n2...,1,1,1,1,1,1,1,1,1,llava-all-validate-prompts,1.000000,1.000000,1.0
1492,4,1,Both images show a body of water.,image,1. Both images feature a bridge.\n2. Both imag...,1,1,1,1,1,1,1,1,1,llava-all-validate-prompts,1.000000,1.000000,1.0
