In [None]:
import glob
import json
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
%config InlineBackend.figure_format = 'retina'

In [None]:
DOMAIN = "compositional_graphics"

EXPERIMENT_ID_BASELINE = "logo_stitch_iterative"
EXPERIMENT_ID_CODEX = "logo_codex_stitch_iterative_human_readable"

In [None]:
DIR_ITERATIVE_BASELINE = f"../experiments/outputs/{DOMAIN}/{EXPERIMENT_ID_BASELINE}/"
DIR_ITERATIVE_CODEX = f"../experiments/outputs/{DOMAIN}/{EXPERIMENT_ID_CODEX}/"

In [None]:
def get_log_likelihoods(dir_results):
    data = []

    for path in sorted(glob.glob(os.path.join(dir_results, "*"))):
        config_json_path = os.path.join(path, "config.json")
        with open(config_json_path, "r") as f:
            config = json.load(f)
        global_batch_size = config["experiment_iterator"]["task_batcher"]["params"][
            "global_batch_size"
        ]

        test_likelihoods_json_path = os.path.join(path, "0", "test_likelihoods.json")
        with open(test_likelihoods_json_path, "r") as f:
            likelihoods_data = json.load(f)

        stitch_frontiers_json_path = os.path.join(
            path, "0", "train", "stitch_frontiers.json"
        )
        with open(stitch_frontiers_json_path, "r") as f:
            stitch_frontiers_data = json.load(f)

        for task_name, ll_list in likelihoods_data["log_likelihoods_by_task"][
            "test"
        ].items():
            data.append(
                {
                    "batch_size": global_batch_size,
                    "task_name": task_name,
                    "log_likelihood": ll_list[0],
                    "n_frontiers": len(stitch_frontiers_data["frontiers"]),
                }
            )

    df = pd.DataFrame(data)
    return df

In [None]:
df1 = get_log_likelihoods(DIR_ITERATIVE_BASELINE)
df1["condition"] = "Stitch"

df2 = None

df2 = get_log_likelihoods(DIR_ITERATIVE_CODEX)
df2["condition"] = "Stitch + Codex"

In [None]:
df = pd.concat([df1, df2], axis=0).reset_index(drop=True)
BATCH_SIZES = sorted(df.batch_size.unique().tolist())

In [None]:
BATCH_SIZES

In [None]:
g = sns.lineplot(data=df, x="batch_size", y="log_likelihood", hue="condition")
g.set(xscale="log");

In [None]:
sns.pointplot(data=df, x="batch_size", y="log_likelihood", hue="condition")

In [None]:
sns.pointplot(data=df, x="batch_size", y="n_frontiers", hue="condition")

In [None]:
sns.pointplot(data=df, x="n_frontiers", y="log_likelihood", hue="condition")

## What programs does Codex generate?

In [None]:
codex_query_results_json_paths = glob.glob(
    f"../experiments/outputs/{DOMAIN}/{EXPERIMENT_ID_CODEX}/{EXPERIMENT_ID_CODEX}_*/0/codex_query_results.json"
)

In [None]:
def get_codex_programs():
    df_list = []
    for batch_size in BATCH_SIZES:
        path = f"../experiments/outputs/{DOMAIN}/{EXPERIMENT_ID_CODEX}/{EXPERIMENT_ID_CODEX}_{batch_size}/0/codex_query_results.json"
        with open(path, "r") as f:
            codex_query_results = json.load(f)

        data = []
        for p in codex_query_results["prompt_programs"]:
            data.append(
                {
                    "program": p,
                    "origin": "train",
                    "valid": True,
                }
            )
        for p in codex_query_results["programs_valid"]:
            data.append(
                {
                    "program": p,
                    "origin": "codex",
                    "valid": True,
                }
            )
        for p in codex_query_results["programs_invalid"]:
            data.append(
                {
                    "program": p,
                    "origin": "codex",
                    "valid": False,
                }
            )

        df = pd.DataFrame(data)
        df["program_str_len"] = df.program.str.len()
        df["batch_size"] = batch_size

        train_programs = set(df[df["origin"] == "train"]["program"])
        df["copied_from_train"] = [(row["origin"] == "codex") and (row["program"] in train_programs) for _, row in df.iterrows()]
        
        df_list.append(df)

#         print(
#             f"+ From {batch_size} training programs, Codex generated {len(df[df.origin == 'codex'])} programs, for a total of {len(df)} programs."
#         )
#         print(
#             f"+ Of these, {len(df[(df.origin == 'codex') & (df.valid)])} were valid and {len(df[(df.origin == 'codex') & (~df.valid)])} were invalid."
#         )
#         print(
#             f"+ In total, there were {df['program'].nunique()} unique programs; {df[df.valid]['program'].nunique()} were valid."
#         )

#         copied_programs = set(codex_query_results["programs_valid"]) & set(
#             codex_query_results["prompt_programs"]
#         )
#         print(
#             f"+ {len(copied_programs)} of the Codex programs were direct copies from the training data."
#         )
        
    return df_list

In [None]:
df_list = get_codex_programs()
df_codex = pd.concat(df_list).reset_index(drop=True)

In [None]:
df_codex

In [None]:
plt.title("Percentage of valid programs")
sns.pointplot(data=df_codex, x="batch_size", y="valid", hue="origin");

In [None]:
plt.title("Program string length")
sns.violinplot(data=df_codex, x="batch_size", y="program_str_len", hue="origin");

In [None]:
plt.title("Count of unique programs")
sns.pointplot(data=df_codex.groupby("batch_size").nunique().reset_index(), x="batch_size", y="program");

In [None]:
plt.title("Count of programs copied from train")
sns.pointplot(data=df_codex.groupby("batch_size").sum().reset_index(), x="batch_size", y="copied_from_train");

In [None]:
sns.displot(data=df_codex, x="program_str_len", hue="valid", col="origin", row="batch_size");

In [None]:
df.program[df.origin == "train"].tolist()[:5]

In [None]:
df.program[df.origin == "codex"].tolist()[:5]