In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Efficiency Comparisons

In [None]:
#This cell generates the average EIG and validity percentage of each method in order to produce the first part of Table 1.
from contextlib import ContextDecorator
from collections import Counter
from random import sample
contexts = range(1,19)

df_dict = {"model":[],"sample_length":[],"attributes":[],"context":[],"max_eig":[],"valid":[]}

llm = pd.read_csv("llm_results.csv")
llm100 = pd.read_csv("llm100_results.csv")
sampling = pd.read_csv("sampling_results.csv")
hfppl = pd.read_csv("hfppl_results.csv")

#llm10
modes = ["ascii","serial","vision"]
for mode in modes:
  llm_new = llm[llm["mode"] == mode]
  for context in contexts:
    try:
      mList = llm_new[llm_new["context"] == context]["eig"].to_list()
      max_eig = max(mList)
      zero = Counter(mList)[0]/len(mList)
    except:
      max_eig = -0.0
      zero = 1

    df_dict["model"].append("GPT4")
    df_dict["sample_length"].append("10")
    df_dict["attributes"].append(mode)
    df_dict["context"].append(context)
    df_dict["max_eig"].append(max_eig)
    df_dict["valid"].append(1-zero)
    print("GPT4", "10", mode, context, max_eig, zero)

#llm100
modes = ["ascii","serial","vision"]
for mode in modes:
  llm100_new = llm100[llm100["mode"] == mode]
  for context in contexts:
    try:
      mList = llm100_new[llm100_new["context"] == context]["eig"].to_list()
      max_eig = max(mList)
      zero = sum([float(i) <= 0 for i in mList])/len(mList)
    except IndexError:
      max_eig = -0.0
      zero = 1
    df_dict["model"].append("GPT4")
    df_dict["sample_length"].append("100")
    df_dict["attributes"].append(mode)
    df_dict["context"].append(context)
    df_dict["max_eig"].append(max_eig)
    df_dict["valid"].append(1-zero)
    print("GPT4", "100", mode, context, max_eig, zero)

#hfppl
information = ["REGULAR","COMBINED"]
particles = [1,3,5]
for context in contexts:
  hfppl_new = hfppl[hfppl["trial_id"] == context]
  for particle in particles:
    for awareness in information:
      try:
        mList = hfppl_new[hfppl_new["particle_num"] < particle][hfppl_new["model_type"] == awareness]["score"].to_list()
        max_eig = max(mList)
        samples = len(hfppl_new[hfppl_new["particle_num"] < particle][hfppl_new["model_type"] == awareness])
        zero = Counter(mList)[0]/len(mList)
      except:
        max_eig = -0.0
      extra = "(No board)" if awareness == "REGULAR" else ""
      df_dict["model"].append(f'SMC Steering {extra}')
      df_dict["sample_length"].append(samples)
      df_dict["attributes"].append(f"{awareness}_{particle}")
      df_dict["context"].append(context)
      df_dict["max_eig"].append(max_eig)
      df_dict["valid"].append(1-zero)
      print(f"SMC Steering {extra}", samples, f"{awareness}_{particle}",context,max_eig, zero)

#sampling
sampling_lengths = [10,100,1000,10000]
for context in contexts:
  sampling_new = sampling[sampling["board_id"] == context].reset_index()
  for sampling_length in sampling_lengths:
    max_samples = len(sampling_new["score"])
    temp = sampling_new["score"].to_list()
    sampled = sample(temp,sampling_length)
    max_eig = max(sampled)
    zero = Counter(sampled)[0]/len(sampled)
    df_dict["model"].append("Grammar")
    df_dict["sample_length"].append(sampling_length)
    df_dict["attributes"].append(sampling_length)
    df_dict["context"].append(context)
    df_dict["max_eig"].append(max_eig)
    df_dict["valid"].append(1-zero)
    print("Grammar",sampling_length, sampling_length, context,max_eig, zero)

df = pd.DataFrame.from_dict(df_dict)
df

In [None]:
#This cell computes the standard deviations to complete Table 1
#This is meant to be copied and pasted into a LaTeX file so the DataFrame stores everything as a string, and the plus-or-minus symbols are indicated by underscores
from math import sqrt

df_dict = {"model":[],"samples":[],"avg_valid":[],"avg_eig":[]}

samples = [10,100,1000,10000]
for sample in samples:
  current = df[df["model"]=="Grammar"][df["sample_length"]==sample]

  eigList = current["max_eig"].to_list()
  avgEig = sum(eigList)/len(eigList)

  stdDev = sqrt(sum([(i-avgEig)**2 for i in eigList])/len(eigList))
  avgValidity = sum(current["valid"].to_list())/len(current["valid"].to_list())

  df_dict["model"].append("sampling")
  df_dict["samples"].append(str(sample))
  df_dict["avg_valid"].append(str(avgValidity))
  df_dict["avg_eig"].append(str(avgEig)+"_"+str(stdDev))
  print(" ".join(["sampling",str(sample),str(avgValidity),str(avgEig)+"_"+str(stdDev)]))

samples = [10,100]
models = ["ascii","serial","vision"]
for sample in samples:
  for model in models:
    current = df[df["model"]=="GPT4"][df["sample_length"]==str(sample)][df["attributes"] == model]
    eigList = current["max_eig"].to_list()
    avgEig = sum([float(i) for i in eigList])/len(eigList)
    stdDev = sqrt(sum([(float(i)-avgEig)**2 for i in eigList])/len(eigList))
    avgValidity = sum(current["valid"].to_list())/len(current["valid"].to_list())

    df_dict["model"].append("llm"+"_"+model)
    df_dict["samples"].append(str(sample))
    df_dict["avg_valid"].append(str(avgValidity))
    df_dict["avg_eig"].append(str(avgEig)+"_"+str(stdDev))
    print(" ".join(["llm"+"_"+model,str(sample),str(avgValidity),str(avgEig)+"_"+str(stdDev)]))

particles = [1,3,5]
models = ["REGULAR","COMBINED"]
for model in models:
  for particle in particles:
    att = f"{model}_{particle}"
    extra = "(No board)" if model == "REGULAR" else ""
    current = df[df["model"]==f'SMC Steering {extra}'][df["attributes"] == att]
    eigList = current["max_eig"].to_list()
    avgEig = sum([float(i) for i in eigList])/len(eigList)
    stdDev = sqrt(sum([(float(i)-avgEig)**2 for i in eigList])/len(eigList))
    samplesList = current["sample_length"].to_list()
    avgSamples = sum(samplesList)/len(samplesList)
    samplesSD = sqrt(sum([(float(i)-avgSamples)**2 for i in samplesList])/len(samplesList))
    avgValidity = sum(current["valid"].to_list())/len(current["valid"].to_list())

    df_dict["model"].append("SMC"+str(particle)+"_"+model)
    df_dict["samples"].append(str(avgSamples)+"_"+str(samplesSD))
    df_dict["avg_valid"].append(str(avgValidity))
    df_dict["avg_eig"].append(str(avgEig)+"_"+str(stdDev))
    print(" ".join(["SMC"+str(particle)+"_"+model,str(avgSamples)+"_"+str(samplesSD),str(avgValidity),str(avgEig)+"_"+str(stdDev)]))

df_dict

In [None]:
#This cell generates Figure 1 from the data.
import seaborn as sns
from math import log
df.to_csv("plot_data.csv")
df = df.astype({"attributes":str})
df_plot = df[["sample_length","max_eig","model"]]
df_plot = df_plot.astype({"sample_length":float,"max_eig":float})
df_plot["sample_length"] = [log(i,10) for i in df_plot["sample_length"].to_list()]
df_plot = df_plot.rename(columns={"sample_length": "Samples Taken (log₁₀)", "max_eig": "Maximum EIG", "model":"Model"})
s = sns.scatterplot(data=df_plot, x='Samples Taken (log₁₀)', y='Maximum EIG',hue="Model")
s.set_ylim(0,5)
s = s.set(title="Overall Model Informativeness")

# Single Step analysis

In [2]:
df = pd.read_csv("hfppl_results_single_step.csv")
df

Unnamed: 0,prefix,completion,translation,score,type,particle,model_type,trial_id
0,Does the blue ship also occupy row 1?,Does the blue ship also occupy row 1?,(== (coloredTiles Blue) (coloredTiles Red)),0.000000,final,0,COMBINED,1
1,Is there a vertical ship at 1A?,Is there a vertical ship at 1A?,(== (orient (color 1-A)) V),0.000000,final,1,COMBINED,1
2,At what point of the purple ship does the red ...,At what point of the purple ship does the red ...,(touch Red Purple),0.756291,final,2,COMBINED,1
3,How many blocks are filled on the right side o...,How many blocks are filled on the right side o...,(++ (map (lambda x0 (if (== (orient x0) V) (ri...,0.000000,final,3,COMBINED,1
4,How many moves will be needed to sink the red ...,How many moves will be needed to sink the red ...,(++ (map (lambda x0 (if (== (orient x0) V) (si...,0.000000,final,4,COMBINED,1
...,...,...,...,...,...,...,...,...
175,\n,\n,(set AllColors),0.000000,final,5,COMBINED,18
176,Is there one ship in the rightmost column?,Is there one ship in the rightmost column?,(== (length (filter (lambda x0 (== (bottomrigh...,0.000000,final,6,COMBINED,18
177,How many squares are occupied by blue or red s...,How many squares are occupied by blue or red s...,(++ (map (lambda x0 (size x0)) (set (set Blue ...,0.000000,final,7,COMBINED,18
178,Can I sink the purple ship with two green shots?,Can I sink the purple ship with two green shots?,(sink Purple Green 2),0.000000,final,8,COMBINED,18


In [12]:
# Max score for each trial_id
df.groupby("trial_id")["score"].max()

trial_id
1     0.756291
2     4.577348
3     4.619552
4     1.406680
5     0.928413
6     4.726875
7     0.938722
8     2.584963
9     1.584549
10    1.120521
11    1.171941
12    1.471223
13    2.029328
14    0.445065
15    0.831474
16    2.141974
17    2.209313
18    0.000000
Name: score, dtype: float64