### Compute aggregate results over raw completions for random flip generation

#### Process flip data

In [11]:
llm_names = ['gpt-4-0613', 
             'gpt-4-0314', 
             'gpt-3.5-turbo-0613', 
             'gpt-3.5-turbo-0301',
             'gpt-3.5-turbo-instruct',
             'text-davinci-003', 
             'text-davinci-002', 
             'text-davinci-001', 
             'text-curie-001', 
             'text-babbage-001', 
             'text-ada-001']

llm_raw = {llm: pickle.load(open(f'out/gen_flips1/gen_flips_{llm}.pk', 'rb')) + \       # dir formerly: 9-2
                (pickle.load(open(f'out/gen_flips2/gen_flips_{llm}.pk', 'rb')) if llm != 'gpt-3.5-turbo-instruct' else [])    # dir formerly: 09-21-2023_18-15-13
           for llm in llm_names}

llm_data = defaultdict(lambda: defaultdict(list))
for llm, res in llm_raw.items():
    for r in res:
        flips = res_to_flips(r, print_misses=False)['flips']
        llm_data[llm][r['p_tails']].append(flips)

llm_data = {k: dict(v) for k, v in llm_data.items()}

In [14]:
# from multiprocessing import Pool

# all_args = [(llm, p_tails, llm_flips, fit_models, 50) for llm, flips_by_ptails in llm_data.items()
#                                       for p_tails, llm_flips in flips_by_ptails.items()]
# with Pool(15) as pool:
#     llm_fit_res = list(tqdm(
#         pool.imap(get_results, all_args), total=len(all_args)))

In [15]:
llm_fit_res = []

for llm, flips_by_ptails in tqdm(list(llm_data.items())[:-2]):
    for p_tails, llm_flips in tqdm(flips_by_ptails.items()):
        r = get_results(llm, p_tails, llm_flips, fit_models, seq_len=50)
        llm_fit_res.append(r)

llm_fit_res = pd.concat(llm_fit_res)

  0%|                                                                                                                                              | 0/9 [00:00<?, ?it/s]
  0%|                                                                                                                                             | 0/13 [00:00<?, ?it/s][A
  8%|██████████▏                                                                                                                          | 1/13 [00:12<02:27, 12.29s/it][A
 15%|████████████████████▍                                                                                                                | 2/13 [00:25<02:23, 13.09s/it][A
 23%|██████████████████████████████▋                                                                                                      | 3/13 [00:38<02:09, 12.97s/it][A
 31%|████████████████████████████████████████▉                                                                                            

KeyboardInterrupt: 

In [None]:
llm_sub_res = []
seq_len = 50

for llm, flips_by_ptails in tqdm(list(llm_data.items())[:-2]):
    for p_tails, llm_flips in tqdm(flips_by_ptails.items(), leave=False):
        # r = get_results(llm, p_tails, llm_flips, fit_models, seq_len=50)
        r = get_sub_results(llm, p_tails, llm_flips, {mn: m for mn, m in fit_models.items()})  #  if mn in ['llm', 'Bernoulli', 'MC-a', 'MC-2', 'MC-10', 'Ground Truth', 'window-10']})  #, 'HMM-5', 'HMM-20']})
        llm_sub_res += r

In [None]:
llm_fit_res.to_csv('out/gen_fit_res.csv')
pickle.dump(llm_sub_res, open('out/gen_sub_res.pk', 'wb'))

### load tree results for formal language generation

In [166]:
llm_names = [
    #'gpt-4-0613', 
    #'gpt-4-0314', 
    #'gpt-3.5-turbo-0613', 
    #'gpt-3.5-turbo-0301',
    'gpt-3.5-turbo-instruct-0914', 
    'text-davinci-003', 
    'text-davinci-002', 
    'text-davinci-001', 
    'text-curie-001', 
    'text-babbage-001', 
    'text-ada-001'
]
out_dir = 'out/gen_formal-lang'     # formerly: 11-17-2023_09-47-41

llm_tree_raw = {llm: pickle.load(open(f'{out_dir}/tree_formal_{llm}.pk', 'rb')) for llm in llm_names}

In [167]:
llm_trees = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list))))
for llm, res in llm_tree_raw.items():
    for r in res:
        # del r['completion']
        concept = tuple(r['concept'])
        for x_len in r['x_len']:
            for depth in r['depth']:
                depth = depth[:-1] if depth else depth
                depth = len(depth.split(','))
                llm_trees[llm][concept][x_len][depth] += r  if type(r) in (list, tuple)  else [r]

In [71]:
r['concept'], r['x_len'], r['depth'], r['prompt_args']['flips']

((1, 0, 1),
 (30,),
 ('Tails, Tails, Tails, Tails, Tails,',),
 'Tails, Heads, Tails, Tails, Heads, Tails, Tails, Heads, Tails, Tails, Heads, Tails, Tails, Heads, Tails, Tails, Heads, Tails, Tails, Heads, Tails, Tails, Heads, Tails, Tails, Heads, Tails, Tails, Heads, Tails, Tails, Tails, Tails, Tails, Tails,')

In [75]:
(llm, concept, x_len, depth, )

('text-ada-001', (1, 0, 1), 30, 5)

In [54]:
# Convert chat results to probabilities, since openai chat results didn't have logprobs available at the time (they do now! for now at least)
#   I think there's a bug in my code, and I didn't end up getting this together to have formal language generation with chat models in the paper  🤷‍♂️

llm_probs = defaultdict(lambda: defaultdict(dict))
for llm, d1 in llm_trees.items():
    for concept, d2 in d1.items():
        for x_len, res in d2.items():
            if llm.startswith('gpt'):
                llm_probs[llm][concept][x_len] = chat_res_to_probs(res)
            else:
                llm_probs[llm][concept][x_len] = comp_res_to_probs(res)