In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from os.path import join as oj
import analyze_utils
import matplotlib.pyplot as plt
import iprompt
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu
tqdm.pandas()
keys = ['model_cls', 'task_name', 'checkpoint', 'seed']
keys_extra = ['n_shots', 'num_learned_tokens']
keys_out = ['reciprocal_rank', 'prefixes', 'iprompt_preprefix_str']

save_dir = '/home/chansingh/mntv1/iprompt_revision_xmas/'

In [2]:
# r = analyze_utils.load_results_and_cache_autoprompt_json(save_dir, save_file='r.pkl', one_row_only=True)
r = pd.read_pickle(oj(save_dir, 'r.pkl'))

In [3]:
# # subselect the data
r = r[r['n_shots'] == 5]
r = r[r['num_learned_tokens'] == 6]
# original suffix had a reranking step, correct one is just called "suff"
r = r[~(r['model_cls'] == 'suffix')]

# # add more keys
keys_derived = ['task_collection', 'top_prompt_correctness', 'bleu_top_prompt']
r['task_collection'] = r.task_name.apply(analyze_utils.task_collection)
r['top_prompt_correctness'] = r['reciprocal_rank'] == 1
r['gt_prompt'] = r['task_name'].apply(
    lambda x: iprompt.data.TASKS[x]['description'])
r['bleu_top_prompt'] = r.progress_apply(
    lambda x: sentence_bleu([x['gt_prompt'].split()],
                            x['prefixes'].split(), weights=(0.75, 0.25)), axis=1)  # maybe need to split before calling?

d = r[keys + keys_extra + keys_out + keys_derived]
d.to_pickle(oj(save_dir, 'd.pkl'))
d = pd.read_pickle(oj(save_dir, 'd.pkl'))


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
100%|██████████| 2357/2357 [00:00<00:00, 19098.17it/s]


# Metrics table

In [18]:
metrics = d.groupby(by=['model_cls', 'task_collection', 'checkpoint']).mean(numeric_only=True).reset_index()
ms = []
# with pd.option_context('display.float_format', lambda x: '%.1e' % x):
with pd.option_context('display.float_format', lambda x: '%.2g' % x):
    for ckpt in metrics.checkpoint.unique():
        m = (
            metrics[metrics.checkpoint == ckpt]
            .pivot_table(index='model_cls', columns='task_collection', values=['reciprocal_rank', 'top_prompt_correctness', 'bleu_top_prompt'])
            .transpose()
        )
        display(m)
        ms.append(m)

Unnamed: 0_level_0,model_cls,autoprompt,iprompt,suff
Unnamed: 0_level_1,task_collection,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bleu_top_prompt,ANLI,4.100000000000001e-80,0.0084,0.00071
bleu_top_prompt,DD,0.0,0.0018,0.0025
bleu_top_prompt,Induction,2e-79,0.0058,0.00096
bleu_top_prompt,Math,0.0,6e-79,0.0078
reciprocal_rank,ANLI,0.074,0.41,0.06
reciprocal_rank,DD,0.00034,0.0091,0.026
reciprocal_rank,Induction,0.086,0.35,0.035
reciprocal_rank,Math,0.14,0.69,0.075
top_prompt_correctness,ANLI,0.074,0.37,0.011
top_prompt_correctness,DD,0.0,0.0,0.0069


Unnamed: 0_level_0,model_cls,autoprompt,iprompt,suff
Unnamed: 0_level_1,task_collection,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bleu_top_prompt,ANLI,3.8000000000000004e-80,0.01,0.00032
bleu_top_prompt,DD,0.0,0.0,0.0024
bleu_top_prompt,Induction,2e-80,0.019,0.0027
bleu_top_prompt,Math,0.0,0.0073,4.8e-79
reciprocal_rank,ANLI,0.0025,0.39,0.085
reciprocal_rank,DD,0.013,0.09,0.084
reciprocal_rank,Induction,0.1,0.13,0.056
reciprocal_rank,Math,0.09,0.2,0.025
top_prompt_correctness,ANLI,0.0,0.34,0.025
top_prompt_correctness,DD,0.0,0.037,0.067


Unnamed: 0_level_0,model_cls,iprompt,suff
Unnamed: 0_level_1,task_collection,Unnamed: 2_level_1,Unnamed: 3_level_1
bleu_top_prompt,ANLI,0.0029,0.0021
bleu_top_prompt,DD,0.056,2.9e-79
bleu_top_prompt,Induction,0.018,9.1e-05
bleu_top_prompt,Math,0.025,0.0046
reciprocal_rank,ANLI,0.26,0.057
reciprocal_rank,DD,0.11,0.0043
reciprocal_rank,Induction,0.17,0.066
reciprocal_rank,Math,0.15,0.029
top_prompt_correctness,ANLI,0.2,0.024
top_prompt_correctness,DD,0.091,0.0


In [21]:
for m in ms:
    # put iprompt column first
    col = m['iprompt']
    m = m.drop(columns='iprompt')
    m.insert(loc=0, column='iprompt', value=col)

    # printing index
    mets_ordered = ['reciprocal_rank',
                    'top_prompt_correctness', 'bleu_top_prompt']
    collections_ordered = ['Math', 'ANLI', 'Induction']
    index = pd.MultiIndex.from_product([mets_ordered, collections_ordered])
    m[m < 1e-4] = 0
    m_s = (
        m
        .loc[index]  # reorder
        .applymap(lambda x: str.format("{:0.2g}", x))
        .apply(lambda x:
               [f"\\textbf{{{i}}}" if x.name ==
                'iprompt' else i for i in x],
               axis=0)
        .style.to_latex(hrules=True)
        .replace(' & task_collection &  &  &  \\\\\n', '')
        .replace(' & model_cls & iprompt & autoprompt & suff',  ' & & iPrompt & AutoPrompt & Suffix')
    )
    rename = {
        'reciprocal_rank': 'MRR',
        'top_prompt_correctness': 'Correctness',
        'bleu_top_prompt': 'BLEU',
    }
    for k in rename.keys():
        s = '\multirow[c]{3}{*}{' + k + '}'
        # print(s)
        m_s = m_s.replace(
            '\multirow[c]{3}{*}{' + k + '}',
            '\parbox[c]{0.3mm}{\multirow{3}{*}{\\rotatebox[origin = c]{90} {' + rename[k] + '}}}'
        )

    print(m_s)


\begin{tabular}{lllll}
\toprule
 & & iPrompt & AutoPrompt & Suffix \\
\midrule
\parbox[c]{0.3mm}{\multirow{3}{*}{\rotatebox[origin = c]{90} {MRR}}} & Math & \textbf{0.69} & 0.14 & 0.075 \\
 & ANLI & \textbf{0.41} & 0.074 & 0.06 \\
 & Induction & \textbf{0.35} & 0.086 & 0.035 \\
\parbox[c]{0.3mm}{\multirow{3}{*}{\rotatebox[origin = c]{90} {Correctness}}} & Math & \textbf{0.6} & 0.13 & 0.034 \\
 & ANLI & \textbf{0.37} & 0.074 & 0.011 \\
 & Induction & \textbf{0.28} & 0.08 & 0.0089 \\
\parbox[c]{0.3mm}{\multirow{3}{*}{\rotatebox[origin = c]{90} {BLEU}}} & Math & \textbf{0} & 0 & 0.0078 \\
 & ANLI & \textbf{0.0084} & 0 & 0.00071 \\
 & Induction & \textbf{0.0058} & 0 & 0.00096 \\
\bottomrule
\end{tabular}

\begin{tabular}{lllll}
\toprule
 & & iPrompt & AutoPrompt & Suffix \\
\midrule
\parbox[c]{0.3mm}{\multirow{3}{*}{\rotatebox[origin = c]{90} {MRR}}} & Math & \textbf{0.2} & 0.09 & 0.025 \\
 & ANLI & \textbf{0.39} & 0.0025 & 0.085 \\
 & Induction & \textbf{0.13} & 0.1 & 0.056 \\
\parbox[c]{

# Prompts table

In [7]:
for ckpt in d.checkpoint.unique():
    print(ckpt)
    prompts = d[d.checkpoint == ckpt]
    print(prompts.shape[0])
    prompts = (
        prompts
        .sort_values(by=['task_collection', 'reciprocal_rank'], ascending=False)
        .groupby(by=['model_cls', 'task_name'])
        .first()
    )
    # with pd.option_context('display.float_format', lambda x: '%.1e' % x):
    with pd.option_context('display.max_rows', None):
        display(prompts[['prefixes']].pivot_table(index='task_name', columns='model_cls', values='prefixes', aggfunc=lambda x: ' '.join(x)))

EleutherAI/gpt-j-6B
1172


model_cls,autoprompt,iprompt,suff
task_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
active_to_passive,(= 18 the the subst,Choose a pronoun for each sentence,Create a sentence or group of
add_two,>:Returns Adding togetherFont accomplish,Create a function named `sum,n>2 m1
antonyms,čthe bectheBut But,The noun to its opposite (,The code to ascend. You
cause_and_effect,REG Kinect virginity developed mosquit The,What would each sentence be if,write programs that read through an
common_concept,???????? parted configuredthe ????????,Find a noun that includes all,which is a common word used
d3_0,,line contains this string? No,"contains all 6 items, No"
d3_1,,"sentence contains ""yes"" or",string doesn't match any template
d3_10,,"message contains ""no"". No",contains all of these words or
d3_11,,"string contains ""No"" or",was matched; output otherwise No
d3_12,,tweet mentions “yes�,is true or output false if


google/flan-t5-xl
535


model_cls,iprompt,suff
task_name,Unnamed: 1_level_1,Unnamed: 2_level_1
active_to_passive,Answer: The judge encouraged the,Answer: The emperor
add_two,prompt: find the sum of,Answer: b. To
antonyms,What is the opposite of what,Answer: capitalism. Capitalis
cause_and_effect,Answer: Mary's.,Answer: I ate the
common_concept,answer: sewing wrenches,For who are men? Answer
d3_0,,Answer: yes publique à
d3_1,,no if it is
d3_10,Answer: no.||,Yes.; a).
d3_11,,Answer: it is not possible
d3_12,,yes|;e finally and


facebook/galactica-6.7b
650


model_cls,autoprompt,iprompt,suff
task_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
active_to_passive,,4-way Multiple Choice,Is the context a good response
add_two,ě addthe Hyper� addi,In order to add two or,Given three real-valued inputs
antonyms,meet equilibration stiptertead asymmetry,What is the opposite of each,[T1] Question
cause_and_effect,shaking Dthethethethe,Find clues as to why each,What do you think will happen
common_concept,,Where are all the animals?,What' s the most common
d3_0,Alloy ReeABL vetotitledthe,,"is sarcastic, otherwise ignore"
d3_1,Cosm compositionallyind locom astro bfnm,,sentence describes or is related to
d3_10,onso Semanна NichentiVALID,,says the answer is yes on
d3_11,,,says that the United States president
d3_12,assert unco Nog antich DesignsFOR,,says that someone arrives or de
