In [None]:
import os
import pandas as pd
from collections import Counter, defaultdict
import json
import numpy as np
import openai
import seaborn as sns
import torch
from datasets import Dataset, DatasetDict, load_metric, load_from_disk
from sklearn.metrics import classification_report
import time
import itertools
import random
from pathlib import Path

In [None]:
def _find_generated_answer(tokens, newline="\n" ): 
    """Our LMs tend to insert initial newline characters before
    they begin generating text. This function ensures that we 
    properly capture the true first line as the answer while
    also ensuring that token probabilities are aligned."""        
    answer_token_indices = []
    char_seen = False            
    for i, tok in enumerate(tokens):
        # This is the main condition: a newline that isn't an initial
        # string of newlines:
        if tok == newline and char_seen:
            break
        # Keep the initial newlines for consistency:
        elif tok == newline and not char_seen:
            answer_token_indices.append(i)
        # Proper tokens:
        elif tok != newline:
            char_seen = True
            answer_token_indices.append(i)
    return answer_token_indices 

def few_shot_sample_random( 
    ex,
    df,
    n=50,
):
    """
    Parameters
    ----------
    ex : Pandas DataFrame row
        Single example for which we want to find few-shot samples for.
    df : Pandas DataFrame
        Counterfactual dataframe, from which to choose the samples. 
    n : int, default 2
        Number of few-shot samples to generate. 
        
    Returns
    -------
    Pandas DataFrame
        Returns `n` sample rows, chosen randomly from a sample pool
        in `df` filtered by the boolean flags.
    """
    
    filtered_df = df # no filtering
    if n < len(filtered_df):
        return filtered_df.sample(n)
    return filtered_df

def few_shot_sample_keyword( 
    ex,
    df,
    n=50,
):
    """
    Parameters
    ----------
    ex : Pandas DataFrame row
        Single example for which we want to find few-shot samples for.
    df : Pandas DataFrame
        Counterfactual dataframe, from which to choose the samples. 
    n : int, default 2
        Number of few-shot samples to generate. 
        
    Returns
    -------
    Pandas DataFrame
        Returns `n` sample rows, chosen randomly from a sample pool
        in `df` filtered by the boolean flags.
    """
    
    sentence = ex[['sentence']].iloc[0]
    
    keywords = set([])
    for w in sentence.split():
        if w not in {'The', 'A', 'a', 'the', '.'}:
            keywords.add(w)
    
    keyword_dfs = []
    nonkeyword_dfs = []
    for k in list(keywords):
        contain_values = df[df['sentence'].str.contains(k)]
        if len(contain_values.index) > 0:
            keyword_dfs += [contain_values.sample(min([len(contain_values), 5]))]
        
        noncontain_values = df[~df['sentence'].str.contains(k)]
        nonkeyword_dfs += [noncontain_values]
        
    filtered_df = pd.concat(keyword_dfs).drop_duplicates(subset=['sentence'])
    nonkeyword_dfs = pd.concat(nonkeyword_dfs).drop_duplicates(subset=['sentence'])

    if n < len(filtered_df):
        return filtered_df.sample(n)
    filtered_df = pd.concat([filtered_df, nonkeyword_dfs.sample(n - len(filtered_df))])
    assert len(filtered_df) == n
    return filtered_df

def generate_prompt(ex, few_shots, start_prompt=True, joiner='\n\n'):
    """
    Generates prompt for few-shot learning. An example:
    
    Please follow the instructions to manipulate the characters of the 
    INPUT string and generate the desired OUTPUT string.
    
    INPUT: tuo
    
    OUTPUT: out
    
    [... total k-shots of demonstrations ...]
    
    INPUT: nethgirf
    
    OUTPUT:
    
    Parameters
    ----------
    ex : Pandas DataFrame row
        Single example for which we want to generate a counterfactual. 
    few_shots : Pandas DataFrame
        Counterfactual examples, as chosen by `few_shot_sample`
    start_prompt : bool, default True
        Whether or not to include a prompt at the beginning. 
    clue_type : str, default None
        Whether to describe the clue type in the prompt
        
    Returns
    -------
    string
        Few-shot prompt to provide to a language-generating model. 
    """
    
    prompt = []
    
    if start_prompt:
        starter = "Please translate a sentence into a logical form."
        prompt += [starter]
        
    for i in range(len(few_shots)):
        input_sentence = few_shots.iloc[i][0]
        output_sentence = few_shots.iloc[i][1]
        prompt += [f"INPUT:{input_sentence}"]
        prompt += [f"OUTPUT:{output_sentence}"]
    
    test_sentence = ex[0]
    target_text = ex[1]
    
    prompt += [f'INPUT:{test_sentence}']
    prompt += ['OUTPUT:']

    # clean out any newlines within sentences
    prompt = [' '.join(l.split('\n')) for l in prompt]
    return joiner.join(prompt), target_text


def generate_prompts(
    sample_df, train_df, nshot, 
    start_prompt=True,
    **fs_kwargs
):
    """
    Generates prompts for all samples in `sample_df`, where the few-shot
    samples are taken from `train_df`.
    """
    def gp(ex):
        few_shots = few_shot_sample_keyword(ex, train_df, n=nshot, **fs_kwargs)
        return generate_prompt(
            ex, few_shots, 
            start_prompt=start_prompt,
        )
    
    return list(sample_df.apply(gp, axis=1))

def run_gpt3(prompts, keys_batch=None, engine="text-curie-001", model=None, temperature=0.0, max_tokens=64, **gpt3_kwargs):
    """
    Runs GPT-3 on a list of prompts.
    
    Parameters
    ----------
    prompts : iterable of str
    engine : str  
        https://beta.openai.com/docs/engines/gpt-3                
    temperature : float
        It seems best to set it high for this task!
    max_tokens: int
        Limits how many tokens the model is asked to generate.
        
    For information about values for `gpt3_kwargs`, see
    
    https://beta.openai.com/docs/api-reference/completions
    
    Returns
    -------
    list of dicts   
    """
    OPEN_AI_API_KEY = '<YOUR_KEY_GOES_HERE>'
    openai.api_key = OPEN_AI_API_KEY

    assert (engine is not None) or (model is not None), 'Please provide an engine or a finetuned model id.'

    # go with pretrained model if provided, else use engine
    if model is not None:
        gpt3_kwargs['model'] = model
    else:
        gpt3_kwargs['engine'] = engine
        
    response = openai.Completion.create(
        prompt=prompts,
        temperature=temperature,
        echo=False,   # This function will not work
        logprobs=1,   # properly if any of these
        n=1,          # are changed!
        max_tokens=max_tokens,
        **gpt3_kwargs)
    
    # From here, we parse each example to get the values
    # we need:
    data = []
    prompt_index = 0
    for ex, prompt in zip(response["choices"], prompts):
        tokens = ex["logprobs"]["tokens"]
        logprobs = ex["logprobs"]["token_logprobs"]        
        probs = list(np.exp(logprobs))
        if "<|endoftext|>" in tokens:
            end_i = tokens.index("<|endoftext|>")
            tokens = tokens[ : end_i]  # This leaves off the "<|endoftext|>"
            probs = probs[ : end_i]    # token -- perhaps dubious.
        ans_indices = _find_generated_answer(tokens)
        answer_tokens = [tokens[i] for i in ans_indices]
        answer_probs = [probs[i] for i in ans_indices]
        answer = "".join(answer_tokens)        
        data.append({
            "prompt": prompt,
            "generated_text": ex["text"],
            "generated_tokens": tokens,
            "generated_probs": probs,
            "generated_answer": answer,
            "generated_answer_tokens": answer_tokens,
            "generated_answer_probs": answer_probs, 
            "target_answer": keys_batch[prompt_index] if keys_batch is not None else None})
        prompt_index += 1
    return data

def run_gpt3_experiment(
    name,
    train_df, 
    dev_df, 
    nshot=0, 
    engine='curie', 
    start_prompt=True, 
    temperature=0.0, 
    batch_size=20,
    ft_kwargs={},
    fs_kwargs={},
    **gpt3_kwargs
):
    """
    Uses GPT-3 to generate counterfactuals on `dev_df`, using `train_df` to either
    finetune the model or to sample few-shot examples. 
    
    Parameters
    ----------
    name : string
        Name of the experiment. GPT-3 outputs will be saved to a json file with 
        the provided name.
    train_df : Pandas DataFrame
        Training data, formatted as output of `reframe_counterfactual`.
    dev_df : Pandas DataFrame
        Validation data, formatted as output of `reframe_counterfactual`.        
    engine : string, default 'curie'
        One of 'ada', 'babbage', 'curie', or 'davinci' (will automatically
        take the 'text' option of the model)
        https://beta.openai.com/docs/engines/gpt-3
    start_prompt : bool, default True
        Whether prompts should contain an initial prefix.
    clue_type : bool, default True
        Whether prompts should contain an initial prefix of the clue type.
    temperature : float, default 0.7
        Temperature of GPT-3 model (higher -> more creative)
    batch_size : int, default 20
        Batch sizes for GPT-3 API (20 is largest for free acount)
    ft_kwargs : dict
        Keyword arguments for finetuning, `finetune_gpt3`.
    fs_kwargs : dict
        Keyword arguments for few-shot generation, `few_shot_sample`.
    gpt3_kwargs 
        Keyword arguments for the GPT-3 model, `run_gpt3`
        https://beta.openai.com/docs/api-reference/fine-tunes
    
    Returns
    -------
    list of dicts
        See `run_gpt3`.
    """
    
    if os.path.isfile(os.path.join("./openai-results", f'{name}.json')):
        print("Found existing file. Skipping the experiment ...")
        return None
    
    model = None
    
    prompts_with_keys = generate_prompts(
        dev_df, train_df=train_df, nshot=nshot, 
        start_prompt=start_prompt,
        **fs_kwargs
    )
    
    prompts = [p_k[0] for p_k in prompts_with_keys]
    
    print("NUM OF PROMPTS: ", len(prompts))
    keys = [p_k[1] for p_k in prompts_with_keys]
    output = []
    for b in range(0, len(prompts), batch_size):
        batch = prompts[b:b + batch_size]
        keys_batch = keys[b:b + batch_size]
        output += run_gpt3(
            batch, keys_batch, engine=engine, model=model, temperature=temperature, **gpt3_kwargs
        )
    with open(os.path.join("./openai-results", f'{name}.json'), 'w') as f:
        json.dump(output, f)
            
    return output

In [None]:
train_df = pd.read_csv("../cogs/train.tsv", sep="\t", names=['sentence', 'LF', 'type'])
train_df = train_df[train_df['LF'].str.contains("LAMBDA")==False]
dev_df = pd.read_csv("../cogs/dev.tsv", sep="\t", names=['sentence', 'LF', 'type'])
test_df = pd.read_csv("../cogs/test.tsv", sep="\t", names=['sentence', 'LF', 'type'])
gen_df = pd.read_csv("../cogs/gen.tsv", sep="\t", names=['sentence', 'LF', 'type'])
results = []
LIMIT_TEST_N = 1000
seed = 42

engines = [
    "text-davinci-003",
    "text-davinci-002",
    "text-curie-001",
    "text-babbage-001",
    "text-ada-001",
]
test_splits = ['test', 'gen']
for test_split in test_splits:
    for engine in engines:
        if "davinci" in engine:
            nshot = 40
        else:
            nshot = 20
        
        _ = random.seed(seed)
        _ = np.random.seed(seed)
        _ = torch.manual_seed(seed)
        if test_split == 'test':
            eval_df = test_df
        else:
            eval_df = gen_df
        run_name = f"gpt3.{engine}.nshot.{nshot}.split.{test_split}.seed.{seed}"
        print(f"RUNNING: {run_name}")
        # eval
        _ = run_gpt3_experiment(
            run_name, 
            train_df, 
            eval_df.sample(LIMIT_TEST_N), 
            nshot=nshot, 
            engine=engine, 
            start_prompt=True, 
            temperature=0.0, 
            batch_size=20,
        )
        print(f"ANALYZING: {run_name}")
        output_parent_dir = "openai-results"
        output_filename = f"./{output_parent_dir}/{run_name}.json"
        output = json.load(open(output_filename))
        total_count = 0
        correct_count = 0
        for ex in output:
            total_count += 1
            if ex["target_answer"] == ex["generated_answer"]:
                correct_count += 1
        exact_match = correct_count/total_count
        print(f"Exact Match: {exact_match}")
        results += [[engine, test_split, seed, exact_match]]
        

In [None]:
free_train_df = pd.read_csv("../variable_free/train.tsv", sep="\t", names=['sentence', 'LF', 'type'])
free_train_df = free_train_df[free_train_df['LF'].str.contains("LAMBDA")==False]
free_dev_df = pd.read_csv("../variable_free/dev.tsv", sep="\t", names=['sentence', 'LF', 'type'])
free_test_df = pd.read_csv("../variable_free/test.tsv", sep="\t", names=['sentence', 'LF', 'type'])
free_gen_df = pd.read_csv("../variable_free/gen.tsv", sep="\t", names=['sentence', 'LF', 'type'])
free_results = []
LIMIT_TEST_N = 1000
seed = 42

engines = [
    "text-davinci-003",
    "text-davinci-002",
    "text-curie-001",
    "text-babbage-001",
    "text-ada-001",
]
test_splits = ['free_test', 'free_gen']
for test_split in test_splits:
    for engine in engines:
        if "davinci" in engine:
            nshot = 40
        else:
            nshot = 20
        _ = random.seed(seed)
        _ = np.random.seed(seed)
        _ = torch.manual_seed(seed)
        if test_split == 'free_test':
            free_eval_df = free_test_df
        else:
            free_eval_df = free_gen_df
        run_name = f"gpt3.{engine}.nshot.{nshot}.split.{test_split}.seed.{seed}"
        print(f"RUNNING: {run_name}")
        # eval
        _ = run_gpt3_experiment(
            run_name, 
            free_train_df, 
            free_eval_df.sample(LIMIT_TEST_N), 
            nshot=nshot, 
            engine=engine, 
            start_prompt=True, 
            temperature=0.0, 
            batch_size=20,
        )
        print(f"ANALYZING: {run_name}")
        output_parent_dir = "openai-results"
        output_filename = f"./{output_parent_dir}/{run_name}.json"
        output = json.load(open(output_filename))
        total_count = 0
        correct_count = 0
        for ex in output:
            total_count += 1
            if ex["target_answer"] == ex["generated_answer"]:
                correct_count += 1
        exact_match = correct_count/total_count
        print(f"Exact Match: {exact_match}")
        free_results += [[engine, test_split, seed, exact_match]]
        