In [1]:
import polars as pl
import os
import random
from pathlib import Path


path = Path('./')
proj_dir = f'{str(path.resolve().parents[1].absolute())}'
outfile = f'{proj_dir}//datasets/flan/IHOP_Flan_CoT.jsonl'

In [2]:
df = pl.DataFrame(
    schema={
        "input": pl.Utf8, 
        "target": pl.Utf8, 
        "response": pl.Utf8,
        "_task_origin": pl.Utf8,
        "_task_type": pl.Utf8,
    }
)


In [3]:
# Cot Templates
COT_PROMPTS = [
    {"prompt": "Let's think step by step below.", "type": "suffix"},
    {"prompt": "Let's think step by step.", "type": "suffix"},
    {"prompt": "Let’s work this out in a step by step way to be sure we have the right answer.", "type": "suffix"},
    {"prompt": "Use reasoning to answer the following question.", "type": "suffix"},
    {"prompt": "Let's be accurate as possible. Think before you answer.", "type": "suffix"},
    {"prompt": "Please answer the following question by reasoning step-by-step.", "type": "suffix"},
    {"prompt": "Step-by-step reasoning:", "type": "suffix"},
    {"prompt": "A step-by-step solution is:", "type": "suffix"},
    {"prompt": "Let's be accurate as possible and think first.", "type": "suffix"},
    {"prompt": "Stream of thoughts:", "type": "suffix"},
    {"prompt": "Step-by-step reasoning process below:", "type": "suffix"},
    {"prompt": "Please answer and provide answer explanation.", "type": "suffix"},
    {"prompt": "Think hard. Detailed solution:", "type": "suffix"},
    {"prompt": "Reason slowly and give your answer.", "type": "suffix"},
    {"prompt": "Answer the following question, with explanation first.", "type": "prefix"},
    {"prompt": "I'll give you a question, please answer with step-by-step reasoning process.", "type": "prefix"},
    {"prompt": "Use reasoning to lead to the answer of the following question:", "type": "prefix"},
    {"prompt": "Please answer the following question by reasoning step-by-step.", "type": "prefix"},
    {"prompt": "Use reasoning to answer the following question.", "type": "prefix"},
]



In [4]:
# answer templates
ANSWER_TEMPLATES = [
    'The answer is',
    'The answer:',
    'So, the answer is',
    'Final answer:',
    'Thus, the answer is',
    'So, the final answer is',
    'The final answer:',
    'Therefore, the answer is',
]

In [5]:
cot_types = {
    "gsm8k": "cot",
    "strategyqa": "cot",
    "creak": "cot",
    "qasc": "cot",
    "esnli": "cot",
    "ecqa": "cot",
    "sensemaking": "cot",
    "aqua": "stream",
    "qed": "stream",
}

In [6]:
dir = f'{proj_dir}/datasets/flan/cot_data/'
for file in os.listdir(dir):
    ds_name = file.split('_')[0]
    cot_type = cot_types[ds_name]
    named = f"flan_{cot_type}_{ds_name}"
    ldf = pl.read_csv(
        f'{dir}{file}', 
        has_header=False, 
        separator='\t',
        new_columns=['input', 'target', 'response']
    ).with_columns(
        pl.lit(named).alias('_task_origin'),
        pl.lit('CoT').alias('_task_type'),
        pl.col("target").cast(pl.Utf8)
    )
    df = df.vstack(ldf)


In [7]:
def format_input(input):
    rand_idx = random.randint(0, len(COT_PROMPTS)-1)
    prompt = COT_PROMPTS[rand_idx]
    if prompt['type'] == "prefix":
        input = f"{prompt['prompt']}\n{input}\n"
    elif prompt['type'] == "suffix":
        input = f"{input}\n{prompt['prompt']}"
    # ToDo: handle formatting options
    return input

def format_response(x):
    rand_answer = random.choice(ANSWER_TEMPLATES)
    response = ''
    for rx in x['response'].split('.')[:-1]:
        response += f"{rx.strip()}.\n\n"
    response += f"{rand_answer} {x['target']}"
    return response

df = df.drop_nulls().with_columns(
    pl.col('input').apply(format_input),
    pl.struct(target='target', response='response').apply(format_response).alias('response')
)

In [8]:
df[0,2]

'Reproduction is the process by which living things give rise to offspring.\n\nSex equals reproduction.\n\nSex is the process by which living things give rise to offspring.\n\nSo, the final answer is (D)'

In [9]:
with open(outfile, mode="ab") as f:
   df.write_ndjson(f)