In [2]:
import json
import random
import re
import sys
from tqdm import tqdm
from time import sleep
from datetime import datetime

import pandas as pd
import openai


sys.path.append("../extra")
from prompt import PROMPT

# autoreload
%load_ext autoreload
%autoreload 2

In [2]:
df = []
with open("../data/rb_augmented.jsonl", "r") as f:
    for line in f:
        df.append(json.loads(line))

# convert to df
df = pd.DataFrame(df)

print(f"Loaded {len(df):,} augmented examples")

Loaded 757 augmented examples


In [3]:
def build_examples(dataset: pd.DataFrame, n: int = 3) -> str:
    examples = ""
    
    sample_dataset = dataset.sample(n)
    
    for i in range(n):
        q = sample_dataset.iloc[i]["question"]
        a = sample_dataset.iloc[i]["output"]
        examples += f"Q: {q}\nA: {a}\n\n"

    return examples


def build_augmentation_prompt(prompt: str, examples: str, n_outputs: int) -> str:
    """Take in the original prompt and examples to build a new prompt for augmentation."""

    prompt_and_examples = prompt + "\n\n" + examples
    tail_str = f"Now use some random dimensions and metrics and the examples as a style guide and provide {n_outputs} more examples of questions and outputs"
    prompt_and_examples = prompt_and_examples + "\n\n" + tail_str
    return prompt_and_examples


def query_openai(prompt: str) -> str:
    """Query OpenAI API with the given prompt and return the result."""
    output = openai.ChatCompletion.create(
        model="gpt-4-0613",
        messages =[{"role": "user", "content": prompt}],
        max_tokens=5000,
        temperature=0.6,
    )
    return output.choices[0].message.content


def extract_qa(text: str) -> pd.DataFrame:
    # Regular expressions for questions and answers
    question_pattern = r"Q: (.*?)\n"
    answer_pattern = r"A: ```json\n(.*?)```"

    # Find all questions and answers
    questions = re.findall(question_pattern, text, re.DOTALL)
    answers = re.findall(answer_pattern, text, re.DOTALL)

    # Rewrap the answers to add the markdown code block
    answers = [f"```json\n{a}```" for a in answers]

    # Determine the length of questions and answers
    q_len = len(questions)
    a_len = len(answers)

    # If questions and answers don"t align, fill the shorter one with None values to avoid failure
    if q_len < a_len:
        questions += [None] * (a_len - q_len)
    elif a_len < q_len:
        answers += [None] * (q_len - a_len)

    # Create a DataFrame
    df = pd.DataFrame({
        "question": questions,
        "answer": answers
    })

    # Remove rows where both question and answer are None
    df.dropna(how="all", inplace=True)

    return df

In [None]:
augmented_dataframes = []
for _ in tqdm(range(500)):
    try:
        # Sample n examples from the dataframe
        examples = build_examples(df, 3)

        # Build the prompt for the LLM
        prompt = build_augmentation_prompt(PROMPT, examples, 10)

        # Query the LLM and return the augmented examples in a string
        output = query_openai(prompt)

        # Convert the output string to a dataframe
        df_augmented = extract_qa(output)

        # Append the augmented dataframe to the list
        augmented_dataframes.append(df_augmented)

        print(f"Augmented {len(df_augmented)} examples.")
        sleep(5)
    except Exception as e:
        # Handle the exception (you can print an error message or log it if needed)
        print(f"An error occurred during the iteration: {e}")
        continue


In [19]:
# Join the list of df to a single df
final_df = pd.concat(augmented_dataframes, ignore_index=True)

# Save the final df to a pickle file with timestamp in the name
fp = f"../data/augmented_{len(final_df)}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pkl"
final_df.to_pickle(fp)

## Load and Convert to JSONL

In [12]:
df2 = pd.read_pickle("../data/aug_4838_20230803/augmented.pkl")

# Drop null
len_before = len(df2)
df2 = df2.dropna()
print(f"Drop {len_before - len(df2)} null rows")

# Drop duplicates
len_before = len(df2)
df2 = df2.drop_duplicates()
print(f"Drop {len_before - len(df2)} duplicate rows")

# Drop rows with short questions (likely invalid)
len_before = len(df2)
df2 = df2[df2['question'].str.len() > 20]
print(f"Drop {len_before - len(df2)} too short questions rows")

# Convert to JSONL
print(f"Writing JSONL with {len(df2)} rows")
json_str = df2.to_json(orient='records', lines=True)
with open("../data/aug_4838_20230803/augmented.jsonl", "w") as f:
    f.write(json_str)

Drop 875 null rows
Drop 1110 duplicate rows
Drop 547 too short questions rows
Writing JSONL with 2306 rows
