# Generate User Queries

Code authored by: Shaw Talebi

### imports

In [1]:
from utils.gen_data import send_openai_request, load_tool_metadata, get_tool_name
import pandas as pd
import os

### intialize query dict

In [2]:
query_dict = {
    "query": [],
    "query_type": [],
    "tool_needed": [],
    "tool_name": [],
}

### No Tool Queries

In [3]:
# total num queries to generate
N = 200

# query variations
category_list = ["General Knowledge", "Lifestyle advice", "Creative Writing", "Definitions", "Explanations", "Comparison", "Step-by-step instructions", "Writing feedback", "Venting", "code generation"]
seed_list = ["cooking", "fitness", "business", "education", "hobbies",]
difficulty_list = ["beginner level", "intermediate level"]
typos_list = ["", "Please include formatting errors and typos."]

                 
# write instructions for generating queries
no_tool_instructions = "You are a query generator. Given an input category and additionl guidance, you create one simple, human-like query that can be easily solved without the use of any tools"

# user prompt template
user_prompt_template = lambda category, seed, difficulty, typo : f"Category: {category} \nSeed: {seed} \nDifficulty: {difficulty} \n{typo}"

In [4]:
%%time
# number of queries to generate per call
num_queries = int(N/(len(category_list) * len(typos_list) * len(difficulty_list) * len(seed_list)))

for category in category_list:
    for typo in typos_list:
        for difficulty in difficulty_list:
            for seed in seed_list:
                # generate no tool queries
                user_message = user_prompt_template(category, seed, difficulty, typo)
                query_list = send_openai_request(no_tool_instructions, user_message, n=num_queries, temperature=1)
        
                # extend add queries to query_dict
                query_dict['query'] = query_dict['query'] + query_list
                query_dict['query_type'] = query_dict['query_type'] + (['no_tool'] * num_queries)
                query_dict['tool_needed'] = query_dict['tool_needed'] + ([False] * num_queries)
                query_dict['tool_name'] = query_dict['tool_name'] + ([None] * num_queries)

CPU times: user 1.01 s, sys: 134 ms, total: 1.14 s
Wall time: 2min 38s


### Tool Queries

"Easy" (in this project) means the query is simple enough for model to immediately call tool (but a tool call is needed) <br>
"Hard" (in this project) means the query is requires a little thinking and a tool call to resolve

In [5]:
tool_metadata_list = load_tool_metadata()
print(tool_metadata_list[0])

Name: calculator
Description: Perform basic arithmetic calculations.
Parameters:
  expression: string - Arithmetic expression to evaluate.


In [6]:
%%time
# num queries to generate
num_queries = 5

# write instructions for generating easy and hard queries
easy_instructions = 'Generate a simple, human-like query that can only be resolved using the tool described by the user.'
hard_instructions = 'Generate a simple, human-like query that can only be resolved using the tool described by the user and a little thinking. Please include formatting errors and typos.'

for tool_metadata in tool_metadata_list:
    # easy queries
    easy_query_list = send_openai_request(easy_instructions, tool_metadata, n=num_queries, temperature=1)

    # hard queries
    hard_query_list = send_openai_request(hard_instructions, tool_metadata, n=num_queries, temperature=1)

    # extend add queries to query_dict
    query_dict['query'] = query_dict['query'] + easy_query_list + hard_query_list
    query_dict['query_type'] = query_dict['query_type'] + (['easy'] * num_queries) + (['hard'] * num_queries)
    query_dict['tool_needed'] = query_dict['tool_needed'] + [True] * (2*num_queries)
    query_dict['tool_name'] = query_dict['tool_name'] + [get_tool_name(tool_metadata)] * (2*num_queries)

CPU times: user 427 ms, sys: 44.5 ms, total: 471 ms
Wall time: 1min 19s


In [7]:
print(len(query_dict['query']))
print(len(query_dict['query_type']))
print(len(query_dict['tool_needed']))
print(len(query_dict['tool_name']))

600
600
600
600


### write data to .csv

In [8]:
# Convert query_dict to DataFrame
df = pd.DataFrame(query_dict)

# Write to CSV file
csv_path = 'data/queries.csv'
df.to_csv(csv_path, index=False)