In [1]:
import os
import ast
import csv
import sys
import random
import pandas as pd

from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv())

sys.path.append('../src') 

from dataset_utils import Static_dataGen, Dynamic_dataGen, Bonus_dataGen, Preprocessing

key = os.environ.get("OPEN_AI_KEY")

# Static Dataset Creation
Aim: To generate a set of query-output pairs using the original set of 9 tools

Method: 
1. A set of 3-4 tools is sampled every iteration for query generation
2. The sampled set of tools is passed to an LLM agent for query generation
3. The query is then passed to another agent, along with their descriptions, to generate its completion


In [6]:
staticDatagen = Static_dataGen(key)

no_of_StaticQuery_CompletionPairs2beGen = 10

data_dict = staticDatagen.genQuery(no_of_StaticQuery_CompletionPairs2beGen)

if not os.path.exists('../datasets/Generated/raw_data'):
    os.makedirs('../datasets/Generated/raw_data')

field_names= ['Query','Output']

with open('../datasets/Generated/raw_data/saveStaticdataset.csv', 'w') as csv_file:  
    csv_writer = csv.DictWriter(csv_file, fieldnames=data_dict[0].keys())
    csv_writer.writeheader()
    csv_writer.writerows(data_dict)

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

# Dynamic Dataset Creation
Aim: To generate a dynamic toolset, and combining them with the original toolset to obtain a set of query-output pairs

Method (Dynamic Toolset Creation): 
1. 4 tools are sampled from the original toolset every iteration
2. These tools are then passed to an agent, to generate similar tools

Method (Query-Output Pair Generation): 
1. Random 10 tools along with the original 9 at a time are passed to the agent for generating queries. The model has the liberty to select any number of tools from this for query generation. 
2. Another agent then generates the completions for the query list
(The query list is cleaned by code and manual intervention before passing to the second agent, and a similar process is followed for the final CSV creation)

In [None]:
dynamicDatagen = Dynamic_dataGen(key)

no_of_newTool2beAdded = 10

no_of_DynamicQuery_CompletionPairs2beGen = 10

if not os.path.exists('../datasets/Generated/raw_data'):
    os.makedirs('../datasets/Generated/raw_data')

dynamicDatagen.genDynamicTools(no_of_newTool2beAdded)

data_dict = dynamicDatagen.genDynamicQueryOutputPair(no_of_DynamicQuery_CompletionPairs2beGen)

field_names= ['Added_Tools','Query','Output']

with open('../datasets/Generated/raw_data/saveDynamicData.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(data_dict)

# Bonus Dataset Creation
Aim: To generate a set of query-output pairs which involves usage of conditional and iterative operators

Method: Manually creating a list of 5 such query-output pairs, feeding these examples along with a list of a few relevant dynamic tools combined with the original toolset in the query-generating agent, and finally passing this list of queries in the completion agent. At every step of output from the model, the data is cleaned before saving and passing to the further agents.

In [None]:
bonusDatagen = Bonus_dataGen(key)

no_of_BonusQuery_CompletionPairs2beGen = 10

if not os.path.exists('../datasets/Generated/raw_data'):
    os.makedirs('../datasets/Generated/raw_data')

data_dict = bonusDatagen.genBonusQueryOutputPair(no_of_BonusQuery_CompletionPairs2beGen)

field_names= ['Query','Output']

with open('../datasets/Generated/raw_data/saveBonusData.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(data_dict)

## Restructuring Dataset For Different Pipelines

### Dataset formation for P1 Pipeline

Since the P1 pipeline does not require a training set, the following code generates an evaluation dataset for the P1 pipeline. The docstring is created by choosing the tools used in the query along with some random tools from the tools list. Since the data has to be used for infering the model, and has no prior knowledge of the tools, it requires the docstring of the allowed tools, along with some examples (few-shot) in the prompt to generate good results.

In [10]:
static_df = pd.read_csv("../datasets/Pre-Generated/raw_data/static_dataset.csv") 
dynamic_df = pd.read_csv("../datasets/Pre-Generated/raw_data/dynamic_dataset.csv") 
bonus_df = pd.read_csv("../datasets/Pre-Generated/raw_data/bonus_dataset.csv", encoding= 'unicode_escape') 
bonusTool_list = [row[0] for row in csv.reader(open('../resources/Tool_list/final-bonus-toolset.csv', 'r'))]

datasetForm = Preprocessing()

if not os.path.exists('../datasets/Generated/P1_datasets/test'):
    os.makedirs('../datasets/Generated/P1_datasets/test')

#Static
staticDictP1 = []

for i, row in static_df.iterrows():
    query = row['Query']
    output = row['Output']
    added_tools = datasetForm.p1_static()
    prompt = datasetForm.prompt_p1_static_dynamic(query, added_tools)
    staticDictP1.append({'Query': prompt,'Output' : output})

field_names= ['Query', 'Output']

P1_static_test = staticDictP1[0:round(0.1*len(staticDictP1))]

with open('../datasets/Generated/P1_datasets/test/StaticP1dataset_test.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(P1_static_test)

#Dynamic
dynamicDictP1 = []

for i, row in dynamic_df.iterrows():
    query = row['Query']
    output = row['Output']
    additional_tools = ast.literal_eval(row['Added_Tools'].replace("['", "['''").replace("']", "''']"))
    added_tools = datasetForm.p1_dynamic(additional_tools)
    prompt = datasetForm.prompt_p1_static_dynamic(query, added_tools)
    dynamicDictP1.append({'Query': prompt,'Output' : output})

field_names= ['Query', 'Output']

P1_dynamic_test = dynamicDictP1[0:round(0.33*len(dynamicDictP1))]


with open('../datasets/Generated/P1_datasets/test/DynamicP1dataset_test.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(P1_dynamic_test)

# Bonus
bonusDictP1 = []
for i, row in bonus_df.iterrows():
    query = row['Query']
    output = row['Output']
    added_tools = datasetForm.p1_bonus(bonusTool_list)
    prompt = datasetForm.prompt_p1_bonus(query, added_tools)
    bonusDictP1.append({'Query': prompt,'Output' : output})

field_names= ['Query', 'Output']

P1_bonus_test = bonusDictP1[0:round(0.1*len(bonusDictP1))]

with open('../datasets/Generated/P1_datasets/test/BonusP1dataset_test.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(P1_bonus_test)

### Prompt formation for P2 Pipeline

In [11]:
static_df = pd.read_csv("../datasets/Pre-Generated/raw_data/static_dataset.csv") 
dynamic_df = pd.read_csv("../datasets/Pre-Generated/raw_data/dynamic_dataset.csv") 
bonus_df = pd.read_csv("../datasets/Pre-Generated/raw_data/bonus_dataset.csv", encoding= 'unicode_escape') 
bonusTool_list = [row[0] for row in csv.reader(open('../resources/Tool_list/final-bonus-toolset.csv', 'r'))]

datasetForm = Preprocessing()

if not os.path.exists('../datasets/Generated/P2_datasets/train_val'):
    os.makedirs('../datasets/Generated/P2_datasets/train_val')

if not os.path.exists('../datasets/Generated/P2_datasets/test'):
    os.makedirs('../datasets/Generated/P2_datasets/test')

#Static
staticDictP2 = []
for i, row in static_df.iterrows():
    query = row['Query']
    output = row['Output']
    prompt = datasetForm.prompt_p2_pipeline(query,output)
    staticDictP2.append({'Prompt':prompt, 'Output' : output})

field_names= ['Prompt', 'Output']

P2_test_static = staticDictP2[0:round(0.1*len(staticDictP2))]

with open('../datasets/Generated/P2_datasets/test/StaticP2prompt_test.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(P2_test_static)

#Dynamic
dynamicDictP2 = []
for i, row in dynamic_df.iterrows():
    query = row['Query']
    output = row['Output']
    additional_tools = ast.literal_eval(row['Added_Tools'].replace("['", "['''").replace("']", "''']"))
    prompt = datasetForm.prompt_p2_pipeline(query,output,additional_tools)
    dynamicDictP2.append({'Prompt':prompt, 'Output' : output})

field_names= ['Prompt', 'Output']

P2_test_dynamic = dynamicDictP2[0:round(0.33*len(dynamicDictP2))]

with open('../datasets/Generated/P2_datasets/test/DynamicP2prompt_test.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(P2_test_dynamic)

#Bonus
bonusDictP2 = []
for i, row in bonus_df.iterrows():
    query = row['Query']
    output = row['Output']
    prompt = datasetForm.prompt_p2_pipeline(query,output,bonusTool_list)
    bonusDictP2.append({'Prompt':prompt, 'Output' : output})

field_names= ['Prompt', 'Output']

P2_test_bonus = bonusDictP2[0:round(0.1*len(bonusDictP2))]

with open('../datasets/Generated/P2_datasets/test/BonusP2prompt_test.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(P2_test_bonus)

P2_train_val = staticDictP2[round(0.1*len(staticDictP2)):] + dynamicDictP2[round(0.33*len(dynamicDictP2)):] + bonusDictP2[round(0.9*len(bonusDictP2)):]
random.shuffle(P2_train_val)

P2_val = P2_train_val[0:round(0.1*len(P2_train_val))]
P2_train = P2_train_val[round(0.1*len(P2_train_val)):]


with open('../datasets/Generated/P2_datasets/train_val/P2prompt_train.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(P2_train)

with open('../datasets/Generated/P2_datasets/train_val/P2prompt_val.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(P2_val)

### Prompt formation for P3 Pipeline

In [12]:
static_df = pd.read_csv("../datasets/Pre-Generated/raw_data/static_dataset.csv") 
dynamic_df = pd.read_csv("../datasets/Pre-Generated/raw_data/dynamic_dataset.csv") 
bonus_df = pd.read_csv("../datasets/Pre-Generated/raw_data/bonus_dataset.csv", encoding= 'unicode_escape') 
bonusTool_list = [row[0] for row in csv.reader(open('../resources/Tool_list/final-bonus-toolset.csv', 'r'))]

datasetForm = Preprocessing()

if not os.path.exists('../datasets/Generated/P3_datasets/train_val'):
    os.makedirs('../datasets/Generated/P3_datasets/train_val')

if not os.path.exists('../datasets/Generated/P3_datasets/test'):
    os.makedirs('../datasets/Generated/P3_datasets/test')

#Static
staticDictP3 = []
for i, row in static_df.iterrows():
    query = row['Query']
    output = row['Output']
    prompt = datasetForm.prompt_p3_pipeline(query,output)
    staticDictP3.append({'Prompt':prompt, 'Output' : output})

field_names= ['Prompt', 'Output']

P3_test_static = staticDictP3[0:round(0.1*len(staticDictP3))]

with open('../datasets/Generated/P3_datasets/test/StaticP3prompt_test.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(P3_test_static)

#Dynamic
dynamicDictP3 = []
for i, row in dynamic_df.iterrows():
    query = row['Query']
    output = row['Output']
    additional_tools = ast.literal_eval(row['Added_Tools'].replace("['", "['''").replace("']", "''']"))
    prompt = datasetForm.prompt_p3_pipeline(query,output,additional_tools)
    dynamicDictP3.append({'Prompt':prompt, 'Output' : output})

field_names= ['Prompt', 'Output']

P3_test_dynamic = dynamicDictP3[0:round(0.33*len(dynamicDictP3))]

with open('../datasets/Generated/P3_datasets/test/DynamicP3prompt_test.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(P3_test_dynamic)

#Bonus
bonusDictP3 = []
for i, row in bonus_df.iterrows():
    query = row['Query']
    output = row['Output']
    prompt = datasetForm.prompt_p3_pipeline(query,output,bonusTool_list)
    bonusDictP3.append({'Prompt':prompt, 'Output' : output})

field_names= ['Prompt', 'Output']

P3_test_bonus = bonusDictP3[0:round(0.1*len(bonusDictP3))]

with open('../datasets/Generated/P3_datasets/test/BonusP3prompt_test.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(P3_test_bonus)

P3_train_val_stage_1 = staticDictP3[round(0.1*len(staticDictP3)):]
P3_train_val_stage_2 = staticDictP3[round(0.75*len(staticDictP3)):] + bonusDictP3

random.shuffle(P3_train_val_stage_1)
random.shuffle(P3_train_val_stage_2)


P3_val_stage_1 = P3_train_val_stage_1[0:round(0.1*len(P3_train_val_stage_1))]
P3_train_stage_1 = P3_train_val_stage_1[round(0.1*len(P3_train_val_stage_1)):]

P3_val_stage_2 = P3_train_val_stage_2[0:round(0.1*len(P3_train_val_stage_2))]
P3_train_stage_2 = P3_train_val_stage_2[round(0.1*len(P3_train_val_stage_2)):]

if not os.path.exists('../datasets/Generated/P3_datasets/train_val/Stage-1'):
    os.makedirs('../datasets/Generated/P3_datasets/train_val/Stage-1')

if not os.path.exists('../datasets/Generated/P3_datasets/train_val/Stage-2'):
    os.makedirs('../datasets/Generated/P3_datasets/train_val/Stage-2')

with open('../datasets/Generated/P3_datasets/train_val/Stage-1/P3prompt_stage_1_train.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(P3_train_stage_1)

with open('../datasets/Generated/P3_datasets/train_val/Stage-1/P3prompt_stage_1_val.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(P3_val_stage_1)

with open('../datasets/Generated/P3_datasets/train_val/Stage-2/P3prompt_stage_2_train.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(P3_train_stage_2)

with open('../datasets/Generated/P3_datasets/train_val/Stage-2/P3prompt_stage_2_val.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(P3_val_stage_2)