In [1]:
import os
import ast
import csv
import pandas as pd
from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv())
from utils import Static_dataGen, Dynamic_dataGen, Bonus_dataGen, Preprocessing

key = os.environ.get("OPEN_AI_KEY")

# Static Dataset Creation
Aim: To generate a set of query-output pairs using the original set of 9 tools

Method: Sampling of 3-4 tools for query generation in multiple iterations, followed by passing the complete queries and tool list for output generation in the completion agent. Data cleaning by code and manual intervention for final dataset creation.

Previous Method: Similar to method 2, except that both query-outputs were generated together in the same agent

In [18]:
staticDatagen = Static_dataGen(key)

no_of_StaticQuery_CompletionPairs2beGen = 10

data_dict = staticDatagen.genQuery(no_of_StaticQuery_CompletionPairs2beGen)

field_names= ['Query','Output']

with open('./datasets/Generated/saveStaticdataset.csv', 'w') as csv_file:  
    csv_writer = csv.DictWriter(csv_file, fieldnames=data_dict[0].keys())
    csv_writer.writeheader()
    csv_writer.writerows(data_dict)

# Dynamic Dataset Creation
Aim: To generate a dynamic toolset, and combining them with the original toolset to obtain a set of query-output pairs

Method (Dynamic Toolset Creation): Sampling of 4 tools from the original toolset in multiple iterations, and generating similar tools. Experimenting with prompt and temperature to modify tools.

Method (Query-Output Pair Generation): Passing random 10 tools along with the original 9 at a time, the model has the liberty to select any number of tools from this for query generation. Using this query list in the completion agent where it generates the relevant outputs. The query list is cleaned by code and manual intervention before passing to the second agent, and a similar process is followed for the final CSV creation. 

In [19]:
dynamicDatagen = Dynamic_dataGen(key)

no_of_newTool2beAdded = 10

no_of_DynamicQuery_CompletionPairs2beGen = 10

dynamicDatagen.genDynamicTools(no_of_newTool2beAdded)

data_dict = dynamicDatagen.genDynamicQueryOutputPair(no_of_DynamicQuery_CompletionPairs2beGen)

field_names= ['Added_Tools','Query','Output']

with open('./datasets/Generated/saveDynamicData.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(data_dict)

# Bonus Dataset Creation
Aim: To generate a set of query-output pairs which involves usage of conditional and iterative operators

Method: Manually creating a list of 5 such query-output pairs, feeding these examples along with a list of a few relevant dynamic tools combined with the original toolset in the query-generating agent, and finally passing this list of queries in the completion agent. At every step of output from the model, the data is cleaned before saving and passing to the further agents.

In [2]:
bonusDatagen = Bonus_dataGen(key)

no_of_BonusQuery_CompletionPairs2beGen = 10

data_dict = bonusDatagen.genBonusQueryOutputPair(no_of_BonusQuery_CompletionPairs2beGen)

field_names= ['Query','Output']

with open('./datasets/Generated/saveBonusData.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(data_dict)

# Static Dataset Reading

In [3]:
static_df = pd.read_csv("./datasets/originals/static_dataset.csv") 
promptForm = Preprocessing()

static_df.head()

Unnamed: 0,Query,Output
0,Summarize issues similar to don:core:dvrv-us-1...,"var_1 = get_similar_work_items(work_id=""don:co..."
1,Summarize high severity tickets from the custo...,"var_1 = search_object_by_name(query=""UltimateC..."
2,What are my all issues in the triage stage und...,var_1 = who_am_i()\nvar_2 = works_list(stage.n...
3,List all high severity tickets coming in from ...,"var_1 = search_object_by_name(query=""Cust123"")..."
4,"Given a customer meeting transcript ""T"", creat...",var_1 = create_actionable_tasks_from_text(text...


### Prompt formation for P2 Pipeline


In [4]:
staticDictP2 = []
for i, row in static_df.iterrows():
    query = row['Query']
    output = row['Output']
    prompt = promptForm.prompt_p2_pipeline(query,output)
    staticDictP2.append({'Prompt':prompt})

field_names= ['Prompt']

with open('./promptForm/StaticP2prompt.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(staticDictP2)

### Prompt formation for P3 Pipeline

In [5]:
staticDictP3 = []
for i, row in static_df.iterrows():
    query = row['Query']
    output = row['Output']
    prompt = promptForm.prompt_p3_pipeline(query,output)
    staticDictP3.append({'Prompt':prompt})

field_names = ['Prompt']

with open('./promptForm/StaticP3prompt.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(staticDictP3)

# Dynamic Dataset Reading

In [6]:
dynamic_df = pd.read_csv("./datasets/originals/dynamic_dataset.csv") 
promptForm = Preprocessing()

dynamic_df.head()

Unnamed: 0,Added_Tools,Query,Output
0,"['\n def list_user_tasks(user_id, completed=Fa...","For user ""Tina"", list her tasks, filter work ...","var_1 = list_user_tasks(user_id=""Tina"")\nvar_2..."
1,"['\n \n def list_user_tasks(user_id, completed...","Find the current sprint ID, list all tasks ass...",var_1 = get_sprint_id()\nvar_2 = search_object...
2,"['\n \n def list_user_tasks(user_id, completed...","Search for the user ID of ""ManagerMike,"" list ...","var_1 = search_object_by_name(query=""ManagerMi..."
3,"['\n \n def list_user_tasks(user_id, completed...","Use the ID of the current user, list all tasks...",var_1 = who_am_i()\nvar_2 = list_user_tasks(us...
4,"['\n \n def list_user_tasks(user_id, completed...","Retrieve the ID of the current sprint, list al...",var_1 = get_sprint_id()\nvar_2 = search_object...


### Prompt formation for P2 Pipeline

In [7]:
dynamicDictP2 = []
for i, row in dynamic_df.iterrows():
    query = row['Query']
    output = row['Output']
    additional_tools = ast.literal_eval(row['Added_Tools'].replace("['", "['''").replace("']", "''']"))
    prompt = promptForm.prompt_p2_pipeline(query,output,additional_tools)
    dynamicDictP2.append({'Prompt':prompt})

field_names= ['Prompt']

with open('./promptForm/DynamicP2prompt.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(dynamicDictP2)

### Prompt formation for P3 Pipeline

In [8]:
dynamicDictP3 = []
for i, row in dynamic_df.iterrows():
    query = row['Query']
    output = row['Output']
    additional_tools = ast.literal_eval(row['Added_Tools'].replace("['", "['''").replace("']", "''']"))
    prompt = promptForm.prompt_p3_pipeline(query,output,additional_tools)
    dynamicDictP3.append({'Prompt':prompt})

field_names= ['Prompt']

with open('./promptForm/DynamicP3prompt.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(dynamicDictP3)

# Bonus Dataset Reading

In [9]:
bonus_df = pd.read_csv("./datasets/originals/bonus_dataset.csv") 
promptForm = Preprocessing()

bonus_df.head()

Unnamed: 0,Query,Output
0,"Create tasks from the text ""WeeklyUpdate"" and ...",var_1 = who_am_i()\nvar_2 = create_actionable_...
1,"Retrieve work items with type ""task"" and sever...","for loop_var in range(0,10):\n temp_1 = wor..."
2,"Find work items with priority ""p1"" and type ""i...","var_1 = works_list(issue.priority=[""p1""], type..."
3,"Extract tasks from the text ""ReleaseNotes"", pr...",var_1 = create_actionable_tasks_from_text(text...
4,"Fetch tasks for user ""USER-999"", prioritize th...","for loop_var in range(0,2):\n temp_1 = fetc..."


### Prompt formation for P2 Pipeline

In [10]:
bonusDictP2 = []
for i, row in bonus_df.iterrows():
    query = row['Query']
    output = row['Output']
    prompt = promptForm.prompt_p2_pipeline(query,output)
    bonusDictP2.append({'Prompt':prompt})

field_names= ['Prompt']

with open('./promptForm/BonusP2prompt.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(bonusDictP2)

### Prompt formation for P3 Pipeline

In [11]:
bonusDictP3 = []
for i, row in bonus_df.iterrows():
    query = row['Query']
    output = row['Output']
    prompt = promptForm.prompt_p3_pipeline(query,output)
    bonusDictP3.append({'Prompt':prompt})

field_names= ['Prompt']

with open('./promptForm/BonusP3prompt.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(bonusDictP3)

# Train Validation Test For Static Data

### P2 Pipeline

In [21]:
df = pd.read_csv("./promptForm/StaticP2prompt.csv")

train_df = df[0:1700]
validation_df = df[1700:1900]
test_df = df[1900:]

train_df.to_csv("./finetuning_P2dataset/train.csv", index=False)
validation_df.to_csv("./finetuning_P2dataset/validation.csv", index=False)
test_df.to_csv("./finetuning_P2dataset/test.csv", index=False)

### P3 Pipeline

In [None]:
df = pd.read_csv("./promptForm/StaticP3prompt.csv")

train_df = df[0:1700]
validation_df = df[1700:1900]
test_df = df[1900:]

train_df.to_csv("./finetuning_P3dataset/train.csv", index=False)
validation_df.to_csv("./finetuning_P3dataset/validation.csv", index=False)
test_df.to_csv("./finetuning_P3dataset/test.csv", index=False)