In [2]:
import json
import os
import glob
import pandas as pd
from tqdm import tqdm
import random
from datasets import Dataset


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
TOOLS_JSON_PATH = "/home/ubuntu/Router/data/tools_v1.json"
TOOLS_DESCRIPTION_JSON_PATH = "/home/ubuntu/Router/data/tool_descriptions_v1.json"

tools_description_map = {}
with open(TOOLS_DESCRIPTION_JSON_PATH, "r") as f:
    tools_description_map = json.load(f)

with open(TOOLS_JSON_PATH, "r") as f:
    tools = json.load(f)

tools_map = []
tools_to_integration_map = {}
for category, tools in tools.items():
    for tool in tools:
        tools_map.append(tool["name"])
        tools_to_integration_map[tool["name"]] = category


In [4]:
tool_descriptions_to_tool_name = {}
for tool_name, tool_description in tools_description_map.items():
    tool_descriptions_to_tool_name[tool_description] = tool_name

In [5]:

all_data_files = glob.glob("../data/sft_data_v1/*.json")
data_df = None
for i, file in enumerate(all_data_files):
    if data_df is None:
        data_df = pd.read_json(file)
    else:
        data_df = pd.concat([data_df, pd.read_json(file)])

assert data_df is not None

tool_set = set()
for i, tools_required in tqdm(enumerate(data_df['tools_required'].to_list())):
    tool_list = []
    for tool in tools_required:
        tool_name = tool.split('-')[0].split(' ')[0]
        # data_df.loc[i, tool_name] = True
        tool_set.add(tool_name)
        # tool_list.append(tool_name)
    # data_df.loc[i, 'tools_required_list'] = tool_list

tool_set = list(tool_set)
print(len(tool_set))
for tool_name in tool_set:
    data_df[tool_name] = data_df['tools_required'].apply(lambda x: tool_name in x)
data_df.to_pickle("../data/data_df_v1.pkl")
# data_df.to_pickle("../data/sft_data/data_df.pkl")


466000it [00:00, 575594.33it/s]


30


In [7]:
print(tool_set)

['direct_reply_to_user', 'create_event', 'update_event', 'capture_screenshot', 'update_note', 'search_events', 'send_message_on_slack', 'search_location', 'trash_file', 'delete_event', 'unachievable_task', 'create_reminder', 'search_notes', 'update_reminder', 'unachievable', 'search_messages', 'update_file', 'open_url', 'create_file', 'start_call', 'find_stock_symbol', 'send_email', 'play_song', 'get_directions', 'Unachievable', 'search_inbox', 'send_message_on_messages', 'search_library', 'get_weather', 'create_note']


In [8]:
data_df = pd.read_pickle("../data/data_df_v1.pkl")

In [9]:
# make pairs for each row
positive_pairs = []
for tool_name in tool_set:
    filtered_df = data_df[data_df[tool_name] == True]
    if tool_name not in tools_to_integration_map:
        continue
    integration_name = tools_to_integration_map[tool_name]
    query_list = filtered_df['query'].tolist()
    for query in query_list:
        positive_pairs.append((query, integration_name))

print(len(positive_pairs))

813448


In [22]:
import json
positive_pairs_dict = []
for i in range(len(positive_pairs)):
    positive_pairs_dict.append({
        "text1": positive_pairs[i][0],
        "text2": positive_pairs[i][1]
    })

with open("../data/positive_pairs_v1.json", "w") as f:
    json.dump(positive_pairs_dict, f)

In [27]:
from datasets import load_dataset, load_from_disk
from datasets import DatasetDict
from datasets import Dataset

dataset = Dataset.from_list(positive_pairs_dict)
dataset.shuffle()

dataset_train_test = dataset.train_test_split(test_size=0.1)
dataset_test = dataset_train_test['test']
dataset_train_val = dataset_train_test['train'].train_test_split(test_size=0.1)
dataset_train = dataset_train_val['train']
dataset_val = dataset_train_val['test']

print(len(dataset_train), len(dataset_val), len(dataset_test))

dataset_dict = DatasetDict({
    "train": dataset_train,
    "val": dataset_val,
    "test": dataset_test
})

dataset_dict.save_to_disk("../data/positive_pairs_train_val_test_v1")

658892 73211 81345


Saving the dataset (1/1 shards): 100%|██████████| 658892/658892 [00:02<00:00, 251611.51 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 73211/73211 [00:00<00:00, 243926.60 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 81345/81345 [00:00<00:00, 257523.88 examples/s]


In [28]:
dataset_train_val_test = load_from_disk("../data/positive_pairs_train_val_test_v1")

In [29]:
all_tool_descriptions =list(tools_description_map.values())

In [31]:
queries = []
tool_descriptions = []
for i in range(len(dataset_train_val_test['val'])):
    queries.append(dataset_train_val_test['val'][i]['text1'])
    tool_descriptions.append(dataset_train_val_test['val'][i]['text2'])

val_dataset = []
no_of_positive_pairs = 0
no_of_negative_pairs = 0

for i, (q_i, tool_description_i) in enumerate(zip(queries, tool_descriptions)):
    if random.random() < 0.5:
        label = 1
        tool_description = tool_description_i
        no_of_positive_pairs += 1
    else:
        label = 0
        tool_description = random.choice(list(set(all_tool_descriptions) - {tool_description_i}))
        no_of_negative_pairs += 1

    data = {
        "text1": q_i,
        "text2": tool_description,
        "label": label
    }
    val_dataset.append(data)

print(no_of_positive_pairs, no_of_negative_pairs)

queries = []
tool_descriptions = []
for i in range(len(dataset_train_val_test['test'])):
    queries.append(dataset_train_val_test['test'][i]['text1'])
    tool_descriptions.append(dataset_train_val_test['test'][i]['text2'])
    

test_dataset = []

no_of_positive_pairs = 0
no_of_negative_pairs = 0

for i, (q_i, tool_description_i) in enumerate(zip(queries, tool_descriptions)):
    if random.random() < 0.5:
        label = 1
        tool_description = tool_description_i
        no_of_positive_pairs += 1
    else:
        label = 0
        tool_description = random.choice(list(set(all_tool_descriptions) - {tool_description_i}))
        no_of_negative_pairs += 1

    data = {
        "text1": q_i,
        "text2": tool_description,
        "label": label
    }
    test_dataset.append(data)

print(no_of_positive_pairs, no_of_negative_pairs)


36877 36334
40698 40647


In [33]:

dataset_dict = DatasetDict({
    "train": dataset_train_val_test['train'],
    "val": Dataset.from_list(val_dataset),
    "test": Dataset.from_list(test_dataset)
})

dataset_dict.save_to_disk("../data/positive_pairs_v1")

Saving the dataset (1/1 shards): 100%|██████████| 658892/658892 [00:00<00:00, 1870242.23 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 73211/73211 [00:00<00:00, 1216024.10 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 81345/81345 [00:00<00:00, 1025585.58 examples/s]


In [34]:
from datasets import load_from_disk

dataset_dict = load_from_disk("../data/positive_pairs_v1")


In [35]:
dataset_dict['test']

Dataset({
    features: ['text1', 'text2', 'label'],
    num_rows: 81345
})

In [36]:
all_queries = [x['text1'] for x in dataset_dict['test']]
all_queries_set = set(all_queries)

data_df['test'] = data_df['query'].apply(lambda x: x in all_queries_set)

In [37]:
data_df_test = data_df[data_df['test'] == True]
import random

# Shuffle the test dataframe and select 500 examples
data_df_test_shuffled = data_df_test.sample(frac=1, random_state=42).reset_index(drop=True)
data_df_test_500 = data_df_test_shuffled.head(500)


In [38]:
evals_json = []

for i, row in data_df_test_500.iterrows():
    row_dict = {
        "query": row['query'],
        "tools": row['tools_required'],
    }
    if 'reply' in row['tools_required']:
        row_dict['decision'] = 'integrations'
        # print(f'Query: {row["query"]}')
        # print(f'Reply: {row["tools_required"]}')
        row_dict['tools'].append('send_message')
    elif 'unachievable_task' in row['tools_required']:
        row_dict['decision'] = 'unachievable'
        row_dict['tools'] = []
        # print(f'Query: {row["query"]}')
        # print(f'Unachievable: {row["tools_required"]}')
    else:
        row_dict['decision'] = 'integrations'
    integrations = []
    for tool in row['tools_required']:
        tool = tool.split('-')[0].split(' ')[0]
        if tool in tools_to_integration_map:
            integrations.append(tools_to_integration_map[tool])
        else:
            if 'Slack' in tool:
                integrations.append('Slack')
            elif 'message' in tool:
                integrations.append('Messages')
            else:
                print(f'Tool not found: {tool}')

    row_dict['tools'] = list(set(row_dict['tools']))
    row_dict['integrations'] = list(set(integrations))
    evals_json.append(row_dict)

with open("../data/evals_test_500_v1.json", "w") as f:
    json.dump(evals_json, f)