In [12]:
import json
import os
import glob
import pandas as pd
from tqdm import tqdm
import random
from datasets import Dataset


In [13]:
TOOLS_JSON_PATH = "../data/tools.json"
TOOLS_DESCRIPTION_JSON_PATH = "../data/tool_descriptions.json"

tools_description_map = {}
with open(TOOLS_DESCRIPTION_JSON_PATH, "r") as f:
    tools_description_map = json.load(f)

with open(TOOLS_JSON_PATH, "r") as f:
    tools = json.load(f)

tools_map = []
tools_to_integration_map = {}
for category, tools in tools.items():
    for tool in tools:
        tools_map.append(tool["name"])
        tools_to_integration_map[tool["name"]] = category


In [14]:
tool_descriptions_to_tool_name = {}
for tool_name, tool_description in tools_description_map.items():
    tool_descriptions_to_tool_name[tool_description] = tool_name

In [None]:

all_data_files = glob.glob("../data/sft_data/*.json")
data_df = None
for i, file in enumerate(all_data_files):
    if data_df is None:
        data_df = pd.read_json(file)
    else:
        data_df = pd.concat([data_df, pd.read_json(file)])

assert data_df is not None

tool_set = set()
for i, tools_required in tqdm(enumerate(data_df['tools_required'].to_list())):
    tool_list = []
    for tool in tools_required:
        tool_name = tool.split('-')[0].split(' ')[0]
        # data_df.loc[i, tool_name] = True
        tool_set.add(tool_name)
        # tool_list.append(tool_name)
    # data_df.loc[i, 'tools_required_list'] = tool_list

tool_set = list(tool_set)
print(len(tool_set))
for tool_name in tool_set:
    data_df[tool_name] = data_df['tools_required'].apply(lambda x: tool_name in x)
data_df.to_pickle("../data/data_df.pkl")
# data_df.to_pickle("../data/sft_data/data_df.pkl")


In [17]:
data_df = pd.read_pickle("../data/data_df.pkl")

In [19]:
tools_to_integration_map

{'create_event': 'Calendar',
 'update_event': 'Calendar',
 'delete_event': 'Calendar',
 'search_events': 'Calendar',
 'create_file': 'Files',
 'update_file': 'Files',
 'trash_file': 'Files',
 'start_call': 'FaceTime',
 'send_email': 'Mail',
 'search_inbox': 'Mail',
 'get_directions': 'Maps',
 'search_location': 'Maps',
 'play_song': 'Music',
 'search_library': 'Music',
 'send_message': 'Slack',
 'search_messages': 'Messages',
 'create_note': 'Notes',
 'update_note': 'Notes',
 'create_reminder': 'Reminders',
 'update_reminder': 'Reminders',
 'capture_screenshot': 'System',
 'find_symbol': 'Stocks',
 'open_url': 'WebBrowser',
 'get_weather': 'Weather',
 'reply': 'DirectMessage',
 'unachievable_task': 'Unachievable'}

(466060, 31)

In [None]:
evals_json = []

for i, row in data_df.iterrows():
    row_dict = {
        "query": row['query'],
        "tools": row['tools_required'],
    }
    if 'reply' in row['tools_required']:
        row_dict['decision'] = 'integrations'
        # print(f'Query: {row["query"]}')
        # print(f'Reply: {row["tools_required"]}')
        row_dict['tools'].append('send_message')
    elif 'unachievable_task' in row['tools_required']:
        row_dict['decision'] = 'unachievable'
        row_dict['tools'] = []
        # print(f'Query: {row["query"]}')
        # print(f'Unachievable: {row["tools_required"]}')
    else:
        row_dict['decision'] = 'integrations'
    integrations = []
    for tool in row['tools_required']:
        tool = tool.split('-')[0].split(' ')[0]
        if tool in tools_to_integration_map:
            integrations.append(tools_to_integration_map[tool])
        else:
            if 'Slack' in tool:
                integrations.append('Slack')
            elif 'message' in tool:
                integrations.append('Messages')
            else:
                print(f'Tool not found: {tool}')

    row_dict['tools'] = list(set(row_dict['tools']))
    row_dict['integrations'] = list(set(integrations))
    evals_json.append(row_dict)

# with open("../data/evals.json", "w") as f:
#     json.dump(evals_json, f)

Tool not found: search_reminders


KeyboardInterrupt: 

In [None]:
with open("../data/evals_v0.json", "w") as f:
    json.dump(evals_json, f)

In [43]:
# make pairs for each row
positive_pairs = []
for tool_name in tool_set:
    filtered_df = data_df[data_df[tool_name] == True]
    if tool_name not in tools_to_integration_map:
        continue
    integration_name = tools_to_integration_map[tool_name]
    query_list = filtered_df['query'].tolist()
    for query in query_list:
        positive_pairs.append((query, integration_name))

print(len(positive_pairs))

954422


In [50]:
import random

positive_pairs[random.randint(0, len(positive_pairs))]

('Update the onboarding checklist file, create a note about changes, and send it to HR via Slack.',
 'Slack')

In [69]:
from datasets import load_dataset
from datasets import DatasetDict

dataset = load_dataset("json", data_files="../data/positive_pairs.json", field='data')
dataset.shuffle()

dataset_train_test = dataset['train'].train_test_split(test_size=0.1)
dataset_test = dataset_train_test['test']
dataset_train_val = dataset_train_test['train'].train_test_split(test_size=0.1)
dataset_train = dataset_train_val['train']
dataset_val = dataset_train_val['test']

dataset_dict = DatasetDict({
    "train": dataset_train,
    "val": dataset_val,
    "test": dataset_test
})

dataset_dict.save_to_disk("../data/positive_pairs_train_val_test")

Saving the dataset (1/1 shards): 100%|██████████| 773081/773081 [00:03<00:00, 197388.81 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 85898/85898 [00:00<00:00, 140644.91 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 95443/95443 [00:00<00:00, 193616.09 examples/s]


In [70]:
dataset_train_val_test = load_from_disk("../data/positive_pairs_train_val_test")

In [71]:
all_tool_descriptions =list(tools_description_map.values())

In [None]:
queries = []
tool_descriptions = []
for i in range(len(dataset_train_val_test['val'])):
    queries.append(dataset_train_val_test['val'][i]['text1'])
    tool_descriptions.append(dataset_train_val_test['val'][i]['text2'])

val_dataset = []
no_of_positive_pairs = 0
no_of_negative_pairs = 0

for i, (q_i, tool_description_i) in enumerate(zip(queries, tool_descriptions)):
    if random.random() < 0.5:
        label = 1
        tool_description = tool_description_i
        no_of_positive_pairs += 1
    else:
        label = 0
        tool_description = random.choice(list(set(all_tool_descriptions) - {tool_description_i}))
        no_of_negative_pairs += 1

    data = {
        "text1": q_i,
        "text2": tool_description,
        "label": label
    }
    val_dataset.append(data)

print(no_of_positive_pairs, no_of_negative_pairs)

queries = []
tool_descriptions = []
for i in range(len(dataset_train_val_test['test'])):
    queries.append(dataset_train_val_test['test'][i]['text1'])
    tool_descriptions.append(dataset_train_val_test['test'][i]['text2'])
    

test_dataset = []

no_of_positive_pairs = 0
no_of_negative_pairs = 0

for i, (q_i, tool_description_i) in enumerate(zip(queries, tool_descriptions)):
    if random.random() < 0.5:
        label = 1
        tool_description = tool_description_i
        no_of_positive_pairs += 1
    else:
        label = 0
        tool_description = random.choice(list(set(all_tool_descriptions) - {tool_description_i}))
        no_of_negative_pairs += 1

    data = {
        "text1": q_i,
        "text2": tool_description,
        "label": label
    }
    test_dataset.append(data)

print(no_of_positive_pairs, no_of_negative_pairs)


42700 43198
48001 47442


In [96]:

dataset_dict = DatasetDict({
    "train": dataset_train_val_test['train'],
    "val": Dataset.from_list(val_dataset),
    "test": Dataset.from_list(test_dataset)
})

dataset_dict.save_to_disk("../data/positive_pairs_train_val_test_1")

Saving the dataset (1/1 shards): 100%|██████████| 773081/773081 [00:01<00:00, 735253.09 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 85898/85898 [00:00<00:00, 574763.30 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 95443/95443 [00:00<00:00, 622314.63 examples/s]


In [4]:
from datasets import load_from_disk

dataset_dict = load_from_disk("../data/positive_pairs_train_val_test_1")

print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['text1', 'text2'],
        num_rows: 773081
    })
    val: Dataset({
        features: ['text1', 'text2', 'label'],
        num_rows: 85898
    })
    test: Dataset({
        features: ['text1', 'text2', 'label'],
        num_rows: 95443
    })
})


In [7]:
dataset_dict['test']

Dataset({
    features: ['text1', 'text2', 'label'],
    num_rows: 95443
})

In [33]:
all_queries = [x['text1'] for x in dataset_dict['test']]
all_queries_set = set(all_queries)

data_df['test'] = data_df['query'].apply(lambda x: x in all_queries_set)

In [35]:
data_df_test = data_df[data_df['test'] == True]
import random

# Shuffle the test dataframe and select 500 examples
data_df_test_shuffled = data_df_test.sample(frac=1, random_state=42).reset_index(drop=True)
data_df_test_500 = data_df_test_shuffled.head(500)


In [37]:
evals_json = []

for i, row in data_df_test_500.iterrows():
    row_dict = {
        "query": row['query'],
        "tools": row['tools_required'],
    }
    if 'reply' in row['tools_required']:
        row_dict['decision'] = 'integrations'
        # print(f'Query: {row["query"]}')
        # print(f'Reply: {row["tools_required"]}')
        row_dict['tools'].append('send_message')
    elif 'unachievable_task' in row['tools_required']:
        row_dict['decision'] = 'unachievable'
        row_dict['tools'] = []
        # print(f'Query: {row["query"]}')
        # print(f'Unachievable: {row["tools_required"]}')
    else:
        row_dict['decision'] = 'integrations'
    integrations = []
    for tool in row['tools_required']:
        tool = tool.split('-')[0].split(' ')[0]
        if tool in tools_to_integration_map:
            integrations.append(tools_to_integration_map[tool])
        else:
            if 'Slack' in tool:
                integrations.append('Slack')
            elif 'message' in tool:
                integrations.append('Messages')
            else:
                print(f'Tool not found: {tool}')

    row_dict['tools'] = list(set(row_dict['tools']))
    row_dict['integrations'] = list(set(integrations))
    evals_json.append(row_dict)

with open("../data/evals_test_500_v0.json", "w") as f:
    json.dump(evals_json, f)

In [11]:
data_df

NameError: name 'data_df' is not defined