In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
import os

os.environ["PATH_TO_ENV"] = "~/projects/chatsky-llm-autoconfig/.env"

In [3]:
from tqdm import tqdm
import json
import pickle
from augmentation_utils import (
    check_no_duplicates_one_uttr_list,
    is_correct_length_modified,
    match_roles_modified,
    check_no_duplicates_one_dialogue,
)

# 1. Removing data with greetings in the middle

In [4]:
with open("../data/gen_dataset_augmented_0-402_v4", "rb") as file:
    data = pickle.load(file)
len(data)

402

In [5]:
with open("../data/idx_and_damaged_topics.json", "r") as file:
    data_to_remove = json.load(file)
len(data_to_remove)

25

In [6]:
data_to_remove[0]

[256, 'Recovering deleted customer support chat logs']

In [7]:
new_data = []

for i, example in enumerate(tqdm(data)):
    if (i, example["topic"]) in [(j, topic) for (j, topic) in data_to_remove]:
        continue

    new_example = {}
    new_example["graph"] = example["graph"]
    new_example["topic"] = example["topic"]
    new_example["dialogues"] = example["dialogues"]
    new_example["augmented_dialogues"] = example["augmented_dialogues"]

    new_data.append(new_example)
len(new_data)

100%|██████████| 402/402 [00:00<00:00, 236182.97it/s]


377

In [8]:
with open("../data/d2g_generated_without_hello_in_the_middle.json", "r") as file:
    data_to_compare_with = json.load(file)
len(data_to_compare_with)

377

In [9]:
count = 0
for d1, d2 in zip(new_data, data_to_compare_with):
    if (
        d1["graph"] == d2["graph"]
        and d1["topic"] == d2["topic"]
        and d1["dialogues"] == d2["dialogues"]
    ):
        count += 1
count

377

In [10]:
with open("../data/gen_dataset_augmented_0-402_without_hello_v4", "wb") as file:
    pickle.dump(new_data, file)

# 2. Re-checking

## matching roles and length correctness

In [11]:
with open("../data/gen_dataset_augmented_0-402_without_hello_v4", "rb") as file:
    data = pickle.load(file)
len(data)

377

In [12]:
failure_instances_length = []
failure_instances_roles = []

for i, instance in enumerate(tqdm(data)):
    dialogues = [dial["messages"] for dial in instance["dialogues"]]
    augmented_dialogues = [dial["messages"] for dial in instance["augmented_dialogues"]]

    for j, (orig_dia, aug_dia) in enumerate(zip(dialogues, augmented_dialogues)):
        length_comparison = is_correct_length_modified(orig_dia, aug_dia)
        roles_comparison = match_roles_modified(orig_dia, aug_dia)

        if length_comparison != True:
            failure_instances_length.append((i, j, length_comparison))
        if roles_comparison != True:
            failure_instances_roles.append((i, j, roles_comparison))

len(failure_instances_length), len(failure_instances_roles)

100%|██████████| 377/377 [00:00<00:00, 22009.53it/s]


(49, 49)

## duplicates in one_uttr_list

In [13]:
errors = []
failure_examples = []

for i, example in enumerate(tqdm(data)):
    augmented_dialogues = [dial["messages"] for dial in example["augmented_dialogues"]]

    for j, aug_dia in enumerate(augmented_dialogues):
        try:
            if not check_no_duplicates_one_uttr_list(aug_dia):
                failure_examples.append((i, j))
        except Exception as e:
            errors.append((i, j, e))
len(errors), failure_examples

100%|██████████| 377/377 [00:00<00:00, 33500.40it/s]


(49, [])

# set(len(uttr_list)) = {3}

## 2. Removing what can't be combined

Count lengths of utterances_lists. if the number of generated variations for each original utterance in the dialogue equals 3, the example will be added to the "normals". Otherwise - to the "exceptions"

In [45]:
with open("../data/gen_dataset_augmented_0-402_without_hello_v4", "rb") as file:
    data = pickle.load(file)
len(data)

377

In [46]:
all_lens = []
normals, exceptions, errors = [], [], []

for i, example in enumerate(tqdm(data)):
    augmented_dialogues = example["augmented_dialogues"]

    for j, aug_dia in enumerate(augmented_dialogues):
        try:
            utterances_lists = [turn["text"] for turn in aug_dia["messages"]]
            lens = [len(uttr_list) for uttr_list in utterances_lists]
            if set(lens) == {3}:
                normals.append((i, j))
            else:
                # exceptions.append((i, j, lens))
                exceptions.append((i, j))
            all_lens.append(lens)
        except Exception:
            # errors.append((i, j, e))
            errors.append((i, j))
len(errors), len(all_lens), len(normals), len(exceptions)

100%|██████████| 377/377 [00:00<00:00, 27795.89it/s]


(49, 4136, 3671, 465)

Making new data without exceptions and errors:

In [17]:
new_data = []

for i, example in enumerate(tqdm(data)):
    new_example = {}
    new_example["graph"] = example["graph"]
    new_example["topic"] = example["topic"]
    new_example["dialogues"] = []
    new_example["augmented_dialogues"] = []

    dialogues = example["dialogues"]
    augmented_dialogues = example["augmented_dialogues"]

    for j, (orig_dia, aug_dia) in enumerate(zip(dialogues, augmented_dialogues)):
        if (i, j) not in exceptions and (i, j) not in errors:
            new_example["dialogues"].append(orig_dia)
            new_example["augmented_dialogues"].append(aug_dia)

    new_data.append(new_example)
len(new_data)

100%|██████████| 377/377 [00:00<00:00, 9130.27it/s]


377

In [18]:
all_lens = []
normals, exceptions, errors = [], [], []

for i, example in enumerate(tqdm(new_data)):
    augmented_dialogues = example["augmented_dialogues"]

    for j, aug_dia in enumerate(augmented_dialogues):
        try:
            utterances_lists = [turn["text"] for turn in aug_dia["messages"]]
            lens = [len(uttr_list) for uttr_list in utterances_lists]
            if set(lens) == {3}:
                normals.append((i, j))
            else:
                # exceptions.append((i, j, lens))
                exceptions.append((i, j))
            all_lens.append(lens)
        except Exception as e:
            errors.append((i, j, e))
errors

100%|██████████| 377/377 [00:00<00:00, 30084.14it/s]


[]

In [20]:
len(all_lens)

3671

In [21]:
new_data_2 = []

for i, example in enumerate(tqdm(new_data)):
    if example["augmented_dialogues"] == []:
        print(i)
        continue

    new_example = {}
    new_example["graph"] = example["graph"]
    new_example["topic"] = example["topic"]
    new_example["dialogues"] = example["dialogues"]
    new_example["augmented_dialogues"] = example["augmented_dialogues"]

    new_data_2.append(new_example)
len(new_data_2)

100%|██████████| 377/377 [00:00<00:00, 463574.50it/s]

128
174
190
232
238
282





371

In [22]:
with open(
    "../data/gen_dataset_augmented_0-402_cleaned_v4.json", "w", encoding="utf-8"
) as file:
    json.dump(new_data_2, file, indent=4)

## 3. Adding utterances to nodes and edges

In [23]:
with open("../data/gen_dataset_augmented_0-402_cleaned_v4.json", "r") as fp:
    data = json.load(fp)
len(data)

371

In [24]:
for i, example in enumerate(tqdm(data)):
    dialogues = example["dialogues"]
    augmented_dialogues = example["augmented_dialogues"]

    for j, (orig_dia, aug_dia) in enumerate(zip(dialogues, augmented_dialogues)):
        for orig_turn, aug_turn in zip(orig_dia["messages"], aug_dia["messages"]):
            phrase_to_look_for = orig_turn["text"]
            phrases_to_add = aug_turn["text"]

            if aug_turn["participant"] == "assistant":
                key = "nodes"
            elif aug_turn["participant"] == "user":
                key = "edges"

            for turn in example["graph"][key]:
                if (
                    phrase_to_look_for in turn["utterances"]
                    and phrases_to_add not in turn["utterances"]
                ):
                    turn["utterances"] += phrases_to_add

100%|██████████| 371/371 [00:00<00:00, 2384.95it/s]


In [25]:
data[0]["graph"]["edges"][4]

{'source': 5,
 'target': 6,
 'utterances': ['Medium, please.',
  "I'd like a medium size, please.",
  'Medium would be great, thanks.',
  'Please get me a medium.',
  'Medium, please.',
  'I still want a medium size.',
  'Let’s stick with medium, please.',
  'A medium size, please.',
  'Medium would be great, thanks.',
  "I'd like a medium, please.",
  'Medium, please.',
  'Still medium, thank you.',
  'Let’s stick with medium, please.',
  "I'll take a medium, please.",
  'Medium would be great, thank you.',
  "I'd like a medium size.",
  'Medium, please.',
  'I’ll go with medium again.',
  'A medium size, please.']}

In [26]:
for i, example in enumerate(tqdm(data)):
    for key in ["nodes", "edges"]:
        for turn in example["graph"][key]:
            while "" in turn["utterances"]:
                turn["utterances"].remove("")
            turn["utterances"] = list(set(turn["utterances"]))

100%|██████████| 371/371 [00:00<00:00, 12849.92it/s]


In [27]:
data[0]["graph"]["edges"][4]

{'source': 5,
 'target': 6,
 'utterances': ['Let’s stick with medium, please.',
  'A medium size, please.',
  'Medium, please.',
  'I still want a medium size.',
  'Medium would be great, thank you.',
  "I'd like a medium size.",
  'Please get me a medium.',
  "I'd like a medium size, please.",
  'I’ll go with medium again.',
  "I'll take a medium, please.",
  'Medium would be great, thanks.',
  "I'd like a medium, please.",
  'Still medium, thank you.']}

In [28]:
with open(
    "../data/gen_dataset_augmented_0-402_nodes_edges_v4.json", "w", encoding="utf-8"
) as file:
    json.dump(data, file, indent=4)

## 4. Combine new augmented dialogues with one set of variations:

In [29]:
with open("../data/gen_dataset_augmented_0-402_nodes_edges_v4.json", "r") as fp:
    data = json.load(fp)
len(data)

371

In [30]:
def combine_new_aug_dia(aug_dia, k):
    new_aug_dia = {}
    new_aug_dia["id"] = aug_dia["id"] + f"_{k}"
    new_aug_dia["messages"] = []

    roles_to_add = [turn["participant"] for turn in aug_dia["messages"]]
    utterances_to_add = [turn["text"][k] for turn in aug_dia["messages"]]

    for role, uttr in zip(roles_to_add, utterances_to_add):
        dict_messages = {}
        dict_messages["participant"] = role
        dict_messages["text"] = uttr
        new_aug_dia["messages"].append(dict_messages)

    return new_aug_dia

In [31]:
new_data = []

for i, example in enumerate(tqdm(data)):
    # make a copy of example from old data
    new_example = {}
    new_example["graph"] = example["graph"]
    new_example["topic"] = example["topic"]

    # dialogues and augmented_dialogues will be set later
    new_example["dialogues"] = []
    new_example["augmented_dialogues"] = []

    dialogues = example["dialogues"]
    augmented_dialogues = example["augmented_dialogues"]

    # take a pair of orig_dia and aug_dia (aug_dia has 3 variations of all phrases)
    for j, (orig_dia, aug_dia) in enumerate(zip(dialogues, augmented_dialogues)):
        # set an orig_dia without changes
        new_example["dialogues"].append(orig_dia)

        # combine 3 new aug_dias from 1 old aug_dia
        for k in range(3):
            new_aug_dia = combine_new_aug_dia(aug_dia, k)
            # add new aug_dia to the new example
            new_example["augmented_dialogues"].append(new_aug_dia)

    # add new example with new aug_dias to the new data
    new_data.append(new_example)
len(new_data)

  0%|          | 0/371 [00:00<?, ?it/s]

100%|██████████| 371/371 [00:00<00:00, 4975.34it/s]


371

In [32]:
with open(
    "../data/gen_dataset_augmented_0-402_combined_v4.json", "w", encoding="utf-8"
) as file:
    json.dump(new_data, file, indent=4)

In [33]:
errors = []
failure_examples = []

for i, example in enumerate(tqdm(new_data)):
    print(i)
    augmented_dialogues = [dial["messages"] for dial in example["augmented_dialogues"]]

    for j, aug_dia in enumerate(augmented_dialogues):
        try:
            if not check_no_duplicates_one_dialogue(aug_dia):
                failure_examples.append((i, j))
        except Exception as e:
            errors.append((i, j, e))

errors

100%|██████████| 371/371 [00:00<00:00, 7761.30it/s]

0
common_elements: ["It's 123 Main Street."]
1
2
3
common_elements: ['I see that you have concerns about ethical sourcing. Can you share more about what specifically troubles you?']
common_elements: ['I recognize your worries about ethical sourcing. Could you provide more details on your specific concerns?']
common_elements: ['I understand your concerns regarding ethical sourcing. What particular aspects are you worried about?']
4
5
common_elements: ['Got it. Now, let’s move on to pricing. What budget do you have in mind for this project?']
6
7
8
9
10
11
common_elements: ['1.5 BTC', "Could you please confirm that you're sending {amount} Bitcoin?"]
common_elements: ['1.5 BTC']
common_elements: ['1.5 BTC']
common_elements: ['2 ETH', 'Just to confirm: You’re sending {amount} Ethereum. Is that correct?']
common_elements: ['I want to send 2 ETH.', 'Please verify: You are about to send {amount} Ethereum. Is that accurate?']
common_elements: ['Can you confirm that you want to send {amount} Et




[]

In [34]:
len(failure_examples)

653

# len(set(lens(uttr_list))) = 1

## 1. Re-checking

### matching roles and length correctness

In [105]:
with open("../data/gen_dataset_augmented_0-402_without_hello_v4", "rb") as file:
    data = pickle.load(file)
len(data)

377

In [106]:
failure_instances_length = []
failure_instances_roles = []

for i, instance in enumerate(tqdm(data)):
    dialogues = [dial["messages"] for dial in instance["dialogues"]]
    augmented_dialogues = [dial["messages"] for dial in instance["augmented_dialogues"]]

    for j, (orig_dia, aug_dia) in enumerate(zip(dialogues, augmented_dialogues)):
        length_comparison = is_correct_length_modified(orig_dia, aug_dia)
        roles_comparison = match_roles_modified(orig_dia, aug_dia)

        if length_comparison != True:
            failure_instances_length.append((i, j, length_comparison))
        if roles_comparison != True:
            failure_instances_roles.append((i, j, roles_comparison))

len(failure_instances_length), len(failure_instances_roles)

100%|██████████| 377/377 [00:00<00:00, 20678.89it/s]


(49, 49)

### duplicates in one_uttr_list

In [107]:
errors = []
failure_examples = []

for i, example in enumerate(tqdm(data)):
    augmented_dialogues = [dial["messages"] for dial in example["augmented_dialogues"]]

    for j, aug_dia in enumerate(augmented_dialogues):
        try:
            if not check_no_duplicates_one_uttr_list(aug_dia):
                failure_examples.append((i, j))
        except Exception as e:
            errors.append((i, j, e))
len(errors), failure_examples

100%|██████████| 377/377 [00:00<00:00, 30317.17it/s]


(49, [])

## 2. Removing what can't be combined

Count lengths of utterances_lists. If the number of generated variations for each original utterance in the dialogue is the same, the example will be added to the "normals". Otherwise - to the "exceptions"

In [108]:
with open("../data/gen_dataset_augmented_0-402_without_hello_v4", "rb") as file:
    data = pickle.load(file)
len(data)

377

In [109]:
for d in data:
    if d["topic"] == "Reporting an incorrect product allergen listing":
        dia = d["augmented_dialogues"][7]["messages"]

dia[0]["text"] = dia[0]["text"][1:]
dia[0]["text"]

['Hi there! What can I do for you today?',
 'Good day! How may I assist you?',
 'Welcome! How can I help you today?']

In [110]:
all_lens = []
normals, exceptions, errors = [], [], []

for i, example in enumerate(tqdm(data)):
    augmented_dialogues = example["augmented_dialogues"]

    for j, aug_dia in enumerate(augmented_dialogues):
        try:
            utterances_lists = [turn["text"] for turn in aug_dia["messages"]]
            lens = [len(uttr_list) for uttr_list in utterances_lists]
            if len(set(lens)) == 1:
                normals.append((i, j))
            else:
                # exceptions.append((i, j, lens))
                # exceptions.append((i, j, aug_dia['id']))
                exceptions.append((i, j))
            all_lens.append(lens)
        except Exception:
            # errors.append((i, j, e))
            errors.append((i, j))
len(errors), len(all_lens), len(normals), len(exceptions)

100%|██████████| 377/377 [00:00<00:00, 25011.11it/s]


(49, 4136, 4104, 32)

Making new data without exceptions and errors:

In [111]:
new_data = []

for i, example in enumerate(tqdm(data)):
    new_example = {}
    new_example["graph"] = example["graph"]
    new_example["topic"] = example["topic"]
    new_example["dialogues"] = []
    new_example["augmented_dialogues"] = []

    dialogues = example["dialogues"]
    augmented_dialogues = example["augmented_dialogues"]

    for j, (orig_dia, aug_dia) in enumerate(zip(dialogues, augmented_dialogues)):
        if (i, j) not in exceptions and (i, j) not in errors:
            new_example["dialogues"].append(orig_dia)
            new_example["augmented_dialogues"].append(aug_dia)

    new_data.append(new_example)
len(new_data)

100%|██████████| 377/377 [00:00<00:00, 37370.37it/s]


377

In [112]:
all_lens = []
normals, exceptions, errors = [], [], []

for i, example in enumerate(tqdm(new_data)):
    augmented_dialogues = example["augmented_dialogues"]

    for j, aug_dia in enumerate(augmented_dialogues):
        try:
            utterances_lists = [turn["text"] for turn in aug_dia["messages"]]
            lens = [len(uttr_list) for uttr_list in utterances_lists]
            if len(set(lens)) == 1:
                normals.append((i, j))
            else:
                # exceptions.append((i, j, lens))
                exceptions.append((i, j))
            all_lens.append(lens)
        except Exception as e:
            errors.append((i, j, e))
len(errors)

100%|██████████| 377/377 [00:00<00:00, 22789.87it/s]


0

In [113]:
len(all_lens)

4104

In [114]:
new_data_2 = []

for i, example in enumerate(tqdm(new_data)):
    if example["augmented_dialogues"] == []:
        print(i)
        continue

    new_example = {}
    new_example["graph"] = example["graph"]
    new_example["topic"] = example["topic"]
    new_example["dialogues"] = example["dialogues"]
    new_example["augmented_dialogues"] = example["augmented_dialogues"]

    new_data_2.append(new_example)
len(new_data_2)

100%|██████████| 377/377 [00:00<00:00, 299196.33it/s]

190





376

In [84]:
with open(
    "../data/gen_dataset_augmented_0-402_cleaned_v5.json", "w", encoding="utf-8"
) as file:
    json.dump(new_data_2, file, indent=4)

## 3. Adding utterances to nodes and edges

In [118]:
with open("../data/gen_dataset_augmented_0-402_cleaned_v5.json", "r") as fp:
    data = json.load(fp)
len(data)

376

In [119]:
for i, example in enumerate(tqdm(data)):
    dialogues = example["dialogues"]
    augmented_dialogues = example["augmented_dialogues"]

    for j, (orig_dia, aug_dia) in enumerate(zip(dialogues, augmented_dialogues)):
        for orig_turn, aug_turn in zip(orig_dia["messages"], aug_dia["messages"]):
            phrase_to_look_for = orig_turn["text"]
            phrases_to_add = aug_turn["text"]

            if aug_turn["participant"] == "assistant":
                key = "nodes"
            elif aug_turn["participant"] == "user":
                key = "edges"

            for turn in example["graph"][key]:
                if (
                    phrase_to_look_for in turn["utterances"]
                    and phrases_to_add not in turn["utterances"]
                ):
                    turn["utterances"] += phrases_to_add

100%|██████████| 376/376 [00:00<00:00, 2122.31it/s]


In [120]:
data[0]["graph"]["edges"][4]

{'source': 5,
 'target': 6,
 'utterances': ['Medium, please.',
  "I'd like a medium size, please.",
  'Medium would be great, thanks.',
  'Please get me a medium.',
  'Medium, please.',
  'I still want a medium size.',
  'Let’s stick with medium, please.',
  'A medium size, please.',
  'Medium would be great, thanks.',
  "I'd like a medium, please.",
  'Medium, please.',
  'Still medium, thank you.',
  'Let’s stick with medium, please.',
  "I'll take a medium, please.",
  'Medium would be great, thank you.',
  "I'd like a medium size.",
  'Medium, please.',
  'I’ll go with medium again.',
  'A medium size, please.']}

In [121]:
for i, example in enumerate(tqdm(data)):
    for key in ["nodes", "edges"]:
        for turn in example["graph"][key]:
            while "" in turn["utterances"]:
                turn["utterances"].remove("")
            turn["utterances"] = list(set(turn["utterances"]))
data[0]["graph"]["edges"][4]

  0%|          | 0/376 [00:00<?, ?it/s]

100%|██████████| 376/376 [00:00<00:00, 10783.16it/s]


{'source': 5,
 'target': 6,
 'utterances': ['Let’s stick with medium, please.',
  'A medium size, please.',
  'Medium, please.',
  'I still want a medium size.',
  'Medium would be great, thank you.',
  "I'd like a medium size.",
  'Please get me a medium.',
  "I'd like a medium size, please.",
  'I’ll go with medium again.',
  "I'll take a medium, please.",
  'Medium would be great, thanks.',
  "I'd like a medium, please.",
  'Still medium, thank you.']}

In [89]:
with open(
    "../data/gen_dataset_augmented_0-402_nodes_edges_v5.json", "w", encoding="utf-8"
) as file:
    json.dump(data, file, indent=4)

## 4. Combine new augmented dialogues with one set of variations:

In [122]:
with open("../data/gen_dataset_augmented_0-402_nodes_edges_v5.json", "r") as fp:
    data = json.load(fp)
len(data)

376

In [91]:
def combine_new_aug_dia(aug_dia, k):
    new_aug_dia = {}
    new_aug_dia["id"] = aug_dia["id"] + f"_{k}"
    new_aug_dia["messages"] = []

    roles_to_add = [turn["participant"] for turn in aug_dia["messages"]]
    utterances_to_add = [turn["text"][k] for turn in aug_dia["messages"]]

    for role, uttr in zip(roles_to_add, utterances_to_add):
        dict_messages = {}
        dict_messages["participant"] = role
        dict_messages["text"] = uttr
        new_aug_dia["messages"].append(dict_messages)

    return new_aug_dia

In [123]:
new_data = []

for i, example in enumerate(tqdm(data)):
    # make a copy of example from old data
    new_example = {}
    new_example["graph"] = example["graph"]
    new_example["topic"] = example["topic"]

    # dialogues and augmented_dialogues will be set later
    new_example["dialogues"] = []
    new_example["augmented_dialogues"] = []

    dialogues = example["dialogues"]
    augmented_dialogues = example["augmented_dialogues"]

    # take a pair of orig_dia and aug_dia (aug_dia has 3 variations of all phrases)
    for j, (orig_dia, aug_dia) in enumerate(zip(dialogues, augmented_dialogues)):
        # set an orig_dia without changes
        new_example["dialogues"].append(orig_dia)

        uttr_list_len = len(aug_dia["messages"][0]["text"])

        # combine 3 new aug_dias from 1 old aug_dia
        for k in range(uttr_list_len):
            new_aug_dia = combine_new_aug_dia(aug_dia, k)
            # add new aug_dia to the new example
            new_example["augmented_dialogues"].append(new_aug_dia)

    # add new example with new aug_dias to the new data
    new_data.append(new_example)
len(new_data)

  0%|          | 0/376 [00:00<?, ?it/s]

100%|██████████| 376/376 [00:00<00:00, 4689.31it/s]


376

In [100]:
with open(
    "../data/gen_dataset_augmented_0-402_combined_v5.json", "w", encoding="utf-8"
) as file:
    json.dump(new_data, file, indent=4)