In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import os
os.environ['PATH_TO_ENV'] = "~/projects/chatsky-llm-autoconfig/.env"

In [3]:
from tqdm import tqdm
import json
import pickle
from augmentation_utils import (
    check_no_duplicates_one_uttr_list, is_correct_length_modified, match_roles_modified
    )

In [4]:
from dialogue2graph.pipelines.core.dialogue import DialogueMessage, Dialogue



# 1. Checking data

In [130]:
with open(f"../data/gen_dataset_augmented_0-402", "rb") as file:
    data = pickle.load(file)
len(data)

402

In [None]:
# with open("../data/d2g_generated_without_hello_in_the_middle.json", "r") as fp:
#     data_without_hello = json.load(fp)
# len(data_without_hello)

377

## matching roles and length correctness

In [131]:
failure_instances_length = []
failure_instances_roles = []

for i, instance in enumerate(tqdm(data)):
    dialogues = [dial['messages'] for dial in instance['dialogues']]
    augmented_dialogues = [dial['messages'] for dial in instance['augmented_dialogues']]

    for j, (orig_dia, aug_dia) in enumerate(zip(dialogues, augmented_dialogues)):
        length_comparison = is_correct_length_modified(orig_dia, aug_dia)
        roles_comparison = match_roles_modified(orig_dia, aug_dia)

        if length_comparison != True:
            failure_instances_length.append((i, j, length_comparison))
        if roles_comparison != True:
            failure_instances_roles.append((i, j, roles_comparison))      

len(failure_instances_length), len(failure_instances_roles)

100%|██████████| 402/402 [00:00<00:00, 19567.03it/s]


(50, 50)

## duplicates in one_uttr_list

In [132]:
errors = []
failure_examples = []

for i, example in enumerate(tqdm(data)):
    augmented_dialogues = [dial['messages'] for dial in example['augmented_dialogues']]

    for j, aug_dia in enumerate(augmented_dialogues):
        try:
            if not check_no_duplicates_one_uttr_list(aug_dia):
                failure_examples.append((i, j))
        except Exception as e:
            errors.append((i, j, e))
len(errors), failure_examples

100%|██████████| 402/402 [00:00<00:00, 20211.09it/s]


(50, [])

# 2. Removing what can't be combined

Count lengths of utterances_lists. if the number of generated variations for each original utterance in the dialogue equals 3, the example will be added to the "normals". Otherwise - to the "exceptions"

In [133]:
all_lens = []
normals, exceptions, errors = [], [], []

for i, example in enumerate(tqdm(data)):
    augmented_dialogues = example['augmented_dialogues']

    for j, aug_dia in enumerate(augmented_dialogues):
        try:
            utterances_lists = [turn['text'] for turn in aug_dia['messages']]
            lens = [len(uttr_list) for uttr_list in utterances_lists]
            if set(lens) == {3}:
                normals.append((i, j))
            else:
                # exceptions.append((i, j, lens))
                exceptions.append((i, j))
            all_lens.append(lens)
        except Exception as e:
            # errors.append((i, j, e))
            errors.append((i, j))     
len(errors), len(all_lens), len(normals), len(exceptions)

100%|██████████| 402/402 [00:00<00:00, 18798.89it/s]


(50, 4453, 3947, 506)

Making new data without exceptions and errors:

In [134]:
new_data = []

for i, example in enumerate(tqdm(data)):
    new_example = {}
    new_example['graph'] = example['graph']
    new_example['topic'] = example['topic']
    new_example['dialogues'] = []
    new_example['augmented_dialogues'] = []
        
    dialogues = example['dialogues']
    augmented_dialogues = example['augmented_dialogues']

    for j, (orig_dia, aug_dia) in enumerate(zip(dialogues, augmented_dialogues)):
        if (i, j) not in exceptions and (i, j) not in errors:
            new_example['dialogues'].append(orig_dia)
            new_example['augmented_dialogues'].append(aug_dia)
        
    new_data.append(new_example)
len(new_data)

  0%|          | 0/402 [00:00<?, ?it/s]

100%|██████████| 402/402 [00:00<00:00, 8679.26it/s]


402

In [135]:
all_lens = []
normals, exceptions, errors = [], [], []

for i, example in enumerate(tqdm(new_data)):
    augmented_dialogues = example['augmented_dialogues']

    for j, aug_dia in enumerate(augmented_dialogues):
        try:
            utterances_lists = [turn['text'] for turn in aug_dia['messages']]
            lens = [len(uttr_list) for uttr_list in utterances_lists]
            if set(lens) == {3}:
                normals.append((i, j))
            else:
                # exceptions.append((i, j, lens))
                exceptions.append((i, j))
            all_lens.append(lens)
        except Exception as e:
            errors.append((i, j, e))    
errors

100%|██████████| 402/402 [00:00<00:00, 20903.66it/s]


[]

In [136]:
new_data_2 = []

for i, example in enumerate(tqdm(new_data)):
    if example['augmented_dialogues'] == []:
        print(i)
        continue

    new_example = {}
    new_example['graph'] = example['graph']
    new_example['topic'] = example['topic']
    new_example['dialogues'] = example['dialogues']
    new_example['augmented_dialogues'] = example['augmented_dialogues']

    new_data_2.append(new_example)
len(new_data_2)

100%|██████████| 402/402 [00:00<00:00, 515472.40it/s]

135
182
200
242
248
293





396

In [137]:
with open(f"../data/gen_dataset_augmented_0-402_cleaned.json", "w", encoding="utf-8") as file:
    json.dump(new_data_2, file, indent=4)

# 3. Adding utterances to nodes and edges

In [138]:
with open("../data/gen_dataset_augmented_0-402_cleaned.json", "r") as fp:
    data = json.load(fp)
len(data)

396

In [139]:
for i, example in enumerate(tqdm(data)):
    dialogues = example['dialogues']
    augmented_dialogues = example['augmented_dialogues']
    
    for j, (orig_dia, aug_dia) in enumerate(zip(dialogues, augmented_dialogues)):
        for orig_turn, aug_turn in zip(orig_dia['messages'], aug_dia['messages']):
            phrase_to_look_for = orig_turn['text']
            phrases_to_add = aug_turn['text']

            if aug_turn['participant'] == 'assistant':
                key = 'nodes'
            elif aug_turn['participant'] == 'user':
                key = 'edges'
            
            for turn in example['graph'][key]:
                if (phrase_to_look_for in turn['utterances'] and 
                    phrases_to_add not in turn['utterances']):
                    turn['utterances'] += phrases_to_add

100%|██████████| 396/396 [00:00<00:00, 2331.60it/s]


In [140]:
data[0]['graph']['edges'][4]

{'source': 5,
 'target': 6,
 'utterances': ['Medium, please.',
  "I'd like a medium size, please.",
  'Medium would be great, thanks.',
  'Please get me a medium.',
  'Medium, please.',
  'I still want a medium size.',
  'Let’s stick with medium, please.',
  'A medium size, please.',
  'Medium would be great, thanks.',
  "I'd like a medium, please.",
  'Medium, please.',
  'Still medium, thank you.',
  'Let’s stick with medium, please.',
  "I'll take a medium, please.",
  'Medium would be great, thank you.',
  "I'd like a medium size.",
  'Medium, please.',
  'I’ll go with medium again.',
  'A medium size, please.']}

In [141]:
for i, example in enumerate(tqdm(data)):
    for key in ['nodes', 'edges']:
        for turn in example['graph'][key]:
            while '' in turn['utterances']:
                turn['utterances'].remove('')
            turn['utterances'] = list(set(turn['utterances']))

100%|██████████| 396/396 [00:00<00:00, 10435.95it/s]


In [142]:
data[0]['graph']['edges'][4]

{'source': 5,
 'target': 6,
 'utterances': ["I'd like a medium size.",
  'Medium, please.',
  'A medium size, please.',
  "I'd like a medium, please.",
  'Medium would be great, thank you.',
  'Please get me a medium.',
  'I’ll go with medium again.',
  'Let’s stick with medium, please.',
  'Medium would be great, thanks.',
  "I'd like a medium size, please.",
  'I still want a medium size.',
  "I'll take a medium, please.",
  'Still medium, thank you.']}

In [143]:
with open(f"../data/gen_dataset_augmented_0-402_nodes_edges.json", "w", encoding="utf-8") as file:
    json.dump(data, file, indent=4)

# 4. Combine new augmented dialogues with one set of variations:

In [147]:
with open("../data/gen_dataset_augmented_0-402_nodes_edges.json", "r") as fp:
    data = json.load(fp)
len(data)

396

In [145]:
def combine_new_aug_dia(aug_dia, k):
    new_aug_dia = {}
    new_aug_dia['id'] = aug_dia['id'] + f'_{k}'
    new_aug_dia['messages'] = []

    roles_to_add = [turn['participant'] for turn in aug_dia['messages']]            
    utterances_to_add = [turn['text'][k] for turn in aug_dia['messages']]

    for role, uttr in zip(roles_to_add, utterances_to_add):
        dict_messages = {}
        dict_messages['participant'] = role
        dict_messages['text'] = uttr         
        new_aug_dia['messages'].append(dict_messages)

    return new_aug_dia

In [148]:
new_data = []

for i, example in enumerate(tqdm(data)):
    # make a copy of example from old data
    new_example = {}
    new_example['graph'] = example['graph']
    new_example['topic'] = example['topic']
    
    # dialogues and augmented_dialogues will be set later
    new_example['dialogues'] = []
    new_example['augmented_dialogues'] = []    

    dialogues = example['dialogues']
    augmented_dialogues = example['augmented_dialogues']

    # take a pair of orig_dia and aug_dia (aug_dia has 3 variations of all phrases)
    for j, (orig_dia, aug_dia) in enumerate(zip(dialogues, augmented_dialogues)):
        # set an orig_dia without changes
        new_example['dialogues'].append(orig_dia)

        #combine 3 new aug_dias from 1 old aug_dia
        for k in range(3):
            new_aug_dia = combine_new_aug_dia(aug_dia, k)
            # add new aug_dia to the new example
            new_example['augmented_dialogues'].append(new_aug_dia)
            
    # add new example with new aug_dias to the new data
    new_data.append(new_example)
len(new_data)

  0%|          | 0/396 [00:00<?, ?it/s]

100%|██████████| 396/396 [00:00<00:00, 4823.36it/s]


396

In [149]:
with open(f"../data/gen_dataset_augmented_0-402_combined.json", "w", encoding="utf-8") as file:
    json.dump(new_data, file, indent=4)

In [155]:
new_data[0]['augmented_dialogues'][0]

{'id': 'Responding to DMs on Instagram/Facebook._1_0_0',
 'messages': [{'participant': 'assistant',
   'text': 'Hello! How can I help you today?'},
  {'participant': 'user', 'text': 'I’d like to place an order.'},
  {'participant': 'assistant',
   'text': 'Awesome! Which product are you interested in?'},
  {'participant': 'user', 'text': 'I would like a t-shirt.'},
  {'participant': 'assistant', 'text': 'What size do you prefer?'},
  {'participant': 'user', 'text': "I'd like a medium size, please."},
  {'participant': 'assistant', 'text': 'Do you want that in red or blue?'},
  {'participant': 'user', 'text': 'I prefer red, please.'},
  {'participant': 'assistant',
   'text': 'Great choice! Could you provide your shipping address?'},
  {'participant': 'user', 'text': 'Sure, it’s 123 Main Street.'},
  {'participant': 'assistant',
   'text': 'Thanks! Your order has been successfully placed. Is there anything else you need?'},
  {'participant': 'user', 'text': 'Actually, I’d like to modify

In [157]:
new_data[0]['graph']['edges']

[{'source': 1,
  'target': 2,
  'utterances': ['I’d like to make an order.',
   'I want to order something.',
   'I’d like to place an order.',
   'I want to make an order.',
   'I want to make a purchase.',
   'I want to place an order.',
   "I'm looking to order something.",
   "I'm looking to place an order."]},
 {'source': 1,
  'target': 3,
  'utterances': ['I would like some details on your product offerings.',
   "I'd like some product information.",
   'Could you provide me with some details about your products?',
   'Could you provide me with details about your products?',
   "I'm interested in learning more about your products.",
   'I would like to get some information on your offerings.']},
 {'source': 1,
  'target': 4,
  'utterances': ['No worries, I appreciate it.',
   'Never mind, thanks.',
   "It's all good, thanks anyway.",
   "That's fine, thank you."]},
 {'source': 2,
  'target': 5,
  'utterances': ["I'd love to order a t-shirt.",
   "Let's go with a t-shirt.",
   'I 