In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
import os

os.environ["PATH_TO_ENV"] = "~/projects/chatsky-llm-autoconfig/.env"

In [3]:
from tqdm import tqdm
import pickle
from augmentation_utils import (
    check_no_duplicates_all_dialogues,
    check_no_duplicates_one_dialogue,
    check_no_duplicates_one_uttr_list,
    is_correct_length_modified,
    match_roles_modified,
)

In [4]:
with open("../data/gen_dataset_augment_uttr-vars_9", "rb") as fp:
    data = pickle.load(fp)
len(data)

100

### Checking matching roles and length correctness

In [12]:
errors = []
failure_examples = []
new_data = []

for i, example in enumerate(tqdm(data)):
    dialogues = [dial["messages"] for dial in example["dialogues"]]
    augmented_dialogues = [dial["messages"] for dial in example["augmented_dialogues"]]
    true_count = 0

    try:
        for j, (orig_dia, aug_dia) in enumerate(zip(dialogues, augmented_dialogues)):
            if is_correct_length_modified(orig_dia, aug_dia):
                true_count += 1
            else:
                failure_examples.append((i, j))
    except Exception as e:
        errors.append((i, j, e))

    if true_count == len(augmented_dialogues):
        new_data.append(example)

failure_examples, errors

100%|██████████| 100/100 [00:00<00:00, 60047.30it/s]


([(26, 3),
  (54, 4),
  (75, 16),
  (92, 0),
  (92, 1),
  (92, 4),
  (92, 5),
  (92, 6),
  (97, 0)],
 [])

Checking failure examples

In [17]:
for i, j in failure_examples:
    orig_dia = data[i]["dialogues"][j]["messages"]
    aug_dia = data[i]["augmented_dialogues"][j]["messages"]
    break

In [18]:
len(orig_dia), len(aug_dia)

(21, 19)

In [6]:
errors = []
failure_examples = []
new_data = []

for i, example in enumerate(tqdm(data)):
    dialogues = [dial["messages"] for dial in example["dialogues"]]
    augmented_dialogues = [dial["messages"] for dial in example["augmented_dialogues"]]
    true_count = 0

    try:
        for j, (orig_dia, aug_dia) in enumerate(zip(dialogues, augmented_dialogues)):
            if match_roles_modified(orig_dia, aug_dia):
                true_count += 1
            else:
                failure_examples.append((i, j))
    except Exception as e:
        errors.append((i, j, e))

    if true_count == len(augmented_dialogues):
        new_data.append(example)

failure_examples, errors

100%|██████████| 100/100 [00:00<00:00, 22443.84it/s]


([], [])

### Checking duplicates

In [25]:
new_data = []
errors = []
failure_examples = []

for i, example in enumerate(tqdm(data)):
    augmented_dialogues = [dial["messages"] for dial in example["augmented_dialogues"]]
    true_count = 0

    for j, aug_dia in enumerate(augmented_dialogues):
        try:
            if not check_no_duplicates_one_uttr_list(aug_dia):
                failure_examples.append((i, j))
            else:
                true_count += 1
        except Exception as e:
            errors.append((i, j, e))

    if true_count == len(augmented_dialogues):
        new_data.append(example)
errors, failure_examples

100%|██████████| 100/100 [00:00<00:00, 30303.48it/s]


([], [])

In [26]:
new_data = []
errors = []
failure_examples = []

for i, example in enumerate(tqdm(data)):
    print(i)
    augmented_dialogues = [dial["messages"] for dial in example["augmented_dialogues"]]
    true_count = 0

    for j, aug_dia in enumerate(augmented_dialogues):
        try:
            if not check_no_duplicates_one_dialogue(aug_dia):
                failure_examples.append((i, j))
            else:
                true_count += 1
        except Exception as e:
            errors.append((i, j, e))

    if true_count == len(augmented_dialogues):
        new_data.append(example)
errors, failure_examples

100%|██████████| 100/100 [00:00<00:00, 8957.40it/s]

0
common_elements: ['Do you want that in red or blue?', "It's 123 Main Street."]
1
2
common_elements: ['Greetings! What can I do for you today?']
common_elements: ['Hi there! How can I help you today?', 'Greetings! What can I do for you today?', 'Hello! How may I assist you today?']
3
4
common_elements: ['I see that you have concerns about ethical sourcing. Can you share more about what specifically troubles you?', 'I recognize your worries about ethical sourcing. Could you provide more details on your specific concerns?', 'I understand your concerns regarding ethical sourcing. What particular aspects are you worried about?']
5
6
common_elements: ['Got it. Now, let’s move on to pricing. What budget do you have in mind for this project?']
7
8
9
10
11
12
common_elements: ['1.5 BTC', "Could you please confirm that you're sending {amount} Bitcoin?"]
common_elements: ['1.5 BTC']
common_elements: ['1.5 BTC']
common_elements: ['2 ETH', 'I want to send 2 ETH.', 'Just to confirm: You’re sending




([],
 [(0, 1),
  (2, 4),
  (2, 7),
  (4, 9),
  (6, 0),
  (12, 0),
  (12, 1),
  (12, 2),
  (12, 3),
  (15, 0),
  (15, 2),
  (16, 0),
  (16, 1),
  (16, 3),
  (16, 4),
  (16, 5),
  (17, 13),
  (17, 14),
  (17, 15),
  (17, 16),
  (17, 19),
  (18, 0),
  (18, 6),
  (19, 0),
  (19, 1),
  (19, 4),
  (20, 0),
  (21, 8),
  (26, 2),
  (28, 0),
  (28, 1),
  (28, 2),
  (28, 4),
  (28, 5),
  (31, 0),
  (31, 1),
  (34, 2),
  (34, 3),
  (35, 4),
  (37, 0),
  (37, 1),
  (37, 2),
  (37, 3),
  (37, 4),
  (37, 5),
  (38, 0),
  (38, 1),
  (40, 11),
  (41, 1),
  (42, 4),
  (43, 1),
  (43, 4),
  (44, 2),
  (44, 4),
  (47, 1),
  (50, 6),
  (51, 5),
  (51, 12),
  (52, 0),
  (53, 6),
  (54, 0),
  (54, 1),
  (54, 2),
  (54, 3),
  (54, 5),
  (54, 6),
  (54, 7),
  (54, 9),
  (59, 0),
  (59, 2),
  (59, 3),
  (59, 8),
  (60, 0),
  (60, 1),
  (63, 5),
  (64, 3),
  (65, 5),
  (66, 0),
  (66, 1),
  (68, 0),
  (68, 2),
  (68, 3),
  (69, 0),
  (69, 1),
  (69, 2),
  (69, 3),
  (69, 4),
  (75, 8),
  (78, 10),
  (78, 13),
 

In [27]:
new_data = []
errors = []
failure_examples = []

for i, example in enumerate(tqdm(data)):
    print(f"example {i}")
    augmented_dialogues = [dial["messages"] for dial in example["augmented_dialogues"]]

    try:
        if not check_no_duplicates_all_dialogues(augmented_dialogues):
            failure_examples.append(i)
        else:
            new_data.append(example)
    except Exception as e:
        errors.append((i, e))
errors, failure_examples

100%|██████████| 100/100 [00:00<00:00, 9186.15it/s]

example 0
common_elements: {'Perfect! What’s your shipping address?', 'Medium, please.', 'Of course, my address is 123 Main St.', "I'm interested in a t-shirt.", 'Do you want that in red or blue?', 'Sure, it’s 123 Main St.', 'Thank you! Your order is confirmed. Anything else I can assist you with?', 'Great! What would you like to order?', 'Hello! How can I help you today?'}
example 1
common_elements: {"I did, but it wasn't successful.", 'Have you considered resetting your password?', 'Thanks for your help.', "Hi there! I apologize for the issues you're facing. How can I help you today?", 'Yes, please go ahead and escalate it.', "Yes, but that didn't solve the problem.", "I tried that, but it didn't work.", "I can't seem to get into my profile.", "I'm unable to log into my account.", 'Did you try resetting your password?', 'Thank you for your assistance.', 'Have you attempted to reset your password?', 'Thank you for contacting us. Have a wonderful day!', "I'm having issues getting into 




([],
 [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99])

In [8]:
dialogues[1]

[{'participant': 'assistant',
  'text': 'Hello! How can I assist you with your rewards and promo codes today?'},
 {'participant': 'user',
  'text': 'I want to combine my rewards with a promo code.'},
 {'participant': 'assistant',
  'text': 'Sure, I can help with that. Please provide your account number.'},
 {'participant': 'user', 'text': 'My account number is 123456.'},
 {'participant': 'assistant',
  'text': "Thank you. I've verified your account. Would you like to apply a promo code now?"},
 {'participant': 'user',
  'text': 'Actually, I want to change my account number.'},
 {'participant': 'assistant',
  'text': 'Of course! Please provide your new account number.'},
 {'participant': 'user', 'text': '987654'},
 {'participant': 'assistant',
  'text': 'Sure, I can help with that. Please provide your account number.'},
 {'participant': 'user', 'text': 'My account number is 123456.'},
 {'participant': 'assistant',
  'text': "Thank you. I've verified your account. Would you like to apply

In [7]:
augmented_dialogues[1]

[{'participant': 'assistant',
  'text': ['Hi there! How can I help you today with your rewards and promotional codes?',
   'Hello! What assistance do you need regarding your rewards and promo codes?',
   'Greetings! How can I support you with your rewards and promotional offers today?']},
 {'participant': 'user',
  'text': ['I would like to merge my rewards with a promo code.',
   'Can I use my rewards together with a promo code?',
   'I want to combine my rewards points with a promotional code.']},
 {'participant': 'assistant',
  'text': ['Absolutely, I can assist you with that. Could you please share your account number?',
   'Sure thing! Please provide me with your account number so I can help.',
   "Of course! Just give me your account number, and I'll get started."]},
 {'participant': 'user',
  'text': ['My account number is 123456.',
   "It's 123456.",
   'You can find it at 123456.']},
 {'participant': 'assistant',
  'text': ["Thank you! I've confirmed your account. Do you want 