In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
import os

os.environ["PATH_TO_ENV"] = "~/projects/chatsky-llm-autoconfig/.env"

In [8]:
from tqdm import tqdm
import pickle
import pandas as pd
from augmentation_utils import (
    is_correct_length_modified,
    match_roles_modified,
)

# Error analysis

## matching roles and length correctness

In [12]:
with open("../data/gen_dataset_augmented_0-402", "rb") as file:
    data = pickle.load(file)
len(data)

402

In [13]:
failure_instances_length = []
failure_instances_roles = []

for i, instance in enumerate(tqdm(data)):
    dialogues = [dial["messages"] for dial in instance["dialogues"]]
    augmented_dialogues = [dial["messages"] for dial in instance["augmented_dialogues"]]

    for j, (orig_dia, aug_dia) in enumerate(zip(dialogues, augmented_dialogues)):
        length_comparison = is_correct_length_modified(orig_dia, aug_dia)
        roles_comparison = match_roles_modified(orig_dia, aug_dia)

        if length_comparison != True:
            failure_instances_length.append((i, j, length_comparison))
        if roles_comparison != True:
            failure_instances_roles.append((i, j, roles_comparison))

len(failure_instances_length), len(failure_instances_roles)

100%|██████████| 402/402 [00:00<00:00, 21453.42it/s]


(50, 50)

In [14]:
errors = []
for _, _, error in failure_instances_length:
    errors.append(error)
errors = set(errors)
errors

{False,
 "Length comparison error: object of type 'UnboundLocalError' has no len()"}

In [15]:
errors = []
for _, _, error in failure_instances_roles:
    errors.append(error)
errors = set(errors)
errors

{"Roles comparison error: 'UnboundLocalError' object is not iterable",
 "Roles comparison error: string indices must be integers, not 'str'"}

In [19]:
# graphs with length errors
graphs = set([graph for graph, _, _ in failure_instances_length])
len(graphs), graphs

(6, {92, 100, 200, 216, 222, 365})

In [21]:
# graphs with roles errors
graphs = set([graph for graph, _, _ in failure_instances_roles])
len(graphs), graphs

(6, {92, 100, 200, 216, 222, 365})

In [49]:
dialogues_total = []
for graph in sorted(graphs):
    dialogues_total.append((graph, len(data[graph]["dialogues"])))
tmp = pd.DataFrame(dialogues_total, columns=["graph", "dialogues_total"])
tmp

Unnamed: 0,graph,dialogues_total
0,92,7
1,100,48
2,200,36
3,216,44
4,222,47
5,365,6


In [51]:
df = pd.DataFrame(
    failure_instances_length, columns=["graph", "dialogue", "length_error"]
)
df.groupby("graph").count().reset_index().join(tmp, rsuffix="_").drop(
    columns=["graph_"]
)

Unnamed: 0,graph,dialogue,length_error,dialogues_total
0,92,5,5,7
1,100,1,1,48
2,200,36,36,36
3,216,1,1,44
4,222,5,5,47
5,365,2,2,6


In [82]:
df.loc[df["graph"] == 100]

Unnamed: 0,graph,dialogue,length_error
5,100,20,Length comparison error: object of type 'Unbou...


In [91]:
data[216]["dialogues"][6]

{'id': 'Requesting a military service discount_1_6',
 'messages': [{'participant': 'assistant',
   'text': 'Hello! How can I assist you today?'},
  {'participant': 'user',
   'text': 'I would like to request a military service discount.'},
  {'participant': 'assistant',
   'text': 'Of course! Are you currently serving in the military?'},
  {'participant': 'user', 'text': 'Yes, I am currently serving.'},
  {'participant': 'assistant',
   'text': 'Great! I have applied the discount to your account. Is there anything else I can help you with?'},
  {'participant': 'user',
   'text': "Actually, I'd like to update my contact information."},
  {'participant': 'assistant',
   'text': 'Sure, I can help you update your contact information. Please provide the new details.'},
  {'participant': 'user', 'text': 'Here are my new contact details.'},
  {'participant': 'assistant',
   'text': 'Thank you, your contact information has been updated. Is there anything else I can assist you with?'},
  {'part

In [90]:
data[100]["augmented_dialogues"][20]

{'id': 'Reporting a broken forgot password link_1_20',
 'messages': UnboundLocalError("cannot access local variable 'augmented_dialogue' where it is not associated with a value")}

## Re-augmentation

In [None]:
from augmentation_prompts import variations_augmentation_prompt_9
from dialogue_augmentation import augment_dialogue



In [87]:
len(data)

402

In [None]:
for i, j in [(100, 20), (216, 6)]:
    print(f"Augmenting example {i} dialogue {j}")
    topic = data[i]["topic"]
    orig_dialogue = data[i]["dialogues"][j]["messages"]

    try:
        aug_dialogue = augment_dialogue(
            orig_dialogue,
            topic,
            variations_augmentation_prompt_9,
            "gpt-4o-mini-2024-07-18",
        )
    except Exception as e:
        aug_dialogue = e

    data[i]["augmented_dialogues"][j]["messages"] = aug_dialogue

In [97]:
with open("../data/gen_dataset_augmented_0-402_v3", "wb") as file:
    pickle.dump(data, file)

In [102]:
data[100]["augmented_dialogues"][20]

{'id': 'Reporting a broken forgot password link_1_20',
 'messages': [{'participant': 'assistant',
   'text': ['Hi there! How may I help you today?',
    'Hello! What can I do for you today?',
    'Greetings! How can I assist you at this moment?']},
  {'participant': 'user',
   'text': ["I'm unable to log into my account because I've forgotten my password.",
    "I can't get into my account since I forgot my password.",
    "I'm locked out of my account due to a forgotten password."]},
  {'participant': 'assistant',
   'text': ["I'm really sorry to hear that. Is it the 'Forgot Password' link that's giving you trouble?",
    "I apologize for the inconvenience. Are you experiencing issues with the 'Forgot Password' link?",
    'Sorry to hear that! Are you having difficulties with the link to reset your password?']},
  {'participant': 'user',
   'text': ['No, I just need to change my password.',
    "That's not the issue; I simply want to reset my password.",
    'No, I only want to reset 

In [103]:
data[216]["augmented_dialogues"][6]

{'id': 'Requesting a military service discount_1_6',
 'messages': 'Generation error: length comparison: False; roles comparison: True'}