In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import os
os.environ['PATH_TO_ENV'] = "~/projects/chatsky-llm-autoconfig/.env"

In [3]:
from dialogue2graph.pipelines.core.dialogue import DialogueMessage
from pydantic import BaseModel
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from typing import List
from tqdm import tqdm
import pickle



In [4]:
from datasets import load_dataset

dataset = load_dataset("DeepPavlov/d2g_generated", token=True)

In [5]:
from augmentation_prompts import (
    variations_augmentation_prompt_2, variations_augmentation_prompt_4,
    variations_augmentation_prompt_9
    )
from dialogue_augmentation import augment_dialogue, augment_dialogue_data

# variations_augmentation_prompt_2

In [6]:
augmentation_prompt = PromptTemplate.from_template(variations_augmentation_prompt_2)

class DialogueSequence(BaseModel):
    result: List[DialogueMessage]

model = ChatOpenAI(
    model="gpt-4o-mini-2024-07-18", 
    api_key=os.getenv("OPENAI_API_KEY"), 
    base_url=os.getenv("OPENAI_BASE_URL"), 
    temperature=0.7
)

parser = JsonOutputParser(pydantic_object=DialogueSequence)

chain = augmentation_prompt | model | parser

In [6]:
example = dataset['train'][0]
dialogues = [dial['messages'] for dial in example['dialogues']]
len(dialogues)

6

In [7]:
topic = example['topic']
orig_dialogue = dialogues[0]
orig_dialogue

[{'participant': 'assistant', 'text': 'Hi there! How can I assist you today?'},
 {'participant': 'user', 'text': 'I want to place an order.'},
 {'participant': 'assistant',
  'text': 'Great! What product are you interested in?'},
 {'participant': 'user', 'text': 'I’d like a t-shirt.'},
 {'participant': 'assistant', 'text': 'What size would you like?'},
 {'participant': 'user', 'text': 'Medium, please.'},
 {'participant': 'assistant', 'text': 'Would you like that in red or blue?'},
 {'participant': 'user', 'text': 'Red, please.'},
 {'participant': 'assistant',
  'text': 'Perfect! Can I have your shipping address?'},
 {'participant': 'user', 'text': 'Sure, it’s 123 Main St.'},
 {'participant': 'assistant',
  'text': 'Thank you! Your order has been placed. Is there anything else I can help you with?'},
 {'participant': 'user', 'text': 'Actually, can I change my order?'},
 {'participant': 'assistant',
  'text': 'Of course! What would you like to change?'},
 {'participant': 'user', 'text': 

In [9]:
aug_dialogue = chain.invoke({
            "topic": topic,
            "dialogue": orig_dialogue
        })
aug_dialogue

[{'participant': 'assistant',
  'text': ['Hello! How can I help you today?',
   'Hi! What assistance do you need today?',
   'Hey there! How may I assist you?']},
 {'participant': 'user',
  'text': ["I'd like to make an order.",
   'I want to place an order for something.',
   "I'm interested in ordering."]},
 {'participant': 'assistant',
  'text': ['Awesome! Which product are you looking to buy?',
   'Fantastic! What item do you want to order?',
   'Great! What product are you thinking about?']},
 {'participant': 'user',
  'text': ["I'm looking for a t-shirt.",
   'I’d like to order a t-shirt.',
   'I want a t-shirt, please.']},
 {'participant': 'assistant',
  'text': ['What size do you prefer?',
   'Which size would you like to go with?',
   'Can you tell me your preferred size?']},
 {'participant': 'user',
  'text': ['Medium would be great, thanks.',
   "I'll take a medium size, please.",
   'I’d like a medium, please.']},
 {'participant': 'assistant',
  'text': ['Would you prefer t

In [13]:
new_data = []
i = 0

for i, example in enumerate(dataset['train']):
    print(f'Augmenting example {i}:')
    topic = example['topic']
    all_dialogues = example['dialogues']

    example['augmented_dialogues'] = []

    for element in tqdm(all_dialogues, total=len(all_dialogues)):
        orig_dialogue = element['messages']
        try:         
            aug_dialogue = chain.invoke({
                "topic": topic,
                "dialogue": orig_dialogue
            })
        except Exception as e:
            aug_dialogue = e

        example['augmented_dialogues'].append(
            {
                'id' : element['id'],
                'messages' : aug_dialogue
                }
            )
        
    new_data.append(example)
    with open("../data/gen_dataset_augment_uttr-vars", "wb") as fp:
        pickle.dump(new_data, fp)

    i += 1
    if i == 10:
        break

Augmenting example 0:


100%|██████████| 6/6 [00:45<00:00,  7.57s/it]


Augmenting example 1:


100%|██████████| 12/12 [01:45<00:00,  8.75s/it]


Augmenting example 2:


100%|██████████| 12/12 [01:57<00:00,  9.77s/it]


Augmenting example 3:


100%|██████████| 9/9 [01:17<00:00,  8.57s/it]


Augmenting example 4:


100%|██████████| 13/13 [01:53<00:00,  8.69s/it]


Augmenting example 5:


100%|██████████| 12/12 [02:02<00:00, 10.18s/it]


Augmenting example 6:


100%|██████████| 5/5 [01:17<00:00, 15.44s/it]


Augmenting example 7:


100%|██████████| 17/17 [02:57<00:00, 10.45s/it]


Augmenting example 8:


100%|██████████| 10/10 [01:54<00:00, 11.46s/it]


Augmenting example 9:


100%|██████████| 6/6 [01:24<00:00, 14.07s/it]


# other prompts

In [6]:
new_data = augment_dialogue_data(
    dataset['train'].select(range(2)), variations_augmentation_prompt_4,
    "gpt-4o-mini-2024-07-18", "../data/gen_dataset_augment_uttr-vars_4_2")

Augmenting example 0:


100%|██████████| 6/6 [00:46<00:00,  7.71s/it]


Augmenting example 1:


100%|██████████| 12/12 [01:22<00:00,  6.86s/it]


In [7]:
new_data = augment_dialogue_data(
    dataset['train'].select(range(2)), variations_augmentation_prompt_9,
    "gpt-4o-mini-2024-07-18", "../data/gen_dataset_augment_uttr-vars_9_2")

Augmenting example 0:


100%|██████████| 6/6 [00:53<00:00,  8.92s/it]


Augmenting example 1:


100%|██████████| 12/12 [01:25<00:00,  7.13s/it]


# Augmentation with prompt 9

In [19]:
for example in dataset['train'].select(range(2, 10)):
    print(len(example['dialogues']))

12
9
13
12
5
17
10
6


In [20]:
new_data = augment_dialogue_data(
    dataset['train'].select(range(2, 10)), variations_augmentation_prompt_9,
    "gpt-4o-mini-2024-07-18", "../data/gen_dataset_augment_uttr-vars_9_8")

Augmenting example 0:


100%|██████████| 12/12 [01:40<00:00,  8.41s/it]


Augmenting example 1:


100%|██████████| 9/9 [01:24<00:00,  9.42s/it]


Augmenting example 2:


100%|██████████| 13/13 [01:32<00:00,  7.11s/it]


Augmenting example 3:


100%|██████████| 12/12 [01:40<00:00,  8.40s/it]


Augmenting example 4:


100%|██████████| 5/5 [01:09<00:00, 13.85s/it]


Augmenting example 5:


100%|██████████| 17/17 [02:39<00:00,  9.39s/it]


Augmenting example 6:


100%|██████████| 10/10 [01:21<00:00,  8.19s/it]


Augmenting example 7:


100%|██████████| 6/6 [01:23<00:00, 13.93s/it]


In [None]:
with open("../data/gen_dataset_augment_uttr-vars_9_2", "rb") as fp:
    part_1 = pickle.load(fp)
with open("../data/gen_dataset_augment_uttr-vars_9_8", "rb") as fp:
    part_2 = pickle.load(fp)
data_9 = part_1 + part_2

with open("../data/gen_dataset_augment_uttr-vars_9_10", "wb") as fp:
    pickle.dump(data_9, fp)

In [25]:
new_data_2 = augment_dialogue_data(
    dataset['train'].select(range(10, 20)), variations_augmentation_prompt_9,
    "gpt-4o-mini-2024-07-18", "../data/gen_dataset_augment_uttr-vars_9_20")

Augmenting example 0:


100%|██████████| 16/16 [02:00<00:00,  7.54s/it]


Augmenting example 1:


100%|██████████| 5/5 [01:00<00:00, 12.10s/it]


Augmenting example 2:


100%|██████████| 5/5 [00:38<00:00,  7.69s/it]


Augmenting example 3:


100%|██████████| 4/4 [01:22<00:00, 20.72s/it]


Augmenting example 4:


100%|██████████| 10/10 [01:37<00:00,  9.78s/it]


Augmenting example 5:


100%|██████████| 4/4 [00:59<00:00, 14.90s/it]


Augmenting example 6:


100%|██████████| 10/10 [01:48<00:00, 10.84s/it]


Augmenting example 7:


100%|██████████| 20/20 [03:31<00:00, 10.57s/it]


Augmenting example 8:


100%|██████████| 19/19 [03:00<00:00,  9.49s/it]


Augmenting example 9:


100%|██████████| 7/7 [01:47<00:00, 15.42s/it]


In [26]:
new_data_3 = augment_dialogue_data(
    dataset['train'].select(range(20, 30)), variations_augmentation_prompt_9,
    "gpt-4o-mini-2024-07-18", "../data/gen_dataset_augment_uttr-vars_9_30")

Augmenting example 0:


100%|██████████| 9/9 [00:56<00:00,  6.30s/it]


Augmenting example 1:


100%|██████████| 31/31 [03:08<00:00,  6.08s/it]


Augmenting example 2:


100%|██████████| 17/17 [02:17<00:00,  8.09s/it]


Augmenting example 3:


100%|██████████| 18/18 [01:35<00:00,  5.32s/it]


Augmenting example 4:


100%|██████████| 5/5 [00:51<00:00, 10.38s/it]


Augmenting example 5:


100%|██████████| 8/8 [01:07<00:00,  8.47s/it]


Augmenting example 6:


100%|██████████| 4/4 [00:47<00:00, 11.85s/it]


Augmenting example 7:


100%|██████████| 19/19 [02:23<00:00,  7.57s/it]


Augmenting example 8:


100%|██████████| 9/9 [01:27<00:00,  9.76s/it]


Augmenting example 9:


100%|██████████| 9/9 [01:20<00:00,  8.93s/it]


In [27]:
new_data = augment_dialogue_data(
    dataset['train'].select(range(30, 40)), variations_augmentation_prompt_9,
    "gpt-4o-mini-2024-07-18", "../data/gen_dataset_augment_uttr-vars_9_40")

Augmenting example 0:


100%|██████████| 7/7 [00:57<00:00,  8.23s/it]


Augmenting example 1:


100%|██████████| 9/9 [01:08<00:00,  7.61s/it]


Augmenting example 2:


100%|██████████| 8/8 [01:40<00:00, 12.59s/it]


Augmenting example 3:


100%|██████████| 9/9 [01:35<00:00, 10.64s/it]


Augmenting example 4:


100%|██████████| 4/4 [01:19<00:00, 19.79s/it]


Augmenting example 5:


100%|██████████| 9/9 [01:41<00:00, 11.33s/it]


Augmenting example 6:


100%|██████████| 13/13 [01:46<00:00,  8.19s/it]


Augmenting example 7:


100%|██████████| 8/8 [01:33<00:00, 11.68s/it]


Augmenting example 8:


100%|██████████| 10/10 [01:27<00:00,  8.76s/it]


Augmenting example 9:


100%|██████████| 14/14 [02:29<00:00, 10.66s/it]


In [28]:
new_data = augment_dialogue_data(
    dataset['train'].select(range(40, 50)), variations_augmentation_prompt_9,
    "gpt-4o-mini-2024-07-18", "../data/gen_dataset_augment_uttr-vars_9_50")

Augmenting example 0:


100%|██████████| 19/19 [02:59<00:00,  9.45s/it]


Augmenting example 1:


100%|██████████| 7/7 [01:12<00:00, 10.34s/it]


Augmenting example 2:


100%|██████████| 8/8 [01:16<00:00,  9.60s/it]


Augmenting example 3:


100%|██████████| 5/5 [01:09<00:00, 13.94s/it]


Augmenting example 4:


100%|██████████| 12/12 [02:45<00:00, 13.78s/it]


Augmenting example 5:


100%|██████████| 13/13 [01:33<00:00,  7.22s/it]


Augmenting example 6:


100%|██████████| 7/7 [01:19<00:00, 11.33s/it]


Augmenting example 7:


100%|██████████| 14/14 [01:32<00:00,  6.64s/it]


Augmenting example 8:


100%|██████████| 10/10 [01:00<00:00,  6.03s/it]


Augmenting example 9:


100%|██████████| 12/12 [01:18<00:00,  6.50s/it]


In [29]:
new_data = augment_dialogue_data(
    dataset['train'].select(range(50, 60)), variations_augmentation_prompt_9,
    "gpt-4o-mini-2024-07-18", "../data/gen_dataset_augment_uttr-vars_9_60")

Augmenting example 0:


100%|██████████| 11/11 [01:19<00:00,  7.26s/it]


Augmenting example 1:


100%|██████████| 18/18 [03:12<00:00, 10.68s/it]


Augmenting example 2:


100%|██████████| 5/5 [00:55<00:00, 11.17s/it]


Augmenting example 3:


100%|██████████| 38/38 [05:40<00:00,  8.95s/it]


Augmenting example 4:


100%|██████████| 10/10 [02:31<00:00, 15.16s/it]


Augmenting example 5:


100%|██████████| 11/11 [01:55<00:00, 10.45s/it]


Augmenting example 6:


100%|██████████| 14/14 [02:53<00:00, 12.40s/it]


Augmenting example 7:


100%|██████████| 12/12 [01:00<00:00,  5.04s/it]


Augmenting example 8:


100%|██████████| 10/10 [01:36<00:00,  9.61s/it]


Augmenting example 9:


100%|██████████| 11/11 [04:21<00:00, 23.81s/it]


In [30]:
new_data = augment_dialogue_data(
    dataset['train'].select(range(60, 70)), variations_augmentation_prompt_9,
    "gpt-4o-mini-2024-07-18", "../data/gen_dataset_augment_uttr-vars_9_70")

Augmenting example 0:


100%|██████████| 7/7 [00:58<00:00,  8.31s/it]


Augmenting example 1:


100%|██████████| 7/7 [00:43<00:00,  6.27s/it]


Augmenting example 2:


100%|██████████| 22/22 [03:05<00:00,  8.41s/it]


Augmenting example 3:


100%|██████████| 7/7 [01:20<00:00, 11.47s/it]


Augmenting example 4:


100%|██████████| 7/7 [01:22<00:00, 11.76s/it]


Augmenting example 5:


100%|██████████| 6/6 [01:14<00:00, 12.42s/it]


Augmenting example 6:


100%|██████████| 4/4 [00:53<00:00, 13.41s/it]


Augmenting example 7:


100%|██████████| 17/17 [01:30<00:00,  5.35s/it]


Augmenting example 8:


100%|██████████| 6/6 [00:59<00:00,  9.84s/it]


Augmenting example 9:


100%|██████████| 7/7 [01:10<00:00, 10.04s/it]


In [31]:
new_data = augment_dialogue_data(
    dataset['train'].select(range(70, 80)), variations_augmentation_prompt_9,
    "gpt-4o-mini-2024-07-18", "../data/gen_dataset_augment_uttr-vars_9_80")

Augmenting example 0:


100%|██████████| 10/10 [01:36<00:00,  9.64s/it]


Augmenting example 1:


100%|██████████| 6/6 [01:05<00:00, 10.89s/it]


Augmenting example 2:


100%|██████████| 9/9 [01:07<00:00,  7.55s/it]


Augmenting example 3:


100%|██████████| 12/12 [02:10<00:00, 10.91s/it]


Augmenting example 4:


100%|██████████| 10/10 [01:39<00:00,  9.92s/it]


Augmenting example 5:


100%|██████████| 17/17 [02:22<00:00,  8.38s/it]


Augmenting example 6:


100%|██████████| 7/7 [01:27<00:00, 12.56s/it]


Augmenting example 7:


100%|██████████| 15/15 [01:30<00:00,  6.01s/it]


Augmenting example 8:


100%|██████████| 14/14 [02:27<00:00, 10.52s/it]


Augmenting example 9:


100%|██████████| 8/8 [01:18<00:00,  9.76s/it]


In [32]:
new_data = augment_dialogue_data(
    dataset['train'].select(range(80, 90)), variations_augmentation_prompt_9,
    "gpt-4o-mini-2024-07-18", "../data/gen_dataset_augment_uttr-vars_9_90")

Augmenting example 0:


100%|██████████| 12/12 [01:37<00:00,  8.13s/it]


Augmenting example 1:


100%|██████████| 5/5 [00:41<00:00,  8.37s/it]


Augmenting example 2:


100%|██████████| 3/3 [00:27<00:00,  9.30s/it]


Augmenting example 3:


100%|██████████| 11/11 [02:13<00:00, 12.12s/it]


Augmenting example 4:


100%|██████████| 9/9 [01:16<00:00,  8.51s/it]


Augmenting example 5:


100%|██████████| 9/9 [01:16<00:00,  8.55s/it]


Augmenting example 6:


100%|██████████| 8/8 [00:59<00:00,  7.48s/it]


Augmenting example 7:


100%|██████████| 6/6 [00:47<00:00,  7.86s/it]


Augmenting example 8:


100%|██████████| 10/10 [01:12<00:00,  7.28s/it]


Augmenting example 9:


100%|██████████| 12/12 [02:03<00:00, 10.25s/it]


In [33]:
new_data = augment_dialogue_data(
    dataset['train'].select(range(90, 100)), variations_augmentation_prompt_9,
    "gpt-4o-mini-2024-07-18", "../data/gen_dataset_augment_uttr-vars_9_100")

Augmenting example 0:


100%|██████████| 9/9 [01:09<00:00,  7.68s/it]


Augmenting example 1:


100%|██████████| 6/6 [01:10<00:00, 11.78s/it]


Augmenting example 2:


100%|██████████| 7/7 [02:40<00:00, 22.87s/it]


Augmenting example 3:


100%|██████████| 9/9 [01:25<00:00,  9.45s/it]


Augmenting example 4:


100%|██████████| 7/7 [01:38<00:00, 14.10s/it]


Augmenting example 5:


100%|██████████| 7/7 [00:57<00:00,  8.18s/it]


Augmenting example 6:


100%|██████████| 15/15 [02:38<00:00, 10.54s/it]


Augmenting example 7:


100%|██████████| 5/5 [00:53<00:00, 10.65s/it]


Augmenting example 8:


100%|██████████| 9/9 [03:12<00:00, 21.37s/it]


Augmenting example 9:


100%|██████████| 13/13 [02:48<00:00, 12.98s/it]


In [35]:
all_data = []
for i in range(10, 110, 10):
    with open(f"../data/gen_dataset_augment_uttr-vars_9_{i}", "rb") as fp:
        data = pickle.load(fp)
        all_data += data
len(all_data)

100

In [36]:
with open("../data/gen_dataset_augment_uttr-vars_9", "wb") as fp:
    pickle.dump(all_data, fp)