In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import os
# os.environ['PATH_TO_ENV'] = "~/projects/chatsky-llm-autoconfig/.env"
os.getenv('EMBEDDER_MODEL')

'BAAI/bge-m3'

In [3]:
from dialogue2graph.pipelines.core.dialogue import DialogueMessage
from pydantic import BaseModel
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from typing import List
from tqdm import tqdm
import pickle



In [4]:
from datasets import load_dataset

dataset = load_dataset("DeepPavlov/d2g_generated", token=True)

In [5]:
from augmentation_prompts import (
    variations_augmentation_prompt_2, variations_augmentation_prompt_3, 
    variations_augmentation_prompt_4, variations_augmentation_prompt_5,
    variations_augmentation_prompt_6, variations_augmentation_prompt_7,
    variations_augmentation_prompt_8
    )
from augment_dialogue import augment_dialogue

# variations_augmentation_prompt_2

In [6]:
augmentation_prompt = PromptTemplate.from_template(variations_augmentation_prompt_2)

class DialogueSequence(BaseModel):
    result: List[DialogueMessage]

model = ChatOpenAI(
    model="gpt-4o-mini-2024-07-18", 
    api_key=os.getenv("OPENAI_API_KEY"), 
    base_url=os.getenv("OPENAI_BASE_URL"), 
    temperature=0.7
)

parser = JsonOutputParser(pydantic_object=DialogueSequence)

chain = augmentation_prompt | model | parser

In [6]:
example = dataset['train'][0]
dialogues = [dial['messages'] for dial in example['dialogues']]
len(dialogues)

6

In [7]:
topic = example['topic']
orig_dialogue = dialogues[0]
orig_dialogue

[{'participant': 'assistant', 'text': 'Hi there! How can I assist you today?'},
 {'participant': 'user', 'text': 'I want to place an order.'},
 {'participant': 'assistant',
  'text': 'Great! What product are you interested in?'},
 {'participant': 'user', 'text': 'I’d like a t-shirt.'},
 {'participant': 'assistant', 'text': 'What size would you like?'},
 {'participant': 'user', 'text': 'Medium, please.'},
 {'participant': 'assistant', 'text': 'Would you like that in red or blue?'},
 {'participant': 'user', 'text': 'Red, please.'},
 {'participant': 'assistant',
  'text': 'Perfect! Can I have your shipping address?'},
 {'participant': 'user', 'text': 'Sure, it’s 123 Main St.'},
 {'participant': 'assistant',
  'text': 'Thank you! Your order has been placed. Is there anything else I can help you with?'},
 {'participant': 'user', 'text': 'Actually, can I change my order?'},
 {'participant': 'assistant',
  'text': 'Of course! What would you like to change?'},
 {'participant': 'user', 'text': 

In [9]:
aug_dialogue = chain.invoke({
            "topic": topic,
            "dialogue": orig_dialogue
        })
aug_dialogue

[{'participant': 'assistant',
  'text': ['Hello! How can I help you today?',
   'Hi! What assistance do you need today?',
   'Hey there! How may I assist you?']},
 {'participant': 'user',
  'text': ["I'd like to make an order.",
   'I want to place an order for something.',
   "I'm interested in ordering."]},
 {'participant': 'assistant',
  'text': ['Awesome! Which product are you looking to buy?',
   'Fantastic! What item do you want to order?',
   'Great! What product are you thinking about?']},
 {'participant': 'user',
  'text': ["I'm looking for a t-shirt.",
   'I’d like to order a t-shirt.',
   'I want a t-shirt, please.']},
 {'participant': 'assistant',
  'text': ['What size do you prefer?',
   'Which size would you like to go with?',
   'Can you tell me your preferred size?']},
 {'participant': 'user',
  'text': ['Medium would be great, thanks.',
   "I'll take a medium size, please.",
   'I’d like a medium, please.']},
 {'participant': 'assistant',
  'text': ['Would you prefer t

In [13]:
new_data = []
i = 0

for i, example in enumerate(dataset['train']):
    print(f'Augmenting example {i}:')
    topic = example['topic']
    all_dialogues = example['dialogues']

    example['augmented_dialogues'] = []

    for element in tqdm(all_dialogues, total=len(all_dialogues)):
        orig_dialogue = element['messages']
        try:         
            aug_dialogue = chain.invoke({
                "topic": topic,
                "dialogue": orig_dialogue
            })
        except Exception as e:
            aug_dialogue = e

        example['augmented_dialogues'].append(
            {
                'id' : element['id'],
                'messages' : aug_dialogue
                }
            )
        
    new_data.append(example)
    with open("../data/gen_dataset_augment_uttr-vars", "wb") as fp:
        pickle.dump(new_data, fp)

    i += 1
    if i == 10:
        break

Augmenting example 0:


100%|██████████| 6/6 [00:45<00:00,  7.57s/it]


Augmenting example 1:


100%|██████████| 12/12 [01:45<00:00,  8.75s/it]


Augmenting example 2:


100%|██████████| 12/12 [01:57<00:00,  9.77s/it]


Augmenting example 3:


100%|██████████| 9/9 [01:17<00:00,  8.57s/it]


Augmenting example 4:


100%|██████████| 13/13 [01:53<00:00,  8.69s/it]


Augmenting example 5:


100%|██████████| 12/12 [02:02<00:00, 10.18s/it]


Augmenting example 6:


100%|██████████| 5/5 [01:17<00:00, 15.44s/it]


Augmenting example 7:


100%|██████████| 17/17 [02:57<00:00, 10.45s/it]


Augmenting example 8:


100%|██████████| 10/10 [01:54<00:00, 11.46s/it]


Augmenting example 9:


100%|██████████| 6/6 [01:24<00:00, 14.07s/it]


# variations_augmentation_prompt_3, variations_augmentation_prompt_4

In [6]:
new_data = []
i = 0

for i, example in enumerate(dataset['train']):
    print(f'Augmenting example {i}:')
    topic = example['topic']
    all_dialogues = example['dialogues']

    example['augmented_dialogues'] = []

    for element in tqdm(all_dialogues, total=len(all_dialogues)):
        orig_dialogue = element['messages']
        try:         
            aug_dialogue = augment_dialogue(
                orig_dialogue, topic, variations_augmentation_prompt_3, "gpt-4o-mini-2024-07-18"
            )
        except Exception as e:
            aug_dialogue = e

        example['augmented_dialogues'].append(
            {
                'id' : element['id'],
                'messages' : aug_dialogue
                }
            )
        
    new_data.append(example)
    with open("../data/gen_dataset_augment_uttr-vars_3", "wb") as fp:
        pickle.dump(new_data, fp)

    i += 1
    if i == 1:
        break

Augmenting example 0:


100%|██████████| 6/6 [00:57<00:00,  9.60s/it]


In [7]:
new_data = []
i = 0

for i, example in enumerate(dataset['train']):
    print(f'Augmenting example {i}:')
    topic = example['topic']
    all_dialogues = example['dialogues']

    example['augmented_dialogues'] = []

    for element in tqdm(all_dialogues, total=len(all_dialogues)):
        orig_dialogue = element['messages']
        try:         
            aug_dialogue = augment_dialogue(
                orig_dialogue, topic, variations_augmentation_prompt_4, "gpt-4o-mini-2024-07-18"
            )
        except Exception as e:
            aug_dialogue = e

        example['augmented_dialogues'].append(
            {
                'id' : element['id'],
                'messages' : aug_dialogue
                }
            )
        
    new_data.append(example)
    with open("../data/gen_dataset_augment_uttr-vars_4", "wb") as fp:
        pickle.dump(new_data, fp)

    i += 1
    if i == 1:
        break

Augmenting example 0:


100%|██████████| 6/6 [01:01<00:00, 10.31s/it]


In [6]:
new_data = []
i = 0

for i, example in enumerate(dataset['train']):
    print(f'Augmenting example {i}:')
    topic = example['topic']
    all_dialogues = example['dialogues']

    example['augmented_dialogues'] = []

    for element in tqdm(all_dialogues, total=len(all_dialogues)):
        orig_dialogue = element['messages']
        try:         
            aug_dialogue = augment_dialogue(
                orig_dialogue, topic, variations_augmentation_prompt_5, "gpt-4o-mini-2024-07-18"
            )
        except Exception as e:
            aug_dialogue = e

        example['augmented_dialogues'].append(
            {
                'id' : element['id'],
                'messages' : aug_dialogue
                }
            )
        
    new_data.append(example)
    with open("../data/gen_dataset_augment_uttr-vars_5", "wb") as fp:
        pickle.dump(new_data, fp)

    i += 1
    if i == 1:
        break

Augmenting example 0:


100%|██████████| 6/6 [01:13<00:00, 12.23s/it]


In [6]:
new_data = []
i = 0

for i, example in enumerate(dataset['train']):
    print(f'Augmenting example {i}:')
    topic = example['topic']
    all_dialogues = example['dialogues']

    example['augmented_dialogues'] = []

    for element in tqdm(all_dialogues, total=len(all_dialogues)):
        orig_dialogue = element['messages']
        try:         
            aug_dialogue = augment_dialogue(
                orig_dialogue, topic, variations_augmentation_prompt_6, "gpt-4o-mini-2024-07-18"
            )
        except Exception as e:
            aug_dialogue = e

        example['augmented_dialogues'].append(
            {
                'id' : element['id'],
                'messages' : aug_dialogue
                }
            )
        
    new_data.append(example)
    with open("../data/gen_dataset_augment_uttr-vars_6", "wb") as fp:
        pickle.dump(new_data, fp)

    i += 1
    if i == 1:
        break

Augmenting example 0:


100%|██████████| 6/6 [00:56<00:00,  9.49s/it]


In [6]:
new_data = []
i = 0

for i, example in enumerate(dataset['train']):
    print(f'Augmenting example {i}:')
    topic = example['topic']
    all_dialogues = example['dialogues']

    example['augmented_dialogues'] = []

    for element in tqdm(all_dialogues, total=len(all_dialogues)):
        orig_dialogue = element['messages']
        try:         
            aug_dialogue = augment_dialogue(
                orig_dialogue, topic, variations_augmentation_prompt_7, "gpt-4o-mini-2024-07-18"
            )
        except Exception as e:
            aug_dialogue = e

        example['augmented_dialogues'].append(
            {
                'id' : element['id'],
                'messages' : aug_dialogue
                }
            )
        
    new_data.append(example)
    with open("../data/gen_dataset_augment_uttr-vars_7", "wb") as fp:
        pickle.dump(new_data, fp)

    i += 1
    if i == 1:
        break

Augmenting example 0:


100%|██████████| 6/6 [00:48<00:00,  8.08s/it]


In [6]:
new_data = []
i = 0

for i, example in enumerate(dataset['train']):
    print(f'Augmenting example {i}:')
    topic = example['topic']
    all_dialogues = example['dialogues']

    example['augmented_dialogues'] = []

    for element in tqdm(all_dialogues, total=len(all_dialogues)):
        orig_dialogue = element['messages']
        try:         
            aug_dialogue = augment_dialogue(
                orig_dialogue, topic, variations_augmentation_prompt_8, "gpt-4o-mini-2024-07-18"
            )
        except Exception as e:
            aug_dialogue = e

        example['augmented_dialogues'].append(
            {
                'id' : element['id'],
                'messages' : aug_dialogue
                }
            )
        
    new_data.append(example)
    with open("../data/gen_dataset_augment_uttr-vars_8", "wb") as fp:
        pickle.dump(new_data, fp)

    i += 1
    if i == 1:
        break

Augmenting example 0:


100%|██████████| 6/6 [00:53<00:00,  8.98s/it]
