In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
import os

os.environ["PATH_TO_ENV"] = "~/projects/chatsky-llm-autoconfig/.env"

In [3]:
from tqdm import tqdm
import pickle
import json

In [4]:
from augmentation_prompts import variations_augmentation_prompt_9
from dialogue_augmentation import augment_dialogue, augment_dialogue_data



In [5]:
from augmentation_utils import is_correct_length_modified, match_roles_modified

# Re-generation of some failure instances

Adding checking of length and roles into augmentation process

In [8]:
with open(
    "../data/gen_dataset_augment_uttr-vars_9_v2.json", "r", encoding="utf-8"
) as file:
    data = json.load(file)
len(data)

100

In [9]:
failure_instances_length = []
failure_instances_roles = []

for i, instance in enumerate(tqdm(data)):
    dialogues = [dial["messages"] for dial in instance["dialogues"]]
    augmented_dialogues = [dial["messages"] for dial in instance["augmented_dialogues"]]

    for j, (orig_dia, aug_dia) in enumerate(zip(dialogues, augmented_dialogues)):
        length_comparison = is_correct_length_modified(orig_dia, aug_dia)
        roles_comparison = match_roles_modified(orig_dia, aug_dia)

        if length_comparison != True:
            failure_instances_length.append((i, j, length_comparison))
        if roles_comparison != True:
            failure_instances_roles.append((i, j, roles_comparison))

failure_instances_length, failure_instances_roles

100%|██████████| 100/100 [00:00<00:00, 20409.25it/s]


([(92, 0, False),
  (92, 1, False),
  (92, 4, False),
  (92, 5, False),
  (92, 6, False)],
 [(92,
   0,
   "Roles comparison error: string indices must be integers, not 'str'"),
  (92,
   1,
   "Roles comparison error: string indices must be integers, not 'str'"),
  (92,
   4,
   "Roles comparison error: string indices must be integers, not 'str'"),
  (92,
   5,
   "Roles comparison error: string indices must be integers, not 'str'"),
  (92,
   6,
   "Roles comparison error: string indices must be integers, not 'str'")])

In [19]:
for i, j, _ in failure_instances_length:
    print(f"Augmenting example {i} dialogue {j}")
    topic = data[i]["topic"]
    orig_dialogue = data[i]["dialogues"][j]["messages"]

    try:
        aug_dialogue = augment_dialogue(
            orig_dialogue,
            topic,
            variations_augmentation_prompt_9,
            "gpt-4o-mini-2024-07-18",
        )
    except Exception as e:
        aug_dialogue = e

    data[i]["augmented_dialogues"][j]["messages"] = aug_dialogue

with open(
    "../data/gen_dataset_augment_uttr-vars_9_v3.json", "w", encoding="utf-8"
) as file:
    json.dump(data, file, indent=4)

Augmenting example 92 dialogue 0
Augmenting example 92 dialogue 1
Augmenting example 92 dialogue 4
Augmenting example 92 dialogue 5
Augmenting example 92 dialogue 6


In [None]:
with open("../data/gen_dataset_augmented_0-99.json", "r", encoding="utf-8") as file:
    data = json.load(file)
len(data)

100

# Augmentation of all remaining dataset

In [6]:
from datasets import load_dataset

dataset = load_dataset("DeepPavlov/d2g_generated", token=True)

In [None]:
new_data = augment_dialogue_data(
    dataset["train"].select(range(100, 101)),
    variations_augmentation_prompt_9,
    "gpt-4o-mini-2024-07-18",
    "../data/gen_dataset_augment_uttr-vars_9_150_part1",
)

In [None]:
new_data = augment_dialogue_data(
    dataset["train"].select(range(101, 150)),
    variations_augmentation_prompt_9,
    "gpt-4o-mini-2024-07-18",
    "../data/gen_dataset_augment_uttr-vars_9_150_part2",
)

Augmenting example 0:


100%|██████████| 9/9 [01:37<00:00, 10.84s/it]


Augmenting example 1:


100%|██████████| 8/8 [01:12<00:00,  9.07s/it]


Augmenting example 2:


100%|██████████| 6/6 [01:05<00:00, 10.89s/it]


Augmenting example 3:


100%|██████████| 4/4 [01:22<00:00, 20.69s/it]


Augmenting example 4:


100%|██████████| 11/11 [02:42<00:00, 14.81s/it]


Augmenting example 5:


100%|██████████| 5/5 [00:53<00:00, 10.77s/it]


Augmenting example 6:


100%|██████████| 5/5 [01:11<00:00, 14.34s/it]


Augmenting example 7:


100%|██████████| 5/5 [00:51<00:00, 10.36s/it]


Augmenting example 8:


100%|██████████| 10/10 [03:51<00:00, 23.16s/it]


Augmenting example 9:


100%|██████████| 8/8 [01:24<00:00, 10.56s/it]


Augmenting example 10:


100%|██████████| 8/8 [02:10<00:00, 16.27s/it]


Augmenting example 11:


100%|██████████| 8/8 [02:05<00:00, 15.69s/it]


Augmenting example 12:


100%|██████████| 8/8 [01:36<00:00, 12.03s/it]


Augmenting example 13:


100%|██████████| 11/11 [01:39<00:00,  9.01s/it]


Augmenting example 14:


100%|██████████| 13/13 [02:15<00:00, 10.40s/it]


Augmenting example 15:


100%|██████████| 11/11 [03:10<00:00, 17.34s/it]


Augmenting example 16:


100%|██████████| 13/13 [03:04<00:00, 14.22s/it]


Augmenting example 17:


100%|██████████| 10/10 [00:57<00:00,  5.72s/it]


Augmenting example 18:


100%|██████████| 9/9 [01:35<00:00, 10.61s/it]


Augmenting example 19:


100%|██████████| 7/7 [01:21<00:00, 11.65s/it]


Augmenting example 20:


100%|██████████| 17/17 [02:55<00:00, 10.35s/it]


Augmenting example 21:


100%|██████████| 11/11 [01:35<00:00,  8.66s/it]


Augmenting example 22:


100%|██████████| 8/8 [01:21<00:00, 10.19s/it]


Augmenting example 23:


100%|██████████| 26/26 [04:17<00:00,  9.89s/it]


Augmenting example 24:


100%|██████████| 13/13 [02:01<00:00,  9.34s/it]


Augmenting example 25:


100%|██████████| 5/5 [01:00<00:00, 12.09s/it]


Augmenting example 26:


100%|██████████| 8/8 [01:39<00:00, 12.41s/it]


Augmenting example 27:


100%|██████████| 9/9 [01:17<00:00,  8.66s/it]


Augmenting example 28:


100%|██████████| 8/8 [01:59<00:00, 14.89s/it]


Augmenting example 29:


100%|██████████| 16/16 [02:49<00:00, 10.60s/it]


Augmenting example 30:


100%|██████████| 14/14 [03:12<00:00, 13.74s/it]


Augmenting example 31:


100%|██████████| 7/7 [01:45<00:00, 15.05s/it]


Augmenting example 32:


100%|██████████| 9/9 [01:07<00:00,  7.45s/it]


Augmenting example 33:


100%|██████████| 7/7 [01:09<00:00,  9.90s/it]


Augmenting example 34:


100%|██████████| 10/10 [00:59<00:00,  5.95s/it]


Augmenting example 35:


100%|██████████| 8/8 [01:17<00:00,  9.69s/it]


Augmenting example 36:


100%|██████████| 10/10 [02:27<00:00, 14.74s/it]


Augmenting example 37:


100%|██████████| 9/9 [01:21<00:00,  9.05s/it]


Augmenting example 38:


100%|██████████| 10/10 [02:01<00:00, 12.14s/it]


Augmenting example 39:


100%|██████████| 6/6 [01:30<00:00, 15.00s/it]


Augmenting example 40:


100%|██████████| 17/17 [03:20<00:00, 11.80s/it]


Augmenting example 41:


100%|██████████| 9/9 [01:45<00:00, 11.70s/it]


Augmenting example 42:


100%|██████████| 6/6 [00:55<00:00,  9.28s/it]


Augmenting example 43:


100%|██████████| 8/8 [01:43<00:00, 12.91s/it]


Augmenting example 44:


100%|██████████| 6/6 [01:16<00:00, 12.67s/it]


Augmenting example 45:


100%|██████████| 9/9 [01:18<00:00,  8.73s/it]


Augmenting example 46:


100%|██████████| 8/8 [01:42<00:00, 12.87s/it]


Augmenting example 47:


100%|██████████| 11/11 [01:30<00:00,  8.25s/it]


Augmenting example 48:


100%|██████████| 8/8 [01:05<00:00,  8.21s/it]


In [None]:
new_data = augment_dialogue_data(
    dataset["train"].select(range(150, 200)),
    variations_augmentation_prompt_9,
    "gpt-4o-mini-2024-07-18",
    "../data/gen_dataset_augment_uttr-vars_9_150-199",
)

Augmenting example 0:


100%|██████████| 9/9 [01:14<00:00,  8.31s/it]


Augmenting example 1:


100%|██████████| 9/9 [02:21<00:00, 15.68s/it]


Augmenting example 2:


100%|██████████| 6/6 [01:33<00:00, 15.56s/it]


Augmenting example 3:


100%|██████████| 10/10 [01:45<00:00, 10.54s/it]


Augmenting example 4:


100%|██████████| 9/9 [01:02<00:00,  6.91s/it]


Augmenting example 5:


100%|██████████| 8/8 [00:47<00:00,  5.99s/it]


Augmenting example 6:


100%|██████████| 7/7 [00:54<00:00,  7.85s/it]


Augmenting example 7:


100%|██████████| 8/8 [01:11<00:00,  8.92s/it]


Augmenting example 8:


100%|██████████| 12/12 [02:07<00:00, 10.58s/it]


Augmenting example 9:


100%|██████████| 11/11 [01:53<00:00, 10.30s/it]


Augmenting example 10:


100%|██████████| 12/12 [02:48<00:00, 14.04s/it]


Augmenting example 11:


100%|██████████| 28/28 [05:27<00:00, 11.69s/it]


Augmenting example 12:


100%|██████████| 9/9 [01:36<00:00, 10.68s/it]


Augmenting example 13:


100%|██████████| 8/8 [00:57<00:00,  7.20s/it]


Augmenting example 14:


100%|██████████| 7/7 [00:46<00:00,  6.66s/it]


Augmenting example 15:


100%|██████████| 10/10 [02:24<00:00, 14.41s/it]


Augmenting example 16:


100%|██████████| 10/10 [01:57<00:00, 11.80s/it]


Augmenting example 17:


100%|██████████| 13/13 [02:29<00:00, 11.52s/it]


Augmenting example 18:


100%|██████████| 6/6 [01:26<00:00, 14.42s/it]


Augmenting example 19:


100%|██████████| 9/9 [01:30<00:00, 10.05s/it]


Augmenting example 20:


100%|██████████| 16/16 [02:52<00:00, 10.77s/it]


Augmenting example 21:


100%|██████████| 4/4 [00:35<00:00,  8.95s/it]


Augmenting example 22:


100%|██████████| 9/9 [01:47<00:00, 11.97s/it]


Augmenting example 23:


100%|██████████| 8/8 [02:48<00:00, 21.08s/it]


Augmenting example 24:


100%|██████████| 5/5 [00:43<00:00,  8.64s/it]


Augmenting example 25:


100%|██████████| 36/36 [04:53<00:00,  8.14s/it]


Augmenting example 26:


100%|██████████| 9/9 [01:50<00:00, 12.22s/it]


Augmenting example 27:


100%|██████████| 10/10 [01:40<00:00, 10.01s/it]


Augmenting example 28:


100%|██████████| 10/10 [01:53<00:00, 11.39s/it]


Augmenting example 29:


100%|██████████| 7/7 [01:59<00:00, 17.09s/it]


Augmenting example 30:


100%|██████████| 6/6 [00:42<00:00,  7.04s/it]


Augmenting example 31:


100%|██████████| 9/9 [01:31<00:00, 10.21s/it]


Augmenting example 32:


100%|██████████| 6/6 [00:40<00:00,  6.80s/it]


Augmenting example 33:


100%|██████████| 16/16 [02:52<00:00, 10.80s/it]


Augmenting example 34:


100%|██████████| 17/17 [05:09<00:00, 18.20s/it]


Augmenting example 35:


100%|██████████| 9/9 [01:08<00:00,  7.63s/it]


Augmenting example 36:


100%|██████████| 7/7 [01:42<00:00, 14.63s/it]


Augmenting example 37:


100%|██████████| 10/10 [02:47<00:00, 16.77s/it]


Augmenting example 38:


100%|██████████| 8/8 [02:08<00:00, 16.01s/it]


Augmenting example 39:


100%|██████████| 18/18 [02:17<00:00,  7.65s/it]


Augmenting example 40:


100%|██████████| 10/10 [01:30<00:00,  9.04s/it]


Augmenting example 41:


100%|██████████| 14/14 [02:09<00:00,  9.26s/it]


Augmenting example 42:


100%|██████████| 8/8 [00:55<00:00,  6.94s/it]


Augmenting example 43:


100%|██████████| 7/7 [00:48<00:00,  6.90s/it]


Augmenting example 44:


100%|██████████| 12/12 [02:44<00:00, 13.68s/it]


Augmenting example 45:


100%|██████████| 15/15 [03:27<00:00, 13.84s/it]


Augmenting example 46:


100%|██████████| 8/8 [02:00<00:00, 15.10s/it]


Augmenting example 47:


100%|██████████| 9/9 [01:47<00:00, 11.89s/it]


Augmenting example 48:


100%|██████████| 14/14 [02:29<00:00, 10.66s/it]


Augmenting example 49:


100%|██████████| 3/3 [00:19<00:00,  6.50s/it]


In [76]:
len(new_data)

50

In [65]:
with open("../data/gen_dataset_augmented_150-199.json", "w", encoding="utf-8") as file:
    json.dump(new_data, file, indent=4)

In [9]:
new_data = augment_dialogue_data(
    dataset["train"].select(range(201, 250)),
    variations_augmentation_prompt_9,
    "gpt-4o-mini-2024-07-18",
    "../data/gen_dataset_augment_uttr-vars_9_200-249_part2",
)

Augmenting example 0:


100%|██████████| 31/31 [05:21<00:00, 10.36s/it]


Augmenting example 1:


100%|██████████| 43/43 [07:55<00:00, 11.05s/it]


Augmenting example 2:


100%|██████████| 24/24 [02:41<00:00,  6.73s/it]


Augmenting example 3:


100%|██████████| 56/56 [09:02<00:00,  9.68s/it]


Augmenting example 4:


100%|██████████| 6/6 [01:23<00:00, 13.92s/it]


Augmenting example 5:


100%|██████████| 10/10 [01:17<00:00,  7.72s/it]


Augmenting example 6:


100%|██████████| 7/7 [00:40<00:00,  5.77s/it]


Augmenting example 7:


100%|██████████| 11/11 [01:22<00:00,  7.52s/it]


Augmenting example 8:


100%|██████████| 6/6 [00:57<00:00,  9.55s/it]


Augmenting example 9:


100%|██████████| 9/9 [01:48<00:00, 12.05s/it]


Augmenting example 10:


100%|██████████| 16/16 [02:59<00:00, 11.24s/it]


Augmenting example 11:


100%|██████████| 8/8 [01:09<00:00,  8.69s/it]


Augmenting example 12:


100%|██████████| 32/32 [04:49<00:00,  9.05s/it]


Augmenting example 13:


100%|██████████| 96/96 [14:58<00:00,  9.36s/it]


Augmenting example 14:


100%|██████████| 15/15 [01:25<00:00,  5.72s/it]


Augmenting example 15:


100%|██████████| 44/44 [07:52<00:00, 10.73s/it]


Augmenting example 16:


100%|██████████| 64/64 [11:19<00:00, 10.62s/it]


Augmenting example 17:


100%|██████████| 54/54 [08:11<00:00,  9.10s/it]


Augmenting example 18:


100%|██████████| 74/74 [08:29<00:00,  6.88s/it]


Augmenting example 19:


100%|██████████| 69/69 [08:58<00:00,  7.80s/it]


Augmenting example 20:


100%|██████████| 46/46 [04:08<00:00,  5.41s/it]


Augmenting example 21:


100%|██████████| 47/47 [14:57<00:00, 19.10s/it]


Augmenting example 22:


100%|██████████| 12/12 [02:36<00:00, 13.04s/it]


Augmenting example 23:


100%|██████████| 9/9 [01:35<00:00, 10.56s/it]


Augmenting example 24:


100%|██████████| 10/10 [01:21<00:00,  8.16s/it]


Augmenting example 25:


100%|██████████| 7/7 [01:42<00:00, 14.67s/it]


Augmenting example 26:


100%|██████████| 5/5 [00:58<00:00, 11.76s/it]


Augmenting example 27:


100%|██████████| 4/4 [00:44<00:00, 11.03s/it]


Augmenting example 28:


100%|██████████| 2/2 [00:42<00:00, 21.40s/it]


Augmenting example 29:


100%|██████████| 9/9 [01:10<00:00,  7.79s/it]


Augmenting example 30:


100%|██████████| 5/5 [01:01<00:00, 12.28s/it]


Augmenting example 31:


100%|██████████| 15/15 [03:27<00:00, 13.81s/it]


Augmenting example 32:


100%|██████████| 4/4 [00:58<00:00, 14.52s/it]


Augmenting example 33:


100%|██████████| 8/8 [02:40<00:00, 20.03s/it]


Augmenting example 34:


100%|██████████| 5/5 [00:27<00:00,  5.57s/it]


Augmenting example 35:


100%|██████████| 3/3 [00:28<00:00,  9.60s/it]


Augmenting example 36:


100%|██████████| 4/4 [00:37<00:00,  9.46s/it]


Augmenting example 37:


100%|██████████| 13/13 [01:44<00:00,  8.04s/it]


Augmenting example 38:


100%|██████████| 6/6 [00:49<00:00,  8.32s/it]


Augmenting example 39:


100%|██████████| 5/5 [00:48<00:00,  9.77s/it]


Augmenting example 40:


100%|██████████| 5/5 [00:36<00:00,  7.36s/it]


Augmenting example 41:


100%|██████████| 6/6 [00:35<00:00,  5.91s/it]


Augmenting example 42:


100%|██████████| 5/5 [00:44<00:00,  8.84s/it]


Augmenting example 43:


100%|██████████| 2/2 [00:33<00:00, 16.88s/it]


Augmenting example 44:


100%|██████████| 18/18 [04:05<00:00, 13.62s/it]


Augmenting example 45:


100%|██████████| 9/9 [01:32<00:00, 10.30s/it]


Augmenting example 46:


100%|██████████| 6/6 [01:09<00:00, 11.54s/it]


Augmenting example 47:


100%|██████████| 7/7 [00:39<00:00,  5.63s/it]


Augmenting example 48:


100%|██████████| 4/4 [00:45<00:00, 11.38s/it]


In [15]:
new_data = augment_dialogue_data(
    dataset["train"].select(range(250, 300)),
    variations_augmentation_prompt_9,
    "gpt-4o-mini-2024-07-18",
    "../data/gen_dataset_augment_uttr-vars_9_250-299",
)

Augmenting example 0:


100%|██████████| 33/33 [06:41<00:00, 12.16s/it]


Augmenting example 1:


100%|██████████| 4/4 [00:39<00:00,  9.77s/it]


Augmenting example 2:


100%|██████████| 9/9 [01:26<00:00,  9.61s/it]


Augmenting example 3:


100%|██████████| 4/4 [00:35<00:00,  8.79s/it]


Augmenting example 4:


100%|██████████| 8/8 [01:49<00:00, 13.66s/it]


Augmenting example 5:


100%|██████████| 7/7 [01:14<00:00, 10.60s/it]


Augmenting example 6:


100%|██████████| 3/3 [00:42<00:00, 14.28s/it]


Augmenting example 7:


100%|██████████| 12/12 [01:48<00:00,  9.03s/it]


Augmenting example 8:


100%|██████████| 5/5 [00:52<00:00, 10.50s/it]


Augmenting example 9:


100%|██████████| 6/6 [01:16<00:00, 12.83s/it]


Augmenting example 10:


100%|██████████| 5/5 [00:29<00:00,  5.91s/it]


Augmenting example 11:


100%|██████████| 10/10 [01:32<00:00,  9.21s/it]


Augmenting example 12:


100%|██████████| 6/6 [00:44<00:00,  7.48s/it]


Augmenting example 13:


100%|██████████| 7/7 [01:31<00:00, 13.09s/it]


Augmenting example 14:


100%|██████████| 13/13 [01:22<00:00,  6.36s/it]


Augmenting example 15:


100%|██████████| 4/4 [00:30<00:00,  7.64s/it]


Augmenting example 16:


100%|██████████| 7/7 [00:56<00:00,  8.06s/it]


Augmenting example 17:


100%|██████████| 19/19 [03:30<00:00, 11.08s/it]


Augmenting example 18:


100%|██████████| 4/4 [00:45<00:00, 11.25s/it]


Augmenting example 19:


100%|██████████| 14/14 [01:52<00:00,  8.04s/it]


Augmenting example 20:


100%|██████████| 7/7 [01:05<00:00,  9.42s/it]


Augmenting example 21:


100%|██████████| 16/16 [02:34<00:00,  9.64s/it]


Augmenting example 22:


100%|██████████| 5/5 [00:48<00:00,  9.79s/it]


Augmenting example 23:


100%|██████████| 3/3 [00:29<00:00,  9.95s/it]


Augmenting example 24:


100%|██████████| 3/3 [00:47<00:00, 15.94s/it]


Augmenting example 25:


100%|██████████| 3/3 [00:37<00:00, 12.63s/it]


Augmenting example 26:


100%|██████████| 8/8 [00:54<00:00,  6.78s/it]


Augmenting example 27:


100%|██████████| 9/9 [02:05<00:00, 13.99s/it]


Augmenting example 28:


100%|██████████| 3/3 [00:42<00:00, 14.12s/it]


Augmenting example 29:


100%|██████████| 8/8 [01:16<00:00,  9.62s/it]


Augmenting example 30:


100%|██████████| 11/11 [01:55<00:00, 10.47s/it]


Augmenting example 31:


100%|██████████| 6/6 [00:53<00:00,  8.95s/it]


Augmenting example 32:


100%|██████████| 11/11 [03:18<00:00, 18.09s/it]


Augmenting example 33:


100%|██████████| 20/20 [04:26<00:00, 13.30s/it]


Augmenting example 34:


100%|██████████| 12/12 [02:27<00:00, 12.29s/it]


Augmenting example 35:


100%|██████████| 10/10 [01:51<00:00, 11.15s/it]


Augmenting example 36:


100%|██████████| 9/9 [01:56<00:00, 12.95s/it]


Augmenting example 37:


100%|██████████| 10/10 [02:15<00:00, 13.52s/it]


Augmenting example 38:


100%|██████████| 7/7 [02:34<00:00, 22.10s/it]


Augmenting example 39:


100%|██████████| 7/7 [01:03<00:00,  9.12s/it]


Augmenting example 40:


100%|██████████| 5/5 [01:04<00:00, 12.97s/it]


Augmenting example 41:


100%|██████████| 8/8 [01:28<00:00, 11.05s/it]


Augmenting example 42:


100%|██████████| 22/22 [03:35<00:00,  9.80s/it]


Augmenting example 43:


100%|██████████| 9/9 [00:47<00:00,  5.25s/it]


Augmenting example 44:


100%|██████████| 6/6 [00:55<00:00,  9.23s/it]


Augmenting example 45:


100%|██████████| 5/5 [00:44<00:00,  8.82s/it]


Augmenting example 46:


100%|██████████| 17/17 [04:10<00:00, 14.76s/it]


Augmenting example 47:


100%|██████████| 6/6 [01:10<00:00, 11.75s/it]


Augmenting example 48:


100%|██████████| 20/20 [04:00<00:00, 12.01s/it]


Augmenting example 49:


100%|██████████| 4/4 [01:21<00:00, 20.50s/it]


In [16]:
len(new_data)

50

In [17]:
with open("../data/gen_dataset_augmented_250-299.json", "w", encoding="utf-8") as file:
    json.dump(new_data, file, indent=4)

In [7]:
new_data = augment_dialogue_data(
    dataset["train"].select(range(300, 350)),
    variations_augmentation_prompt_9,
    "gpt-4o-mini-2024-07-18",
    "../data/gen_dataset_augment_uttr-vars_9_300-349",
)

Augmenting example 0:


100%|██████████| 9/9 [01:32<00:00, 10.26s/it]


Augmenting example 1:


100%|██████████| 6/6 [01:05<00:00, 10.84s/it]


Augmenting example 2:


100%|██████████| 6/6 [01:34<00:00, 15.82s/it]


Augmenting example 3:


100%|██████████| 12/12 [01:35<00:00,  7.99s/it]


Augmenting example 4:


100%|██████████| 13/13 [02:06<00:00,  9.71s/it]


Augmenting example 5:


100%|██████████| 6/6 [01:39<00:00, 16.52s/it]


Augmenting example 6:


100%|██████████| 7/7 [01:12<00:00, 10.43s/it]


Augmenting example 7:


100%|██████████| 11/11 [02:47<00:00, 15.19s/it]


Augmenting example 8:


100%|██████████| 11/11 [02:20<00:00, 12.80s/it]


Augmenting example 9:


100%|██████████| 9/9 [01:30<00:00, 10.02s/it]


Augmenting example 10:


100%|██████████| 7/7 [01:34<00:00, 13.50s/it]


Augmenting example 11:


100%|██████████| 8/8 [00:55<00:00,  6.97s/it]


Augmenting example 12:


100%|██████████| 11/11 [01:41<00:00,  9.26s/it]


Augmenting example 13:


100%|██████████| 7/7 [02:33<00:00, 21.99s/it]


Augmenting example 14:


100%|██████████| 12/12 [01:16<00:00,  6.37s/it]


Augmenting example 15:


100%|██████████| 4/4 [00:57<00:00, 14.43s/it]


Augmenting example 16:


100%|██████████| 10/10 [01:41<00:00, 10.15s/it]


Augmenting example 17:


100%|██████████| 9/9 [01:20<00:00,  8.93s/it]


Augmenting example 18:


100%|██████████| 12/12 [01:54<00:00,  9.50s/it]


Augmenting example 19:


100%|██████████| 40/40 [05:56<00:00,  8.90s/it]


Augmenting example 20:


100%|██████████| 7/7 [01:16<00:00, 10.88s/it]


Augmenting example 21:


100%|██████████| 6/6 [01:40<00:00, 16.69s/it]


Augmenting example 22:


100%|██████████| 7/7 [01:09<00:00,  9.92s/it]


Augmenting example 23:


100%|██████████| 12/12 [02:31<00:00, 12.64s/it]


Augmenting example 24:


100%|██████████| 12/12 [01:51<00:00,  9.31s/it]


Augmenting example 25:


100%|██████████| 5/5 [01:25<00:00, 17.06s/it]


Augmenting example 26:


100%|██████████| 7/7 [00:53<00:00,  7.64s/it]


Augmenting example 27:


100%|██████████| 5/5 [00:39<00:00,  7.97s/it]


Augmenting example 28:


100%|██████████| 6/6 [01:13<00:00, 12.31s/it]


Augmenting example 29:


100%|██████████| 10/10 [01:30<00:00,  9.07s/it]


Augmenting example 30:


100%|██████████| 6/6 [00:52<00:00,  8.74s/it]


Augmenting example 31:


100%|██████████| 7/7 [00:56<00:00,  8.00s/it]


Augmenting example 32:


100%|██████████| 4/4 [00:42<00:00, 10.71s/it]


Augmenting example 33:


100%|██████████| 7/7 [01:26<00:00, 12.29s/it]


Augmenting example 34:


100%|██████████| 5/5 [01:10<00:00, 14.02s/it]


Augmenting example 35:


100%|██████████| 7/7 [01:16<00:00, 10.90s/it]


Augmenting example 36:


100%|██████████| 13/13 [01:59<00:00,  9.18s/it]


Augmenting example 37:


100%|██████████| 13/13 [01:24<00:00,  6.52s/it]


Augmenting example 38:


100%|██████████| 8/8 [01:11<00:00,  8.97s/it]


Augmenting example 39:


100%|██████████| 12/12 [02:10<00:00, 10.87s/it]


Augmenting example 40:


100%|██████████| 5/5 [00:51<00:00, 10.21s/it]


Augmenting example 41:


100%|██████████| 8/8 [02:00<00:00, 15.01s/it]


Augmenting example 42:


100%|██████████| 6/6 [00:58<00:00,  9.73s/it]


Augmenting example 43:


100%|██████████| 13/13 [01:55<00:00,  8.92s/it]


Augmenting example 44:


100%|██████████| 20/20 [02:44<00:00,  8.23s/it]


Augmenting example 45:


100%|██████████| 5/5 [00:59<00:00, 11.89s/it]


Augmenting example 46:


100%|██████████| 7/7 [01:41<00:00, 14.56s/it]


Augmenting example 47:


100%|██████████| 20/20 [02:56<00:00,  8.80s/it]


Augmenting example 48:


100%|██████████| 21/21 [02:20<00:00,  6.71s/it]


Augmenting example 49:


100%|██████████| 38/38 [06:40<00:00, 10.54s/it]


In [8]:
len(new_data)

50

In [9]:
with open("../data/gen_dataset_augmented_300-349.json", "w", encoding="utf-8") as file:
    json.dump(new_data, file, indent=4)

In [None]:
new_data = augment_dialogue_data(
    dataset["train"].select(range(350, 400)),
    variations_augmentation_prompt_9,
    "gpt-4o-mini-2024-07-18",
    "../data/gen_dataset_augment_uttr-vars_9_350-399",
)

Augmenting example 0:


100%|██████████| 12/12 [02:22<00:00, 11.84s/it]


Augmenting example 1:


100%|██████████| 11/11 [01:51<00:00, 10.12s/it]


Augmenting example 2:


100%|██████████| 11/11 [01:33<00:00,  8.49s/it]


Augmenting example 3:


100%|██████████| 8/8 [01:47<00:00, 13.43s/it]


Augmenting example 4:


100%|██████████| 7/7 [02:16<00:00, 19.56s/it]


Augmenting example 5:


100%|██████████| 16/16 [03:28<00:00, 13.05s/it]


Augmenting example 6:


100%|██████████| 16/16 [03:37<00:00, 13.57s/it]


Augmenting example 7:


100%|██████████| 10/10 [01:52<00:00, 11.28s/it]


Augmenting example 8:


100%|██████████| 7/7 [01:00<00:00,  8.67s/it]


Augmenting example 9:


100%|██████████| 14/14 [02:33<00:00, 10.98s/it]


Augmenting example 10:


100%|██████████| 6/6 [01:07<00:00, 11.22s/it]


Augmenting example 11:


100%|██████████| 7/7 [01:08<00:00,  9.75s/it]


Augmenting example 12:


100%|██████████| 9/9 [02:05<00:00, 13.90s/it]


Augmenting example 13:


100%|██████████| 6/6 [01:22<00:00, 13.70s/it]


Augmenting example 14:


100%|██████████| 11/11 [01:51<00:00, 10.17s/it]


Augmenting example 15:


100%|██████████| 6/6 [02:05<00:00, 20.88s/it]


Augmenting example 16:


100%|██████████| 7/7 [01:51<00:00, 15.97s/it]


Augmenting example 17:


100%|██████████| 10/10 [01:32<00:00,  9.24s/it]


Augmenting example 18:


100%|██████████| 6/6 [01:07<00:00, 11.23s/it]


Augmenting example 19:


100%|██████████| 6/6 [01:24<00:00, 14.11s/it]


Augmenting example 20:


100%|██████████| 6/6 [01:13<00:00, 12.31s/it]


Augmenting example 21:


100%|██████████| 11/11 [02:48<00:00, 15.31s/it]


Augmenting example 22:


100%|██████████| 10/10 [01:34<00:00,  9.41s/it]


Augmenting example 23:


100%|██████████| 3/3 [00:58<00:00, 19.42s/it]


Augmenting example 24:


100%|██████████| 6/6 [01:17<00:00, 12.86s/it]


Augmenting example 25:


100%|██████████| 9/9 [01:32<00:00, 10.24s/it]


Augmenting example 26:


100%|██████████| 7/7 [01:14<00:00, 10.67s/it]


Augmenting example 27:


100%|██████████| 4/4 [01:06<00:00, 16.59s/it]


Augmenting example 28:


100%|██████████| 6/6 [01:05<00:00, 10.99s/it]


Augmenting example 29:


100%|██████████| 7/7 [01:09<00:00,  9.90s/it]


Augmenting example 30:


100%|██████████| 9/9 [01:43<00:00, 11.55s/it]


Augmenting example 31:


100%|██████████| 7/7 [01:42<00:00, 14.60s/it]


Augmenting example 32:


100%|██████████| 8/8 [01:31<00:00, 11.41s/it]


Augmenting example 33:


100%|██████████| 9/9 [01:49<00:00, 12.21s/it]


Augmenting example 34:


100%|██████████| 9/9 [01:43<00:00, 11.45s/it]


Augmenting example 35:


100%|██████████| 11/11 [01:49<00:00,  9.91s/it]


Augmenting example 36:


100%|██████████| 15/15 [03:07<00:00, 12.47s/it]


Augmenting example 37:


100%|██████████| 5/5 [01:52<00:00, 22.54s/it]


Augmenting example 38:


100%|██████████| 12/12 [02:33<00:00, 12.75s/it]


Augmenting example 39:


100%|██████████| 6/6 [00:33<00:00,  5.55s/it]


Augmenting example 40:


100%|██████████| 11/11 [02:55<00:00, 15.99s/it]


Augmenting example 41:


100%|██████████| 10/10 [02:09<00:00, 12.92s/it]


Augmenting example 42:


100%|██████████| 32/32 [04:00<00:00,  7.50s/it]


Augmenting example 43:


100%|██████████| 16/16 [02:57<00:00, 11.11s/it]


Augmenting example 44:


100%|██████████| 9/9 [01:35<00:00, 10.58s/it]


Augmenting example 45:


100%|██████████| 6/6 [01:02<00:00, 10.48s/it]


Augmenting example 46:


100%|██████████| 10/10 [01:31<00:00,  9.17s/it]


Augmenting example 47:


100%|██████████| 9/9 [01:29<00:00,  9.98s/it]


Augmenting example 48:


100%|██████████| 8/8 [00:48<00:00,  6.03s/it]


Augmenting example 49:


100%|██████████| 4/4 [01:00<00:00, 15.24s/it]


In [None]:
len(new_data)

50

In [None]:
with open("../data/gen_dataset_augmented_350-399.json", "w", encoding="utf-8") as file:
    json.dump(new_data, file, indent=4)

In [None]:
new_data = augment_dialogue_data(
    dataset["train"].select(range(400, 402)),
    variations_augmentation_prompt_9,
    "gpt-4o-mini-2024-07-18",
    "../data/gen_dataset_augment_uttr-vars_9_400-402",
)

Augmenting example 0:


100%|██████████| 7/7 [01:03<00:00,  9.03s/it]


Augmenting example 1:


100%|██████████| 11/11 [01:41<00:00,  9.21s/it]


In [None]:
len(new_data)

2

In [None]:
with open("../data/gen_dataset_augmented_400-402.json", "w", encoding="utf-8") as file:
    json.dump(new_data, file, indent=4)

## Saving all data

In [8]:
with open("../data/gen_dataset_augment_uttr-vars_9_100-149_part1", "rb") as file:
    data_1 = pickle.load(file)
with open("../data/gen_dataset_augment_uttr-vars_9_100-149_part2", "rb") as file:
    data_2 = pickle.load(file)

all_data = data_1 + data_2
with open("../data/gen_dataset_augmented_100-149", "wb") as fp:
    pickle.dump(all_data, fp)

In [9]:
with open("../data/gen_dataset_augment_uttr-vars_9_200-249_part1", "rb") as file:
    data_1 = pickle.load(file)
with open("../data/gen_dataset_augment_uttr-vars_9_200-249_part2", "rb") as file:
    data_2 = pickle.load(file)

all_data = data_1 + data_2
with open("../data/gen_dataset_augmented_200-249_v2", "wb") as fp:
    pickle.dump(all_data, fp)

In [10]:
with open("../data/gen_dataset_augmented_0-99.json", "r") as fp:
    data_100 = json.load(fp)
with open("../data/gen_dataset_augmented_100-149", "rb") as file:
    data_150 = pickle.load(file)
with open("../data/gen_dataset_augmented_150-199.json", "r") as fp:
    data_200 = json.load(fp)
with open("../data/gen_dataset_augmented_200-249_v2", "rb") as file:
    data_250 = pickle.load(file)
with open("../data/gen_dataset_augmented_250-299.json", "r") as fp:
    data_300 = json.load(fp)
with open("../data/gen_dataset_augmented_300-349.json", "r") as fp:
    data_350 = json.load(fp)
with open("../data/gen_dataset_augmented_350-399.json", "r") as fp:
    data_400 = json.load(fp)
with open("../data/gen_dataset_augmented_400-402.json", "r") as fp:
    data_402 = json.load(fp)

data = (
    data_100
    + data_150
    + data_200
    + data_250
    + data_300
    + data_350
    + data_400
    + data_402
)
len(data)

402

Compare our data with source dataset

In [11]:
from datasets import load_dataset

dataset = load_dataset("DeepPavlov/d2g_generated", token=True)

count = 0
for d1, d2 in zip(data, dataset["train"]):
    if (
        d1["graph"] == d2["graph"]
        and d1["topic"] == d2["topic"]
        and d1["dialogues"] == d2["dialogues"]
    ):
        count += 1
count

402

Save data

In [13]:
with open("../data/gen_dataset_augmented_0-402", "wb") as fp:
    pickle.dump(data, fp)