In [5]:
import os
import json
import pathlib

In [None]:
import dotenv

dotenv.load_dotenv()

In [2]:
from datasets import load_dataset, Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from pydantic_settings import BaseSettings, SettingsConfigDict
from typing import Optional


class EnvSettings(BaseSettings, case_sensitive=True):
    model_config = SettingsConfigDict(env_file="./.env", env_file_encoding="utf-8")
    HUGGINGFACE_TOKEN: Optional[str]

In [18]:
env_settings = EnvSettings()

In [None]:
data = []
for file in [p for p in pathlib.Path("../data").iterdir() if p.is_file()]:
    with open(file) as f:
        data.extend(json.load(f))
data

In [None]:
for idx1 in range(len(data)):
    for idx2 in range(len(data[idx1]["dialogs"])):
        data[idx1]["dialogs"][idx2].pop("topic", None)
        data[idx1]["dialogs"][idx2].pop("validate", None)
data

In [10]:
counter = {}
for idx1 in range(len(data)):
    if data[idx1]["topic"] not in counter:
        counter[data[idx1]["topic"]] = 0
    counter[data[idx1]["topic"]] += 1
    for idx2 in range(len(data[idx1]["dialogs"])):
        data[idx1]["dialogs"][idx2]["id"] = (
            f"{data[idx1]['topic']}_{counter[data[idx1]['topic']]}_{idx2}"
        )

In [None]:
data[5]["dialogs"][1]

In [14]:
dataset = Dataset.from_list(data)

In [15]:
dataset

Dataset({
    features: ['graph', 'topic', 'dialogs'],
    num_rows: 402
})

In [None]:
dataset[0]["dialogs"][1]

In [None]:
dataset.push_to_hub(
    "DeepPavlov/d2g_generated", private=True, token=env_settings.HUGGINGFACE_TOKEN
)

In [None]:
dataset = load_dataset("DeepPavlov/d2g_generated", token=os.getenv("HUGGINGFACE_TOKEN"))
dataset

In [8]:
new_dataset = dataset.rename_column("dialogs", "dialogs")

In [None]:
new_dataset

In [None]:
new_dataset.push_to_hub(
    "DeepPavlov/d2g_generated", private=True, token=os.getenv("HUGGINGFACE_TOKEN")
)

In [None]:
dataset["train"][0]

In [None]:
dataset = load_dataset(
    "DeepPavlov/d2g_generated_augmented", token=os.getenv("HUGGINGFACE_TOKEN")
)
dataset

In [14]:
new_dataset = dataset.rename_column("dialogs", "dialogs")

In [15]:
new_dataset = new_dataset.rename_column("augmented_dialogs", "augmented_dialogs")

In [None]:
new_dataset

In [None]:
new_dataset.push_to_hub(
    "DeepPavlov/d2g_generated_augmented",
    private=True,
    token=os.getenv("HUGGINGFACE_TOKEN"),
)