In [5]:
from platform import system

from datasets import Dataset, concatenate_datasets
import os
import itertools

In [6]:
# Load the preprocessed dataset
# The dataset is a dataframe json file, so we can load it using pandas
# File structure: preprocessed/{task}/{dataset_name}.jsonl
# task: CHAT, CORPUS, CORPUS_SPOKEN, DIALECT, TASK_SUMMARIZATION, TASK_TRANSLATION

# Using walk to get all the files in the preprocessed folder

preprocessed_folder = 'preprocessed'
datasets = []

for root, dirs, files in os.walk(preprocessed_folder):
    for file in files:
        if file.endswith('.jsonl'):
            datasets.append(os.path.join(root, file))

datasets

['preprocessed\\CHAT\\009.전문분야_기술과학_한국어 멀티세션 데이터.jsonl',
 'preprocessed\\CHAT\\010.전문분야_사회과학_한국어 멀티세션 데이터.jsonl',
 'preprocessed\\CHAT\\044.페르소나 대화.jsonl',
 'preprocessed\\CHAT\\045.지식검색 대화.jsonl',
 'preprocessed\\CHAT\\046.공감형 대화.jsonl',
 'preprocessed\\CHAT\\141.한국어 멀티세션 대화.jsonl',
 'preprocessed\\CHAT\\NIKL_DIALOGUE_2020_v1.4.zip.jsonl',
 'preprocessed\\CHAT\\NIKL_DIALOGUE_2021_v1.1.zip.jsonl',
 'preprocessed\\CHAT\\NIKL_DIALOGUE_2022_v1.0_JSON.zip.jsonl',
 'preprocessed\\CORPUS\\016.행정 문서 대상 기계독해 데이터.jsonl',
 'preprocessed\\CORPUS\\017.뉴스 기사 기계독해 데이터.jsonl',
 'preprocessed\\CORPUS\\018.논문자료 요약 데이터.jsonl',
 'preprocessed\\CORPUS\\019.법률, 규정 (판결서, 약관 등) 텍스트 분석 데이터.jsonl',
 'preprocessed\\CORPUS\\021.도서자료 기계독해.jsonl',
 'preprocessed\\CORPUS\\022.요약문 및 레포트 생성 데이터.jsonl',
 'preprocessed\\CORPUS\\023.방송 콘텐츠 대본 요약 데이터.jsonl',
 'preprocessed\\CORPUS\\029.대규모 구매도서 기반 한국어 말뭉치 데이터.jsonl',
 'preprocessed\\CORPUS\\030.웹데이터 기반 한국어 말뭉치 데이터.jsonl',
 'preprocessed\\CORPUS\\048.일반상식 문장 생성 데이터.jsonl'

In [7]:
# classificate the datasets by task
tasks = ['CHAT', 'CORPUS', 'CORPUS_SPOKEN', 'DIALECT', 'TASK_SUMMARIZATION', 'TASK_TRANSLATION']
datasets_by_task = {task: [] for task in tasks}

for dataset in datasets:
    task = dataset.split('\\')[1]
    datasets_by_task[task].append(dataset)

datasets_by_task

{'CHAT': ['preprocessed\\CHAT\\009.전문분야_기술과학_한국어 멀티세션 데이터.jsonl',
  'preprocessed\\CHAT\\010.전문분야_사회과학_한국어 멀티세션 데이터.jsonl',
  'preprocessed\\CHAT\\044.페르소나 대화.jsonl',
  'preprocessed\\CHAT\\045.지식검색 대화.jsonl',
  'preprocessed\\CHAT\\046.공감형 대화.jsonl',
  'preprocessed\\CHAT\\141.한국어 멀티세션 대화.jsonl',
  'preprocessed\\CHAT\\NIKL_DIALOGUE_2020_v1.4.zip.jsonl',
  'preprocessed\\CHAT\\NIKL_DIALOGUE_2021_v1.1.zip.jsonl',
  'preprocessed\\CHAT\\NIKL_DIALOGUE_2022_v1.0_JSON.zip.jsonl'],
 'CORPUS': ['preprocessed\\CORPUS\\016.행정 문서 대상 기계독해 데이터.jsonl',
  'preprocessed\\CORPUS\\017.뉴스 기사 기계독해 데이터.jsonl',
  'preprocessed\\CORPUS\\018.논문자료 요약 데이터.jsonl',
  'preprocessed\\CORPUS\\019.법률, 규정 (판결서, 약관 등) 텍스트 분석 데이터.jsonl',
  'preprocessed\\CORPUS\\021.도서자료 기계독해.jsonl',
  'preprocessed\\CORPUS\\022.요약문 및 레포트 생성 데이터.jsonl',
  'preprocessed\\CORPUS\\023.방송 콘텐츠 대본 요약 데이터.jsonl',
  'preprocessed\\CORPUS\\029.대규모 구매도서 기반 한국어 말뭉치 데이터.jsonl',
  'preprocessed\\CORPUS\\030.웹데이터 기반 한국어 말뭉치 데이터.jsonl',
  'preproces

In [13]:
import gc
from tqdm.notebook import tqdm
from datasets import load_dataset, Dataset

datasets_loaded = {}
backslash = '\\'

for task, datasets in tqdm(datasets_by_task.items(), desc='Converting datasets', total=len(datasets_by_task)):
    ds_bar = tqdm(datasets, desc=task, total=len(datasets))
    for dataset in ds_bar:
        temp_dir = f"temp/{task}/{dataset.split('/')[-1].split(backslash)[-1]}"
        ds_bar.set_postfix_str(temp_dir)
        if not os.path.exists(temp_dir):
            dataset = Dataset.from_json(dataset)
            dataset.save_to_disk(temp_dir)
            del dataset
            gc.collect()

for task, datasets in tqdm(datasets_by_task.items(), desc='Loading datasets', total=len(datasets_by_task)):
    datasets_loaded[task] = []
    ds_bar = tqdm(datasets, desc=task, total=len(datasets))
    for dataset in ds_bar:
        temp_dir = f"temp/{task}/{dataset.split('/')[-1].split(backslash)[-1]}"
        ds_bar.set_postfix_str(temp_dir)
        datasets_loaded[task].append(Dataset.load_from_disk(temp_dir))

Converting datasets:   0%|          | 0/6 [00:00<?, ?it/s]

CHAT:   0%|          | 0/9 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/120097 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/120761 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25572 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/14622 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10786 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/184000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2232 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3723 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2129 [00:00<?, ? examples/s]

CORPUS:   0%|          | 0/25 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/197698 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/159968 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/14 shards):   0%|          | 0/678758 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5368 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/146771 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/84364 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]

Saving the dataset (0/18 shards):   0%|          | 0/29363383 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4878 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/812992 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/6 shards):   0%|          | 0/28624 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/37 [00:00<?, ?it/s]

Saving the dataset (0/37 shards):   0%|          | 0/2688970 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/331951 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18270 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/96049 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/81127 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/978342 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/580152 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/729280 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/1023431 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]

Saving the dataset (0/22 shards):   0%|          | 0/3536491 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/7 shards):   0%|          | 0/10045 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/101844 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/160002 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/325072 [00:00<?, ? examples/s]

CORPUS_SPOKEN:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/497111 [00:00<?, ? examples/s]

DIALECT: 0it [00:00, ?it/s]

TASK_SUMMARIZATION:   0%|          | 0/7 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/15 shards):   0%|          | 0/678774 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/146771 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/84364 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/96049 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/81127 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/160002 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/325072 [00:00<?, ? examples/s]

TASK_TRANSLATION:   0%|          | 0/13 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2267244 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/1200140 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2305332 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/2387510 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1559656 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1560587 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/2322093 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/240414 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/240242 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/241061 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/319997 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/2399935 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1273039 [00:00<?, ? examples/s]

Loading datasets:   0%|          | 0/6 [00:00<?, ?it/s]

CHAT:   0%|          | 0/9 [00:00<?, ?it/s]

CORPUS:   0%|          | 0/25 [00:00<?, ?it/s]

Loading dataset from disk:   0%|          | 0/18 [00:00<?, ?it/s]

Loading dataset from disk:   0%|          | 0/37 [00:00<?, ?it/s]

Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

CORPUS_SPOKEN:   0%|          | 0/1 [00:00<?, ?it/s]

DIALECT: 0it [00:00, ?it/s]

TASK_SUMMARIZATION:   0%|          | 0/7 [00:00<?, ?it/s]

TASK_TRANSLATION:   0%|          | 0/13 [00:00<?, ?it/s]

In [14]:
datasets_loaded

{'CHAT': [Dataset({
      features: ['messages'],
      num_rows: 120097
  }),
  Dataset({
      features: ['messages'],
      num_rows: 120761
  }),
  Dataset({
      features: ['messages'],
      num_rows: 25572
  }),
  Dataset({
      features: ['messages'],
      num_rows: 14622
  }),
  Dataset({
      features: ['messages'],
      num_rows: 10786
  }),
  Dataset({
      features: ['messages'],
      num_rows: 184000
  }),
  Dataset({
      features: ['messages'],
      num_rows: 2232
  }),
  Dataset({
      features: ['messages'],
      num_rows: 3723
  }),
  Dataset({
      features: ['messages'],
      num_rows: 2129
  })],
 'CORPUS': [Dataset({
      features: ['text'],
      num_rows: 197698
  }),
  Dataset({
      features: ['text'],
      num_rows: 159968
  }),
  Dataset({
      features: ['text'],
      num_rows: 678758
  }),
  Dataset({
      features: ['text'],
      num_rows: 16000
  }),
  Dataset({
      features: ['text'],
      num_rows: 5368
  }),
  Dataset({
      f

In [8]:
import huggingface_hub

huggingface_hub.notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:
# publish the datasets

for name, dataset in datasets_loaded.items():
    if len(dataset) == 0:
        print(f"Skipping {name}")
        continue
    concatenate_datasets(dataset).save_to_disk(f"E:/2024/datasets/{name}")

Saving the dataset (0/3 shards):   0%|          | 0/483922 [00:00<?, ? examples/s]

Saving the dataset (0/125 shards):   0%|          | 0/42159830 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/497111 [00:00<?, ? examples/s]

Skipping DIALECT


Saving the dataset (0/20 shards):   0%|          | 0/1572159 [00:00<?, ? examples/s]

Saving the dataset (0/12 shards):   0%|          | 0/18317250 [00:00<?, ? examples/s]

In [8]:
# publish a subset of the datasets, with 10k samples

subset = concatenate_datasets(datasets_loaded["CORPUS"]).shuffle(seed=42).select(range(10000))

subset.push_to_hub(f'devngho/korean-corpus', 'corpus-10k', private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/devngho/korean-corpus/commit/975c677a49e628c5fb7e7aa43091e4e38b7a5d1a', commit_message='Upload dataset', commit_description='', oid='975c677a49e628c5fb7e7aa43091e4e38b7a5d1a', pr_url=None, pr_revision=None, pr_num=None)

In [9]:
subset = concatenate_datasets(datasets_loaded["DIALECT"]).shuffle(seed=42).select(range(10000))

subset.push_to_hub(f'devngho/korean-corpus', 'dialect-10k', private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.11k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/devngho/korean-corpus/commit/28acb223364acd661308ef88e97aff5e70dbe455', commit_message='Upload dataset', commit_description='', oid='28acb223364acd661308ef88e97aff5e70dbe455', pr_url=None, pr_revision=None, pr_num=None)