In [1]:
%env HF_DATASETS_CACHE=E:\2024/hf_cache

env: HF_DATASETS_CACHE=E:\2024/hf_cache


In [5]:
from datasets import load_from_disk

tasks = ['corpus', 'corpus_spoken']

corpus = {k: load_from_disk(f"E:\\2024/datasets/{k.upper()}") for k in tasks}

Loading dataset from disk:   0%|          | 0/125 [00:00<?, ?it/s]

In [6]:
# 1. exact deduplication
def dedup_exact(dataset, column):
    mem = set()
    
    def is_dup(x):
        text_hash = hash(x)
        if text_hash in mem:
            return True
        mem.add(text_hash)
        return False
    
    return dataset.filter(lambda x: not is_dup(x[column]))

corpus_dedup = {}

for task in tasks:
    corpus_dedup[task] = dedup_exact(corpus[task], 'text')

In [7]:
for task in tasks:
    print(f"Before deduplication: {task} - {len(corpus[task])}")
    print(f"After deduplication: {task} - {len(corpus_dedup[task])} ({len(corpus[task]) - len(corpus_dedup[task])} duplicates removed)")

Before deduplication: corpus - 42159830
After deduplication: corpus - 35528369 (6631461 duplicates removed)
Before deduplication: corpus_spoken - 497111
After deduplication: corpus_spoken - 458878 (38233 duplicates removed)


In [9]:
def length_filter(dataset, column, min_length=100, min_sentences=5):
    def check_length(x):
        sentences = len(x[column].split('.'))
        return sentences >= min_sentences and len(x[column]) >= min_length
    
    return dataset.filter(check_length, num_proc=32)

corpus_dedup_length = {}

for task in tasks:
    if task == 'corpus':
        corpus_dedup_length[task] = length_filter(corpus_dedup[task], 'text', 100, 5)
    else:
        corpus_dedup_length[task] = corpus_dedup[task]

Filter (num_proc=32):   0%|          | 0/35528369 [00:00<?, ? examples/s]

In [8]:
import re

def remove_email_dataset(dataset, column):
    email_regex = re.compile(r'[a-zA-Z0-9._%+-]+@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,6})?', re.MULTILINE)
    
    return dataset.map(lambda x: {column: email_regex.sub('', x[column]).strip()}, num_proc=32)

corpus_dedup_email_length = {}
corpus_dedup_email_length_rows = {}

for task in tasks:
    corpus_dedup_email_length[task] = remove_email_dataset(corpus_dedup_length[task], 'text')

Map (num_proc=32):   0%|          | 0/35528369 [00:00<?, ? examples/s]

Map (num_proc=32):   0%|          | 0/458878 [00:00<?, ? examples/s]

In [10]:
print(f"Before length filtering: corpus - {len(corpus_dedup['corpus'])}")
print(f"After length filtering: corpus - {len(corpus_dedup_email_length['corpus'])} ({len(corpus_dedup['corpus']) - len(corpus_dedup_email_length['corpus'])} removed)")

Before length filtering: corpus - 35528369
After length filtering: corpus - 16853440 (18674929 removed)


In [11]:
corpus_dedup_email_length['corpus'].save_to_disk("E:\\2024/datasets_filtered/CORPUS")
corpus_dedup_email_length['corpus_spoken'].save_to_disk("E:\\2024/datasets_filtered/CORPUS_SPOKEN")

Saving the dataset (0/57 shards):   0%|          | 0/16853440 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/458878 [00:00<?, ? examples/s]

In [12]:
!pip install text-dedup

Collecting text-dedup
  Obtaining dependency information for text-dedup from https://files.pythonhosted.org/packages/5d/64/c16c52acc18381387982b6fa8647382b7f3f6f92fe93b61cc0f5b5fdfd74/text_dedup-0.4.0-py3-none-any.whl.metadata
  Using cached text_dedup-0.4.0-py3-none-any.whl.metadata (19 kB)
Collecting bitarray>=2.6.2 (from text-dedup)
  Obtaining dependency information for bitarray>=2.6.2 from https://files.pythonhosted.org/packages/8c/75/e921ada57bb0bcece5eb515927c031f0bc828f702b8f213639358d9df396/bitarray-3.0.0-cp311-cp311-win_amd64.whl.metadata
  Downloading bitarray-3.0.0-cp311-cp311-win_amd64.whl.metadata (33 kB)
Collecting click-option-group<0.6.0,>=0.5.6 (from text-dedup)
  Obtaining dependency information for click-option-group<0.6.0,>=0.5.6 from https://files.pythonhosted.org/packages/af/75/81ea958bc0f7e410257cb2a42531b93a7695a31930cde87192c010a52c50/click_option_group-0.5.6-py3-none-any.whl.metadata
  Using cached click_option_group-0.5.6-py3-none-any.whl.metadata (8.3 kB)
C

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'd:\\dev\\korean-datasets\\venv3\\lib\\site-packages\\numpy\\linalg\\_umath_linalg.cp311-win_amd64.pyd'
Check the permissions.


[notice] A new release of pip is available: 23.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## launch in wsl or linux

In [1]:
# run dedup_by_cluster.sh to dedup corpus. for me, it took 1h 23m and its cost was about $10. <=10% files were deduplicated.

from datasets import load_from_disk

corpus_ds = load_from_disk("/mnt/e/2024/datasets_filtered/CORPUS")

corpus_ds.save_to_disk('/home/devngho/CORPUS')

corpus_ds_wsl = load_from_disk('/home/devngho/CORPUS')

corpus_ds_wsl.to_parquet("gs://devngho-dedup-clusters/input", storage_options={"project": input("project"), "token": '\\\\wsl.localhost\\Ubuntu\\home\\devngho\\.config\\gcloud\\application_default_credentials.json'}, compression="NONE", batch_size=10000)

Loading dataset from disk:   0%|          | 0/57 [00:00<?, ?it/s]

ValueError: Invalid gcloud credentials

In [16]:
!python -m text_dedup.minhash --path "/mnt/e/2024/datasets_filtered/CORPUS_SPOKEN" --local --split "train" --output "/mnt/e/2024/datasets_dedup/CORPUS_SPOKEN" --column "text" --batch_size 10000

D:\dev\korean-datasets\venv3\Scripts\python.exe: Error while finding module specification for 'text_dedup.minhash' (ModuleNotFoundError: No module named 'text_dedup')


## download

In [1]:
import gcsfs

project = input("project")

fs = gcsfs.GCSFileSystem(project, token="\\\\wsl.localhost\\Ubuntu\\home\\devngho\\.config\\gcloud\\application_default_credentials.json")

files = fs.listdir("gs://devngho-dedup-clusters/output")

In [2]:
from datasets import load_dataset
from tqdm.notebook import tqdm

datasets = []
files = list(filter(lambda x: x['storageClass'] == 'DIRECTORY', files))

for file in tqdm(files, total=len(files), desc="Downloading deduplicated datasets"):
    directory = fs.listdir(f"gs://{file['name']}")
    parquet = next(filter(lambda x: x['name'].endswith('.parquet'), directory))
    dataset = load_dataset('parquet', data_files={"train": f"gs://{parquet['name']}"}, split="train", storage_options={"project": project, "token": "\\\\wsl.localhost\\Ubuntu\\home\\devngho\\.config\\gcloud\\application_default_credentials.json"})
    datasets.append(dataset)

Downloading deduplicated datasets:   0%|          | 0/256 [00:00<?, ?it/s]

In [6]:
from datasets import concatenate_datasets

dataset_dedup = concatenate_datasets(datasets)
print(dataset_dedup)

Dataset({
    features: ['__id__', 'text'],
    num_rows: 16048255
})


In [9]:
from datasets import load_from_disk

before = load_from_disk("E:\\2024/datasets_filtered/CORPUS")
after = dataset_dedup

print(f"Before deduplication: {len(before)}")
print(f"After deduplication: {len(after)} ({len(before) - len(after)}({(len(before) - len(after)) / len(before) * 100:.2f}%) duplicates removed)")

Loading dataset from disk:   0%|          | 0/57 [00:00<?, ?it/s]

Before deduplication: 16853440
After deduplication: 16048255 (805185(4.78%) duplicates removed)


In [10]:
dataset_dedup.save_to_disk("E:\\2024/datasets_dedup/CORPUS")

Saving the dataset (0/104 shards):   0%|          | 0/16048255 [00:00<?, ? examples/s]

## upload

In [1]:
from datasets import load_from_disk

tasks = ['corpus', 'corpus_spoken']
# tasks = ['corpus_spoken']

corpus_dedup = {k: load_from_disk(f"E:\\2024/datasets_dedup/{k.upper()}") for k in tasks}

Loading dataset from disk:   0%|          | 0/104 [00:00<?, ?it/s]

In [12]:
corpus_dedup['corpus'].push_to_hub('devngho/korean-corpus', private=True)

Uploading the dataset shards:   0%|          | 0/104 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/devngho/korean-corpus/commit/6006c0b6212beb384688026a6b3985bada0f663f', commit_message='Upload dataset (part 00002-of-00003)', commit_description='', oid='6006c0b6212beb384688026a6b3985bada0f663f', pr_url=None, pr_revision=None, pr_num=None)

In [6]:
corpus_dedup['corpus_spoken'].push_to_hub('devngho/korean-corpus-spoken', private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/372 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/devngho/korean-corpus-spoken/commit/dee5e7ba591af8f755666ed3eca7eb679911df6b', commit_message='Upload dataset', commit_description='', oid='dee5e7ba591af8f755666ed3eca7eb679911df6b', pr_url=None, pr_revision=None, pr_num=None)

## Shuffle

In [2]:
corpus_shuffled = {k: v.shuffle(seed=42).flatten_indices(num_proc=4) for k, v in corpus_dedup.items()}

Flattening the indices (num_proc=4):   0%|          | 0/16048255 [00:00<?, ? examples/s]

Flattening the indices (num_proc=4):   0%|          | 0/371895 [00:00<?, ? examples/s]

In [3]:
for task in tasks:
    corpus_shuffled[task].save_to_disk(f"E:\\2024/datasets_final/{task.upper()}")

Saving the dataset (0/104 shards):   0%|          | 0/16048255 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/371895 [00:00<?, ? examples/s]

In [4]:
corpus_shuffled['corpus'].push_to_hub('devngho/korean-corpus-shuffled', private=True)
corpus_shuffled['corpus_spoken'].push_to_hub('devngho/korean-corpus-spoken-shuffled', private=True)

Uploading the dataset shards:   0%|          | 0/104 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/155 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/372 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/devngho/korean-corpus-spoken-shuffled/commit/b5a7c060185d30cae6381b9738247eb8bca466d5', commit_message='Upload dataset', commit_description='', oid='b5a7c060185d30cae6381b9738247eb8bca466d5', pr_url=None, pr_revision=None, pr_num=None)