In [2]:
# 100 K 샘플로 무작위 추출
from datasets import load_dataset
import json
import random
from pathlib import Path

# 🔧 사용할 한국어 데이터셋
datasets_config = {
    "koalpaca": {
        "path": "beomi/KoAlpaca-v1.1a",
        "split": "train"
    },
    "openorca": {
        "path": "kyujinpy/KOR-OpenOrca-Platypus-v3",
        "split": "train"
    },
    "kullm": {
        "path": "nlpai-lab/kullm-v2",
        "split": "train"
    },
    "sharegpt": {
        "path": "FreedomIntelligence/sharegpt-korean",
        "split": "train"
    }
}

merged_data = []
all_valid_entries = []

# ✅ 변환 함수 (KoAlpaca 포맷 통일)
def convert_to_koalpaca_format(name, entry):
    if name == "sharegpt":
        conversations = entry.get("conversations", [])
        if len(conversations) >= 2:
            return {
                "instruction": conversations[0].get("value", "").strip(),
                "input": "",
                "output": conversations[1].get("value", "").strip()
            }
    else:
        instruction = entry.get("instruction") or entry.get("prompt", "")
        output = entry.get("output") or entry.get("completion", "")
        if instruction and output:
            return {
                "instruction": instruction.strip(),
                "input": "",
                "output": output.strip()
            }
    return None

# ✅ 모든 유효 데이터 모으기
for name, config in datasets_config.items():
    print(f"🔄 Loading: {name} from {config['path']}")
    dataset = load_dataset(config["path"], split=config["split"])
    dataset = dataset.shuffle(seed=42)

    valid = []
    for entry in dataset:
        item = convert_to_koalpaca_format(name, entry)
        if item:
            valid.append(item)

    print(f"✅ {name} usable: {len(valid)}")
    all_valid_entries.extend(valid)

# ✅ 100K로 무작위 추출
random.seed(42)
final_dataset = random.sample(all_valid_entries, min(100_000, len(all_valid_entries)))

# 💾 저장
output_path = Path("../data/merged_ko_instruction_100k.jsonl")
with open(output_path, "w", encoding="utf-8") as f:
    for item in final_dataset:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"✅ 최종 저장 완료: {output_path} (총 샘플 수: {len(final_dataset)})")


🔄 Loading: koalpaca from beomi/KoAlpaca-v1.1a
✅ koalpaca usable: 21155
🔄 Loading: openorca from kyujinpy/KOR-OpenOrca-Platypus-v3
✅ openorca usable: 34212
🔄 Loading: kullm from nlpai-lab/kullm-v2
✅ kullm usable: 151346
🔄 Loading: sharegpt from FreedomIntelligence/sharegpt-korean
✅ sharegpt usable: 6011
✅ 최종 저장 완료: ..\data\merged_ko_instruction_100k.jsonl (총 샘플 수: 100000)


In [3]:
import os

file_path = "../data/merged_ko_instruction_100k.jsonl"

if os.path.exists(file_path):
    size_mb = os.path.getsize(file_path) / (1024 * 1024)
    print(f"📦 파일 용량: {size_mb:.2f} MB")
else:
    print("❌ 파일이 존재하지 않습니다.")


📦 파일 용량: 154.54 MB


In [4]:
file_path = "../data/merged_ko_instruction_100k.jsonl"

with open(file_path, "r", encoding="utf-8") as f:
    line_count = sum(1 for _ in f)

print(f"📊 총 샘플 수: {line_count}")


📊 총 샘플 수: 100000


In [None]:
jsonl_path = "../data/merged_ko_instruction_100k.jsonl"
raw_dataset = load_dataset("json", data_files=jsonl_path, split="train")
split_dataset = raw_dataset.train_test_split(test_size=0.05, seed=42)
train_raw = split_dataset["train"]
val_raw = split_dataset["test"]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
from datasets import DatasetDict

# 2. 로컬 JSONL 파일 로드 및 분할
jsonl_path = "../data/merged_ko_instruction_100k.jsonl"
raw_dataset = load_dataset("json", data_files=jsonl_path, split="train")
split_dataset = raw_dataset.train_test_split(test_size=0.05, seed=42)

# 3. DatasetDict로 묶기
dataset_dict = DatasetDict({
    "train": split_dataset["train"],
    "validation": split_dataset["test"]
})

# 4. 데이터셋 푸시 (dataset repo가 없다면 자동 생성됨)
dataset_dict.push_to_hub("jwlee-ai/ko-instruct-mix-100k")


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/95 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/jwlee-ai/ko-instruct-100k-merged/commit/79217bd31e6cc0ed3c40635e8722579a606f23fc', commit_message='Upload dataset', commit_description='', oid='79217bd31e6cc0ed3c40635e8722579a606f23fc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/jwlee-ai/ko-instruct-100k-merged', endpoint='https://huggingface.co', repo_type='dataset', repo_id='jwlee-ai/ko-instruct-100k-merged'), pr_revision=None, pr_num=None)