# Data Collection


In [None]:
import os
import sys
import json
import argparse
from pathlib import Path
from datasets import load_dataset, disable_caching
from huggingface_hub import login

try:
    import orjson
    def dumps(x): return orjson.dumps(x).decode()
except Exception:
    def dumps(x): return json.dumps(x, separators=(',', ':'))


In [None]:
# Load Hugging Face authentication token from environment or .env file
def load_token():
    t = os.getenv("HUGGINGFACE_HUB_TOKEN")
    if t: return t
    p = Path(".env")
    if p.exists():
        for line in p.read_text(encoding="utf-8").splitlines():
            if line.startswith("HUGGINGFACE_HUB_TOKEN=") and not line.lstrip().startswith("#"):
                return line.split("=", 1)[1].strip()
    sys.stderr.write("Set HUGGINGFACE_HUB_TOKEN or add it to .env\n")
    sys.exit(1)


In [None]:
# Display progress bar for data download tracking
def progress(i, n):
    if not n: return
    k = 40
    bar = "=" * int(k * i / n) + "-" * (k - int(k * i / n))
    sys.stdout.write(f"\r[{bar}] {i:,}/{n:,} ({100*i/n:4.1f}%)")
    sys.stdout.flush()


In [None]:
# Stream and save dataset samples to JSONL file with progress tracking
def stream(sample_size, out_path, split, step):
    disable_caching()
    login(token=load_token())
    ds = load_dataset("lmsys/lmsys-chat-1m", split=split, streaming=True)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    wrote = 0
    with open(out_path, "w", encoding="utf-8", buffering=1_048_576) as f:
        for wrote, ex in enumerate(ds, 1):
            f.write(dumps(ex) + "\n")
            if wrote % step == 0: progress(wrote, sample_size)
            if wrote >= sample_size: break
    progress(sample_size, sample_size)
    return wrote


In [None]:
# Configure data collection parameters and execute download
samples = 1_000_000
output_path = Path("data/raw_conversations.jsonl")
split = "train"
progress_every = 50_000

n = stream(samples, output_path, split, progress_every)
