In [None]:
import os
import json
import glob
import gzip

In [None]:
path_download = os.path.join("data", "download")

In [None]:
file_names = os.listdir(path_download)
file_names_prefix = [
    f"{chunk[0]}_{chunk[1]}"
    for chunk in [file_name.split("_") for file_name in file_names]
]
file_names_prefix = list[str](set(file_names_prefix))

In [None]:
path_output = os.path.join("data", "output")
os.makedirs(path_output, exist_ok=True)

total_processed = 0

for file_name_prefix in file_names_prefix:
    print(f"Processing: {file_name_prefix}")

    pattern = os.path.join(path_download, f"{file_name_prefix}_*.json")
    json_files = sorted(glob.glob(pattern))

    if not json_files:
        print("  ⚠ No files found")
        continue

    print(f"  Found {len(json_files)} file(s)")

    output_file = os.path.join(
        path_output, f"{file_name_prefix}_{len(json_files)}.ndjson.gz"
    )
    row_count = 0

    with gzip.open(output_file, "wt", encoding="utf-8") as gz_file:
        for json_file in json_files:
            with open(json_file, "r", encoding="utf-8") as f:
                data = json.load(f)

                if isinstance(data, list):
                    for item in data:
                        gz_file.write(json.dumps(item, ensure_ascii=False) + "\n")
                        row_count += 1
                else:
                    gz_file.write(json.dumps(data, ensure_ascii=False) + "\n")
                    row_count += 1

    total_processed += 1
    print(f"  → Saved: {output_file} (rows: {row_count:,})")

print(f"\n✓ Complete! Total prefixes processed: {total_processed}")

In [None]:
import polars as pl

test_df = pl.read_ndjson(
    os.path.join(path_output, "2024-11-16_2025-02-13_373.ndjson.gz")
)
test_df