In [3]:
# FILE: 01_migrate_checkpoint.ipynb
import json
import os
import math

# --- CONFIGURATION ---
# !!! EDIT THESE TWO LINES FOR EACH WORKER'S CHECKPOINT YOU MIGRATE !!!
PART_NUMBER = 3
BATCH_SIZE = 1000 # The number of records per batch file. 1000 is a good default.
# -------------------------------------------------------------------

# --- Define Paths ---
# This should be the directory where your MERGED data will eventually go
METADATA_OUTPUT_DIR = "//bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_chars/"
# This is the path to your CURRENT large checkpoint file
EXISTING_CHECKPOINT_FILE = os.path.join(METADATA_OUTPUT_DIR, f"part_{PART_NUMBER}", "song_chars_checkpoint_repaired.json")
# This is the new directory where the small batch files will be created
BATCH_OUTPUT_DIR = os.path.join(METADATA_OUTPUT_DIR, f"part_{PART_NUMBER}", "responses")

print(f"Migrating checkpoint for Worker {PART_NUMBER}...")

if not os.path.exists(EXISTING_CHECKPOINT_FILE):
    print(f"ERROR: Cannot find existing checkpoint file at: {EXISTING_CHECKPOINT_FILE}")
else:
    os.makedirs(BATCH_OUTPUT_DIR, exist_ok=True)
    
    # Load the entire existing checkpoint file into memory one last time
    print(f"Loading {EXISTING_CHECKPOINT_FILE}...")
    with open(EXISTING_CHECKPOINT_FILE, "r") as f:
        all_responses = json.load(f)
    
    print(f"Loaded {len(all_responses):,} records. Now splitting into batches of {BATCH_SIZE}...")

    # Loop through the data in chunks and save each chunk as a new batch file
    num_batches = math.ceil(len(all_responses) / BATCH_SIZE)
    for i in range(num_batches):
        start_index = i * BATCH_SIZE
        end_index = start_index + BATCH_SIZE
        batch_data = all_responses[start_index:end_index]
        
        # Format batch number with leading zeros for correct sorting (e.g., batch_00001.json)
        batch_filename = os.path.join(BATCH_OUTPUT_DIR, f"batch_{i+1:05d}.json")
        
        with open(batch_filename, "w") as f:
            json.dump(batch_data, f, indent=2)

    print(f"Successfully split data into {num_batches} batch files in:\n{BATCH_OUTPUT_DIR}")

Migrating checkpoint for Worker 3...
Loading //bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_chars/part_3\song_chars_checkpoint_repaired.json...
Loaded 25,005 records. Now splitting into batches of 1000...
Successfully split data into 26 batch files in:
//bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_chars/part_3\responses
