	# Author: Alexander Staub
	## Last changed: 2025.06.26
	## Purpose: RUN FIRST - devide the relevant dataset into N parts (3 at the moment)


In [8]:
#installing packages
import time
import requests
import logging
import pandas as pd
from pprint import pprint
from datetime import datetime
import json
import os
import numpy as np

In [None]:
# --- CONFIGURATION ---

NUM_WORKERS = 3  # The number of parallel scripts you want to run

# ---  Define the path to your sample dataset ---
#MASTER_INPUT_FILE = "//bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_ids/chartmetric_ids_sample.csv"

# --- CHANGE: Define the path to your full datasets of musicbrainz and charts ---
MASTER_INPUT_FILE_MB = "//bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_ids/chartmetric_ids_mb_matched.csv"
MASTER_INPUT_FILE_CHARTS = "//bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_ids/chartmetric_ids_chart_songs_matched.csv"

# --- Define a new directory where the split input files will be saved ---
WORKER_INPUT_DIR = "//bigdata.wu.ac.at/delpero/Data_alexander/data/incidental/chartmetric/worker_inputs/"

### WHAT THIS CHANGE DOES:
# This separates the logic. This notebook acts as the main setup script.
# It defines a new, dedicated folder to hold the input files for each worker, keeping your project organized.

print(f"Creating worker input directory: {WORKER_INPUT_DIR}")
os.makedirs(WORKER_INPUT_DIR, exist_ok=True)

Creating worker input directory: //bigdata.wu.ac.at/delpero/Data_alexander/data/incidental/chartmetric/worker_inputs/


In [None]:
# Create a file called MASTER_INPUT_FILE which row binds the two input files

# load the two input files
df_mb = pd.read_csv(MASTER_INPUT_FILE_MB)
df_charts = pd.read_csv(MASTER_INPUT_FILE_CHARTS)

# remove the columns in df_charts that are not in df_mb
df_charts = df_charts[df_mb.columns]

#remove the columns in df_mb that are not in df_charts
df_mb = df_mb[df_charts.columns]

#row bind the two dataframes
df_combined = pd.concat([df_mb, df_charts], ignore_index=True)



In [None]:
# --- Load and Prepare the Master Dataset ---
print(f"Loading master dataset from {MASTER_INPUT_FILE}...")
master_df = df_combined

Loading master dataset from //bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_ids/chartmetric_ids_sample.csv...


In [6]:
# Clean the data ONCE before splitting
master_df = master_df.drop_duplicates(subset="chartmetric_ids")
master_df.dropna(subset=['chartmetric_ids'], inplace=True)
master_df.reset_index(drop=True, inplace=True)

print(f"Loaded and cleaned {len(master_df)} unique IDs.")

Loaded and cleaned 4573 unique IDs.


In [9]:
# --- Split the DataFrame into Chunks for Each Worker ---
id_chunks = np.array_split(master_df, NUM_WORKERS)

  return bound(*args, **kwds)


In [10]:
# --- Save Each Chunk to its Own File ---
for i, chunk in enumerate(id_chunks):
    part_number = i + 1
    # --- The output path is now dynamic for each worker part. ---
    output_path = os.path.join(WORKER_INPUT_DIR, f"ids_part_{part_number}.csv")
    
    ### WHAT THIS CHANGE DOES:
    # It creates separate, numbered input files (e.g., ids_part_1.csv, ids_part_2.csv).
    # Each file contains a unique and non-overlapping subset of the original IDs.
    
    chunk.to_csv(output_path, index=False)
    print(f"Saved chunk {part_number} with {len(chunk)} IDs to {output_path}")

print("\nController script finished. You can now run the worker notebooks.")

Saved chunk 1 with 1525 IDs to //bigdata.wu.ac.at/delpero/Data_alexander/data/incidental/chartmetric/worker_inputs/ids_part_1.csv
Saved chunk 2 with 1524 IDs to //bigdata.wu.ac.at/delpero/Data_alexander/data/incidental/chartmetric/worker_inputs/ids_part_2.csv
Saved chunk 3 with 1524 IDs to //bigdata.wu.ac.at/delpero/Data_alexander/data/incidental/chartmetric/worker_inputs/ids_part_3.csv

Controller script finished. You can now run the worker notebooks.
