In [None]:
import numpy as np
import joblib
import os
import json

# Paths
feature_dir = "/kaggle/input/features-ntt123/features"  # Directory containing feature batches
kmeans_model_path = "/kaggle/input/in4remodel/kmeans_model.joblib"  # Path to the trained k-means model
output_path = "/kaggle/working/units.json"  # Path to save the resulting JSON file

# Load the trained k-means model
kmeans = joblib.load(kmeans_model_path)

unit_data = []
feature_files = [os.path.join(feature_dir, f) for f in os.listdir(feature_dir) if f.endswith(".npz")]

for feature_file in feature_files:
    batch_data = np.load(feature_file, allow_pickle=True)

    for key in batch_data:
        entry = batch_data[key].item()  
        features = entry["features"]  
        file_path = entry["path"]  
        duration = entry["duration"]  

        units = kmeans.predict(features)  

        unit_data.append({
            "audio": file_path, 
            "duration": duration,
            "codes": " ".join(map(str, units))
        })

    print(f"Processed file {feature_file}")

# Save the result to a JSON file
with open(output_path, "w") as json_file:
    for entry in unit_data:
        json_file.write(json.dumps(entry) + "\n")

print(f"Unit data saved to {output_path}")

Processed file /kaggle/input/features-ntt123/features/features_batch_1.npz
Processed file /kaggle/input/features-ntt123/features/features_batch_15.npz
Processed file /kaggle/input/features-ntt123/features/features_batch_6.npz
Processed file /kaggle/input/features-ntt123/features/features_batch_9.npz
Processed file /kaggle/input/features-ntt123/features/features_batch_12.npz
Processed file /kaggle/input/features-ntt123/features/features_batch_3.npz
Processed file /kaggle/input/features-ntt123/features/features_batch_7.npz
Processed file /kaggle/input/features-ntt123/features/features_batch_2.npz
Processed file /kaggle/input/features-ntt123/features/features_batch_13.npz
Processed file /kaggle/input/features-ntt123/features/features_batch_8.npz
Processed file /kaggle/input/features-ntt123/features/features_batch_14.npz
Processed file /kaggle/input/features-ntt123/features/features_batch_10.npz
Processed file /kaggle/input/features-ntt123/features/features_batch_5.npz
Processed file /kagg

# Creating splits

In [None]:
import json
import math

# Paths
input_path = "/kaggle/input/in4remodel/units.json"  
train_output_path = "/kaggle/working/train.txt"  
val_output_path = "/kaggle/working/val.txt"  

with open(input_path, "r") as json_file:
    lines = json_file.readlines()

# Calculate split index
total_lines = len(lines)
train_size = math.floor(total_lines * 0.8)  # 80% for training

train_lines = lines[:train_size]
val_lines = lines[train_size:]

with open(train_output_path, "w") as train_file:
    train_file.writelines(train_lines)
with open(val_output_path, "w") as val_file:
    val_file.writelines(val_lines)

print(f"Training data saved to {train_output_path} ({len(train_lines)} lines)")
print(f"Validation data saved to {val_output_path} ({len(val_lines)} lines)")

Training data saved to /kaggle/working/train.txt (11868 lines)
Validation data saved to /kaggle/working/val.txt (2967 lines)
