In [None]:
!pip install --upgrade numpy



In [None]:
import numpy as np
print(f"NumPy version THIS session is using: {np.__version__}")
import os
from google.colab import drive
import zipfile

# Mount Google Drive
drive.mount('/content/drive')

save_path = '/content/drive/MyDrive/AAI511_ML/preprocessed_composer_data_v2.npz'

print(f"Inspecting arrays inside: {save_path}\n")

try:
    with zipfile.ZipFile(save_path) as zf:
        for name in zf.namelist():
            if not name.endswith('.npy'):
                continue

            with zf.open(name) as fp:
                # Read the magic string to get the file format version
                version = np.lib.format.read_magic(fp)

                # Call the correct header function based on the version
                if version[0] == 1:
                    shape, fortran_order, dtype = np.lib.format.read_array_header_1_0(fp)
                elif version[0] == 2:
                    shape, fortran_order, dtype = np.lib.format.read_array_header_2_0(fp)
                else:
                    print(f"--> Array: '{name.replace('.npy', '')}' has an unsupported format version: {version}")
                    continue

                array_name = name.replace('.npy', '')
                print(f"--> Array: '{array_name}'")
                print(f"    Shape: {shape}")
                print(f"    Data Type (dtype): {dtype}")
                print("-" * 20)

except Exception as e:
    print(f"An error occurred while inspecting the file: {e}")

NumPy version THIS session is using: 2.0.2
Mounted at /content/drive
Inspecting arrays inside: /content/drive/MyDrive/AAI511_ML/preprocessed_composer_data_v2.npz

--> Array: 'network_input'
    Shape: (6742025, 100, 1)
    Data Type (dtype): float64
--------------------
--> Array: 'network_output'
    Shape: (6742025,)
    Data Type (dtype): int64
--------------------


In [None]:
import tensorflow as tf


def create_tf_dataset(file_path, batch_size=32):
    """Create a TensorFlow dataset that loads data efficiently"""

    # Load with memory mapping
    data = np.load(file_path, mmap_mode='r')
    network_input = data['network_input']
    network_output = data['network_output']

    # Create TensorFlow dataset
    dataset = tf.data.Dataset.from_tensor_slices((network_input, network_output))

    # Optimize the pipeline
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)  # Prefetch next batch while training
    dataset = dataset.cache()  # Cache in memory if possible

    return dataset, data

# Your rewritten loading code
if os.path.exists(save_path):
    print(f"Loading preprocessed data from {save_path}...")

    # Create TensorFlow dataset (this replaces the old loading method)
    train_dataset, data_file = create_tf_dataset(save_path, batch_size=32)

    # Get shapes and info from the memory-mapped data
    network_input = data_file['network_input']
    network_output = data_file['network_output']

    print(f"Data loaded successfully!")
    print(f"Network input shape: {network_input.shape}")
    print(f"Network output shape: {network_output.shape}")
    print(f"Network input dtype: {network_input.dtype}")
    print(f"Network output dtype: {network_output.dtype}")

    # The data_file stays open for the dataset to use
    print(f"TensorFlow dataset created with batch size: 64")
    print(f"Number of batches: {len(train_dataset)}")

else:
    print(f"File not found at {save_path}")
    print("Please check the file path or re-run your preprocessing code.")

# Now use train_dataset in your model training instead of network_input/network_output
print("\n✓ Use 'train_dataset' in your model.fit() instead of network_input/network_output")
print("Example: model.fit(train_dataset, epochs=10)")

Loading preprocessed data from /content/drive/MyDrive/AAI511_ML/preprocessed_composer_data_v2.npz...
Data loaded successfully!
Network input shape: (6742025, 100, 1)
Network output shape: (6742025,)
Network input dtype: float64
Network output dtype: int64
TensorFlow dataset created with batch size: 64
Number of batches: 210689

✓ Use 'train_dataset' in your model.fit() instead of network_input/network_output
Example: model.fit(train_dataset, epochs=10)
