<a href="https://colab.research.google.com/gist/gyacynuk/735711c82835549f000f82a7271ea06d/csc412-data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
try:
  import pypianoroll
except ModuleNotFoundError:
  !pip install pypianoroll
  import pypianoroll
from scipy.sparse import csc_matrix
import json
from pathlib import Path
import torch
from torch.utils import data as Data
from torch.utils.data import Dataset, DataLoader, random_split, TensorDataset

In [None]:
# Borrowed code from https://github.com/salu133445/lakh-pianoroll-dataset/blob/main/src/pypianoroll/multitrack.py

def load(filepath):
    """
    Load a .npz file. Supports only files previously saved by
    :meth:`pypianoroll.Multitrack.save`.

    Parameters
    ----------
    filepath : str
        The path to the .npz file.
    """
    def reconstruct_sparse(target_dict, name):
        """
        Return the reconstructed scipy.sparse.csc_matrix, whose components
        are stored in `target_dict` with prefix given as `name`.
        """
        return csc_matrix((target_dict[name+'_csc_data'],
                            target_dict[name+'_csc_indices'],
                            target_dict[name+'_csc_indptr']),
                            shape=target_dict[name+'_csc_shape']).toarray()

    with np.load(filepath) as loaded:
        if 'info.json' not in loaded:
            raise ValueError("Cannot find 'info.json' in the .npz file")
        info_dict = json.loads(loaded['info.json'].decode('utf-8'))
        name = info_dict['name']
        beat_resolution = info_dict['beat_resolution']

        tempo = loaded['tempo']
        if 'downbeat' in loaded.files:
            downbeat = loaded['downbeat']
        else:
            downbeat = None

        idx = 0
        tracks = []
        while str(idx) in info_dict:
            pianoroll = reconstruct_sparse(loaded,
                                            'pianoroll_{}'.format(idx))
            track = pypianoroll.StandardTrack(pianoroll=pianoroll,
                                        program=info_dict[str(idx)]['program'],
                                        is_drum=info_dict[str(idx)]['is_drum'],
                                        name=info_dict[str(idx)]['name'])
            tracks.append(track)
            idx += 1

        return pypianoroll.Multitrack(name=name, resolution=beat_resolution, tempo=tempo, downbeat=downbeat, tracks=tracks)

In [None]:
# Borrowed code from https://github.com/salu133445/musegan/blob/main/src/collect_data.py

FAMILY_NAMES = [
    "drum",
    "bass",
    "guitar",
    "string",
    "piano",
]

FAMILY_THRESHOLDS = [
    (2, 24),  # drum
    (1, 96),  # bass
    (2, 156),  # guitar
    (2, 156),  # string,
    (2, 48),  # piano
]

BATCH_SIZE = 200

In [None]:
  # Borrowed code from https://github.com/salu133445/musegan/blob/main/src/collect_data.py

def check_which_family(track):
    """ Returns a Numpy array of 5 booleans, where each is true iff the track's
    program corresponds to a drum, bass, guitar, string or piano.
    """
    def is_piano(program, is_drum):
        return not is_drum and (
            (program >= 0 and program <= 7)
            or (program >= 16 and program <= 23)
        )

    def is_guitar(program):
        return program >= 24 and program <= 31

    def is_bass(program):
        return program >= 32 and program <= 39

    def is_string(program):
        return program >= 40 and program <= 51

    return np.array(
        [
            track.is_drum,
            is_bass(track.program),
            is_guitar(track.program),
            is_string(track.program),
            is_piano(track.program, track.is_drum),
        ]
    )

In [None]:
# Borrowed code from https://github.com/salu133445/musegan/blob/main/src/collect_data.py

def segment_quality(pianoroll, threshold_pitch, threshold_beats):
    pitch_sum = np.sum(np.sum(pianoroll, axis=0) > 0)
    beat_sum = np.sum(np.sum(pianoroll, axis=1) > 0)
    
    # Custom check:  Only select tracks such that at least one note was played over
    # less than 50% of the bar, not including the first and last 8 timesteps.
    clipped = pianoroll[8:-8]
    notes_played = np.sum(pianoroll[8:-8], axis=0)
    musically_diverse = np.any(notes_played[np.nonzero(notes_played)] < clipped.shape[0] // 2)
    
    return (
        musically_diverse and (pitch_sum >= threshold_pitch) and (beat_sum >= threshold_beats),
        (pitch_sum, beat_sum),
    )

In [None]:
# Borrowed code from https://github.com/salu133445/musegan/blob/main/src/collect_data.py

def collect_data(input_dir, output_file):
    """Collect training data from .npz files.  Given an input directory path
    (path to the LPD dataset), reads all the .npz files recursively and creates
    a .npz file with all the individual bars.
    """
    # How many songs to process before saving
    batch_num = 0
    batch_size = 2000

    # Each sample is a single bar.
    num_consecutive_bar = 1
    
    # The resolution is 24 across all samples.
    resolution = 24

    # Build up a list of "good" bars.
    ok_segment_list = []

    # Search directory recursively.
    filenames = Path(input_dir).rglob("*.npz")

    i = 0
    for filename in filenames:
        multitrack = load(filename)

        # Only consider tracks with a resolution value of 24.
        if multitrack.resolution != 24:
            print('Resolution of {} is {}'.format(filename, multitrack.resolution))
            continue

        # Resolution is the temporal resolution for a quarter note, so multiply
        # it by 4 to obtain the value for a full bar.
        num_bar = len(multitrack.downbeat) // (resolution * 4)
        
        # Loop up until the bar before the final one, in case the track ends
        # before the final bar should end.
        for bidx in range(num_bar - num_consecutive_bar):

            # Multiply by 4 since there are 4 notes in a bar.
            st = bidx * num_consecutive_bar * resolution * 4
            ed = st + num_consecutive_bar * resolution * 4

            # Find the "best" piano track for this bar of this multitrack.
            best_track = None
            best_score = -1
            for track in multitrack.tracks:

                # Discard non-piano tracks.
                tmp_map = check_which_family(track)
                piano_family = 4
                if not tmp_map[piano_family]:
                    continue

                tmp_pianoroll = track[st:ed]
                if np.count_nonzero(tmp_pianoroll) == 0:
                    continue
                is_ok, score = segment_quality(
                    tmp_pianoroll,
                    FAMILY_THRESHOLDS[piano_family][0],
                    FAMILY_THRESHOLDS[piano_family][1],
                )

                score = sum(score)
                if is_ok and score > best_score:
                    best_track = tmp_pianoroll
                    best_score = score

            if best_score != -1:
                # Flatten the best track and add a new axis.
                best_track = np.reshape(best_track, (1, -1))
                ok_segment_list.append(best_track)

      
        # Save batch of processes songs
        i += 1
        if i >= batch_size and len(ok_segment_list) > 0:
            batch_output_file = output_file.replace('.npz', '-{}.npz'.format(batch_num))

            result = np.concatenate(ok_segment_list, axis=0)

            # Convert pianorolls to a boolean matrix.
            result = (result > 0).astype(bool)
            print(f'Output shape:  {result.shape}')

            if batch_output_file.endswith(".npz"):
                np.savez_compressed(
                    batch_output_file,
                    nonzero=np.array(result.nonzero()),
                    shape=result.shape
                )
            else:
                np.save(batch_output_file, result)
            print(f"Successfully saved training data to : {batch_output_file}")

            # Clear segment list and incr batch_num
            i = 0
            batch_num += 1
            ok_segment_list = []

    # Save final batch
    if len(ok_segment_list) > 0:
        batch_output_file = output_file.replace('.npz', '-{}.npz'.format(batch_num))
        result = np.concatenate(ok_segment_list, axis=0)

        # Convert pianorolls to a boolean matrix.
        result = (result > 0).astype(bool)
        print(f'Output shape:  {result.shape}')

        if batch_output_file.endswith(".npz"):
            np.savez_compressed(
                batch_output_file,
                nonzero=np.array(result.nonzero()),
                shape=result.shape
            )
        else:
            np.save(batch_output_file, result)
        print(f"Successfully saved training data to : {batch_output_file}")

In [None]:
collect_data('/content/drive/MyDrive/CSC412/data/lpd/lpd_cleansed/', '/content/drive/MyDrive/CSC412/data/dataset.npz')

Output shape:  (4519, 12288)
Successfully saved training data to : /content/drive/MyDrive/CSC412/data/dataset-0.npz
Output shape:  (4966, 12288)
Successfully saved training data to : /content/drive/MyDrive/CSC412/data/dataset-1.npz
Output shape:  (5610, 12288)
Successfully saved training data to : /content/drive/MyDrive/CSC412/data/dataset-2.npz
Output shape:  (5148, 12288)
Successfully saved training data to : /content/drive/MyDrive/CSC412/data/dataset-3.npz
Output shape:  (5649, 12288)
Successfully saved training data to : /content/drive/MyDrive/CSC412/data/dataset-4.npz
Output shape:  (4442, 12288)
Successfully saved training data to : /content/drive/MyDrive/CSC412/data/dataset-5.npz
Output shape:  (4212, 12288)
Successfully saved training data to : /content/drive/MyDrive/CSC412/data/dataset-6.npz
Output shape:  (4689, 12288)
Successfully saved training data to : /content/drive/MyDrive/CSC412/data/dataset-7.npz
Output shape:  (4934, 12288)
Successfully saved training data to : /cont

In [None]:
def load_data_from_npz(filename):
    """Load and return the training data from a npz file (sparse format)."""
    with np.load(filename) as f:
        data = np.zeros(f['shape'], np.bool_)
        data[[x for x in f['nonzero']]] = True
    return data


def get_dataloader(data_filepath):
    np_training_data = load_data_from_npz(data_filepath)
    x_train = torch.from_numpy(np_training_data)

    dataset = TensorDataset(x_train)
    return DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

get_dataloader('/content/drive/MyDrive/CSC412/data/dataset.npz')