This notebook guided by
-   https://stanford.edu/~shervine/blog/pytorch-how-to-generate-data-parallel

In [2]:
import torch
import numpy as np
from pathlib import Path
import os
import random

In [3]:
class DistributedFolderDataset(torch.utils.data.Dataset):
    def __init__(self, data_size, root):
        '''Parses through data dispersed in distributed folders (root > subroots >> folders containing data >>>). 
        Shuffles and returns single training examples.
        root => path to root of folders
        data_size => amount of data to load into dataset in Gigabytes'''
        self.root = root
        self.data_size = data_size
        self.X = np.zeros((3, 81, 0))
        self.y = np.zeros(0)

        subroot_paths = []  # List of files in root
        for dirName, subdirList, fileList in os.walk(root):  # Walk through training data directory
            if not fileList:  # If file list is empty
                continue
            subroot_paths.append(dirName)
        
        # Shuffle subroots
        random.shuffle(subroot_paths)

        # Load data
        data_loaded = 0     # Amount of data loaded so far
        for subroot in subroot_paths:
            # print('Loading {}'.format(subroot))

            # Path to files, in this case src = song and target = notes
            notes_path = Path(subroot) / 'notes.npy'
            song_path = Path(subroot) / 'song.npy'
            try:
                data_loaded += notes_path.stat().st_size / 1e9  # Measure amount of data input
                data_loaded += song_path.stat().st_size / 1e9
            except WindowsError as err:  # If the files aren't all there
                print('Windows Error: Data in {} not found, skipping\n\n'.format(subroot))

            # Load numpy arrays
            notes = np.load(notes_path)
            song = np.load(song_path)

            # Put all the note and all the song data into one big array
            self.X = np.concatenate((self.X, song), axis=2)
            self.y = np.concatenate((self.y, notes), axis=0)        
            

            print('{:3.2f} / {:3.2f} GB data loaded\n'.format(data_loaded, self.data_size))
            if data_loaded > self.data_size:  # Stop loading once data_size limit is reached
                break

    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, idx):
        # Split song into 150 ms windows
        X = torch.from_numpy(self.X[:,:, idx-7 : idx+8])
        X = torch.squeeze(X)
        y = self.y[idx]
        return X, y



In [4]:
root = Path(r'X:\Training Data\Processed')

my_dataset = DistributedFolderDataset(1, root)
loader = torch.utils.data.DataLoader(my_dataset, batch_size = 10, shuffle=True, num_workers=0)
print(len(my_dataset))



i = 0
for batch in loader:
    print('\nX shape: {}'.format(batch[0].shape))
    print('label: {}'.format(batch[1]))
    i+=1
    if i>50:
        break
    

        

yeah!
0.08 / 1.00 GB data loaded

0.12 / 1.00 GB data loaded

0.17 / 1.00 GB data loaded

0.23 / 1.00 GB data loaded

0.28 / 1.00 GB data loaded

0.36 / 1.00 GB data loaded

0.41 / 1.00 GB data loaded

0.45 / 1.00 GB data loaded

0.51 / 1.00 GB data loaded

0.56 / 1.00 GB data loaded

0.62 / 1.00 GB data loaded

0.66 / 1.00 GB data loaded

0.70 / 1.00 GB data loaded

0.74 / 1.00 GB data loaded

0.79 / 1.00 GB data loaded

0.84 / 1.00 GB data loaded

0.87 / 1.00 GB data loaded

0.91 / 1.00 GB data loaded

0.94 / 1.00 GB data loaded

1.00 / 1.00 GB data loaded

1.15 / 1.00 GB data loaded

587048
