## Colab Lazy Dataset

In [None]:
import sys
from pathlib import Path
import os
sys.path.insert(1, str(Path.cwd().parent))
from tensor_hero.model import ColabLazyDataset
import torch

dl_params = {
            'batch_size' : 12,
            'shuffle' : True,
            'num_workers' : 0,
            'drop_last' : True
        }

train_path = Path(r'X:\Training Data\training_ready\train')
val_path = Path(r'X:\Training Data\training_ready\val')

# Define data loaders
train_data = ColabLazyDataset(train_path, max_src_len=500, max_trg_len=500, pad_idx=434)
train_loader = torch.utils.data.DataLoader(train_data, **dl_params)

for batch_idx, batch in enumerate(train_loader):
    print(f'batch idx: {batch_idx} | shape: {batch}')
    break
    

In [None]:
song_paths = [train_path / x for x in os.listdir(train_path)]
notes_dirs = [x / 'notes' for x in song_paths]
specs_dirs = [x / 'spectrograms' for x in song_paths]

specs_lists = []
for dir_ in specs_dirs:
    for specs_dir, _, specs in os.walk(dir_):
        if not specs:
            continue
        specs_lists.append([Path(specs_dir) / spec for spec in specs])
    # print(f'x: {x}\ny: {y}\nz: {z}\n\n')
def __note_dirs_from_spec_dirs(spec_file):
    '''
    Finds the note files corresponding to the spectrogram in spec_dir
    Helper function for ColabLazyDataset __init__()

    ~~~~ ARGUMENTS ~~~~
    - spec_file (Path): 
        - single path to a spectrogram in colab transformer training data
        - assumes file structure defined in tensor_hero/preprocessing/data.py
            -> preprocess_transformer_data() w/ COLAB=True
    
    ~~~~ RETURNS ~~~~
    Path: Path to notes array corresponding to spec in spec_file
    '''
    return Path(str(spec_file).replace('spectrograms', 'notes'))
    
specs_lists = [spec for spec_list in specs_lists for spec in spec_list]  # Flatten
notes_lists = [__note_dirs_from_spec_dirs(x) for x in specs_lists]
print(specs_lists)
# print(notes_lists[-10:])

len(specs_lists)

## Colab Memory Dataset

In [4]:
import sys
from pathlib import Path
import os
sys.path.insert(1, str(Path.cwd().parent.parent))
from tensor_hero.model import ColabMemoryDataset
import torch

dl_params = {
            'batch_size' : 32,
            'shuffle' : True,
            'num_workers' : 4,
            'drop_last' : True
        }

train_path = Path.cwd().parent.parent / 'Training_Data' / 'colab_training_data' / 'train'

max_examples = 1000

# Define data loaders
train_data = ColabMemoryDataset(train_path, 
                                max_src_len=500, 
                                max_trg_len=500, 
                                pad_idx=434,
                                max_examples=max_examples)
train_loader = torch.utils.data.DataLoader(train_data, **dl_params)

for batch_idx, batch in enumerate(train_loader):
    print(f'batch idx: {batch_idx} | shape: {batch[0].shape}')
    if batch_idx > 5:
        break

    

Checking length of spectrograms and notes...


100%|██████████| 9722/9722 [00:02<00:00, 4820.58it/s]


0 datapoints removed due to exceeding maximum length
Populating 1000 samples into memory


100%|██████████| 1000/1000 [00:02<00:00, 449.51it/s]


self.specs is taking up 1953.13 MB
self.notes is taking up 3.81 MB
batch idx: 0 | shape: torch.Size([32, 512, 500])
batch idx: 1 | shape: torch.Size([32, 512, 500])
batch idx: 2 | shape: torch.Size([32, 512, 500])
batch idx: 3 | shape: torch.Size([32, 512, 500])
batch idx: 4 | shape: torch.Size([32, 512, 500])
batch idx: 5 | shape: torch.Size([32, 512, 500])
batch idx: 6 | shape: torch.Size([32, 512, 500])
