This notebook guided by
-   https://stanford.edu/~shervine/blog/pytorch-how-to-generate-data-parallel

In [1]:
import torch
from torch import nn
import numpy as np
from pathlib import Path
import os
import random
import math
import torch.nn.functional as F
import torch.optim as optim

Okay here's how it has to go.

1) Loop through all the files, loading them, recording their length, adding that to self.length, then using that as __len__

2) Load the files in 1gb at a time.

3) Note the length of that 1GB section

4) Every time __getitem__ is called, increase an index that keeps track of how much of the 1GB chunk has been read.

5) When all of it has been read, load in another 1GB chunk, and reset the index that keeps track of chunk consumption.

In [6]:
class DistributedFolderDataset(torch.utils.data.Dataset):
    def __init__(self, subroot_paths):
        '''
        Parses through data dispersed in distributed folders (subroots >> folders containing data >>>). 
        Shuffles and returns single training examples.
        subroot_paths => list containing string paths to subroot directories
        '''
        # self.data_size = data_size          # Amount of data to load
        self.X = np.zeros((3, 81, 0))       # src
        self.y = np.zeros(0)                # target
        self.length = 0                     # total length of all data
        self.chunk_length = 0               # length of individual chunks
        self.chunk_traversed = 0            # Number of samples in chunk that have already been processed
        self.subroot_idx = 0                # index of subroots directory to load into chunk
        self.chunk_loaded = False           # if true, the chunk has been loaded, if false it needs loading 
        self.subroot_paths = subroot_paths  # List of files in root
        self.total_chunk_length = 0         # Updated amount of chunk traversed to modify idx in __getitem__

        # Get length of dataset
        for path in self.subroot_paths:
            fileList = os.listdir(path)  
            if 'notes.npy' not in fileList or 'song.npy' not in fileList:
                continue

            # Load notes tensor to gather length for __len__
            notes = np.load(Path(path) / 'notes.npy')
            self.length += notes.shape[0]

    def load_chunk(self):
        '''
        Loads 1GB of data or the rest of the data in subroot_paths
        '''
        self.X = np.zeros((3, 81, 0))       # src
        self.y = np.zeros(0)                # target

        # Load data
        data_loaded = 0     # Amount of data loaded so far
        chunk_length = 0    # Number of samples in chunk
        while data_loaded < 1:

            if self.subroot_idx > (len(self.subroot_paths)-1):
                print('All subroots have been traversed')
                self.subroot_idx = 0
                break

            # Path to files, in this case src = song and target = notes
            notes_path = Path(self.subroot_paths[self.subroot_idx]) / 'notes.npy'
            song_path = Path(self.subroot_paths[self.subroot_idx]) / 'song.npy'
            self.subroot_idx += 1
            
            # Get data size
            try:
                data_loaded += notes_path.stat().st_size / 1e9  # Measure amount of data input
                data_loaded += song_path.stat().st_size / 1e9
            except WindowsError as err:  # If the files aren't all there
                print('Windows Error: Data in {} not found, skipping\n\n'.format(subroot))
                continue
            
            # Load numpy arrays
            notes = np.load(notes_path)
            song = np.load(song_path)

            # Put all the note and all the song data into one big array
            self.X = np.concatenate((self.X, song), axis=2)
            self.y = np.concatenate((self.y, notes), axis=0)

        chunk_length = self.y.shape[0]
        return chunk_length

            # print('{:3.2f} / {:3.2f} GB data loaded\n'.format(data_loaded, self.data_size))

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        if self.chunk_loaded == False:  # Time to load a new chunk
            print('Loading Chunk')
            self.chunk_length =  self.load_chunk()
            print('Chunk Length = {}\n'.format(self.chunk_length))
            self.chunk_traversed = 0
            self.chunk_loaded = True

        # Split song into 150 ms windows
        X = torch.from_numpy(np.take(self.X, range(idx-7-self.total_chunk_length,idx+8-self.total_chunk_length), axis=2, mode='wrap'))
        X = X.permute(0, 2, 1)
        y = 1 if self.y[idx-self.total_chunk_length] > 0 else 0  # Only care about onsets, losing note information

        # Check whether new chunk should be loaded
        self.chunk_traversed += 1
        if self.chunk_traversed > (self.chunk_length-1):
            self.total_chunk_length += self.chunk_length
            print('Full chunk traversed, {} / {} total samples traversed'.format(self.total_chunk_length, self.length))
            self.chunk_loaded = False
        return X, y



In [3]:
# Split dataset by song
def train_val_test_split(root, data_amount, val = 0.1, test = 0.1, shuffle = False):
    '''
    Takes a directory input and outputs 3 lists of subdirectories of specified size.
    I'm going to operate under the assumption that songs converge on a mean length if you get enough of them.
    - root: root of subdirectories
    - data_amount: amount of data to load
    - val: validation split
    - test: test split
    - shuffle: shuffle names of directories
    '''

    subroot_paths = []
    data_loaded = 0

    # Generate list of song folders
    for dirName, subdirList, fileList in os.walk(root):  
        if 'notes.npy' not in fileList or 'song.npy' not in fileList:
            continue

        # Get data size
        notes_path = Path(dirName) / 'notes.npy'
        song_path = Path(dirName) / 'song.npy'
        try:
            data_loaded += notes_path.stat().st_size / 1e9  # Measure amount of data input
            data_loaded += song_path.stat().st_size / 1e9
        except WindowsError as err:  # If the files aren't all there
            print('Windows Error: Data in {} not found, skipping\n\n'.format(subroot))
            continue
        
        if data_loaded > data_amount:
            break

        subroot_paths.append(dirName)
        
    # Shuffle subroots if applicable
    if shuffle:
        random.shuffle(subroot_paths)

    # Split dataset
    num_val = math.floor(val * len(subroot_paths))
    num_test = math.floor(test * len(subroot_paths))

    train = subroot_paths[num_val:(len(subroot_paths)-num_test)]
    val = subroot_paths[0:num_val]
    test = subroot_paths[-num_test:]

    return train, val, test

In [14]:
root = Path(r'X:\Training Data\Processed')

train_paths, _, _ = train_val_test_split(root, 5)
my_dataset = DistributedFolderDataset(train_paths)
loader = torch.utils.data.DataLoader(my_dataset, batch_size = 10000, shuffle=False, num_workers=0)

for batch_idx, (local_batch, local_labels) in enumerate(loader):
    continue


Loading Chunk
Chunk Length = 528486
tensor([0, 0, 0,  ..., 0, 0, 0])
tensor([0, 0, 0,  ..., 0, 0, 0])
tensor([0, 0, 0,  ..., 0, 0, 0])
tensor([0, 0, 0,  ..., 0, 0, 0])
tensor([0, 0, 0,  ..., 0, 0, 0])
tensor([0, 0, 0,  ..., 1, 0, 0])
tensor([0, 0, 0,  ..., 0, 0, 0])
tensor([0, 0, 0,  ..., 0, 0, 0])
tensor([0, 0, 1,  ..., 0, 0, 0])
tensor([0, 0, 0,  ..., 0, 0, 0])
tensor([0, 0, 0,  ..., 0, 0, 0])
tensor([0, 0, 0,  ..., 0, 0, 0])
tensor([0, 0, 0,  ..., 0, 0, 0])
tensor([0, 0, 0,  ..., 0, 0, 0])
tensor([0, 0, 0,  ..., 1, 0, 0])
tensor([0, 0, 0,  ..., 0, 0, 0])
tensor([0, 0, 0,  ..., 0, 0, 0])
tensor([1, 0, 0,  ..., 0, 0, 0])
tensor([0, 0, 0,  ..., 1, 0, 0])
tensor([0, 0, 0,  ..., 0, 0, 0])
tensor([0, 0, 0,  ..., 0, 0, 0])
tensor([0, 0, 1,  ..., 0, 0, 0])
tensor([0, 0, 0,  ..., 0, 0, 0])
tensor([0, 1, 0,  ..., 0, 0, 0])
tensor([0, 1, 0,  ..., 0, 0, 1])
tensor([0, 0, 0,  ..., 0, 0, 0])
tensor([1, 0, 0,  ..., 0, 0, 0])
tensor([0, 0, 0,  ..., 0, 0, 0])
tensor([0, 1, 0,  ..., 1, 0, 0])
tensor(

KeyboardInterrupt: 

## Calculate kernel dimensions through convolutions and max pools

In [21]:
def kernel_dim(H, W, kernel_size, padding = [0,0], dilation = [1,1], stride = [1,1]):
    H = (H+2*padding[0] - dilation[0]*(kernel_size[0]-1) - 1)/(stride[0]) + 1
    H = math.floor(H)
    print('H = ', H)

    W = (W+2*padding[1] - dilation[1]*(kernel_size[1]-1) - 1)/(stride[1]) + 1
    W = math.floor(W)
    print('W = {}\n'.format(W))
    return H, W

H, W = 15, 81
kernel_size = [7,3]
H, W = kernel_dim(H, W, kernel_size)

kernel_size = [1,3]
H, W = kernel_dim(H, W, kernel_size, stride=[1,3])

kernel_size = [3,3]
H, W = kernel_dim(H, W, kernel_size)

kernel_size = [1,3]
H, W = kernel_dim(H, W, kernel_size, stride=[1,3])



H =  9
W = 79

H =  9
W = 26

H =  7
W = 24

H =  7
W = 8



## Define and initialize model

In [4]:
# Define CNN architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=10, kernel_size=(7,3))
        self.pool = nn.MaxPool2d(kernel_size=(1,3))
        self.conv2 = nn.Conv2d(in_channels=10, out_channels=20, kernel_size=(3,3))
        self.fc1 = nn.Linear(7*8*20, 100)
        self.fc2 = nn.Linear(100, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(7*8*20, -1)
        x = x.permute(1, 0)
        x = self.fc1(x)
        x = self.fc2(x)
        x = torch.squeeze(x)
        # x = self.sigmoid(x)  # Removed sigmoid because using nn.BCEWithLogitsLoss
        return x

# Parameters for dataloader
params = {'batch_size' : 15000,
          'shuffle' : False,
          'num_workers': 0}

model = Net()
criterion = nn.BCEWithLogitsLoss(pos_weight = torch.Tensor([10])) # weight = torch.Tensor([10]).repeat_interleave(params['batch_size']))
optimizer = optim.SGD(model.parameters(), lr=0.05, momentum=0.9)

if torch.cuda.is_available():
    print('Using CUDA')
    model = model.cuda()
    criterion = criterion.cuda()
    device = torch.device('cuda:0')

print(model)




Using CUDA
Net(
  (conv1): Conv2d(3, 10, kernel_size=(7, 3), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=(1, 3), stride=(1, 3), padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(10, 20, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=1120, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


## Train model

In [5]:
max_epochs = 2
root = Path(r'X:\Training Data\Processed')
train_paths, val_paths, test_paths = train_val_test_split(root, 5, shuffle=True)

# Define datasets and loaders
trains = DistributedFolderDataset(train_paths)
train_loader = torch.utils.data.DataLoader(trains, **params)
vals = DistributedFolderDataset(val_paths)
val_loader = torch.utils.data.DataLoader(vals, **params)
tests = DistributedFolderDataset(test_paths)
test_loader = torch.utils.data.DataLoader(tests, **params)

criterion = nn.BCEWithLogitsLoss(pos_weight = torch.Tensor([100])) # Start out with high loss for one epoch, then change afterwards
criterion = criterion.cuda()

train_accs =[]
val_accs = []

for epoch in range(max_epochs):
    model.train()
    num_true = 0

    # Training
    for batch_idx, (local_batch, local_labels) in enumerate(train_loader):
        #  Transfer to GPU
        local_batch, local_labels = local_batch.to(device, dtype = torch.float32), local_labels.to(device, dtype = torch.float32)

        # Model computations
        y_pred = model(local_batch)
        loss = criterion(y_pred, local_labels)
        preds = torch.argmax(y_pred, dim=-1).cpu().numpy()
        
        num_true += np.sum(preds == local_labels.cpu().numpy())

        loss.backward()
        optimizer.step()
        model.zero_grad()

        if batch_idx % 1000 == 0:
            # print training update
            print('\nTrain Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(local_batch), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.item()))


        print('Epoch {}: train loss: {}'.format(epoch, loss.item()))
        print('Training Accuracy: {}\n'.format(num_true / len(trains)))
        train_accs.append(num_true / len(trains))

        # Free up GPU memory
        torch.cuda.empty_cache()

    criterion = nn.BCEWithLogitsLoss(pos_weight = torch.Tensor([10]))
    criterion = criterion.cuda()

    # Validation
    model.eval()  # Put model in evaluation mode
    num_true = 0

    for batch_idx, (local_batch, local_labels) in enumerate(val_loader):
        # Transfer to GPU
        local_batch, local_labels = local_batch.to(device, dtype = torch.float32), local_labels.to(device, dtype = torch.int64)

        # Model computations
        y_pred = model(local_batch)
        preds = torch.argmax(y_pred, dim=-1).cpu().numpy()
        num_true += np.sum(preds == local_labels.cpu().numpy()) 

    # Set model back to training mode
    model.train()

    print('Validation Accuracy: {}\n'.format(num_true / len(vals)))
    val_accs.append(num_true / len(vals))




Loading Chunk
Chunk Length = 536713

Epoch 0: train loss: 5.026576042175293
Training Accuracy: 0.0

Epoch 0: train loss: 2.8234641551971436
Training Accuracy: 0.0

Epoch 0: train loss: 3.636552095413208
Training Accuracy: 0.0

Epoch 0: train loss: 2.426703691482544
Training Accuracy: 0.0

Epoch 0: train loss: 3.020479202270508
Training Accuracy: 0.0

Epoch 0: train loss: 2.981693983078003
Training Accuracy: 0.0

Epoch 0: train loss: 4.416798114776611
Training Accuracy: 0.0

Epoch 0: train loss: 2.7679948806762695
Training Accuracy: 0.0

Epoch 0: train loss: 2.4823718070983887
Training Accuracy: 0.0

Epoch 0: train loss: 3.4664628505706787
Training Accuracy: 0.0

Epoch 0: train loss: 2.3213751316070557
Training Accuracy: 0.0

Epoch 0: train loss: 2.809999942779541
Training Accuracy: 0.0

Epoch 0: train loss: 2.8760323524475098
Training Accuracy: 0.0

Epoch 0: train loss: 2.9589977264404297
Training Accuracy: 0.0

Epoch 0: train loss: 2.8950071334838867
Training Accuracy: 0.0

Epoch 0: t

IndexError: cannot do a non-empty take from an empty axes.

In [9]:
print(str(Path().cwd().parent) + r'\Audio Embeddings')

c:\Users\ewais\Documents\GitHub\tensor-hero\Embedding Generation
