In [1]:
from copy import deepcopy, copy
import os
import pathlib
from random import shuffle

import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, ChainDataset
from livelossplot import PlotLosses

%matplotlib inline
plt.rcParams['figure.figsize'] = (14, 5)

In [5]:
class MirexNet(nn.Module):
    def __init__(self):
        super(MirexNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)
        self.conv2 = nn.Conv2d(32, 32, 3, padding=1)
        self.conv3 = nn.Conv2d(32, 32, 3, padding=1)
        self.conv4 = nn.Conv2d(32, 32, 3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=(2, 1))
        self.conv5 = torch.nn.Conv2d(32, 64, 3)
        self.conv6 = torch.nn.Conv2d(64, 64, 3)
        self.conv7 = torch.nn.Conv2d(64, 128, kernel_size=(12, 9))
        self.conv8 = torch.nn.Conv2d(128, 25, 1)
        self.dropout = nn.Dropout(0.5)
        self.avg_pool = nn.AvgPool2d(kernel_size=(13, 3))

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = self.pool(x)
        x = self.dropout(x)
        x = F.relu(self.conv5(x))
        x = F.relu(self.conv6(x))
        x = self.pool(x)
        x = self.dropout(x)
        x = F.relu(self.conv7(x))
        x = self.dropout(x)
        x = self.conv8(x)
        x = self.avg_pool(x)
        return x

In [2]:
def split_iterable_dataset(dataset, train_size=0.8):
    ann_labels = copy(dataset.ann_list)
    shuffle(ann_labels)
    train_size = int(0.8 * len(ann_labels))
    #test_size = len(ann_labels) - train_size
    train_dataset = deepcopy(dataset)
    train_dataset.ann_list = ann_labels[:train_size]
    
    test_dataset = deepcopy(dataset)
    test_dataset.ann_list = ann_labels[train_size:]
    return train_dataset, test_dataset

In [3]:
from dataset import MirexDataset, FrameIterableDataset, MirexChainDataset


beatles_dataset = FrameIterableDataset(audio_dir='data/beatles/mp3s-32k/',
                                       ann_dir='data/beatles/chordlabs/',
                                       window_size=8192, hop_length=4096)
queen_dataset = FrameIterableDataset(audio_dir='data/queen/mp3/',
                                     ann_dir='data/queen/chordlabs/',
                                     window_size=8192, hop_length=4096)
dataset = MirexChainDataset([beatles_dataset, queen_dataset])
# train_size = int(0.8 * len(dataset))
# test_size = len(dataset) - train_size
# train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

#loader_train = DataLoader(train_dataset, shuffle=True, batch_size=None, num_workers=0)
#loader_val = DataLoader(test_dataset, shuffle=True, batch_size=None, num_workers=0)

train_dataset, test_dataset = split_iterable_dataset(dataset)
loader_train = DataLoader(train_dataset, num_workers=0, batch_size=32)
loader_val = DataLoader(test_dataset, num_workers=0, batch_size=32)

dataloaders = {
    "train": loader_train,
    "val": loader_val
}

In [5]:
def train_model(model, optimizer, dataloaders, device, epochs=1):
    liveloss = PlotLosses()
    model = model.to(device=device)
    
    for e in range(epochs):
        logs = {}
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # put model to training mode
            else:
                model.eval()
                
            running_loss = 0.0
            dataset_len = 0
        
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device=device, dtype=torch.float32)
                labels = labels.to(device=device, dtype=torch.float32)
            
                scores = model(inputs)
                scores = scores.squeeze(3).squeeze(2)
            
                loss = F.cross_entropy(scores, torch.max(labels, 1)[1])
        
                if phase == 'train':
                    # Zero out all of the gradients for the variables which the optimizer
                    # will update.
                    optimizer.zero_grad()
        
                    # This is the backwards pass: compute the gradient of the loss with
                    # respect to each  parameter of the model.
                    loss.backward()
        
                    # Actually update the parameters of the model using the gradients
                    # computed by the backwards pass.
                    optimizer.step()
            
                running_loss += loss.detach() * inputs.size(0)
                dataset_len += inputs.size(0)
        
            epoch_loss = running_loss / dataset_len
            prefix = ''
            if phase == 'val':
                prefix = 'val_'
            
            logs[prefix + ' log loss'] = epoch_loss.item()
        
        liveloss.update(logs)
        liveloss.send()

In [6]:
from collections import OrderedDict

model = nn.Sequential(OrderedDict([
    ('conv1', nn.Conv2d(1, 32, 3, padding=1)),
    ('relu1', nn.ReLU()),
    ('bnorm1', nn.BatchNorm2d(32)),
    ('conv2', nn.Conv2d(32, 32, 3, padding=1)),
    ('relu2', nn.ReLU()),
    ('bnorm2', nn.BatchNorm2d(32)),
    ('conv3', nn.Conv2d(32, 32, 3, padding=1)),
    ('relu3', nn.ReLU()),
    ('bnorm3', nn.BatchNorm2d(32)),
    ('conv4', nn.Conv2d(32, 32, 3, padding=1)),
    ('relu4', nn.ReLU()),
    ('bnorm4', nn.BatchNorm2d(32)),
    ('pool1', nn.MaxPool2d(kernel_size=(2, 1))),
    ('dropout1', nn.Dropout(0.5)),
    ('conv5', nn.Conv2d(32, 64, 3)),
    ('relu5', nn.ReLU()),
    ('bnorm5', nn.BatchNorm2d(64)),
    ('conv6', nn.Conv2d(64, 64, 3)),
    ('relu6', nn.ReLU()),
    ('bnorm6', nn.BatchNorm2d(64)),
    ('pool2', nn.MaxPool2d(kernel_size=(2, 1))),
    ('dropout2', nn.Dropout(0.5)),
    ('conv7', torch.nn.Conv2d(64, 128, kernel_size=(12, 9))),
    ('relu7', nn.ReLU()),
    ('bnorm7', nn.BatchNorm2d(128)),
    ('dropout3', nn.Dropout(0.5)),
    ('conv8', torch.nn.Conv2d(128, 25, 1)),
    ('pool3', nn.AvgPool2d(kernel_size=(13, 3))),
]))

In [7]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

#model = MirexNet()
learning_rate = 1e-3
optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), weight_decay=1e-7)

train_model(model, optimizer, dataloaders, device, 1)

KeyboardInterrupt: 

In [17]:
#torch.save(model.state_dict(), 'models/mirex_cnn.model')
# model = TheModelClass(*args, **kwargs)
model.load_state_dict(torch.load('models/mirex_cnn.model'))
model.eval()

Sequential(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (relu1): ReLU()
  (bnorm1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (relu2): ReLU()
  (bnorm2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (relu3): ReLU()
  (bnorm3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv4): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (relu4): ReLU()
  (bnorm4): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool2d(kernel_size=(2, 1), stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
  (dropout1): Dropout(p=0.5, inplace=False)
  (conv5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (relu5): ReLU()
  (bnorm5): BatchNorm2

In [27]:
chord_dataset = MirexDataset(audio_dir='data/queen/mp3/Greatest_Hits_II/',
                             ann_dir='data/queen/chordlabs/Greatest_Hits_II/',
                             window_size=8192, hop_length=4096)
loader_chord = DataLoader(chord_dataset, shuffle=True, batch_size=None, num_workers=0)

In [19]:
from container import ContextContainer
from metrics import compute_eval_measures

@torch.no_grad()
def estimate_chords(dataloader, model, device, scores_dir='scores'):
    model.eval()  # set model to evaluation mode
    
    if not os.path.exists(scores_dir):
        os.makedirs(scores_dir)
    
    criterion = nn.Softmax()
    for sample in dataloader:
        chromagram = sample['chromagram']
        
        container = ContextContainer(chromagram, 7)
        result = torch.empty(25, chromagram.shape[1])
        for idx, frame in enumerate(container):
            frame = frame.view(1, 1, *frame.shape)
            
            inputs = frame.to(device=device, dtype=torch.float32)
            scores = model(inputs)
            scores = torch.squeeze(scores)
            scores = criterion(scores)

            result[:, idx] = scores
        
        _, preds = torch.max(result, 0)
        result = F.one_hot(preds, result.shape[0]).t_()

        ann_matrix = sample['ann_matrix'].data.numpy()
        result = result.data.numpy()
        
        P, R, F1, TP, FP, FN = compute_eval_measures(ann_matrix, result)
        title = 'Evaluation result (N=%d, TP=%d, FP=%d, FN=%d, P=%.3f, R=%.3f, F=%.3f)' % (result.shape[1], TP, FP, FN, P,R,F1)
        print(title)
        #break

In [28]:
estimate_chords(loader_chord, model, device)

Evaluation result (N=2786, TP=1238, FP=1548, FN=1453, P=0.444, R=0.460, F=0.452)
Evaluation result (N=3198, TP=424, FP=2774, FN=1068, P=0.133, R=0.284, F=0.181)
Evaluation result (N=2600, TP=1026, FP=1574, FN=854, P=0.395, R=0.546, F=0.458)
Evaluation result (N=2370, TP=1252, FP=1118, FN=162, P=0.528, R=0.885, F=0.662)
Evaluation result (N=2681, TP=919, FP=1762, FN=779, P=0.343, R=0.541, F=0.420)
Evaluation result (N=2822, TP=1266, FP=1556, FN=980, P=0.449, R=0.564, F=0.500)
