# Get GTZAN and MusiCNN-MSD

In [1]:
import os
import tensorflow
import torch
import torchaudio
import numpy as np
import essentia.standard as es
from essentia import Pool
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(42)
###############################################################################################

# Please first specify the path of your GTZAN dataset if it is already downloaded in your system.
    # Otherwise leave 'data' or the desired path where we will download the dataset.

GTZAN_path = 'data'
#GTZAN_path = <your_path>

###############################################################################################

# TensorflowPredictMusiCNN expects mono 16kHz sample rate inputs. Resample needed
resample = es.Resample(inputSampleRate=22050, outputSampleRate=16000, quality=0)

# Download dataset from torchaudio
if not os.path.isdir(GTZAN_path):
    os.mkdir(GTZAN_path)
    train_dataset = torchaudio.datasets.GTZAN(root=GTZAN_path, download=True, subset='training')
else:
    train_dataset = torchaudio.datasets.GTZAN(root=GTZAN_path, subset='training')
val_dataset = torchaudio.datasets.GTZAN(root=GTZAN_path, subset='validation')
test_dataset = torchaudio.datasets.GTZAN(root=GTZAN_path, subset='testing')

# We download the essentia MSD MusiCNN model
if not os.path.isfile('msd-musicnn-1.pb'):
    !curl -SLO https://essentia.upf.edu/models/autotagging/msd/msd-musicnn-1.pb

class Essentia_MusiCNNMSD_GTZAN_Dataset(Dataset):
    """ The embeddings of the GTZAN dataset extracted with Essentia-Tensorflow's MusiCNN-MSD model. """
    def __init__(self, GTZAN_dataset, embeddings):
        self.GTZAN_dataset = GTZAN_dataset
        self.embeddings = embeddings
        self.GTZAN_genres = [
            "blues",
            "classical",
            "country",
            "disco",
            "hiphop",
            "jazz",
            "metal",
            "pop",
            "reggae",
            "rock",
        ]
        
    def __len__(self):
        return len(self.GTZAN_dataset)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        inputs = torch.from_numpy(self.embeddings[idx]).mean(0) #comment mean for original method
        labels = torch.tensor(self.GTZAN_genres.index(self.GTZAN_dataset[idx][2]))
        
        return inputs, labels
# We define a shallow model

class shallowClassifier(nn.Module):
    def __init__(self):
        super(shallowClassifier, self).__init__()
        self.dense1 = nn.Linear(200, 100) #change to 19*200 if commenting .mean()avobe
        self.dense2 = nn.Linear(100, 10)
        
    def forward(self,x):
        x = x.view(-1, 200) #change to 200*19 if commenting .mean() above
        x = F.relu(self.dense1(x))
        x = self.dense2(x)
        
        return x
    

# Build embeddings with GTZAN time

In [2]:
# Compute and store the embeddings for each subset
if not os.path.isfile('train_embeddings.npy'):
    i=0
    train_embeddings = []
    for track in train_dataset:
        i+=1
        if i%100==0:
            print('Processing track '+str(i)+' of '+str(len(train_dataset)))
        train_embeddings.append(es.TensorflowPredictMusiCNN(
            graphFilename='msd-musicnn-1.pb', output='model/dense/BiasAdd')(resample(track[0].numpy()[0])))
    train_embeddings = np.array(train_embeddings)
    np.save('train_embeddings.npy',train_embeddings)

    val_embeddings = []
    for track in val_dataset:
        val_embeddings.append(es.TensorflowPredictMusiCNN(
            graphFilename='msd-musicnn-1.pb', output='model/dense/BiasAdd')(resample(track[0].numpy()[0])))
    val_embeddings=np.array(val_embeddings)    
    np.save('val_embeddings.npy',val_embeddings)

    test_embeddings = []
    for track in test_dataset:
        test_embeddings.append(es.TensorflowPredictMusiCNN(
            graphFilename='msd-musicnn-1.pb', output='model/dense/BiasAdd')(resample(track[0].numpy()[0])))
    test_embeddings=np.array(test_embeddings)
    np.save('test_embeddings.npy',np.array(test_embeddings))
else:
    train_embeddings=np.load('train_embeddings.npy')
    val_embeddings=np.load('val_embeddings.npy')
    test_embeddings=np.load('test_embeddings.npy')

In [3]:
# Embedding shapes...
train_embeddings[0].shape

(19, 200)

In [4]:
# Embedding types
train_embeddings[0].dtype

dtype('float32')

In [6]:
embtrain_dataset = Essentia_MusiCNNMSD_GTZAN_Dataset(train_dataset, train_embeddings)

# Build train loop

In [5]:
def train_test(train_embeddings, val_embeddings, test_embeddings):
    # We compute the distance of the embeddings between all songs in the training set
    emb_distance = np.zeros((len(train_embeddings), len(train_embeddings)))

    for indxA, trackA in enumerate(train_embeddings):
        for indxB, trackB in enumerate(train_embeddings):
            emb_distance[indxA, indxB] = np.linalg.norm(trackA - trackB)        

    embtrain_dataset = Essentia_MusiCNNMSD_GTZAN_Dataset(train_dataset, train_embeddings)
    train_loader = torch.utils.data.DataLoader(embtrain_dataset, batch_size=16, shuffle=True, num_workers=4)

    embval_dataset = Essentia_MusiCNNMSD_GTZAN_Dataset(val_dataset, val_embeddings)
    val_loader = torch.utils.data.DataLoader(embval_dataset, batch_size=16, shuffle=False, num_workers=4)

    embtest_dataset = Essentia_MusiCNNMSD_GTZAN_Dataset(test_dataset, test_embeddings)
    test_loader = torch.utils.data.DataLoader(embtest_dataset, batch_size=16, shuffle=False, num_workers=4)

    model = shallowClassifier()

    if torch.cuda.is_available():
        model = model.cuda()

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer)

    num_epochs = 2000
    train_losses = torch.zeros(num_epochs)
    val_losses = torch.zeros(num_epochs)

    bestloss = 100000.0
    for epoch in range(num_epochs):
        #The train loop
        model.train()
        for inputs, labels in train_loader:
            # Send data to the GPU
            if torch.cuda.is_available():
                inputs = inputs.cuda()
                labels = labels.cuda()

            # Clear gradient and forward + loss + backward
            optimizer.zero_grad()
            outputs = model(inputs) 
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_losses[epoch] += loss.item()
        train_losses[epoch] /= len(train_loader)

        model.eval()
        with torch.no_grad():
            for inputs, labels in val_loader:
                # Send data to the GPU
                if torch.cuda.is_available():
                    inputs = inputs.cuda()
                    labels = labels.cuda()

                outputs = model(inputs) 
                loss = criterion(outputs, labels)

                val_losses[epoch] += loss.item()
            val_losses[epoch] /= len(val_loader)
            scheduler.step(val_losses[epoch])

            # If best epoch, we save parameters
            if val_losses[epoch] < bestloss :
                bestloss = val_losses[epoch]
                torch.save(model.state_dict(), 'model.pth')
                
        if epoch%100==0:
            print('Epoch '+str(epoch)+': Train Loss = '+str(train_losses[epoch].item())+'. Val Loss = '+str(val_losses[epoch].item())+'.')
    print('Best validation loss :' + str(bestloss.item()))

    # Finally we compute accuracy with the test set
    model.load_state_dict(torch.load('model.pth'));
    model.eval()
    confusion_matrix = torch.zeros(len(embtrain_dataset.GTZAN_genres), len(embtrain_dataset.GTZAN_genres))
    with torch.no_grad():
        for inputs, labels in test_loader:
            # Send data to the GPU
            if torch.cuda.is_available():
                inputs = inputs.cuda()
                labels = labels.cuda()
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            for t, p in zip(labels.view(-1), predicted.view(-1)):
                confusion_matrix[t.long(), p.long()] += 1

    # Per-class Accuracy
    pclass_acc = confusion_matrix.diag()/confusion_matrix.sum(1)
    return torch.mean(pclass_acc).item()*100

# Train GTZAN_time

In [7]:
baseline = train_test(train_embeddings, val_embeddings, test_embeddings)

Epoch 0: Train Loss = 2.396742582321167. Val Loss = 2.302741050720215.
Epoch 100: Train Loss = 1.0398520231246948. Val Loss = 1.171513557434082.
Epoch 200: Train Loss = 0.6912970542907715. Val Loss = 0.885692298412323.
Epoch 300: Train Loss = 0.5544649362564087. Val Loss = 0.7747247219085693.
Epoch 400: Train Loss = 0.47854354977607727. Val Loss = 0.7215855717658997.
Epoch 500: Train Loss = 0.4272844195365906. Val Loss = 0.6938523054122925.
Epoch 600: Train Loss = 0.3927702009677887. Val Loss = 0.6783539056777954.
Epoch 700: Train Loss = 0.3665103316307068. Val Loss = 0.6705629825592041.
Epoch 800: Train Loss = 0.34445664286613464. Val Loss = 0.667378842830658.
Epoch 900: Train Loss = 0.3370794653892517. Val Loss = 0.6668924689292908.
Epoch 1000: Train Loss = 0.334261029958725. Val Loss = 0.6668926477432251.
Epoch 1100: Train Loss = 0.33406737446784973. Val Loss = 0.6668928861618042.
Epoch 1200: Train Loss = 0.3340103328227997. Val Loss = 0.6668930053710938.
Epoch 1300: Train Loss = 0.

In [8]:
print(baseline)

80.49517869949341


# Baseline is 80.50%

In [6]:
audio = resample(train_dataset[0][0].numpy()[0])
original = es.TensorflowPredictMusiCNN(graphFilename='msd-musicnn-1.pb', output='model/dense/BiasAdd')(audio)

In [7]:
original.shape

(19, 200)

In [8]:
def melspectrogram(audio):
    """
    From a 16kH sample, computes the mel spectrogram with the same signature as done in MelonPlaylist dataset.
    
    Input:
    audio (samples) sampled at 16kHz and between [-1,1]
    Output:(frames, 48bands)
    """    
    windowing = es.Windowing(type='hann', normalized=False, zeroPadding=0)
    spectrum = es.Spectrum()
    melbands = es.MelBands(numberBands=48,
                                   sampleRate=16000,
                                   lowFrequencyBound=0,
                                   highFrequencyBound=16000/2,
                                   inputSize=(512+0)//2+1,
                                   weighting='linear',
                                   normalize='unit_tri',
                                   warpingFormula='slaneyMel',
                                   type='power')
    amp2db = es.UnaryOperator(type='lin2db', scale=2)
    result = []
    for frame in es.FrameGenerator(audio, frameSize=512, hopSize=256,
                                   startFromZero=False):
        spectrumFrame = spectrum(windowing(frame))

        melFrame = melbands(spectrumFrame)
        result.append(amp2db(melFrame))
    return np.array(result)

In [11]:
melon_sample=melspectrogram(audio)

In [19]:
melon_sample.shape

(1877, 48)

In [17]:
oversampled= torch.nn.functional.interpolate(input=torch.from_numpy(melon_sample).unsqueeze(0).unsqueeze(0), size=[melon_sample.shape[0], melon_sample.shape[1]*2], mode='nearest').squeeze()

In [18]:
oversampled.shape

torch.Size([1877, 96])

In [20]:
def adapt_melonInput_TensorflowPredict(melon_sample, mode):
    """
    Adapts (by treating the spectrogram as an image and using Computer 
    Vision interpolation methods) the MelonPlaylist mel spectrograms to patches
    suitable for using the Essentia-Tensorflow TensorflowPredict algorithm.

    Input:
    melon_samples (frames, 48bands) dtype=np.float32
    mode: 'linear', 'nearest'  'bilinear', 'bicubic', , 'area', 'trilinear'
    Output:(batch, 187, 1, 96bands)
    """
    db2amp = es.UnaryOperator(type='db2lin', scale=2)
    if mode == 'linear':
        oversampled = np.zeros((len(melon_sample), melon_sample.shape[1]*2)).astype(np.float32)
    else:
        renormalized = np.zeros_like(melon_sample).astype(np.float32)
    for k in range(len(melon_sample)):
        if mode == 'linear':
            sample = np.log10(1 + (db2amp(melon_sample[k])*10000))
            oversampled[k,:]=np.interp(np.arange(96)/2, np.arange(48), sample)
        else:
            renormalized[k,:] = np.log10(1 + (db2amp(melon_sample[k])*10000))
    if mode != 'linear':
        renormalized = torch.from_numpy(renormalized).unsqueeze(0).unsqueeze(0)
        if mode == 'trilinear':
            oversampled=torch.nn.functional.interpolate(input=renormalized.unsqueeze(0), 
                                            size=[1,melon_sample.shape[0],melon_sample.shape[1]*2],
                                                        mode=mode).squeeze()
        else:
            oversampled=torch.nn.functional.interpolate(input=renormalized, 
                                        size=[melon_sample.shape[0],melon_sample.shape[1]*2], 
                                                        mode=mode).squeeze()
        oversampled = oversampled.numpy()
    
    # Now we cut again, but with hop size of 93 frames as in default TensorflowPredictMusiCNN
    new = np.zeros((int(len(oversampled) / 93) - 1, 187, 96)).astype(np.float32)
    for k in range(int(len(oversampled) / 93) - 1):
        new[k]=oversampled[k*93:k*93+187]
    return np.expand_dims(new, 2)

In [12]:
modelName='msd-musicnn-1.pb'
output_layer='model/dense/BiasAdd'
input_layer='model/Placeholder'
predict = es.TensorflowPredict(graphFilename=modelName,
                               inputs=[input_layer],
                               outputs=[output_layer])


In [14]:
# TensorflowPredictMusiCNN expects mono 16kHz sample rate inputs. Resample needed
# Compute and store the embeddings for each subset
if not os.path.isfile('train_embeddings_linear.npy'):
    i=0
    train_embeddings = []
    for track in train_dataset:
        i+=1
        if i%100==0:
            print('Processing track '+str(i)+' of '+str(len(train_dataset)))
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict(melspectrogram(resample(track[0].numpy()[0])),mode='linear'))
        output = predict(in_pool)
        train_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    train_embeddings = np.array(train_embeddings)
    np.save('train_embeddings_linear.npy', train_embeddings)
    
    val_embeddings = []
    for track in val_dataset:
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict(melspectrogram(resample(track[0].numpy()[0])),mode='linear'))
        output = predict(in_pool)
        val_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    val_embeddings = np.array(val_embeddings)
    np.save('val_embeddings_linear.npy', val_embeddings)
    
    test_embeddings = []
    for track in test_dataset:
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict(melspectrogram(resample(track[0].numpy()[0])),mode='linear'))
        output = predict(in_pool)
        test_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    test_embeddings = np.array(test_embeddings)
    np.save('test_embeddings_linear.npy', test_embeddings)    

else:
    train_embeddings=np.load('train_embeddings_linear.npy')
    val_embeddings=np.load('val_embeddings_linear.npy')
    test_embeddings=np.load('test_embeddings_linear.npy')

Processing track 100 of 443
Processing track 200 of 443
Processing track 300 of 443
Processing track 400 of 443


In [13]:
# Embedding shapes...
train_embeddings[0].shape

(19, 200)

In [14]:
# Embedding types
train_embeddings[0].dtype

dtype('float32')

# Re-train, now with interpolated melspecs

In [17]:
interpolated = train_test(train_embeddings, val_embeddings, test_embeddings)

Epoch 0: Train Loss = 2.3252789974212646. Val Loss = 2.2550265789031982.
Epoch 100: Train Loss = 1.4404184818267822. Val Loss = 1.5836281776428223.
Epoch 200: Train Loss = 1.0612469911575317. Val Loss = 1.2955150604248047.
Epoch 300: Train Loss = 0.8743215799331665. Val Loss = 1.1578919887542725.
Epoch 400: Train Loss = 0.7598857879638672. Val Loss = 1.0832873582839966.
Epoch 500: Train Loss = 0.6838250756263733. Val Loss = 1.0374938249588013.
Epoch 600: Train Loss = 0.6335861086845398. Val Loss = 1.0076253414154053.
Epoch 700: Train Loss = 0.5872496962547302. Val Loss = 0.9864686131477356.
Epoch 800: Train Loss = 0.5609604120254517. Val Loss = 0.9710853099822998.
Epoch 900: Train Loss = 0.535830020904541. Val Loss = 0.9588419198989868.
Epoch 1000: Train Loss = 0.5032868385314941. Val Loss = 0.9489818215370178.
Epoch 1100: Train Loss = 0.4847221374511719. Val Loss = 0.9403696656227112.
Epoch 1200: Train Loss = 0.46988943219184875. Val Loss = 0.9338710308074951.
Epoch 1300: Train Loss =

In [18]:
print(interpolated)

74.57183599472046


# Now with random embeddings

In [15]:
np.random.seed(42)

In [20]:
rand = train_test(np.reshape(5*np.random.randn(train_embeddings.size).astype(np.float32), train_embeddings.shape), 
                 np.reshape(5*np.random.randn(val_embeddings.size).astype(np.float32), val_embeddings.shape), 
                 np.reshape(5*np.random.randn(test_embeddings.size).astype(np.float32), test_embeddings.shape))

Epoch 0: Train Loss = 2.32436203956604. Val Loss = 2.3439722061157227.
Epoch 100: Train Loss = 2.2854135036468506. Val Loss = 2.339341402053833.
Epoch 200: Train Loss = 2.2608494758605957. Val Loss = 2.3376495838165283.
Epoch 300: Train Loss = 2.25998854637146. Val Loss = 2.337649345397949.
Epoch 400: Train Loss = 2.2603085041046143. Val Loss = 2.337649345397949.
Epoch 500: Train Loss = 2.260315179824829. Val Loss = 2.3376495838165283.
Epoch 600: Train Loss = 2.2611031532287598. Val Loss = 2.3376495838165283.
Epoch 700: Train Loss = 2.26021146774292. Val Loss = 2.337649345397949.
Epoch 800: Train Loss = 2.2611281871795654. Val Loss = 2.3376495838165283.
Epoch 900: Train Loss = 2.25942063331604. Val Loss = 2.33764910697937.
Epoch 1000: Train Loss = 2.260310173034668. Val Loss = 2.33764910697937.
Epoch 1100: Train Loss = 2.259648084640503. Val Loss = 2.337649345397949.
Epoch 1200: Train Loss = 2.2606582641601562. Val Loss = 2.33764910697937.
Epoch 1300: Train Loss = 2.258835554122925. Va

In [21]:
print(rand)

9.379075467586517


# Now with random musiCNN

In [16]:
class Conv_V(nn.Module):
    # vertical convolution
    def __init__(self, input_channels, output_channels, filter_shape):
        super(Conv_V, self).__init__()
        self.conv = nn.Conv2d(input_channels, output_channels, filter_shape,
                              padding=(0, filter_shape[1]//2))
        self.bn = nn.BatchNorm2d(output_channels)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.bn(self.conv(x)))
        freq = x.size(2)
        out = nn.MaxPool2d((freq, 1), stride=(freq, 1))(x)
        out = out.squeeze(2)
        return out


class Conv_H(nn.Module):
    # horizontal convolution
    def __init__(self, input_channels, output_channels, filter_length):
        super(Conv_H, self).__init__()
        self.conv = nn.Conv1d(input_channels, output_channels, filter_length,
                              padding=filter_length//2)
        self.bn = nn.BatchNorm1d(output_channels)
        self.relu = nn.ReLU()

    def forward(self, x):
        freq = x.size(2)
        out = nn.AvgPool2d((freq, 1), stride=(freq, 1))(x)
        out = out.squeeze(2)
        out = self.relu(self.bn(self.conv(out)))
        return out

class Conv_1d(nn.Module):
    def __init__(self, input_channels, output_channels, shape=3, stride=1, pooling=2):
        super(Conv_1d, self).__init__()
        self.conv = nn.Conv1d(input_channels, output_channels, shape, stride=stride, padding=shape//2)
        self.bn = nn.BatchNorm1d(output_channels)
        self.relu = nn.ReLU()
        self.mp = nn.MaxPool1d(pooling)
    def forward(self, x):
        out = self.mp(self.relu(self.bn(self.conv(x))))
        return out
    
class Musicnn(nn.Module):
    '''
    Pons et al. 2017
    End-to-end learning for music audio tagging at scale.
    This is the updated implementation of the original paper. Referred to the Musicnn code.
    https://github.com/jordipons/musicnn
    '''
    def __init__(self,
                sample_rate=16000,
                n_fft=512,
                f_min=0.0,
                f_max=8000.0,
                n_mels=96,
                n_class=50,
                dataset='mtat'):
        super(Musicnn, self).__init__()

        # Spectrogram
        self.spec = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate,
                                                         n_fft=n_fft,
                                                         f_min=f_min,
                                                         f_max=f_max,
                                                         n_mels=n_mels)
        self.to_db = torchaudio.transforms.AmplitudeToDB()
        self.spec_bn = nn.BatchNorm2d(1)

        # Pons front-end
        m1 = Conv_V(1, 204, (int(0.7*96), 7))
        m2 = Conv_V(1, 204, (int(0.4*96), 7))
        m3 = Conv_H(1, 51, 129)
        m4 = Conv_H(1, 51, 65)
        m5 = Conv_H(1, 51, 33)
        self.layers = nn.ModuleList([m1, m2, m3, m4, m5])

        # Pons back-end
        backend_channel= 512 if dataset=='msd' else 64
        self.layer1 = Conv_1d(561, backend_channel, 7, 1, 1)
        self.layer2 = Conv_1d(backend_channel, backend_channel, 7, 1, 1)
        self.layer3 = Conv_1d(backend_channel, backend_channel, 7, 1, 1)

        # Dense
        dense_channel = 500 if dataset=='msd' else 200
        self.dense1 = nn.Linear((561+(backend_channel*3))*2, dense_channel)
        self.bn = nn.BatchNorm1d(dense_channel)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.dense2 = nn.Linear(dense_channel, n_class)

    def forward(self, x):
        # Spectrogram
        x = self.spec(x)
        x = self.to_db(x)
        x = x.unsqueeze(1)
        x = self.spec_bn(x)

        # Pons front-end
        out = []
        for layer in self.layers:
            out.append(layer(x))
        out = torch.cat(out, dim=1)

        # Pons back-end
        length = out.size(2)
        res1 = self.layer1(out)
        res2 = self.layer2(res1) + res1
        res3 = self.layer3(res2) + res2
        out = torch.cat([out, res1, res2, res3], 1)

        mp = nn.MaxPool1d(length)(out)
        avgp = nn.AvgPool1d(length)(out)
        out = torch.cat([mp, avgp], dim=1)
        out = out.squeeze(2)

        out = self.relu(self.bn(self.dense1(out)))
        out = self.dropout(out)
        #out = self.dense2(out)
        #out = nn.Sigmoid()(out)

        return out
rand_musiCNN = Musicnn()
rand_musiCNN.eval();

In [23]:
# Compute and store the embeddings for each subset
if not os.path.isfile('train_embeddings_random.npy'):
    i=0
    train_embeddings = []
    for track in train_dataset:
        i+=1
        if i%100==0:
            print('Processing track '+str(i)+' of '+str(len(train_dataset)))
        train_embeddings.append(rand_musiCNN(torch.from_numpy(resample(track[0].numpy()[0])).unsqueeze(0)).detach().numpy())
    train_embeddings = np.array(train_embeddings)
    np.save('train_embeddings_random.npy', train_embeddings)
    
    val_embeddings = []
    for track in val_dataset:
        val_embeddings.append(rand_musiCNN(torch.from_numpy(resample(track[0].numpy()[0])).unsqueeze(0)).detach().numpy())
    val_embeddings = np.array(val_embeddings)
    np.save('val_embeddings_random.npy', val_embeddings)
    
    test_embeddings = []
    for track in test_dataset:
        test_embeddings.append(rand_musiCNN(torch.from_numpy(resample(track[0].numpy()[0])).unsqueeze(0)).detach().numpy())
    test_embeddings = np.array(test_embeddings)
    np.save('test_embeddings_random.npy', test_embeddings)    

else:
    train_embeddings=np.load('train_embeddings_random.npy')
    val_embeddings=np.load('val_embeddings_random.npy')
    test_embeddings=np.load('test_embeddings_random.npy')

In [24]:
random_net = train_test(train_embeddings, val_embeddings, test_embeddings)

Epoch 0: Train Loss = 3.1070022583007812. Val Loss = 2.6462361812591553.
Epoch 100: Train Loss = 1.8885945081710815. Val Loss = 2.019928216934204.
Epoch 200: Train Loss = 1.7573341131210327. Val Loss = 1.966556429862976.
Epoch 300: Train Loss = 1.6742095947265625. Val Loss = 1.937293291091919.
Epoch 400: Train Loss = 1.6005276441574097. Val Loss = 1.910744547843933.
Epoch 500: Train Loss = 1.5343894958496094. Val Loss = 1.8919563293457031.
Epoch 600: Train Loss = 1.506278157234192. Val Loss = 1.8848636150360107.
Epoch 700: Train Loss = 1.5050195455551147. Val Loss = 1.8848612308502197.
Epoch 800: Train Loss = 1.5083385705947876. Val Loss = 1.8848590850830078.
Epoch 900: Train Loss = 1.5047283172607422. Val Loss = 1.88485586643219.
Epoch 1000: Train Loss = 1.5083122253417969. Val Loss = 1.8848528861999512.
Epoch 1100: Train Loss = 1.505696415901184. Val Loss = 1.884851336479187.
Epoch 1200: Train Loss = 1.5050700902938843. Val Loss = 1.8848495483398438.
Epoch 1300: Train Loss = 1.513064

In [25]:
random_net

40.58379530906677

# We repeat but with image reshaping (torch) methods

In [27]:
#'nearest'  'bilinear', 'bicubic', , 'area', 'trilinear'
if not os.path.isfile('train_embeddings_nearest.npy'):
    i=0
    train_embeddings = []
    for track in train_dataset:
        i+=1
        if i%100==0:
            print('Processing track '+str(i)+' of '+str(len(train_dataset)))
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict(melspectrogram(resample(track[0].numpy()[0])),mode='nearest'))
        output = predict(in_pool)
        train_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    train_embeddings = np.array(train_embeddings)
    np.save('train_embeddings_nearest.npy', train_embeddings)
    
    val_embeddings = []
    for track in val_dataset:
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict(melspectrogram(resample(track[0].numpy()[0])),mode='nearest'))
        output = predict(in_pool)
        val_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    val_embeddings = np.array(val_embeddings)
    np.save('val_embeddings_nearest.npy', val_embeddings)
    
    test_embeddings = []
    for track in test_dataset:
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict(melspectrogram(resample(track[0].numpy()[0])),mode='nearest'))
        output = predict(in_pool)
        test_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    test_embeddings = np.array(test_embeddings)
    np.save('test_embeddings_nearest.npy', test_embeddings)    

else:
    train_embeddings=np.load('train_embeddings_nearest.npy')
    val_embeddings=np.load('val_embeddings_nearest.npy')
    test_embeddings=np.load('test_embeddings_nearest.npy')

Processing track 100 of 443
Processing track 200 of 443
Processing track 300 of 443
Processing track 400 of 443


In [28]:
nearest_net = train_test(train_embeddings, val_embeddings, test_embeddings)

Epoch 0: Train Loss = 2.481893301010132. Val Loss = 2.4405312538146973.
Epoch 100: Train Loss = 1.4496593475341797. Val Loss = 1.5998427867889404.
Epoch 200: Train Loss = 1.099965214729309. Val Loss = 1.320712924003601.
Epoch 300: Train Loss = 0.9284553527832031. Val Loss = 1.1870777606964111.
Epoch 400: Train Loss = 0.824478268623352. Val Loss = 1.105027675628662.
Epoch 500: Train Loss = 0.7539178133010864. Val Loss = 1.0494338274002075.
Epoch 600: Train Loss = 0.7045766115188599. Val Loss = 1.0080229043960571.
Epoch 700: Train Loss = 0.6616665124893188. Val Loss = 0.9768028259277344.
Epoch 800: Train Loss = 0.6306905746459961. Val Loss = 0.9525780081748962.
Epoch 900: Train Loss = 0.5976877212524414. Val Loss = 0.9338906407356262.
Epoch 1000: Train Loss = 0.5823160409927368. Val Loss = 0.9181756973266602.
Epoch 1100: Train Loss = 0.552892804145813. Val Loss = 0.9049414396286011.
Epoch 1200: Train Loss = 0.5392934679985046. Val Loss = 0.8934511542320251.
Epoch 1300: Train Loss = 0.525

In [29]:
nearest_net

77.28104591369629

In [30]:
#'nearest'  'bilinear', 'bicubic', , 'area', 'trilinear'
if not os.path.isfile('train_embeddings_bilinear.npy'):
    i=0
    train_embeddings = []
    for track in train_dataset:
        i+=1
        if i%100==0:
            print('Processing track '+str(i)+' of '+str(len(train_dataset)))
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict(melspectrogram(resample(track[0].numpy()[0])),mode='bilinear'))
        output = predict(in_pool)
        train_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    train_embeddings = np.array(train_embeddings)
    np.save('train_embeddings_bilinear.npy', train_embeddings)
    
    val_embeddings = []
    for track in val_dataset:
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict(melspectrogram(resample(track[0].numpy()[0])),mode='bilinear'))
        output = predict(in_pool)
        val_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    val_embeddings = np.array(val_embeddings)
    np.save('val_embeddings_bilinear.npy', val_embeddings)
    
    test_embeddings = []
    for track in test_dataset:
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict(melspectrogram(resample(track[0].numpy()[0])),mode='bilinear'))
        output = predict(in_pool)
        test_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    test_embeddings = np.array(test_embeddings)
    np.save('test_embeddings_bilinear.npy', test_embeddings)    

else:
    train_embeddings=np.load('train_embeddings_bilinear.npy')
    val_embeddings=np.load('val_embeddings_bilinear.npy')
    test_embeddings=np.load('test_embeddings_bilinear.npy')

  "See the documentation of nn.Upsample for details.".format(mode)


Processing track 100 of 443
Processing track 200 of 443
Processing track 300 of 443
Processing track 400 of 443


In [31]:
bilinear_net = train_test(train_embeddings, val_embeddings, test_embeddings)

Epoch 0: Train Loss = 2.740442991256714. Val Loss = 2.613727569580078.
Epoch 100: Train Loss = 1.6834008693695068. Val Loss = 1.8165361881256104.
Epoch 200: Train Loss = 1.3514814376831055. Val Loss = 1.5413243770599365.
Epoch 300: Train Loss = 1.145402193069458. Val Loss = 1.38668954372406.
Epoch 400: Train Loss = 1.0153820514678955. Val Loss = 1.2937651872634888.
Epoch 500: Train Loss = 0.921638548374176. Val Loss = 1.23270583152771.
Epoch 600: Train Loss = 0.8532682061195374. Val Loss = 1.1908600330352783.
Epoch 700: Train Loss = 0.7976685166358948. Val Loss = 1.1608574390411377.
Epoch 800: Train Loss = 0.7582550048828125. Val Loss = 1.1391427516937256.
Epoch 900: Train Loss = 0.7223774194717407. Val Loss = 1.1221624612808228.
Epoch 1000: Train Loss = 0.6852974891662598. Val Loss = 1.1098588705062866.
Epoch 1100: Train Loss = 0.662321150302887. Val Loss = 1.0993478298187256.
Epoch 1200: Train Loss = 0.6405248641967773. Val Loss = 1.0926884412765503.
Epoch 1300: Train Loss = 0.640422

In [32]:
bilinear_net

69.29597854614258

In [33]:
#'nearest'  'bilinear', 'bicubic', , 'area', 'trilinear'
if not os.path.isfile('train_embeddings_bicubic.npy'):
    i=0
    train_embeddings = []
    for track in train_dataset:
        i+=1
        if i%100==0:
            print('Processing track '+str(i)+' of '+str(len(train_dataset)))
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict(melspectrogram(resample(track[0].numpy()[0])),mode='bicubic'))
        output = predict(in_pool)
        train_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    train_embeddings = np.array(train_embeddings)
    np.save('train_embeddings_bicubic.npy', train_embeddings)
    
    val_embeddings = []
    for track in val_dataset:
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict(melspectrogram(resample(track[0].numpy()[0])),mode='bicubic'))
        output = predict(in_pool)
        val_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    val_embeddings = np.array(val_embeddings)
    np.save('val_embeddings_bicubic.npy', val_embeddings)
    
    test_embeddings = []
    for track in test_dataset:
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict(melspectrogram(resample(track[0].numpy()[0])),mode='bicubic'))
        output = predict(in_pool)
        test_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    test_embeddings = np.array(test_embeddings)
    np.save('test_embeddings_bicubic.npy', test_embeddings)    

else:
    train_embeddings=np.load('train_embeddings_bicubic.npy')
    val_embeddings=np.load('val_embeddings_bicubic.npy')
    test_embeddings=np.load('test_embeddings_bicubic.npy')

  "See the documentation of nn.Upsample for details.".format(mode)


Processing track 100 of 443
Processing track 200 of 443
Processing track 300 of 443
Processing track 400 of 443


In [34]:
bicubic_net = train_test(train_embeddings, val_embeddings, test_embeddings)

Epoch 0: Train Loss = 2.4317805767059326. Val Loss = 2.4642066955566406.
Epoch 100: Train Loss = 1.5333970785140991. Val Loss = 1.6570769548416138.
Epoch 200: Train Loss = 1.162260890007019. Val Loss = 1.3547444343566895.
Epoch 300: Train Loss = 0.9632114171981812. Val Loss = 1.2005283832550049.
Epoch 400: Train Loss = 0.8469470143318176. Val Loss = 1.114084005355835.
Epoch 500: Train Loss = 0.7700648307800293. Val Loss = 1.0595790147781372.
Epoch 600: Train Loss = 0.7038282752037048. Val Loss = 1.0229744911193848.
Epoch 700: Train Loss = 0.6673607230186462. Val Loss = 0.99671870470047.
Epoch 800: Train Loss = 0.629788875579834. Val Loss = 0.9775038957595825.
Epoch 900: Train Loss = 0.5979045033454895. Val Loss = 0.9635715484619141.
Epoch 1000: Train Loss = 0.5765382647514343. Val Loss = 0.9528799653053284.
Epoch 1100: Train Loss = 0.5535461902618408. Val Loss = 0.9432661533355713.
Epoch 1200: Train Loss = 0.5331271886825562. Val Loss = 0.9378153085708618.
Epoch 1300: Train Loss = 0.53

In [35]:
bicubic_net

72.82840013504028

In [36]:
#'nearest'  'bilinear', 'bicubic', , 'area', 'trilinear'
if not os.path.isfile('train_embeddings_area.npy'):
    i=0
    train_embeddings = []
    for track in train_dataset:
        i+=1
        if i%100==0:
            print('Processing track '+str(i)+' of '+str(len(train_dataset)))
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict(melspectrogram(resample(track[0].numpy()[0])),mode='area'))
        output = predict(in_pool)
        train_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    train_embeddings = np.array(train_embeddings)
    np.save('train_embeddings_area.npy', train_embeddings)
    
    val_embeddings = []
    for track in val_dataset:
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict(melspectrogram(resample(track[0].numpy()[0])),mode='area'))
        output = predict(in_pool)
        val_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    val_embeddings = np.array(val_embeddings)
    np.save('val_embeddings_area.npy', val_embeddings)
    
    test_embeddings = []
    for track in test_dataset:
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict(melspectrogram(resample(track[0].numpy()[0])),mode='area'))
        output = predict(in_pool)
        test_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    test_embeddings = np.array(test_embeddings)
    np.save('test_embeddings_area.npy', test_embeddings)    

else:
    train_embeddings=np.load('train_embeddings_area.npy')
    val_embeddings=np.load('val_embeddings_area.npy')
    test_embeddings=np.load('test_embeddings_area.npy')

Processing track 100 of 443
Processing track 200 of 443
Processing track 300 of 443
Processing track 400 of 443


In [37]:
area_net = train_test(train_embeddings, val_embeddings, test_embeddings)

Epoch 0: Train Loss = 2.5877630710601807. Val Loss = 2.5556771755218506.
Epoch 100: Train Loss = 1.540202260017395. Val Loss = 1.6914875507354736.
Epoch 200: Train Loss = 1.1952879428863525. Val Loss = 1.4031914472579956.
Epoch 300: Train Loss = 1.0138425827026367. Val Loss = 1.2572230100631714.
Epoch 400: Train Loss = 0.9012598991394043. Val Loss = 1.1692289113998413.
Epoch 500: Train Loss = 0.8224290609359741. Val Loss = 1.111919641494751.
Epoch 600: Train Loss = 0.7670912742614746. Val Loss = 1.071157455444336.
Epoch 700: Train Loss = 0.7237657308578491. Val Loss = 1.0406895875930786.
Epoch 800: Train Loss = 0.6879711151123047. Val Loss = 1.017800211906433.
Epoch 900: Train Loss = 0.6537861824035645. Val Loss = 0.9994897842407227.
Epoch 1000: Train Loss = 0.6328492164611816. Val Loss = 0.9851677417755127.
Epoch 1100: Train Loss = 0.6084261536598206. Val Loss = 0.974022626876831.
Epoch 1200: Train Loss = 0.5936712622642517. Val Loss = 0.9675305485725403.
Epoch 1300: Train Loss = 0.59

In [38]:
area_net

75.03759264945984

In [39]:
#'nearest'  'bilinear', 'bicubic', , 'area', 'trilinear'
if not os.path.isfile('train_embeddings_trilinear.npy'):
    i=0
    train_embeddings = []
    for track in train_dataset:
        i+=1
        if i%100==0:
            print('Processing track '+str(i)+' of '+str(len(train_dataset)))
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict(melspectrogram(resample(track[0].numpy()[0])),mode='trilinear'))
        output = predict(in_pool)
        train_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    train_embeddings = np.array(train_embeddings)
    np.save('train_embeddings_trilinear.npy', train_embeddings)
    
    val_embeddings = []
    for track in val_dataset:
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict(melspectrogram(resample(track[0].numpy()[0])),mode='trilinear'))
        output = predict(in_pool)
        val_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    val_embeddings = np.array(val_embeddings)
    np.save('val_embeddings_trilinear.npy', val_embeddings)
    
    test_embeddings = []
    for track in test_dataset:
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict(melspectrogram(resample(track[0].numpy()[0])),mode='trilinear'))
        output = predict(in_pool)
        test_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    test_embeddings = np.array(test_embeddings)
    np.save('test_embeddings_trilinear.npy', test_embeddings)    

else:
    train_embeddings=np.load('train_embeddings_trilinear.npy')
    val_embeddings=np.load('val_embeddings_trilinear.npy')
    test_embeddings=np.load('test_embeddings_trilinear.npy')

  "See the documentation of nn.Upsample for details.".format(mode)


Processing track 100 of 443
Processing track 200 of 443
Processing track 300 of 443
Processing track 400 of 443


In [40]:
trilinear_net = train_test(train_embeddings, val_embeddings, test_embeddings)

Epoch 0: Train Loss = 2.307088851928711. Val Loss = 2.2478086948394775.
Epoch 100: Train Loss = 1.5482195615768433. Val Loss = 1.6184629201889038.
Epoch 200: Train Loss = 1.2306087017059326. Val Loss = 1.381377100944519.
Epoch 300: Train Loss = 1.05232572555542. Val Loss = 1.259857416152954.
Epoch 400: Train Loss = 0.943603515625. Val Loss = 1.187566876411438.
Epoch 500: Train Loss = 0.8589625358581543. Val Loss = 1.1405445337295532.
Epoch 600: Train Loss = 0.7986156344413757. Val Loss = 1.106937050819397.
Epoch 700: Train Loss = 0.754830539226532. Val Loss = 1.0840868949890137.
Epoch 800: Train Loss = 0.7156203985214233. Val Loss = 1.0678960084915161.
Epoch 900: Train Loss = 0.681488573551178. Val Loss = 1.0560524463653564.
Epoch 1000: Train Loss = 0.652154266834259. Val Loss = 1.046676516532898.
Epoch 1100: Train Loss = 0.6399688124656677. Val Loss = 1.0439099073410034.
Epoch 1200: Train Loss = 0.6442651152610779. Val Loss = 1.043907642364502.
Epoch 1300: Train Loss = 0.6395034193992

In [41]:
trilinear_net

70.40250897407532

# Now sonifying (inverting the Mel spectograms)

In [17]:
import librosa

In [45]:
# Compute and store the embeddings for each subset
if not os.path.isfile('train_embeddings_sonify.npy'):
    i=0
    train_embeddings = []
    for track in train_dataset:
        i+=1
        print('Processing track '+str(i)+' of '+str(len(train_dataset)))
        #We de-normalize the melonstuff
        ess=melspectrogram(resample(track[0].numpy()[0]))
        db2amp = es.UnaryOperator(type='db2lin', scale=2)
        for k in range(len(ess)):
            ess[k] = db2amp(ess[k])
        sonified = librosa.feature.inverse.mel_to_audio(M=ess.T, sr=16000, n_fft=512, hop_length=256, center=True, norm='slaney', htk=False)
        train_embeddings.append(es.TensorflowPredictMusiCNN(
            graphFilename='msd-musicnn-1.pb', output='model/dense/BiasAdd')(sonified))
    train_embeddings = np.array(train_embeddings)
    np.save('train_embeddings_sonify.npy',train_embeddings)
    i=0
    val_embeddings = []
    for track in val_dataset:
        i+=1
        print('Processing track '+str(i)+' of '+str(len(val_dataset)))
        #We de-normalize the melonstuff
        ess=melspectrogram(resample(track[0].numpy()[0]))
        db2amp = es.UnaryOperator(type='db2lin', scale=2)
        for k in range(len(ess)):
            ess[k] = db2amp(ess[k])
        sonified = librosa.feature.inverse.mel_to_audio(M=ess.T, sr=16000, n_fft=512, hop_length=256, center=True, norm='slaney', htk=False)
        val_embeddings.append(es.TensorflowPredictMusiCNN(
            graphFilename='msd-musicnn-1.pb', output='model/dense/BiasAdd')(sonified))
    val_embeddings = np.array(val_embeddings)
    np.save('val_embeddings_sonify.npy',train_embeddings)
    i=0                                
    test_embeddings = []
    for track in test_dataset:
        i+=1
        print('Processing track '+str(i)+' of '+str(len(test_dataset)))
        #We de-normalize the melonstuff
        ess=melspectrogram(resample(track[0].numpy()[0]))
        db2amp = es.UnaryOperator(type='db2lin', scale=2)
        for k in range(len(ess)):
            ess[k] = db2amp(ess[k])
        sonified = librosa.feature.inverse.mel_to_audio(M=ess.T, sr=16000, n_fft=512, hop_length=256, center=True, norm='slaney', htk=False)
        test_embeddings.append(es.TensorflowPredictMusiCNN(
            graphFilename='msd-musicnn-1.pb', output='model/dense/BiasAdd')(sonified))
    test_embeddings = np.array(test_embeddings)
    np.save('test_embeddings_sonify.npy',test_embeddings)


else:
    train_embeddings=np.load('train_embeddings_sonify.npy')
    val_embeddings=np.load('val_embeddings_sonify.npy')
    test_embeddings=np.load('test_embeddings_sonify.npy')

In [47]:
sonify_net = train_test(train_embeddings, val_embeddings, test_embeddings)

Epoch 0: Train Loss = 2.4772136211395264. Val Loss = 2.598193883895874.
Epoch 100: Train Loss = 1.9324365854263306. Val Loss = 2.5817947387695312.
Epoch 200: Train Loss = 1.9332129955291748. Val Loss = 2.5817973613739014.
Epoch 300: Train Loss = 1.932881236076355. Val Loss = 2.5817999839782715.
Epoch 400: Train Loss = 1.9336708784103394. Val Loss = 2.581803321838379.
Epoch 500: Train Loss = 1.9348188638687134. Val Loss = 2.581805944442749.
Epoch 600: Train Loss = 1.9329769611358643. Val Loss = 2.5818088054656982.
Epoch 700: Train Loss = 1.9353917837142944. Val Loss = 2.5818114280700684.
Epoch 800: Train Loss = 1.934230089187622. Val Loss = 2.5818142890930176.
Epoch 900: Train Loss = 1.9341288805007935. Val Loss = 2.581817150115967.
Epoch 1000: Train Loss = 1.9353611469268799. Val Loss = 2.581820011138916.
Epoch 1100: Train Loss = 1.9351215362548828. Val Loss = 2.5818228721618652.
Epoch 1200: Train Loss = 1.9320887327194214. Val Loss = 2.5818252563476562.
Epoch 1300: Train Loss = 1.9326

In [44]:
sonify_net

18.74985694885254

# Repeat BILINEAR but with align-corners True

In [18]:
def adapt_melonInput_TensorflowPredict_aligned(melon_sample, mode):
    """
    Adapts (by treating the spectrogram as an image and using Computer 
    Vision interpolation methods) the MelonPlaylist mel spectrograms to patches
    suitable for using the Essentia-Tensorflow TensorflowPredict algorithm.

    Input:
    melon_samples (frames, 48bands)
    mode: 'linear', 'nearest'  'bilinear', 'bicubic', , 'area', 'trilinear'
    Output:(batch, 187, 1, 96bands)
    """
    db2amp = es.UnaryOperator(type='db2lin', scale=2)
    if mode == 'linear':
        oversampled = np.zeros((len(melon_sample), melon_sample.shape[1]*2)).astype(np.float32)
    else:
        renormalized = np.zeros_like(melon_sample).astype(np.float32)
    for k in range(len(melon_sample)):
        if mode == 'linear':
            sample = np.log10(1 + (db2amp(melon_sample[k])*10000))
            oversampled[k,:]=np.interp(np.arange(96)/2, np.arange(48), sample)
        else:
            renormalized[k,:] = np.log10(1 + (db2amp(melon_sample[k])*10000))
    if mode != 'linear':
        renormalized = torch.from_numpy(renormalized).unsqueeze(0).unsqueeze(0)
        if mode == 'trilinear':
            oversampled=torch.nn.functional.interpolate(input=renormalized.unsqueeze(0), 
                                            size=[1,melon_sample.shape[0],melon_sample.shape[1]*2], 
                                                        mode=mode).squeeze()
        else:
            oversampled=torch.nn.functional.interpolate(input=renormalized, 
                                        size=[melon_sample.shape[0],melon_sample.shape[1]*2], 
                                                        mode=mode, align_corners=True).squeeze()
        oversampled = oversampled.numpy()
    
    # Now we cut again, but with hop size of 93 frames as in default TensorflowPredictMusiCNN
    new = np.zeros((int(len(oversampled) / 93) - 1, 187, 96)).astype(np.float32)
    for k in range(int(len(oversampled) / 93) - 1):
        new[k]=oversampled[k*93:k*93+187]
    return np.expand_dims(new, 2)

In [24]:
#'nearest'  'bilinear', 'bicubic', , 'area', 'trilinear'
if not os.path.isfile('train_embeddings_bicubic_aligned.npy'):
    i=0
    train_embeddings = []
    for track in train_dataset:
        i+=1
        if i%100==0:
            print('Processing track '+str(i)+' of '+str(len(train_dataset)))
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict_aligned(melspectrogram(resample(track[0].numpy()[0])),mode='bicubic'))
        output = predict(in_pool)
        train_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    train_embeddings = np.array(train_embeddings)
    np.save('train_embeddings_bicubic_aligned.npy', train_embeddings)
    
    val_embeddings = []
    for track in val_dataset:
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict_aligned(melspectrogram(resample(track[0].numpy()[0])),mode='bicubic'))
        output = predict(in_pool)
        val_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    val_embeddings = np.array(val_embeddings)
    np.save('val_embeddings_bicubic_aligned.npy', val_embeddings)
    
    test_embeddings = []
    for track in test_dataset:
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict_aligned(melspectrogram(resample(track[0].numpy()[0])),mode='bicubic'))
        output = predict(in_pool)
        test_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    test_embeddings = np.array(test_embeddings)
    np.save('test_embeddings_bicubic_aligned.npy', test_embeddings)    

else:
    train_embeddings=np.load('train_embeddings_bicubic_aligned.npy')
    val_embeddings=np.load('val_embeddings_bicubic_aligned.npy')
    test_embeddings=np.load('test_embeddings_bicubic_aligned.npy')

In [25]:
bicubic_aligned_net = train_test(train_embeddings, val_embeddings, test_embeddings)

Epoch 0: Train Loss = 2.3671836853027344. Val Loss = 2.415376901626587.
Epoch 100: Train Loss = 1.4830845594406128. Val Loss = 1.652207851409912.
Epoch 200: Train Loss = 1.1473748683929443. Val Loss = 1.3631185293197632.
Epoch 300: Train Loss = 0.9641218185424805. Val Loss = 1.2145800590515137.
Epoch 400: Train Loss = 0.8493464589118958. Val Loss = 1.1284267902374268.
Epoch 500: Train Loss = 0.7706289887428284. Val Loss = 1.0719891786575317.
Epoch 600: Train Loss = 0.711894690990448. Val Loss = 1.0329068899154663.
Epoch 700: Train Loss = 0.6667733788490295. Val Loss = 1.0051604509353638.
Epoch 800: Train Loss = 0.6325655579566956. Val Loss = 0.9841317534446716.
Epoch 900: Train Loss = 0.6035242676734924. Val Loss = 0.9692971706390381.
Epoch 1000: Train Loss = 0.5758257508277893. Val Loss = 0.9578670263290405.
Epoch 1100: Train Loss = 0.5529043078422546. Val Loss = 0.947456955909729.
Epoch 1200: Train Loss = 0.5486071705818176. Val Loss = 0.9470803141593933.
Epoch 1300: Train Loss = 0.5

In [26]:
bicubic_aligned_net

74.23251271247864

# Try going back to STFT, then to new mel with librosa

In [20]:
def adapt_melonInput_TensorflowPredict_STFT(ess):
    """
    Adapts (by treating the spectrogram as an image and using Computer 
    Vision interpolation methods) the MelonPlaylist mel spectrograms to patches
    suitable for using the Essentia-Tensorflow TensorflowPredict algorithm.

    Input:
    melon_samples (frames, 48bands)
    mode: 'linear', 'nearest'  'bilinear', 'bicubic', , 'area', 'trilinear'
    Output:(batch, 187, 1, 96bands)
    """
    db2amp = es.UnaryOperator(type='db2lin', scale=2)
    for k in range(len(ess)):
        ess[k] = db2amp(ess[k])
    stft = librosa.feature.inverse.mel_to_stft(M=ess.T, sr=16000, n_fft=512,norm='slaney', htk=False)
    libro = librosa.feature.melspectrogram(sr=16000, S=stft, n_fft=512, hop_length=256, n_mels=96, norm='slaney', htk=False).T
    libro = np.log10(1+libro*10000)
    new = np.zeros((int(len(libro) / 93) - 1, 187, 96)).astype(np.float32)
    for k in range(int(len(libro) / 93) - 1):
        new[k]=libro[k*93:k*93+187]
    new = np.expand_dims(new, 2)
    return new

In [21]:
#'nearest'  'bilinear', 'bicubic', , 'area', 'trilinear'
if not os.path.isfile('train_embeddings_stft.npy'):
    i=0
    train_embeddings = []
    for track in train_dataset:
        i+=1
        print('Processing track '+str(i)+' of '+str(len(train_dataset)))
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict_STFT(melspectrogram(resample(track[0].numpy()[0]))))
        output = predict(in_pool)
        train_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    train_embeddings = np.array(train_embeddings)
    np.save('train_embeddings_stft.npy', train_embeddings)
    
    val_embeddings = []
    for track in val_dataset:
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict_STFT(melspectrogram(resample(track[0].numpy()[0]))))
        output = predict(in_pool)
        val_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    val_embeddings = np.array(val_embeddings)
    np.save('val_embeddings_stft.npy', val_embeddings)
    
    test_embeddings = []
    for track in test_dataset:
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict_STFT(melspectrogram(resample(track[0].numpy()[0]))))
        output = predict(in_pool)
        test_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    test_embeddings = np.array(test_embeddings)
    np.save('test_embeddings_stft.npy', test_embeddings)    

else:
    train_embeddings=np.load('train_embeddings_stft.npy')
    val_embeddings=np.load('val_embeddings_stft.npy')
    test_embeddings=np.load('test_embeddings_stft.npy')

Processing track 1 of 443
Processing track 2 of 443
Processing track 3 of 443
Processing track 4 of 443
Processing track 5 of 443
Processing track 6 of 443
Processing track 7 of 443
Processing track 8 of 443
Processing track 9 of 443
Processing track 10 of 443
Processing track 11 of 443
Processing track 12 of 443
Processing track 13 of 443
Processing track 14 of 443
Processing track 15 of 443
Processing track 16 of 443
Processing track 17 of 443
Processing track 18 of 443
Processing track 19 of 443
Processing track 20 of 443
Processing track 21 of 443
Processing track 22 of 443
Processing track 23 of 443
Processing track 24 of 443
Processing track 25 of 443
Processing track 26 of 443
Processing track 27 of 443
Processing track 28 of 443
Processing track 29 of 443
Processing track 30 of 443
Processing track 31 of 443
Processing track 32 of 443
Processing track 33 of 443
Processing track 34 of 443
Processing track 35 of 443
Processing track 36 of 443
Processing track 37 of 443
Processing

Processing track 298 of 443
Processing track 299 of 443
Processing track 300 of 443
Processing track 301 of 443
Processing track 302 of 443
Processing track 303 of 443
Processing track 304 of 443
Processing track 305 of 443
Processing track 306 of 443
Processing track 307 of 443
Processing track 308 of 443
Processing track 309 of 443
Processing track 310 of 443
Processing track 311 of 443
Processing track 312 of 443
Processing track 313 of 443
Processing track 314 of 443
Processing track 315 of 443
Processing track 316 of 443
Processing track 317 of 443
Processing track 318 of 443
Processing track 319 of 443
Processing track 320 of 443
Processing track 321 of 443
Processing track 322 of 443
Processing track 323 of 443
Processing track 324 of 443
Processing track 325 of 443
Processing track 326 of 443
Processing track 327 of 443
Processing track 328 of 443
Processing track 329 of 443
Processing track 330 of 443
Processing track 331 of 443
Processing track 332 of 443
Processing track 333

In [22]:
stft_net = train_test(train_embeddings, val_embeddings, test_embeddings)

Epoch 0: Train Loss = 2.4730193614959717. Val Loss = 2.4873480796813965.
Epoch 100: Train Loss = 1.8230901956558228. Val Loss = 1.9641129970550537.
Epoch 200: Train Loss = 1.5462123155593872. Val Loss = 1.7267212867736816.
Epoch 300: Train Loss = 1.3712682723999023. Val Loss = 1.586881160736084.
Epoch 400: Train Loss = 1.2451542615890503. Val Loss = 1.4988665580749512.
Epoch 500: Train Loss = 1.160767674446106. Val Loss = 1.4396755695343018.
Epoch 600: Train Loss = 1.0896888971328735. Val Loss = 1.39529287815094.
Epoch 700: Train Loss = 1.0339666604995728. Val Loss = 1.361782193183899.
Epoch 800: Train Loss = 0.9949813485145569. Val Loss = 1.3371638059616089.
Epoch 900: Train Loss = 0.9583620429039001. Val Loss = 1.316237211227417.
Epoch 1000: Train Loss = 0.9327408075332642. Val Loss = 1.30291748046875.
Epoch 1100: Train Loss = 0.9289027452468872. Val Loss = 1.3029148578643799.
Epoch 1200: Train Loss = 0.9297040104866028. Val Loss = 1.3029118776321411.
Epoch 1300: Train Loss = 0.93373

In [23]:
stft_net

63.41491937637329

| Model        | Loss           | GTZAN accuracy  |
| ------------- |:-------------:| -----:|
| Random embeddings      | 2.31 | 9.37% |
| Random musiCNN      | 1.88 | 40,58% |
| musiCNN waveform      |    0.67   |   80.50% |
| musiCNN linear interpolation | 0.92      |    74.57% |
| <strong>musiCNN nearest interpolation</strong> | <strong>0.87 </strong>     |   <strong> 77.28% </strong>|
| musiCNN bilinear interpolation | 1.09     |    69.30% |
| musiCNN bicubic interpolation |   0.94   |    72.83% |
| musiCNN bicubic aligned_corners |   0.95   |    74.23% |
| musiCNN area interpolation |      0.97 |    75.04% |
| musiCNN trilinear interpolation |  1.04     |    70.40% |
| musiCNN MEL_to_audio librosa |  2.58     |    18.75% |
| musiCNN MEL_to_STFT librosa |    1.30   |    63.41% |