# Get GTZAN and MusiCNN-MSD

In [1]:
import os
import tensorflow
import torch
import torchaudio
import numpy as np
import essentia.standard as es
from essentia import Pool
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(42)
###############################################################################################

# Please first specify the path of your GTZAN dataset if it is already downloaded in your system.
    # Otherwise leave 'data' or the desired path where we will download the dataset.

GTZAN_path = 'data'
#GTZAN_path = <your_path>

###############################################################################################

# Download dataset from torchaudio
if not os.path.isdir(GTZAN_path):
    os.mkdir(GTZAN_path)
    train_dataset = torchaudio.datasets.GTZAN(root=GTZAN_path, download=True, subset='training')
else:
    train_dataset = torchaudio.datasets.GTZAN(root=GTZAN_path, subset='training')
val_dataset = torchaudio.datasets.GTZAN(root=GTZAN_path, subset='validation')
test_dataset = torchaudio.datasets.GTZAN(root=GTZAN_path, subset='testing')

# We download the essentia MSD MusiCNN model
if not os.path.isfile('msd-musicnn-1.pb'):
    !curl -SLO https://essentia.upf.edu/models/autotagging/msd/msd-musicnn-1.pb

class Essentia_MusiCNNMSD_GTZAN_Dataset(Dataset):
    """ The embeddings of the GTZAN dataset extracted with Essentia-Tensorflow's MusiCNN-MSD model. """
    def __init__(self, GTZAN_dataset, embeddings):
        self.GTZAN_dataset = GTZAN_dataset
        self.embeddings = embeddings
        self.GTZAN_genres = [
            "blues",
            "classical",
            "country",
            "disco",
            "hiphop",
            "jazz",
            "metal",
            "pop",
            "reggae",
            "rock",
        ]
        
    def __len__(self):
        return len(self.GTZAN_dataset)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        inputs = torch.from_numpy(self.embeddings[idx])
        labels = torch.tensor(self.GTZAN_genres.index(self.GTZAN_dataset[idx][2]))
        
        return inputs, labels
# We define a shallow model

class shallowClassifier(nn.Module):
    def __init__(self):
        super(shallowClassifier, self).__init__()
        self.dense1 = nn.Linear(19*200, 100)
        self.dense2 = nn.Linear(100, 10)
        
    def forward(self,x):
        x = x.view(-1, 19*200)
        x = F.relu(self.dense1(x))
        x = self.dense2(x)
        
        return x
    

# Build embeddings with GTZAN time

In [2]:
# TensorflowPredictMusiCNN expects mono 16kHz sample rate inputs. Resample needed
resample = es.Resample(inputSampleRate=22050, outputSampleRate=16000, quality=0)

# Compute and store the embeddings for each subset
if not os.path.isfile('train_embeddings.npy'):
    i=0
    train_embeddings = []
    for track in train_dataset:
        i+=1
        print('Processing track '+str(i)+' of '+str(len(train_dataset)))
        train_embeddings.append(es.TensorflowPredictMusiCNN(
            graphFilename='msd-musicnn-1.pb', output='model/dense/BiasAdd')(resample(track[0].numpy()[0])))
    train_embeddings = np.array(train_embeddings)
    np.save('train_embeddings.npy',train_embeddings)

    val_embeddings = []
    for track in val_dataset:
        val_embeddings.append(es.TensorflowPredictMusiCNN(
            graphFilename='msd-musicnn-1.pb', output='model/dense/BiasAdd')(resample(track[0].numpy()[0])))
    val_embeddings=np.array(val_embeddings)    
    np.save('val_embeddings.npy',val_embeddings)

    test_embeddings = []
    for track in test_dataset:
        test_embeddings.append(es.TensorflowPredictMusiCNN(
            graphFilename='msd-musicnn-1.pb', output='model/dense/BiasAdd')(resample(track[0].numpy()[0])))
    test_embeddings=np.array(test_embeddings)
    np.save('test_embeddings.npy',np.array(test_embeddings))
else:
    train_embeddings=np.load('train_embeddings.npy')
    val_embeddings=np.load('val_embeddings.npy')
    test_embeddings=np.load('test_embeddings.npy')

In [3]:
# Embedding shapes...
train_embeddings[0].shape

(19, 200)

In [4]:
# Embedding types
train_embeddings[0].dtype

dtype('float32')

# Build train loop

In [5]:
def train_test(train_embeddings, val_embeddings, test_embeddings):
    # We compute the distance of the embeddings between all songs in the training set
    emb_distance = np.zeros((len(train_embeddings), len(train_embeddings)))

    for indxA, trackA in enumerate(train_embeddings):
        for indxB, trackB in enumerate(train_embeddings):
            emb_distance[indxA, indxB] = np.linalg.norm(trackA - trackB)        

    embtrain_dataset = Essentia_MusiCNNMSD_GTZAN_Dataset(train_dataset, train_embeddings)
    train_loader = torch.utils.data.DataLoader(embtrain_dataset, batch_size=16, shuffle=True, num_workers=4)

    embval_dataset = Essentia_MusiCNNMSD_GTZAN_Dataset(val_dataset, val_embeddings)
    val_loader = torch.utils.data.DataLoader(embval_dataset, batch_size=16, shuffle=False, num_workers=4)

    embtest_dataset = Essentia_MusiCNNMSD_GTZAN_Dataset(test_dataset, test_embeddings)
    test_loader = torch.utils.data.DataLoader(embtest_dataset, batch_size=16, shuffle=False, num_workers=4)

    model = shallowClassifier()

    if torch.cuda.is_available():
        model = model.cuda()

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer)

    num_epochs = 200
    train_losses = torch.zeros(num_epochs)
    val_losses = torch.zeros(num_epochs)

    bestloss = 100000.0
    for epoch in range(num_epochs):
        #The train loop
        model.train()
        for inputs, labels in train_loader:
            # Send data to the GPU
            if torch.cuda.is_available():
                inputs = inputs.cuda()
                labels = labels.cuda()

            # Clear gradient and forward + loss + backward
            optimizer.zero_grad()
            outputs = model(inputs) 
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_losses[epoch] += loss.item()
        train_losses[epoch] /= len(train_loader)

        model.eval()
        with torch.no_grad():
            for inputs, labels in val_loader:
                # Send data to the GPU
                if torch.cuda.is_available():
                    inputs = inputs.cuda()
                    labels = labels.cuda()

                outputs = model(inputs) 
                loss = criterion(outputs, labels)

                val_losses[epoch] += loss.item()
            val_losses[epoch] /= len(val_loader)
            scheduler.step(val_losses[epoch])

            # If best epoch, we save parameters
            if val_losses[epoch] < bestloss :
                bestloss = val_losses[epoch]
                torch.save(model.state_dict(), 'model.pth')

        print('Epoch '+str(epoch)+': Train Loss = '+str(train_losses[epoch].item())+'. Val Loss = '+str(val_losses[epoch].item())+'.')
    print('Best validation loss :' + str(bestloss.item()))

    # Finally we compute accuracy with the test set
    model.load_state_dict(torch.load('model.pth'));
    model.eval()
    confusion_matrix = torch.zeros(len(embtrain_dataset.GTZAN_genres), len(embtrain_dataset.GTZAN_genres))
    with torch.no_grad():
        for inputs, labels in test_loader:
            # Send data to the GPU
            if torch.cuda.is_available():
                inputs = inputs.cuda()
                labels = labels.cuda()
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            for t, p in zip(labels.view(-1), predicted.view(-1)):
                confusion_matrix[t.long(), p.long()] += 1

    # Per-class Accuracy
    pclass_acc = confusion_matrix.diag()/confusion_matrix.sum(1)
    return torch.mean(pclass_acc).item()*100

# Train GTZAN_time

In [6]:
baseline = train_test(train_embeddings, val_embeddings, test_embeddings)

Epoch 0: Train Loss = 2.424870729446411. Val Loss = 2.297943592071533.
Epoch 1: Train Loss = 2.139758348464966. Val Loss = 2.0840837955474854.
Epoch 2: Train Loss = 1.914249062538147. Val Loss = 1.9156197309494019.
Epoch 3: Train Loss = 1.7302229404449463. Val Loss = 1.778722882270813.
Epoch 4: Train Loss = 1.5837901830673218. Val Loss = 1.6634392738342285.
Epoch 5: Train Loss = 1.4550983905792236. Val Loss = 1.564780592918396.
Epoch 6: Train Loss = 1.3491288423538208. Val Loss = 1.4800515174865723.
Epoch 7: Train Loss = 1.2539209127426147. Val Loss = 1.4058626890182495.
Epoch 8: Train Loss = 1.1741970777511597. Val Loss = 1.3407841920852661.
Epoch 9: Train Loss = 1.1042193174362183. Val Loss = 1.2840946912765503.
Epoch 10: Train Loss = 1.0480878353118896. Val Loss = 1.2334843873977661.
Epoch 11: Train Loss = 0.9921631813049316. Val Loss = 1.1886451244354248.
Epoch 12: Train Loss = 0.9436535239219666. Val Loss = 1.1486024856567383.
Epoch 13: Train Loss = 0.8994914293289185. Val Loss = 

Epoch 111: Train Loss = 0.24788442254066467. Val Loss = 0.6701147556304932.
Epoch 112: Train Loss = 0.24596083164215088. Val Loss = 0.6701090931892395.
Epoch 113: Train Loss = 0.24580921232700348. Val Loss = 0.670116126537323.
Epoch 114: Train Loss = 0.24779216945171356. Val Loss = 0.670165479183197.
Epoch 115: Train Loss = 0.24604608118534088. Val Loss = 0.6701746582984924.
Epoch 116: Train Loss = 0.246580109000206. Val Loss = 0.6701809763908386.
Epoch 117: Train Loss = 0.24421189725399017. Val Loss = 0.6701965928077698.
Epoch 118: Train Loss = 0.24490077793598175. Val Loss = 0.6701787114143372.
Epoch 119: Train Loss = 0.24515381455421448. Val Loss = 0.670154333114624.
Epoch 120: Train Loss = 0.24534526467323303. Val Loss = 0.670141339302063.
Epoch 121: Train Loss = 0.24695320427417755. Val Loss = 0.6701605916023254.
Epoch 122: Train Loss = 0.24461986124515533. Val Loss = 0.670172929763794.
Epoch 123: Train Loss = 0.24536485970020294. Val Loss = 0.6701720952987671.
Epoch 124: Train Lo

In [7]:
print(baseline)

78.96032929420471


# Baseline is 78.96%

In [8]:
audio = resample(train_dataset[0][0].numpy()[0])
original = es.TensorflowPredictMusiCNN(graphFilename='msd-musicnn-1.pb', output='model/dense/BiasAdd')(audio)

In [9]:
original.shape

(19, 200)

In [10]:
def melspectrogram(audio):
    # Computes the mel spectrogram of audio inputs as done in the MelonPlaylist dataset
    windowing = es.Windowing(type='hann', normalized=False, zeroPadding=0)
    spectrum = es.Spectrum()
    melbands = es.MelBands(numberBands=48,
                                   sampleRate=16000,
                                   lowFrequencyBound=0,
                                   highFrequencyBound=16000/2,
                                   inputSize=(512+0)//2+1,
                                   weighting='linear',
                                   normalize='unit_tri',
                                   warpingFormula='slaneyMel',
                                   type='power')
    amp2db = es.UnaryOperator(type='lin2db', scale=2)
    result = []
    for frame in es.FrameGenerator(audio, frameSize=512, hopSize=256,
                                   startFromZero=False):
        spectrumFrame = spectrum(windowing(frame))

        melFrame = melbands(spectrumFrame)
        result.append(amp2db(melFrame))
    return np.array(result)

In [11]:
def adapt_melonInput_TensorflowPredict(melon_sample):
    db2amp = es.UnaryOperator(type='db2lin', scale=2)
    oversampled = np.zeros((len(melon_sample), melon_sample.shape[1]*2)).astype(np.float32)
    for k in range(len(melon_sample)):
        sample = np.log10(1 + (db2amp(melon_sample[k])*10000))
        oversampled[k,:]=np.interp(np.arange(96)/2, np.arange(48), sample)
    # Now we cut again, but with overlap of 93 frames as in default TensorflowPredictMusiCNN
    new = np.zeros((int(len(oversampled) / 93) - 1, 187, 96)).astype(np.float32)
    for k in range(int(len(oversampled) / 93) - 1):
        new[k]=oversampled[k*93:k*93+187]
    return np.expand_dims(new, 2)

In [12]:
modelName='msd-musicnn-1.pb'
output_layer='model/dense/BiasAdd'
input_layer='model/Placeholder'
predict = es.TensorflowPredict(graphFilename=modelName,
                               inputs=[input_layer],
                               outputs=[output_layer])


In [13]:
in_pool = Pool()
in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict(melspectrogram(audio)))
output = predict(in_pool)
prediction = output['model/dense/BiasAdd'][:,0,0,:]

In [14]:
# TensorflowPredictMusiCNN expects mono 16kHz sample rate inputs. Resample needed
# Compute and store the embeddings for each subset
if not os.path.isfile('train_embeddings_melon.npy'):
    i=0
    train_embeddings = []
    for track in train_dataset:
        i+=1
        print('Processing track '+str(i)+' of '+str(len(train_dataset)))
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict(melspectrogram(resample(track[0].numpy()[0]))))
        output = predict(in_pool)
        train_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    train_embeddings = np.array(train_embeddings)
    np.save('train_embeddings_melon.npy', train_embeddings)
    
    val_embeddings = []
    for track in val_dataset:
        i+=1
        print('Processing track '+str(i)+' of '+str(len(train_dataset)))
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict(melspectrogram(resample(track[0].numpy()[0]))))
        output = predict(in_pool)
        val_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    val_embeddings = np.array(val_embeddings)
    np.save('val_embeddings_melon.npy', val_embeddings)
    
    test_embeddings = []
    for track in test_dataset:
        i+=1
        print('Processing track '+str(i)+' of '+str(len(train_dataset)))
        in_pool = Pool()
        in_pool.set('model/Placeholder', adapt_melonInput_TensorflowPredict(melspectrogram(resample(track[0].numpy()[0]))))
        output = predict(in_pool)
        test_embeddings.append(output['model/dense/BiasAdd'][:,0,0,:])
    test_embeddings = np.array(test_embeddings)
    np.save('test_embeddings_melon.npy', test_embeddings)    

else:
    train_embeddings=np.load('train_embeddings_melon.npy')
    val_embeddings=np.load('val_embeddings_melon.npy')
    test_embeddings=np.load('test_embeddings_melon.npy')

Processing track 1 of 443


In [None]:
# Embedding shapes...
train_embeddings[0].shape

In [None]:
# Embedding types
train_embeddings[0].dtype

# Re-train, now with interpolated melspecs

In [None]:
baseline = train_test(train_embeddings, val_embeddings, test_embeddings)