In [1]:
import os
import random
import torch
import numpy as np
import soundfile as sf
from torch.utils import data
from torchaudio_augmentations import (
    RandomResizedCrop,
    RandomApply,
    PolarityInversion,
    Noise,
    Gain,
    HighLowPass,
    Delay,
    PitchShift,
    Reverb,
    Compose,
)


GTZAN_GENRES = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']


class GTZANDataset(data.Dataset):
    def __init__(self, data_path, split, num_samples, num_chunks, is_augmentation):
        self.data_path =  data_path if data_path else ''
        self.split = split
        self.num_samples = num_samples
        self.num_chunks = num_chunks
        self.is_augmentation = is_augmentation
        self.genres = GTZAN_GENRES
        self._get_song_list()
        if is_augmentation:
            self._get_augmentations()

    def _get_song_list(self):
        list_filename = os.path.join(self.data_path, '%s_filtered.txt' % self.split)
        with open(list_filename) as f:
            lines = f.readlines()
        self.song_list = [line.strip() for line in lines]

    def _get_augmentations(self):
        transforms = [
            RandomResizedCrop(n_samples=self.num_samples),
            RandomApply([PolarityInversion()], p=0.8),
            RandomApply([Noise(min_snr=0.3, max_snr=0.5)], p=0.3),
            RandomApply([Gain()], p=0.2),
            RandomApply([HighLowPass(sample_rate=22050)], p=0.8),
            RandomApply([Delay(sample_rate=22050)], p=0.5),
            RandomApply([PitchShift(n_samples=self.num_samples, sample_rate=22050)], p=0.4),
            RandomApply([Reverb(sample_rate=22050)], p=0.3),
        ]
        self.augmentation = Compose(transforms=transforms)

    def _adjust_audio_length(self, wav):
        if self.split == 'train':
            random_index = random.randint(0, len(wav) - self.num_samples - 1)
            wav = wav[random_index : random_index + self.num_samples]
        else:
            hop = (len(wav) - self.num_samples) // self.num_chunks
            wav = np.array([wav[i * hop : i * hop + self.num_samples] for i in range(self.num_chunks)])
        return wav

    def __getitem__(self, index):
        line = self.song_list[index]

        # get genre
        genre_name = line.split('/')[0]
        genre_index = self.genres.index(genre_name)

        # get audio
        audio_filename = os.path.join(self.data_path, 'genres', line)
        wav, fs = sf.read(audio_filename)

        # adjust audio length
        wav = self._adjust_audio_length(wav).astype('float32')

        # data augmentation
        if self.is_augmentation:
            wav = self.augmentation(torch.from_numpy(wav).unsqueeze(0)).squeeze(0).numpy()

        return wav, genre_index

    def __len__(self):
        return len(self.song_list)

def get_dataloader(data_path=None, 
                   split='train', 
                   num_samples=22050 * 29, 
                   num_chunks=1, 
                   batch_size=16, 
                   num_workers=0, 
                   is_augmentation=False):
    is_shuffle = True if (split == 'train') else False
    batch_size = batch_size if (split == 'train') else (batch_size // num_chunks)
    data_loader = data.DataLoader(dataset=GTZANDataset(data_path, 
                                                       split, 
                                                       num_samples, 
                                                       num_chunks, 
                                                       is_augmentation),
                                  batch_size=batch_size,
                                  shuffle=is_shuffle,
                                  drop_last=False,
                                  num_workers=num_workers)
    return data_loader

In [2]:
train_loader = get_dataloader(split='train', is_augmentation=True)
iter_train_loader = iter(train_loader)
train_wav, train_genre = next(iter_train_loader)

valid_loader = get_dataloader(split='valid')
test_loader = get_dataloader(split='test')
iter_test_loader = iter(test_loader)
test_wav, test_genre = next(iter_test_loader)
print('training data shape: %s' % str(train_wav.shape))
print('validation/test data shape: %s' % str(test_wav.shape))
print(train_genre)

training data shape: torch.Size([64, 639450])
validation/test data shape: torch.Size([64, 1, 639450])
tensor([4, 4, 2, 4, 0, 8, 4, 9, 9, 4, 6, 1, 5, 0, 1, 3, 2, 7, 5, 2, 7, 3, 8, 6,
        8, 9, 3, 4, 1, 6, 1, 1, 5, 0, 2, 9, 4, 0, 4, 9, 6, 1, 2, 6, 6, 9, 0, 2,
        8, 0, 6, 1, 2, 8, 5, 6, 9, 4, 3, 6, 4, 3, 4, 0])


In [3]:
from torch import nn


class Conv_2d(nn.Module):
    def __init__(self, input_channels, output_channels, shape=3, pooling=2, dropout=0.1):
        super(Conv_2d, self).__init__()
        self.conv = nn.Conv2d(input_channels, output_channels, shape, padding=shape//2)
        self.bn = nn.BatchNorm2d(output_channels)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(pooling)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, wav):
        out = self.conv(wav)
        out = self.bn(out)
        out = self.relu(out)
        out = self.maxpool(out)
        out = self.dropout(out)
        return out

In [4]:
import torchaudio


class CNN(nn.Module):
    def __init__(self, num_channels=16, 
                       sample_rate=22050, 
                       n_fft=1024, 
                       f_min=0.0, 
                       f_max=11025.0, 
                       num_mels=128, 
                       num_classes=10):
        super(CNN, self).__init__()

        # mel spectrogram
        self.melspec = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate, 
                                                            n_fft=n_fft, 
                                                            f_min=f_min, 
                                                            f_max=f_max, 
                                                            n_mels=num_mels)
        self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
        self.input_bn = nn.BatchNorm2d(1)

        # convolutional layers
        self.layer1 = Conv_2d(1, num_channels, pooling=(2, 3))
        self.layer2 = Conv_2d(num_channels, num_channels, pooling=(3, 4))
        self.layer3 = Conv_2d(num_channels, num_channels * 2, pooling=(2, 5))
        self.layer4 = Conv_2d(num_channels * 2, num_channels * 2, pooling=(3, 3))
        self.layer5 = Conv_2d(num_channels * 2, num_channels * 4, pooling=(3, 4))

        # dense layers
        self.dense1 = nn.Linear(num_channels * 4, num_channels * 4)
        self.dense_bn = nn.BatchNorm1d(num_channels * 4)
        self.dense2 = nn.Linear(num_channels * 4, num_classes)
        self.dropout = nn.Dropout(0.5)
        self.relu = nn.ReLU()

    def forward(self, wav):
        # input Preprocessing
        out = self.melspec(wav)
        out = self.amplitude_to_db(out)

        # input batch normalization
        out = out.unsqueeze(1)
        out = self.input_bn(out)

        # convolutional layers
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.layer5(out)
        
        # reshape. (batch_size, num_channels, 1, 1) -> (batch_size, num_channels)
        out = out.reshape(len(out), -1)

        # dense layers
        out = self.dense1(out)
        out = self.dense_bn(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.dense2(out)

        return out


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
import random

#if torch.backends.mps.is_available():
#    device = torch.device('mps')
#else:
#    print ("MPS device not found.")
device = torch.device('cpu')
LearningRate=[1.0,0.5,0.1,0.05,0.01,0.05,0.001,0.0005,0.0001,0.00005]
Accuracies_lr=torch.FloatTensor(10, 30) 
Loss_lr=torch.FloatTensor(10, 30)

for i in range(6):
    print(LearningRate[i])
    cnn = CNN().to(device)
    loss_function = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(cnn.parameters(), lr=LearningRate[i])
    valid_losses = []
    num_epochs = 30

    for epoch in range(num_epochs):
        losses = []

        # Train
        cnn.train()
        for (wav, genre_index) in train_loader:
            wav = wav.to(device)
            genre_index = genre_index.to(device)

            # Forward
            out = cnn(wav)
            loss = loss_function(out, genre_index)

            # Backward
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        print('Epoch: [%d/%d], Train loss: %.4f' % (epoch+1, num_epochs, np.mean(losses)))

        # Validation
        cnn.eval()
        y_true = []
        y_pred = []
        losses = []
        for wav, genre_index in valid_loader:
            wav = wav.to(device)
            genre_index = genre_index.to(device)

            # reshape and aggregate chunk-level predictions
            b, c, t = wav.size()
            logits = cnn(wav.view(-1, t))
            logits = logits.view(b, c, -1).mean(dim=1)
            loss = loss_function(logits, genre_index)
            losses.append(loss.item())
            _, pred = torch.max(logits.data, 1)

            # append labels and predictions
            y_true.extend(genre_index.tolist())
            y_pred.extend(pred.tolist())
        accuracy = accuracy_score(y_true, y_pred)
        Accuracies_lr[i][epoch]=accuracy
        valid_loss = np.mean(losses)
        Loss_lr[i][epoch]=valid_loss
        print('Epoch: [%d/%d], Valid loss: %.4f, Valid accuracy: %.4f' % (epoch+1, num_epochs, valid_loss, accuracy))
        print('Epoch: [%d/%d], Loss_lr[%d,%d]: %.4f Accuracies_lr[%d,%d]: %.4f' % (epoch+1, num_epochs, i, epoch, Loss_lr[i,epoch].item(), i, epoch, Accuracies_lr[i,epoch].item()))
        # Save model
        valid_losses.append(valid_loss.item())
        if np.argmin(valid_losses) == epoch:
            print('Saving the best model at %d epochs!' % epoch)
            torch.save(cnn.state_dict(), 'best_model.ckpt')

1.0
Epoch: [1/30], Train loss: 11.5715
Epoch: [1/30], Valid loss: 21909.4243, Valid accuracy: 0.1117
Epoch: [1/30], Loss_lr[0,0]: 21909.4238 Accuracies_lr[0,0]: 0.1117
Saving the best model at 0 epochs!
Epoch: [2/30], Train loss: 3.1947
Epoch: [2/30], Valid loss: 5576.3226, Valid accuracy: 0.1218
Epoch: [2/30], Loss_lr[0,1]: 5576.3228 Accuracies_lr[0,1]: 0.1218
Saving the best model at 1 epochs!
Epoch: [3/30], Train loss: 2.5292
Epoch: [3/30], Valid loss: 2838.5176, Valid accuracy: 0.0863
Epoch: [3/30], Loss_lr[0,2]: 2838.5176 Accuracies_lr[0,2]: 0.0863
Saving the best model at 2 epochs!
Epoch: [4/30], Train loss: 2.4290
Epoch: [4/30], Valid loss: 1017.0182, Valid accuracy: 0.0863
Epoch: [4/30], Loss_lr[0,3]: 1017.0182 Accuracies_lr[0,3]: 0.0863
Saving the best model at 3 epochs!
Epoch: [5/30], Train loss: 2.3854
Epoch: [5/30], Valid loss: 259.5604, Valid accuracy: 0.0457
Epoch: [5/30], Loss_lr[0,4]: 259.5604 Accuracies_lr[0,4]: 0.0457
Saving the best model at 4 epochs!
Epoch: [6/30], 

Epoch: [18/30], Train loss: 2.3240
Epoch: [18/30], Valid loss: 2.3008, Valid accuracy: 0.0914
Epoch: [18/30], Loss_lr[1,17]: 2.3008 Accuracies_lr[1,17]: 0.0914
Epoch: [19/30], Train loss: 2.3240
Epoch: [19/30], Valid loss: 2.3793, Valid accuracy: 0.0863
Epoch: [19/30], Loss_lr[1,18]: 2.3793 Accuracies_lr[1,18]: 0.0863
Epoch: [20/30], Train loss: 2.3254
Epoch: [20/30], Valid loss: 2.2694, Valid accuracy: 0.1218
Epoch: [20/30], Loss_lr[1,19]: 2.2694 Accuracies_lr[1,19]: 0.1218
Epoch: [21/30], Train loss: 2.3278
Epoch: [21/30], Valid loss: 2.3220, Valid accuracy: 0.1015
Epoch: [21/30], Loss_lr[1,20]: 2.3220 Accuracies_lr[1,20]: 0.1015
Epoch: [22/30], Train loss: 2.3376
Epoch: [22/30], Valid loss: 2.2872, Valid accuracy: 0.0863
Epoch: [22/30], Loss_lr[1,21]: 2.2872 Accuracies_lr[1,21]: 0.0863
Epoch: [23/30], Train loss: 2.3394
Epoch: [23/30], Valid loss: 2.3502, Valid accuracy: 0.0914
Epoch: [23/30], Loss_lr[1,22]: 2.3502 Accuracies_lr[1,22]: 0.0914
Epoch: [24/30], Train loss: 2.3331
Epoch

Epoch: [6/30], Train loss: 2.1964
Epoch: [6/30], Valid loss: 2.2113, Valid accuracy: 0.1574
Epoch: [6/30], Loss_lr[3,5]: 2.2113 Accuracies_lr[3,5]: 0.1574
Saving the best model at 5 epochs!
Epoch: [7/30], Train loss: 2.2073
Epoch: [7/30], Valid loss: 2.5622, Valid accuracy: 0.0964
Epoch: [7/30], Loss_lr[3,6]: 2.5622 Accuracies_lr[3,6]: 0.0964
Epoch: [8/30], Train loss: 2.2093
Epoch: [8/30], Valid loss: 2.3843, Valid accuracy: 0.1624
Epoch: [8/30], Loss_lr[3,7]: 2.3843 Accuracies_lr[3,7]: 0.1624
Epoch: [9/30], Train loss: 2.2157
Epoch: [9/30], Valid loss: 2.2992, Valid accuracy: 0.1523
Epoch: [9/30], Loss_lr[3,8]: 2.2992 Accuracies_lr[3,8]: 0.1523
Epoch: [10/30], Train loss: 2.1623
Epoch: [10/30], Valid loss: 2.2584, Valid accuracy: 0.1726
Epoch: [10/30], Loss_lr[3,9]: 2.2584 Accuracies_lr[3,9]: 0.1726
Epoch: [11/30], Train loss: 2.1947
Epoch: [11/30], Valid loss: 2.0696, Valid accuracy: 0.1980
Epoch: [11/30], Loss_lr[3,10]: 2.0696 Accuracies_lr[3,10]: 0.1980
Saving the best model at 10

Epoch: [25/30], Train loss: 1.6596
Epoch: [25/30], Valid loss: 4.6432, Valid accuracy: 0.1421
Epoch: [25/30], Loss_lr[4,24]: 4.6432 Accuracies_lr[4,24]: 0.1421
Epoch: [26/30], Train loss: 1.5499
Epoch: [26/30], Valid loss: 1.4876, Valid accuracy: 0.5381
Epoch: [26/30], Loss_lr[4,25]: 1.4876 Accuracies_lr[4,25]: 0.5381
Epoch: [27/30], Train loss: 1.5734
Epoch: [27/30], Valid loss: 1.9770, Valid accuracy: 0.3198
Epoch: [27/30], Loss_lr[4,26]: 1.9770 Accuracies_lr[4,26]: 0.3198
Epoch: [28/30], Train loss: 1.5617
Epoch: [28/30], Valid loss: 3.2201, Valid accuracy: 0.2234
Epoch: [28/30], Loss_lr[4,27]: 3.2201 Accuracies_lr[4,27]: 0.2234
Epoch: [29/30], Train loss: 1.5307
Epoch: [29/30], Valid loss: 1.8563, Valid accuracy: 0.4010
Epoch: [29/30], Loss_lr[4,28]: 1.8563 Accuracies_lr[4,28]: 0.4010
Epoch: [30/30], Train loss: 1.5572
Epoch: [30/30], Valid loss: 3.6353, Valid accuracy: 0.1980
Epoch: [30/30], Loss_lr[4,29]: 3.6353 Accuracies_lr[4,29]: 0.1980
0.05
Epoch: [1/30], Train loss: 2.5118
E

In [None]:
# Load the best model
S = torch.load('best_model.ckpt')
cnn.load_state_dict(S)
print('loaded!')

# Run evaluation
cnn.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for wav, genre_index in test_loader:
        wav = wav.to(device)
        genre_index = genre_index.to(device)

        # reshape and aggregate chunk-level predictions
        b, c, t = wav.size()
        logits = cnn(wav.view(-1, t))
        logits = logits.view(b, c, -1).mean(dim=1)
        _, pred = torch.max(logits.data, 1)

        # append labels and predictions
        y_true.extend(genre_index.tolist())
        y_pred.extend(pred.tolist())

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix

accuracylist=[]

accuracy = accuracy_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, xticklabels=GTZAN_GENRES, yticklabels=GTZAN_GENRES, cmap='YlGnBu')
print('Accuracy: %.4f' % accuracy)
accuracylist.append(accuracy)

In [None]:
from random import shuffle
shuffle(y_true)
accuracy = accuracy_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, xticklabels=GTZAN_GENRES, yticklabels=GTZAN_GENRES, cmap='YlGnBu')
print('Randomness: %.4f' % accuracy)

In [18]:
Accuracies[0][:]

[0.09644670050761421,
 0.15228426395939088,
 0.10152284263959391,
 0.29441624365482233,
 0.10152284263959391,
 0.10152284263959391,
 0.10152284263959391,
 0.19796954314720813,
 0.18781725888324874,
 0.20812182741116753,
 0.13705583756345177,
 0.16243654822335024,
 0.233502538071066,
 0.15228426395939088,
 0.07614213197969544,
 0.19796954314720813,
 0.18274111675126903,
 0.26903553299492383,
 0.27918781725888325,
 0.27918781725888325,
 0.16751269035532995,
 0.20304568527918782,
 0.2131979695431472,
 0.17766497461928935,
 0.23857868020304568,
 0.10152284263959391,
 0.24873096446700507,
 0.17766497461928935,
 0.1319796954314721,
 0.233502538071066]

In [21]:
Accuracies[2][:][1]

0.15228426395939088