In [1]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import sys
sys.path.insert(0,'/content/drive/My Drive/MSc_Project_Colab/BAD_PyTorch/')

In [3]:
!pip install torchaudio

Collecting torchaudio
  Downloading torchaudio-0.9.0-cp37-cp37m-manylinux1_x86_64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 5.2 MB/s 
Installing collected packages: torchaudio
Successfully installed torchaudio-0.9.0


**CNN**

In [4]:
from torch import nn
from torchsummary import summary


class CNNNetwork(nn.Module):

    def __init__(self):
        super().__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding='valid'
            ),
            nn.BatchNorm2d(16),
            nn.LeakyReLU(0.001),
            nn.MaxPool2d(kernel_size=3)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding='valid'
            ),
            nn.BatchNorm2d(16),
            nn.LeakyReLU(0.001),
            nn.MaxPool2d(kernel_size=3)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding='valid'
            ),
            nn.BatchNorm2d(16),
            nn.LeakyReLU(0.001),
            nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 3), padding=0)
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding='valid'
            ),
            nn.BatchNorm2d(16),
            nn.LeakyReLU(0.001),
            nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 3), padding=0)
        )
        self.flatten = nn.Flatten()
        self.dropout = nn.Dropout(p=0.5)
        self.linearA = nn.Linear(448, 256)
        self.batchnormA = nn.BatchNorm1d(256)
        self.leakyrelu = nn.LeakyReLU(0.001)
        self.linearB = nn.Linear(256, 32)
        self.batchnormB = nn.BatchNorm1d(32)

        self.linear = nn.Linear(32, 1)
        # self.sigmoid = nn.Sigmoid()

    def forward(self, input_data):
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flatten(x)
        x = self.dropout(x)
        x = self.linearA(x)
        x = self.batchnormA(x)
        x = self.leakyrelu(x)
        x = self.dropout(x)
        x = self.linearB(x)
        x = self.batchnormB(x)
        x = self.leakyrelu(x)
        x = self.dropout(x)
        logits = self.linear(x)

        # predictions = self.sigmoid(logits)
        return logits


if __name__ == "__main__":
    cnn = CNNNetwork()
    summary(cnn.cuda(), (1, 80, 698))



----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 16, 78, 696]             160
       BatchNorm2d-2          [-1, 16, 78, 696]              32
         LeakyReLU-3          [-1, 16, 78, 696]               0
         MaxPool2d-4          [-1, 16, 26, 232]               0
            Conv2d-5          [-1, 16, 24, 230]           2,320
       BatchNorm2d-6          [-1, 16, 24, 230]              32
         LeakyReLU-7          [-1, 16, 24, 230]               0
         MaxPool2d-8            [-1, 16, 8, 76]               0
            Conv2d-9            [-1, 16, 6, 74]           2,320
      BatchNorm2d-10            [-1, 16, 6, 74]              32
        LeakyReLU-11            [-1, 16, 6, 74]               0
        MaxPool2d-12            [-1, 16, 6, 24]               0
           Conv2d-13            [-1, 16, 4, 22]           2,320
      BatchNorm2d-14            [-1, 16

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Train the model

In [6]:
import torch
from torch import nn
from torch.utils.data import DataLoader
import torchaudio

from dcasedataset import DCASE_Dataset

ANNOTATIONS_FILE = '/content/drive/My Drive/DCASE_Datasets/labels/BirdVox-DCASE20k.csv'
AUDIO_DIR = '/content/drive/My Drive/DCASE_Datasets/audio/'
SAMPLE_RATE = 22050
DURATION = 10
NUM_SAMPLES = 22050 * DURATION


BATCH_SIZE = 16
EPOCHS = 30
LEARNING_RATE = 0.001

def create_data_loader(train_data, batch_size):
    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    return train_dataloader


def train_single_epoch(model, data_loader, loss_fn, optimiser, device):
    for input, target in data_loader:
        input, target = input.to(device), target.to(device)
        print(target.shape)

        # calculate loss
        prediction = model(input)
        # print(prediction.shape)
        sigmoid = nn.Sigmoid()
        # prediction.unsqueeze(1)
        target = target.unsqueeze_(1)
        target = target.type(torch.cuda.FloatTensor)
        loss = loss_fn(sigmoid(prediction), target)
        print(loss)

        # backpropagate error and update weights
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()

    print(f"loss: {loss.item()}")


def train(model, data_loader, loss_fn, optimiser, device, epochs):
    for i in range(epochs):
        print(f"Epoch {i+1}")
        train_single_epoch(model, data_loader, loss_fn, optimiser, device)
        print("---------------------------")
    print("Finished training")


if __name__ == "__main__":

    if torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"
    print(f"Using {device}")

    # instantiate dataset object and create data loader
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_fft=1024,
        hop_length=316,
        n_mels=80,
        power=1,
        f_min=50,
        f_max=12000
    )

    dcase = DCASE_Dataset(ANNOTATIONS_FILE,
                            AUDIO_DIR,
                            mel_spectrogram,
                            SAMPLE_RATE,
                            NUM_SAMPLES,
                            device)

    train_dataloader = create_data_loader(dcase, BATCH_SIZE)

    cnn = CNNNetwork().to(device)
    print(cnn)

    # initialise loss funtion + optimiser
    loss_fn = nn.BCELoss()
    optimiser = torch.optim.Adam(cnn.parameters(), 
                                 lr=LEARNING_RATE)

    # train model
    train(cnn, train_dataloader, loss_fn, optimiser, device, EPOCHS)

    # save model
    torch.save(cnn.state_dict(), "/content/drive/My Drive/MSc_Project_Colab/BAD_PyTorch/cnn.pth")
    print("Trained cnn saved at cnn.pth")

Using cuda


  "At least one mel filterbank has all zero values. "


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
/content/drive/My Drive/DCASE_Datasets/audio/BirdVox-DCASE-20k/bd41a1c0-28d0-4fe9-a7d5-6cf60d3831f5.wav
/content/drive/My Drive/DCASE_Datasets/audio/BirdVox-DCASE-20k/76acbab0-27c3-4f34-a699-a5ad46a50af9.wav
/content/drive/My Drive/DCASE_Datasets/audio/BirdVox-DCASE-20k/7a6bdf11-8099-4822-add0-05e98b1b9dae.wav
/content/drive/My Drive/DCASE_Datasets/audio/BirdVox-DCASE-20k/f3961906-1294-44b3-a404-bf905c7eb06e.wav
/content/drive/My Drive/DCASE_Datasets/audio/BirdVox-DCASE-20k/990cf98b-27aa-4644-ab12-7dfe78d4a5da.wav
/content/drive/My Drive/DCASE_Datasets/audio/BirdVox-DCASE-20k/7132ea91-f345-4c1c-a80d-9363eb6b4b3a.wav
/content/drive/My Drive/DCASE_Datasets/audio/BirdVox-DCASE-20k/a71c6be8-18b0-4fb4-a391-1f7f943d36ca.wav
/content/drive/My Drive/DCASE_Datasets/audio/BirdVox-DCASE-20k/c2070566-99b7-4ec9-aa8a-84a81218d9ff.wav
torch.Size([16])
tensor(0.3879, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)
/content/drive/M