In [1]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sys
sys.path.insert(0,'/content/drive/My Drive/MSc_Project_Colab/BAD_PyTorch/')

In [3]:
!pip install torchaudio

Collecting torchaudio
  Downloading torchaudio-0.9.0-cp37-cp37m-manylinux1_x86_64.whl (1.9 MB)
[?25l[K     |▏                               | 10 kB 38.6 MB/s eta 0:00:01[K     |▍                               | 20 kB 19.1 MB/s eta 0:00:01[K     |▌                               | 30 kB 15.9 MB/s eta 0:00:01[K     |▊                               | 40 kB 14.6 MB/s eta 0:00:01[K     |▉                               | 51 kB 7.9 MB/s eta 0:00:01[K     |█                               | 61 kB 7.7 MB/s eta 0:00:01[K     |█▏                              | 71 kB 8.1 MB/s eta 0:00:01[K     |█▍                              | 81 kB 9.1 MB/s eta 0:00:01[K     |█▌                              | 92 kB 9.5 MB/s eta 0:00:01[K     |█▊                              | 102 kB 7.6 MB/s eta 0:00:01[K     |██                              | 112 kB 7.6 MB/s eta 0:00:01[K     |██                              | 122 kB 7.6 MB/s eta 0:00:01[K     |██▎                             | 133 kB 7

In [4]:
from torch import nn
from torchsummary import summary


class CNNNetwork(nn.Module):

    def __init__(self):
        super().__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding='valid'
            ),
            nn.BatchNorm2d(16),
            nn.LeakyReLU(0.001),
            nn.MaxPool2d(kernel_size=3)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding='valid'
            ),
            nn.BatchNorm2d(16),
            nn.LeakyReLU(0.001),
            nn.MaxPool2d(kernel_size=3)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding='valid'
            ),
            nn.BatchNorm2d(16),
            nn.LeakyReLU(0.001),
            nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 3), padding=0)
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding='valid'
            ),
            nn.BatchNorm2d(16),
            nn.LeakyReLU(0.001),
            nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 3), padding=0)
        )
        self.flatten = nn.Flatten()
        self.dropout = nn.Dropout(p=0.5)
        self.linearA = nn.Linear(448, 256)
        self.batchnormA = nn.BatchNorm1d(256)
        self.leakyrelu = nn.LeakyReLU(0.001)
        self.linearB = nn.Linear(256, 32)
        self.batchnormB = nn.BatchNorm1d(32)

        self.linear = nn.Linear(32, 1)
        # self.sigmoid = nn.Sigmoid()

    def forward(self, input_data):
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flatten(x)
        x = self.dropout(x)
        x = self.linearA(x)
        x = self.batchnormA(x)
        x = self.leakyrelu(x)
        x = self.dropout(x)
        x = self.linearB(x)
        x = self.batchnormB(x)
        x = self.leakyrelu(x)
        x = self.dropout(x)
        logits = self.linear(x)

        # predictions = self.sigmoid(logits)
        return logits


if __name__ == "__main__":
    cnn = CNNNetwork()
    summary(cnn.cuda(), (1, 80, 698))



----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 16, 78, 696]             160
       BatchNorm2d-2          [-1, 16, 78, 696]              32
         LeakyReLU-3          [-1, 16, 78, 696]               0
         MaxPool2d-4          [-1, 16, 26, 232]               0
            Conv2d-5          [-1, 16, 24, 230]           2,320
       BatchNorm2d-6          [-1, 16, 24, 230]              32
         LeakyReLU-7          [-1, 16, 24, 230]               0
         MaxPool2d-8            [-1, 16, 8, 76]               0
            Conv2d-9            [-1, 16, 6, 74]           2,320
      BatchNorm2d-10            [-1, 16, 6, 74]              32
        LeakyReLU-11            [-1, 16, 6, 74]               0
        MaxPool2d-12            [-1, 16, 6, 24]               0
           Conv2d-13            [-1, 16, 4, 22]           2,320
      BatchNorm2d-14            [-1, 16

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


In [12]:
import torch
import torchaudio
from torch import nn

from dcasedatasetcpu import DCASE_Dataset
# from cnnbinary_uky import CNNNetwork
# from train_binary import ANNOTATIONS_FILE, AUDIO_DIR, SAMPLE_RATE, DURATION, NUM_SAMPLES

ANNOTATIONS_FILE = '/content/drive/My Drive/DCASE_Datasets/labels/warblrb10k.csv'
AUDIO_DIR = '/content/drive/My Drive/DCASE_Datasets/audio/'
SAMPLE_RATE = 22050
DURATION = 10
NUM_SAMPLES = 22050 * DURATION


class_mapping = [
    "no-bird",
    "bird"
]


def predict(model, input, target, class_mapping):
    model.eval()
    with torch.no_grad():
        predictions = model(input).cuda()
        sigmoid = nn.Sigmoid()
        predictions = sigmoid(predictions)
        print(predictions)
        # Tensor (1, 10) -> [ [0.1, 0.01, ..., 0.6] ]
        if predictions[0] > 0.5:

          # predicted_index = predictions[0].argmax(0)
          predicted_index = 1
        elif predictions[0] < 0.5:
          predicted_index = 0
        print(predicted_index)
        predicted = class_mapping[predicted_index]
        expected = class_mapping[target]
    return predicted, expected


if __name__ == "__main__":
    # load back the model
    cnn = CNNNetwork()
    state_dict = torch.load("/content/drive/My Drive/MSc_Project_Colab/BAD_PyTorch/cnn.pth", map_location=torch.device('cpu'))
    cnn.load_state_dict(state_dict)

    # load DCASE dataset
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_fft=1024,
        hop_length=316,
        n_mels=80,
        power=1,
        f_min=50,
        f_max=12000
    )

    dcase = DCASE_Dataset(ANNOTATIONS_FILE,
                            AUDIO_DIR,
                            mel_spectrogram,
                            SAMPLE_RATE,
                            NUM_SAMPLES,
                            "cpu")


    # get a sample from the dcase dataset for inference
    index = 0
    correct = 0
    while index < 1000:
      input, target = dcase[index][0], dcase[index][1]
      input.unsqueeze_(0)
      index += 1

    # make an inference
      predicted, expected = predict(cnn, input, target,
                                  class_mapping)
      if predicted == expected:
        correct += 1
      print(f"Predicted: '{predicted}', expected: '{expected}'")
      print()

    accuracy = correct / 1000
    print(accuracy)

  "At least one mel filterbank has all zero values. "


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

/content/drive/My Drive/DCASE_Datasets/audio/warblrb10k/d50427cc-cfad-4116-b7eb.wav
/content/drive/My Drive/DCASE_Datasets/audio/warblrb10k/d50427cc-cfad-4116-b7eb.wav
tensor([[0.9130]], device='cuda:0')
1
Predicted: 'bird', expected: 'bird'

/content/drive/My Drive/DCASE_Datasets/audio/warblrb10k/dd78994e-0d63-4f39-9511.wav
/content/drive/My Drive/DCASE_Datasets/audio/warblrb10k/dd78994e-0d63-4f39-9511.wav
tensor([[0.9044]], device='cuda:0')
1
Predicted: 'bird', expected: 'bird'

/content/drive/My Drive/DCASE_Datasets/audio/warblrb10k/2710f25b-5de0-460a-b187.wav
/content/drive/My Drive/DCASE_Datasets/audio/warblrb10k/2710f25b-5de0-460a-b187.wav
tensor([[0.9933]], device='cuda:0')
1
Predicted: 'bird', expected: 'bird'

/content/drive/My Drive/DCASE_Datasets/audio/warblrb10k/b555a6fc-8559-4bd9-875e.wav
/content/drive/My Drive/DCASE_Datasets/audio/warblrb10k/b555a6fc-8559-4bd9-875e.wav
tensor([[0.2757]], device='cuda:0')
0