In [1]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sys
sys.path.insert(0,'/content/drive/My Drive/MSc_Project_Colab/BAD_PyTorch/')

In [3]:
!pip install torchaudio

Collecting torchaudio
  Downloading torchaudio-0.9.0-cp37-cp37m-manylinux1_x86_64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 4.2 MB/s 
Installing collected packages: torchaudio
Successfully installed torchaudio-0.9.0


In [4]:
from torch import nn
from torchsummary import summary


class CNNNetwork(nn.Module):

    def __init__(self):
        super().__init__()
        # 4 conv blocks / flatten / linear / softmax
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding='valid'
            ),
            nn.BatchNorm2d(16),
            nn.LeakyReLU(0.001),
            nn.MaxPool2d(kernel_size=3)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding='valid'
            ),
            nn.BatchNorm2d(16),
            nn.LeakyReLU(0.001),
            nn.MaxPool2d(kernel_size=3)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding='valid'
            ),
            nn.BatchNorm2d(16),
            nn.LeakyReLU(0.001),
            nn.MaxPool2d(kernel_size=(3,1), stride=(3,1), padding=0)
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.BatchNorm2d(16),
            nn.LeakyReLU(0.001),
            nn.MaxPool2d(kernel_size=(3,1), stride=(3,1), padding=0)
        )
        self.flatten = nn.Flatten()
        self.dropout = nn.Dropout(p=0.5)
        self.linearA = nn.Linear(752, 256)
        self.batchnormA = nn.BatchNorm1d(256)
        self.leakyrelu = nn.LeakyReLU(0.001)
        self.linearB = nn.Linear(256, 32)
        self.batchnormB = nn.BatchNorm1d(32)

        self.linear = nn.Linear(32, 1)
        # self.sigmoid = nn.Sigmoid()

    def forward(self, input_data):
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flatten(x)
        x = self.dropout(x)
        x = self.linearA(x)
        x = self.batchnormA(x)
        x = self.leakyrelu(x)
        x = self.dropout(x)
        x = self.linearB(x)
        x = self.batchnormB(x)
        x = self.leakyrelu(x)
        x = self.dropout(x)


        logits = self.linear(x)
        # predictions = self.sigmoid(logits)
        return logits


if __name__ == "__main__":
    cnn = CNNNetwork()
    summary(cnn.cuda(), (1, 64, 431))



----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 16, 62, 429]             160
       BatchNorm2d-2          [-1, 16, 62, 429]              32
         LeakyReLU-3          [-1, 16, 62, 429]               0
         MaxPool2d-4          [-1, 16, 20, 143]               0
            Conv2d-5          [-1, 16, 18, 141]           2,320
       BatchNorm2d-6          [-1, 16, 18, 141]              32
         LeakyReLU-7          [-1, 16, 18, 141]               0
         MaxPool2d-8            [-1, 16, 6, 47]               0
            Conv2d-9            [-1, 16, 4, 45]           2,320
      BatchNorm2d-10            [-1, 16, 4, 45]              32
        LeakyReLU-11            [-1, 16, 4, 45]               0
        MaxPool2d-12            [-1, 16, 1, 45]               0
           Conv2d-13            [-1, 16, 3, 47]           2,320
      BatchNorm2d-14            [-1, 16

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


In [8]:
import torch
import torchaudio
from torch import nn

from dcasedatasetcpu import DCASE_Dataset
# from cnnbinary_uky import CNNNetwork
# from train_binary import ANNOTATIONS_FILE, AUDIO_DIR, SAMPLE_RATE, DURATION, NUM_SAMPLES

ANNOTATIONS_FILE = '/content/drive/My Drive/DCASE_Datasets/labels/BirdVox-DCASE20k.csv'
AUDIO_DIR = '/content/drive/My Drive/DCASE_Datasets/audio/'
SAMPLE_RATE = 22050
DURATION = 10
NUM_SAMPLES = 22050 * DURATION


class_mapping = [
    "no-bird",
    "bird"
]


def predict(model, input, target, class_mapping):
    model.eval()
    with torch.no_grad():
        predictions = model(input).cuda()
        sigmoid = nn.Sigmoid()
        predictions = sigmoid(predictions)
        print(predictions)
        # Tensor (1, 10) -> [ [0.1, 0.01, ..., 0.6] ]
        if predictions[0] > 0.5:

          # predicted_index = predictions[0].argmax(0)
          predicted_index = 1
        elif predictions[0] < 0.5:
          predicted_index = 0
        print(predicted_index)
        predicted = class_mapping[predicted_index]
        expected = class_mapping[target]
    return predicted, expected


if __name__ == "__main__":
    # load back the model
    cnn = CNNNetwork()
    state_dict = torch.load("/content/drive/My Drive/MSc_Project_Colab/BAD_PyTorch/cnn.pth", map_location=torch.device('cpu'))
    cnn.load_state_dict(state_dict)

    # load DCASE dataset
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_fft=1024,
        hop_length=512,
        n_mels=64
    )

    dcase = DCASE_Dataset(ANNOTATIONS_FILE,
                            AUDIO_DIR,
                            mel_spectrogram,
                            SAMPLE_RATE,
                            NUM_SAMPLES,
                            "cpu")

    # get a sample from the dcase dataset for inference
    index = 0
    correct = 0
    while index < 100:
      input, target = dcase[index][0], dcase[index][1]
      input.unsqueeze_(0)
      index += 1

    # make an inference
      predicted, expected = predict(cnn, input, target,
                                  class_mapping)
      if predicted == expected:
        correct += 1
      print(f"Predicted: '{predicted}', expected: '{expected}'")
      print()

    accuracy = correct / 100
    print(accuracy)

/content/drive/My Drive/DCASE_Datasets/audio/BirdVox-DCASE-20k/00053d90-e4b9-4045-a2f1-f39efc90cfa9.wav
/content/drive/My Drive/DCASE_Datasets/audio/BirdVox-DCASE-20k/00053d90-e4b9-4045-a2f1-f39efc90cfa9.wav
tensor([[0.4987]], device='cuda:0')
0
Predicted: 'no-bird', expected: 'bird'

/content/drive/My Drive/DCASE_Datasets/audio/BirdVox-DCASE-20k/000db435-a40f-4ad9-a74e-d1af284d2c44.wav
/content/drive/My Drive/DCASE_Datasets/audio/BirdVox-DCASE-20k/000db435-a40f-4ad9-a74e-d1af284d2c44.wav
tensor([[0.4392]], device='cuda:0')
0
Predicted: 'no-bird', expected: 'no-bird'

/content/drive/My Drive/DCASE_Datasets/audio/BirdVox-DCASE-20k/001059c0-e04f-42fc-a8e2-11aad24dc6fb.wav
/content/drive/My Drive/DCASE_Datasets/audio/BirdVox-DCASE-20k/001059c0-e04f-42fc-a8e2-11aad24dc6fb.wav
tensor([[0.3921]], device='cuda:0')
0
Predicted: 'no-bird', expected: 'bird'

/content/drive/My Drive/DCASE_Datasets/audio/BirdVox-DCASE-20k/00106202-f61e-467d-a80f-070d90421952.wav
/content/drive/My Drive/DCASE_Datas