In [2]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
import sys
sys.path.insert(0,'/content/drive/My Drive/MSc_Project_Colab/BAD_PyTorch/')

In [4]:
!pip install torchaudio

Collecting torchaudio
  Downloading torchaudio-0.9.0-cp37-cp37m-manylinux1_x86_64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 7.4 MB/s 
Installing collected packages: torchaudio
Successfully installed torchaudio-0.9.0


**CNN**

In [5]:
from torch import nn
from torchsummary import summary


class CNNNetwork(nn.Module):

    def __init__(self):
        super().__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding='valid'
            ),
            nn.BatchNorm2d(16),
            nn.LeakyReLU(0.001),
            nn.MaxPool2d(kernel_size=3)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding='valid'
            ),
            nn.BatchNorm2d(16),
            nn.LeakyReLU(0.001),
            nn.MaxPool2d(kernel_size=3)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding='valid'
            ),
            nn.BatchNorm2d(16),
            nn.LeakyReLU(0.001),
            nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 3), padding=0)
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding='valid'
            ),
            nn.BatchNorm2d(16),
            nn.LeakyReLU(0.001),
            nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 3), padding=0)
        )
        self.flatten = nn.Flatten()
        self.dropout = nn.Dropout(p=0.5)
        self.linearA = nn.Linear(448, 256)
        self.batchnormA = nn.BatchNorm1d(256)
        self.leakyrelu = nn.LeakyReLU(0.001)
        self.linearB = nn.Linear(256, 32)
        self.batchnormB = nn.BatchNorm1d(32)

        self.linear = nn.Linear(32, 1)
        # self.sigmoid = nn.Sigmoid()

    def forward(self, input_data):
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flatten(x)
        x = self.dropout(x)
        x = self.linearA(x)
        x = self.batchnormA(x)
        x = self.leakyrelu(x)
        x = self.dropout(x)
        x = self.linearB(x)
        x = self.batchnormB(x)
        x = self.leakyrelu(x)
        x = self.dropout(x)
        logits = self.linear(x)

        # predictions = self.sigmoid(logits)
        return logits


if __name__ == "__main__":
    cnn = CNNNetwork()
    summary(cnn.cuda(), (1, 80, 698))



----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 16, 78, 696]             160
       BatchNorm2d-2          [-1, 16, 78, 696]              32
         LeakyReLU-3          [-1, 16, 78, 696]               0
         MaxPool2d-4          [-1, 16, 26, 232]               0
            Conv2d-5          [-1, 16, 24, 230]           2,320
       BatchNorm2d-6          [-1, 16, 24, 230]              32
         LeakyReLU-7          [-1, 16, 24, 230]               0
         MaxPool2d-8            [-1, 16, 8, 76]               0
            Conv2d-9            [-1, 16, 6, 74]           2,320
      BatchNorm2d-10            [-1, 16, 6, 74]              32
        LeakyReLU-11            [-1, 16, 6, 74]               0
        MaxPool2d-12            [-1, 16, 6, 24]               0
           Conv2d-13            [-1, 16, 4, 22]           2,320
      BatchNorm2d-14            [-1, 16

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Train the model

In [10]:
import torch
from torch import nn
from torch.utils.data import DataLoader
import torchaudio

from random import seed
from random import random
import matplotlib
import matplotlib.pyplot as plt

from dcasedataset import DCASE_Dataset

ANNOTATIONS_FILE = '/content/drive/My Drive/DCASE_Datasets/labels/ffwarblr_7000.csv'
AUDIO_DIR = '/content/drive/My Drive/DCASE_Datasets/audio/'
SAMPLE_RATE = 22050
DURATION = 10
NUM_SAMPLES = 22050 * DURATION


BATCH_SIZE = 16
EPOCHS = 30
LEARNING_RATE = 0.001

def create_data_loader(train_data, batch_size):
    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    return train_dataloader


def train_single_epoch(model, data_loader, loss_fn, optimiser, device):
    size = len(data_loader.dataset)
    stages_per_epoch = len(data_loader)
    average_loss = 0
    print("Dataset size: {}".format(size))
    print("Stages per epoch {}".format(stages_per_epoch))
    
    for input, target in data_loader:
        input, target = input.to(device), target.to(device)
        # print(input.shape)
        # print(target.shape)

        # for index, signal in enumerate(input):
        #   # seed(1)
        #   value = random()

        #   if value > 0.75:
        #     signal = t1masking(signal)
        #     signal = t2masking(signal)
        #     signal = fmasking(signal)
        #     input[index] = signal


        # def plot_spectrogram(spec, title=None, ylabel='freq_bin', aspect='auto', xmax=None):
        #   fig, axs = plt.subplots(1, 1)
        #   axs.set_title(title or 'Spectrogram')
        #   axs.set_ylabel(ylabel)
        #   axs.set_xlabel('frame')
        #   spec = spec.cpu()
        #   spec = spec[0,:,:]
        #   im = axs.imshow(spec, origin='lower', aspect=aspect)
        #   if xmax:
        #     axs.set_xlim((0, xmax))
        #   fig.colorbar(im, ax=axs)
        #   plt.show(block=False)

        # for signal_mod in input:
        #   plot_spectrogram(signal_mod)

        # calculate loss
        prediction = model(input)
        sigmoid = nn.Sigmoid()
        target = target.unsqueeze_(1)
        target = target.type(torch.cuda.FloatTensor)
        loss = loss_fn(sigmoid(prediction), target)
        print(f"loss: {loss.item()}")
        average_loss += loss.item()

        # backpropagate error and update weights
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()

    print(f"Average loss: {average_loss / stages_per_epoch}")


def train(model, data_loader, loss_fn, optimiser, device, epochs):
    for i in range(epochs):
        print(f"Epoch {i+1}")
        train_single_epoch(model, data_loader, loss_fn, optimiser, device)
        print("---------------------------")
    print("Finished training")


if __name__ == "__main__":

    if torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"
    print(f"Using {device}")

    # instantiate dataset object and create data loader
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_fft=1024,
        hop_length=316,
        n_mels=80,
        power=0.33
    )

    t1masking = torchaudio.transforms.TimeMasking(time_mask_param=40)
    t2masking = torchaudio.transforms.TimeMasking(time_mask_param=40)
    fmasking = torchaudio.transforms.FrequencyMasking(freq_mask_param=10)

    dcase = DCASE_Dataset(ANNOTATIONS_FILE,
                            AUDIO_DIR,
                            mel_spectrogram,
                            SAMPLE_RATE,
                            NUM_SAMPLES,
                            device)
    
    train_dataloader = create_data_loader(dcase, BATCH_SIZE)

    cnn = CNNNetwork().to(device)
    print(cnn)

    # initialise loss funtion + optimiser
    loss_fn = nn.BCELoss()
    optimiser = torch.optim.Adam(cnn.parameters(), 
                                 lr=LEARNING_RATE)

    # train model
    train(cnn, train_dataloader, loss_fn, optimiser, device, EPOCHS)

    # save model
    torch.save(cnn.state_dict(), "/content/drive/My Drive/MSc_Project_Colab/BAD_PyTorch/cnn.pth")
    print("Trained cnn saved at cnn.pth")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
/content/drive/My Drive/DCASE_Datasets/audio/ff1010bird/190606.wav
/content/drive/My Drive/DCASE_Datasets/audio/warblrb10k/287e02ba-5a05-47f2-b556.wav
/content/drive/My Drive/DCASE_Datasets/audio/warblrb10k/98ea5aaf-f3f7-4cc8-9fa3.wav
loss: 0.4893181324005127
/content/drive/My Drive/DCASE_Datasets/audio/ff1010bird/146217.wav
/content/drive/My Drive/DCASE_Datasets/audio/warblrb10k/3a283266-225c-4b59-94e7.wav
/content/drive/My Drive/DCASE_Datasets/audio/warblrb10k/a2e5de5e-2e94-4077-bde6.wav
/content/drive/My Drive/DCASE_Datasets/audio/warblrb10k/30a7c3cf-92b9-4d78-8e05.wav
/content/drive/My Drive/DCASE_Datasets/audio/warblrb10k/7f93fc49-71e4-4384-98e8.wav
/content/drive/My Drive/DCASE_Datasets/audio/ff1010bird/145530.wav
/content/drive/My Drive/DCASE_Datasets/audio/ff1010bird/167434.wav
/content/drive/My Drive/DCASE_Datasets/audio/warblrb10k/bb4f64b8-78f8-4de0-a78b.wav
/content/drive/My Drive/DCASE_Datasets/audio/warblrb10

In [12]:
import torch
import torchaudio
from torch import nn

from random import seed
from random import random
import matplotlib
import matplotlib.pyplot as plt

from dcasedatasetcpu import DCASE_Dataset
# from cnnbinary_uky import CNNNetwork
# from train_binary import ANNOTATIONS_FILE, AUDIO_DIR, SAMPLE_RATE, DURATION, NUM_SAMPLES

ANNOTATIONS_birdvox = '/content/drive/My Drive/DCASE_Datasets/labels/BirdVox-DCASE20k.csv'
ANNOTATIONS_warblr = '/content/drive/My Drive/DCASE_Datasets/labels/warblrb10k.csv'
ANNOTATIONS_freefield = '/content/drive/My Drive/DCASE_Datasets/labels/ff1010bird.csv'
ANNOTATIONS_mini = '/content/drive/My Drive/DCASE_Datasets/labels/mini_metadata.csv'
AUDIO_DIR = '/content/drive/My Drive/DCASE_Datasets/audio/'
SAMPLE_RATE = 22050
DURATION = 10
NUM_SAMPLES = 22050 * DURATION
THRESHOLD = 0.5


class_mapping = [
    "no-bird",
    "bird"
]


def predict(model, input, target, class_mapping):
    model.eval()
    with torch.no_grad():
        predictions = model(input).cuda()
        sigmoid = nn.Sigmoid()
        predictions = sigmoid(predictions)
        print(predictions)
        # Tensor (1, 10) -> [ [0.1, 0.01, ..., 0.6] ]
        if predictions[0] > THRESHOLD:

          # predicted_index = predictions[0].argmax(0)
          predicted_index = 1
        elif predictions[0] < THRESHOLD:
          predicted_index = 0
        print(predicted_index)
        predicted = class_mapping[predicted_index]
        expected = class_mapping[target]
    return predicted, expected


if __name__ == "__main__":
    # load back the model
    cnn = CNNNetwork()
    state_dict = torch.load("/content/drive/My Drive/MSc_Project_Colab/BAD_PyTorch/cnn.pth", map_location=torch.device('cpu'))
    cnn.load_state_dict(state_dict)

    # load DCASE dataset
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_fft=1024,
        hop_length=316,
        n_mels=80,
        power=0.33
    )

    t1masking = torchaudio.transforms.TimeMasking(time_mask_param=40)
    t2masking = torchaudio.transforms.TimeMasking(time_mask_param=40)
    fmasking = torchaudio.transforms.FrequencyMasking(freq_mask_param=10)

    dcase = DCASE_Dataset(ANNOTATIONS_birdvox,
                            AUDIO_DIR,
                            mel_spectrogram,
                            SAMPLE_RATE,
                            NUM_SAMPLES,
                            "cpu")


    # get a sample from the dcase dataset for inference
    num_files = 1000

    count = 0
    index = 0
    correct = 0
    countnobird = 0
    countbird = 0
    while count < num_files:
  
      value = random()
      val_label = random()

      input, target = dcase[index][0], dcase[index][1]


      # if target == 0:
      #   if val_label < 0.667:
      #     index += 1
      #     continue

      # if value > 0.75:
      #   input = t1masking(input)
      #   input = t2masking(input)
      #   input = fmasking(input)
      #   input = input

      # def plot_spectrogram(spec, title=None, ylabel='freq_bin', aspect='auto', xmax=None):
      #     fig, axs = plt.subplots(1, 1)
      #     axs.set_title(title or 'Spectrogram')
      #     axs.set_ylabel(ylabel)
      #     axs.set_xlabel('frame')
      #     spec = spec.cpu()
      #     spec = spec[0,:,:]
      #     im = axs.imshow(spec, origin='lower', aspect=aspect)
      #     if xmax:
      #       axs.set_xlim((0, xmax))
      #     fig.colorbar(im, ax=axs)
      #     plt.show(block=False)

      
      # plot_spectrogram(input)


      input.unsqueeze_(0)

      index += 1
      count += 1

      if target == 1:
        countbird += 1
      else:
        countnobird += 1

    # make an inference
      predicted, expected = predict(cnn, input, target,
                                  class_mapping)
      if predicted == expected:
        correct += 1
      print(f"Predicted: '{predicted}', expected: '{expected}'")
      print()

    accuracy = correct / num_files
    print(accuracy)
    print("bird: {}".format(countbird))
    print("no-bird: {}".format(countnobird))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
/content/drive/My Drive/DCASE_Datasets/audio/BirdVox-DCASE-20k/024a895c-e99d-4699-adb9-538cda9ea48c.wav
tensor([[0.3048]], device='cuda:0')
0
Predicted: 'no-bird', expected: 'bird'

/content/drive/My Drive/DCASE_Datasets/audio/BirdVox-DCASE-20k/024aebf9-3685-4328-ab7b-dc31f71bd2d1.wav
/content/drive/My Drive/DCASE_Datasets/audio/BirdVox-DCASE-20k/024aebf9-3685-4328-ab7b-dc31f71bd2d1.wav
tensor([[0.8274]], device='cuda:0')
1
Predicted: 'bird', expected: 'bird'

/content/drive/My Drive/DCASE_Datasets/audio/BirdVox-DCASE-20k/024bb492-c465-4b74-a2fe-2c5d0b6d7ee2.wav
/content/drive/My Drive/DCASE_Datasets/audio/BirdVox-DCASE-20k/024bb492-c465-4b74-a2fe-2c5d0b6d7ee2.wav
tensor([[0.0205]], device='cuda:0')
0
Predicted: 'no-bird', expected: 'no-bird'

/content/drive/My Drive/DCASE_Datasets/audio/BirdVox-DCASE-20k/024c381c-62e4-4dea-a95d-aee7d2041610.wav
/content/drive/My Drive/DCASE_Datasets/audio/BirdVox-DCASE-20k/024c381c-62e4-