## Constructing the datasets

In [1]:
import librosa
import pandas as pd
import numpy as np
from IPython.lib.display import Audio
from matplotlib import pyplot as plt
import multiprocessing
import scipy.signal

import torch

The audio has been recorded with a sampling rate of 44100. There are 686 labeled recordings in the dataset.

**shortest recording:** 1.01 seconds<br>
**longest recording:** 5.004 seconds<br>
**mean length:** 4.22 seconds<br>
**median length:** 5.004 seconds<br>

424 recordings are of maximum length and 217 are shorter than 4 seconds.

In [21]:
SAMPLE_RATE = 44100

We can read the audio in and store it in the annotations dataframe for quicker iteration through the dataset during training.

In [34]:
%%time

anno = pd.read_csv('data/annotations.csv')

audio = []

for _, row in anno.iterrows():
    recording, sr = librosa.load(f'data/audio/{row.filename}', sr=None)
    audio.append(recording)
    
anno['audio'] = audio

CPU times: user 436 ms, sys: 108 ms, total: 544 ms
Wall time: 545 ms


There are 87 possible labels that we can one hot encode as our targets.

In [36]:
anno.head()

Unnamed: 0,filename,Aegcau_call,Alaarv_song,Anttri_song,Butbut_call,Carcan_call,Carcan_song,Carcar_call,Carcar_song,Cerbra_call,...,Sylund_call,Sylund_song,Tetpyg_song,Tibtom_song,Trotro_song,Turmer_call,Turmer_song,Turphi_call,Turphi_song,audio
0,nips4b_birds_trainfile001.wav,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0.0, -0.00048828125, -0.0009765625, 0.0004882..."
1,nips4b_birds_trainfile002.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0.0, 0.00048828125, 0.0, 0.00048828125, 0.000..."
2,nips4b_birds_trainfile003.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[-0.00048828125, 0.00048828125, 0.001953125, 0..."
3,nips4b_birds_trainfile004.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,"[-0.00048828125, -0.00048828125, -0.0034179688..."
4,nips4b_birds_trainfile005.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,"[0.0024414062, 0.0053710938, 0.00390625, 0.004..."


The only annotations we have available for this dataset are the individual codenames. We will use these as our labels.

Given this, we provide 3 options for this dataset:
* sample random 1 seconds from each call for each example (the **sample** option)
* cut each example into examples of 1 second duration (the **cut** option), this will produce some number of new examples, that will depend on the total length of recordings
* pad each example to the longest example in the dataset (the **pad** option)
* take just the 1 of each call from the beginning (the **first** option)

In [98]:
class ExampleProcessor():
    def __init__(self, example_length):
        assert example_length in options.keys()
        self.example_length = example_length
    def __call__(self, example):
        return options[self.example_length](example)

def first(example):
    return example[:1*SAMPLE_RATE]

def sample(example):
    start_frame = np.random.randint(example.shape[0] - 1*SAMPLE_RATE)
    return example[start_frame:start_frame+1*SAMPLE_RATE]

def pad(example):
    out = np.zeros((SAMPLE_RATE*5.005)) # the duration of 5.005 makes this array big enough to house
                                        # the biggest example we have in the dataset
    out[:example.shape[0]] = example
    return out

options = {
    'first': first,
    'sample': sample,
    'pad': pad
}

In [48]:
anno.head()

Unnamed: 0,filename,Aegcau_call,Alaarv_song,Anttri_song,Butbut_call,Carcan_call,Carcan_song,Carcar_call,Carcar_song,Cerbra_call,...,Sylund_call,Sylund_song,Tetpyg_song,Tibtom_song,Trotro_song,Turmer_call,Turmer_song,Turphi_call,Turphi_song,audio
105,nips4b_birds_trainfile106.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0.0, 0.0, 0.00048828125, 0.0, 0.00048828125, ..."
145,nips4b_birds_trainfile146.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[-0.0014648438, -0.0014648438, 0.0014648438, 0..."
632,nips4b_birds_trainfile633.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[-0.00048828125, 0.0, -0.0014648438, -0.001953..."
194,nips4b_birds_trainfile195.wav,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[-0.0024414062, -0.0029296875, -0.0034179688, ..."
89,nips4b_birds_trainfile090.wav,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[-0.0068359375, -0.0068359375, -0.0063476562, ..."


In [50]:
from IPython.core.debugger import set_trace

In [146]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, example='sample'):
        if example == 'cut':
            labels = []
            audio = []
            filenames = []

            for idx, row in df.iterrows():
                while True:
                    if row.audio.shape[0] < 1*SAMPLE_RATE: break
                    labels.append(row[1:88])
                    audio.append(row.audio[:1*SAMPLE_RATE])
                    filenames.append(row.filename)
                    row.audio = row.audio[1*SAMPLE_RATE:]
            filenames = pd.DataFrame({'filename': filenames})
            labels = pd.DataFrame(labels)
            labels.reset_index(inplace=True, drop=True)
            audio = pd.DataFrame({'audio': audio})
            example = 'first'
            
        self.examples = pd.concat((filenames, labels, audio), axis=1)
        self.example_processor = ExampleProcessor(example)
        
    def __getitem__(self, index):
        example = self.examples.iloc[index]
        x = self.example_processor(example.audio)
        y = example[1:88].values.astype(np.float32)
        return x, y

    def __len__(self):
        return self.examples.shape[0]

In [147]:
anno = anno.sample(frac=1)

In [148]:
train_ds = Dataset(anno.iloc[:600,:], example='cut')
valid_ds = Dataset(anno.iloc[600:,:], example='cut')

Here is what constitutes a single example - amplitude values as an array being the independent variable, and one hot encoded labels.

In [153]:
train_ds[0]

(array([ 0.00146484, -0.00048828, -0.00195312, ..., -0.00097656,
        -0.00146484, -0.00146484], dtype=float32),
 array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.], dtype=float32))

Let's now construct the dataloaders to ensure everything works as expected.

In [150]:
train_dl = torch.utils.data.DataLoader(
    dataset=train_ds,
    batch_size=32,
    shuffle=True,
    num_workers=multiprocessing.cpu_count()-1
)

valid_dl = torch.utils.data.DataLoader(
    dataset=valid_ds,
    batch_size=32,
    shuffle=False,
    num_workers=multiprocessing.cpu_count()-1
)

In [151]:
for batch in train_dl: pass
for batch in valid_dl: pass

In [152]:
batch[0].shape, batch[1].shape # we are on the final batch, there were not enough examples to fill it

(torch.Size([9, 44100]), torch.Size([9, 87]))