First of all, let us get all the data that we need. Through the magic of `nbdev`, we will use the functionality we defined in `01_gettin_started`

In [1]:
import soundfile as sf

In [2]:
from birdcall.data import *

items, classes = get_items(100)
trn_idxs, val_idxs = trn_val_split_items(items, 10)[0]
mean, std = calculate_mean_and_std(items, trn_idxs)
trn_ds = AudioDataset(items[trn_idxs], classes, mean, std)
val_ds = AudioDataset(items[val_idxs], classes, mean, std)

In [3]:
len(trn_ds), len(val_ds)

(23734, 2638)

In [4]:
from fastai2.vision.all import *

We need some sort of architecture to get started - the one adapted from this [paper](https://www.groundai.com/project/end-to-end-environmental-sound-classification-using-a-1d-convolutional-neural-network/1) seems like a good place to start

In [5]:
NUM_WORKERS

8

In [6]:
BS = 128

dls = DataLoaders(
    DataLoader(dataset=trn_ds, bs=BS, num_workers=NUM_WORKERS, shuffle=True),
    DataLoader(dataset=val_ds, bs=BS, num_workers=NUM_WORKERS)
).cuda()

In [7]:
b = dls.train.one_batch()
b[0].shape

torch.Size([128, 240000])

Let's define our architecture

In [8]:
get_arch = lambda: nn.Sequential(*[
    Lambda(lambda x: x.unsqueeze(1)),
    ConvLayer(1, 16, ks=64, stride=2, ndim=1),
    ConvLayer(16, 16, ks=8, stride=8, ndim=1),
    ConvLayer(16, 32, ks=32, stride=2, ndim=1),
    ConvLayer(32, 32, ks=8, stride=8, ndim=1),
    ConvLayer(32, 64, ks=16, stride=2, ndim=1),
    ConvLayer(64, 128, ks=8, stride=2, ndim=1),
    ConvLayer(128, 256, ks=4, stride=2, ndim=1),
    ConvLayer(256, 256, ks=4, stride=4, ndim=1),
    Flatten(),
    LinBnDrop(7424, 512, p=0.25, act=nn.ReLU()),
    LinBnDrop(512, 512, p=0.25, act=nn.ReLU()),
    LinBnDrop(512, 256, p=0.25, act=nn.ReLU()),
    LinBnDrop(256, len(classes))
])

A couple of functions to help us calculate metrics for diagnostics

In [9]:
def preds_to_tp_fp_fn(preds, targs):
    positives = preds.sigmoid() > 0.5
    true_positives = positives[targs == 1]
    false_positives = positives[targs != 1]
    negatives = ~positives
    false_negatives = negatives[targs == 1]
    return true_positives.sum(), false_positives.sum(), false_negatives.sum()

In [13]:
def precision(preds, targs):
    tp, fp, fn = preds_to_tp_fp_fn(preds, targs)
    return (tp.float() / (tp + fp)).item()

def recall(preds, targs):
    tp, fp, fn = preds_to_tp_fp_fn(preds, targs)
    return (tp.float() / (tp + fn)).item()

def f1(preds, targs, eps=1e-8):
    prec = precision(preds, targs)
    rec = recall(preds, targs)
    return 2 * (prec * rec) / (prec + rec + eps)

In [14]:
learn = Learner(
    dls,
    get_arch(),
    metrics=[AccumMetric(precision), AccumMetric(recall), AccumMetric(f1)],
    loss_func=BCEWithLogitsLossFlat()
)

In [15]:
learn.fit(3, 1e-3)

epoch,train_loss,valid_loss,precision,recall,f1,time
0,0.09549,0.031127,0.120482,0.003791,0.00735,00:30
1,0.023005,0.019363,0.31746,0.007582,0.014809,00:30
2,0.013935,0.010793,0.954373,0.095148,0.173044,00:30
