In [1]:
from birdcall.data import *
from birdcall.metrics import *
from birdcall.ops import *

import torch
import torchvision
from torch import nn
import numpy as np
import pandas as pd
from pathlib import Path
import soundfile as sf

In [2]:
splits = pd.read_pickle('data/splits.pkl')
positive_class_items = pd.read_pickle('data/positive_class_items.pkl')
negative_class_items = pd.read_pickle('data/negative_class_items.pkl')
north_american_birds_common = pd.read_pickle('data/north_american_birds_common.pkl')
all_classes = pd.read_pickle('data/classes.pkl')

In [3]:
train_items = np.array(positive_class_items)[splits[0][0]].tolist()
val_items = np.array(positive_class_items)[splits[0][1]].tolist()
negative_class_items = [(-1, item[1], item[2]) for item in negative_class_items]

train_items = translate_class(train_items, all_classes, north_american_birds_common)
val_items = translate_class(val_items, all_classes, north_american_birds_common)

In [4]:
classes = pd.read_pickle('data/classes.pkl')
north_american_birds_common = pd.read_pickle('data/north_american_birds_common.pkl')

train_ds = MelspecPoolDatasetNegativeClass(train_items, negative_class_items, north_american_birds_common, len_mult=300, reshape_to_3ch=False)
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=16, num_workers=NUM_WORKERS, pin_memory=True, shuffle=True)

In [5]:
val_items_binned = bin_items_negative_class(val_items)

np.random.shuffle(negative_class_items)
negative_class_items = negative_class_items[:2500]
negative_class_items_binned = bin_items_negative_class(negative_class_items)

In [6]:
class FrontEnd(nn.Module):
    def __init__(self):
        super().__init__()
        self.bn = nn.BatchNorm1d(80, affine=False)
        self.register_parameter('alpha', torch.nn.Parameter(torch.tensor(0.)))
        
    def forward(self, x):
        bs, ch, y_dim, x_dim = x.shape
        x = x ** torch.sigmoid(self.alpha)
        x = x.view(-1, y_dim, x_dim)
        x = self.bn(x)
        return x.view(bs, ch, y_dim, x_dim)

In [7]:
class Body(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.convs = nn.ModuleList([
            nn.Conv2d(1, 64, 3),
            nn.Conv2d(64, 64, 3),
            nn.Conv2d(64, 128, 3),
            nn.Conv2d(128, 128, 3),
            nn.Conv2d(128, 128, (17,3)),
            nn.Conv2d(128, 1024, (1,21)),
            nn.Conv2d(1024, 1024, (1,1)),
            nn.Conv2d(1024, num_classes, (1,1))
        ])
        self.bns = nn.ModuleList([
            nn.BatchNorm1d(78),
            nn.BatchNorm1d(76),
            nn.BatchNorm1d(23),
            nn.BatchNorm1d(21),
            nn.BatchNorm1d(5),
            nn.BatchNorm1d(1),
            nn.BatchNorm1d(1),
        ])
    def forward(self, x):
        bs, ch = x.shape[:2]
        x = x.view(-1, 1, x.shape[-2], x.shape[-1])
        for i in range(2):
            x = self.convs[i](x)
            x = nn.functional.leaky_relu_(x)
            x = x.view(-1, x.shape[-2], x.shape[-1])
            x = self.bns[i](x)
            x = x.view(bs * ch, -1, x.shape[-2], x.shape[-1])

        x = nn.functional.max_pool2d(x, (3,3))
        
        for i in range(2, 5):
            x = self.convs[i](x)
            x = nn.functional.leaky_relu_(x)
            if i == 4: x = nn.functional.dropout2d(x, 0.5)
            x = x.view(-1, x.shape[-2], x.shape[-1])
            x = self.bns[i](x)
            x = x.view(bs * ch, -1, x.shape[-2], x.shape[-1])
        
        x = nn.functional.max_pool2d(x, (5,3))

        for i in range(5, 7):
            x = self.convs[i](x)
            x = nn.functional.leaky_relu_(x)
            x = nn.functional.dropout2d(x, 0.5)
            x = x.view(-1, x.shape[-2], x.shape[-1])
            x = self.bns[i](x)
            x = x.view(bs * ch, -1, x.shape[-2], x.shape[-1])
        
        x = self.convs[-1](x)
        return x.view(bs, ch, -1)

In [8]:
class Model(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.frontend = FrontEnd()
        self.body = Body(num_classes)

    def forward(self, x):
        x = self.frontend(x)
        x = self.body(x)
        x = lme_pool(x)
        return x

In [9]:
model = Model(num_classes=len(north_american_birds_common)).cuda()

In [10]:
import torch.optim as optim
from sklearn.metrics import accuracy_score, f1_score
import time

In [11]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), 1e-3)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, 5)

In [None]:
t0 = time.time()
for epoch in range(130):
    running_loss = 0.0
    for i, data in enumerate(train_dl, 0):
        model.train()
        inputs, labels = data[0].cuda(), data[1].cuda()
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        if np.isnan(loss.item()): 
            print(f'!!! nan encountered in loss !!! alpha: epoch: {epoch}\n')
        loss.backward()
        optimizer.step()
        scheduler.step()

        running_loss += loss.item()

    model.eval();
    preds = []
    targs = []

    for num_specs in val_items_binned.keys():
        valid_ds = MelspecShortishValidatioDataset(val_items_binned[num_specs], north_american_birds_common, negative_class_items_binned[num_specs], reshape_to_3ch=False)
        valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size=2*16, num_workers=NUM_WORKERS, pin_memory=True)

        with torch.no_grad():
            for data in valid_dl:
                inputs, labels = data[0].cuda(), data[1].cuda()
                outputs = model(inputs)
                preds.append(outputs.cpu().detach())
                targs.append(labels.cpu().detach())

    preds = torch.cat(preds)
    targs = torch.cat(targs)

    accuracy = accuracy_score(preds.sigmoid() > 0.5, targs)
    f1 = f1_score(preds.sigmoid() > 0.5, targs, average='micro')
    print(f'[{epoch + 1}, {(time.time() - t0)/60:.1f}] loss: {running_loss / (len(train_dl)-1):.3f}, acc: {accuracy:.3f}, f1: {f1:.3f}')
    running_loss = 0.0

    if epoch % 5 == 4: torch.save(model.state_dict(), f'models/{epoch+1}_lmepool_frontend_neg_nocnn_{round(f1, 2)}.pth')

[1, 16.6] loss: 0.051, acc: 0.687, f1: 0.000
[2, 32.8] loss: 0.047, acc: 0.687, f1: 0.002
[3, 48.9] loss: 0.044, acc: 0.680, f1: 0.010
[4, 65.1] loss: 0.041, acc: 0.656, f1: 0.037
[5, 81.3] loss: 0.038, acc: 0.644, f1: 0.080
[6, 97.5] loss: 0.034, acc: 0.645, f1: 0.129
[7, 113.7] loss: 0.032, acc: 0.645, f1: 0.203
[8, 129.9] loss: 0.029, acc: 0.647, f1: 0.207
[9, 146.1] loss: 0.027, acc: 0.694, f1: 0.268
[10, 162.3] loss: 0.025, acc: 0.689, f1: 0.326
[11, 178.5] loss: 0.024, acc: 0.695, f1: 0.334
[12, 194.7] loss: 0.023, acc: 0.691, f1: 0.379
[13, 210.9] loss: 0.021, acc: 0.704, f1: 0.367
[14, 227.1] loss: 0.020, acc: 0.711, f1: 0.400
[15, 243.3] loss: 0.019, acc: 0.704, f1: 0.412
[16, 259.5] loss: 0.018, acc: 0.720, f1: 0.406
[17, 275.7] loss: 0.017, acc: 0.721, f1: 0.430
[18, 291.9] loss: 0.017, acc: 0.714, f1: 0.430
[19, 308.1] loss: 0.016, acc: 0.729, f1: 0.470
[20, 324.3] loss: 0.016, acc: 0.731, f1: 0.470
[21, 340.5] loss: 0.015, acc: 0.741, f1: 0.495
[22, 356.7] loss: 0.015, acc

In [None]:
f1s = []
ts = []
for t in np.linspace(0.4, 1, 61):
    f1s.append(f1_score(preds.sigmoid() > t, targs, average='micro'))
    ts.append(t)

In [None]:
max(f1s), accuracy_score(preds.sigmoid() > ts[np.argmax(f1s)], targs)

In [None]:
ts[np.argmax(f1s)]

In [None]:
from birdcall.metrics import *

preds_to_tp_fp_fn(preds, targs)