In [1]:
from birdcall.data import *
from birdcall.metrics import *
from birdcall.ops import *

import torch
import torchvision
from torch import nn
import numpy as np
import pandas as pd

In [None]:
classes = pd.read_pickle('data/classes.pkl')

In [None]:
north_american_birds = train[train.country.isin(['United States', 'Canada', 'Mexico'])].ebird_code.value_counts()
north_american_birds_common = north_american_birds[north_american_birds == 100].index.tolist()

In [None]:
from sklearn.model_selection import StratifiedKFold

In [38]:
north_american_birds_common

['daejun',
 'houfin',
 'cacwre',
 'blujay',
 'marwre',
 'astfly',
 'purfin',
 'spotow',
 'pasfly',
 'warvir',
 'foxspa',
 'comgra',
 'ruckin',
 'mouchi',
 'whtspa',
 'eastow',
 'bushti',
 'whcspa',
 'gnttow',
 'dowwoo',
 'wesmea',
 'rebwoo',
 'sonspa',
 'carwre',
 'brdowl',
 'evegro',
 'bnhcow',
 'fiespa',
 'indbun',
 'swaspa',
 'bulori',
 'rebnut',
 'whbnut',
 'amecro',
 'annhum',
 'rewbla',
 'herthr',
 'bkhgro',
 'bkcchi',
 'orcwar',
 'linspa',
 'amegfi',
 'vesspa',
 'logshr',
 'buggna',
 'tuftit',
 'gockin',
 'savspa',
 'bewwre',
 'pilwoo',
 'canwre',
 'norcar',
 'scoori',
 'brespa',
 'comyel',
 'amerob',
 'pinsis']

In [2]:
from pathlib import Path
import soundfile as sf

In [68]:
positive_class_items  = []

for ebird in north_american_birds_common:
    paths = list(Path(f'data/train_resampled/{ebird}').iterdir())
    for path in paths:
        positive_class_items.append((classes.index(ebird), path, sf.info(path).duration))

In [63]:
negative_class_items = []

for directory in Path(f'data/train_resampled/').iterdir():
    if directory.name not in north_american_birds_common:
        for path in directory.iterdir():
            negative_class_items.append((classes.index(directory.name), path, sf.info(path).duration))

In [69]:
np.random.shuffle(positive_class_items)

In [71]:
pd.to_pickle(positive_class_items, 'data/positive_class_items.pkl')
pd.to_pickle(negative_class_items, 'data/negative_class_items.pkl')

In [78]:
sk = StratifiedKFold()
splits = list(sk.split([item[0] for item in positive_class_items], [item[0] for item in positive_class_items]))

In [86]:
pd.to_pickle(splits, 'data/splits.pkl')

In [90]:
splits[0][0].shape

(4560,)

In [97]:
pd.to_pickle(north_american_birds_common, 'data/north_american_birds_common.pkl')

In [102]:
len(north_american_birds_common)

57

In [3]:
splits = pd.read_pickle('data/splits.pkl')
positive_class_items = pd.read_pickle('data/positive_class_items.pkl')
negative_class_items = pd.read_pickle('data/negative_class_items.pkl')
north_american_birds_common = pd.read_pickle('data/north_american_birds_common.pkl')

In [4]:
from collections import defaultdict

class MelspecPoolDatasetNegativeClass(torch.utils.data.Dataset):
    def __init__(self, items, items_neg_class, classes, north_american_birds_common, len_mult=20, specs_per_example=30):
        self.recs = defaultdict(list)
        for item in items:
            self.recs[item[0]].append(item)
        self.items = items
        self.items_neg_class = items_neg_class
        self.all_classes = classes
        self.vocab = north_american_birds_common
        self.specs_per_example = specs_per_example
        self.len_mult = len_mult
        
    def __getitem__(self, idx):
        if np.random.rand() > 0.56:
            cls_idx = idx % len(self.vocab)
            ebird_idx = self.all_classes.index(self.vocab[cls_idx])
            recs = self.recs[ebird_idx]
            _, path, duration = recs[np.random.randint(0, len(recs))]
        else:
            cls_idx = -1
            _, path, duration = self.items_neg_class[np.random.randint(len(self.items_neg_class))]
            
        example = self.sample_specs(path, duration, self.specs_per_example)
        imgs = example.reshape(-1, 3, 80, 212)
        return imgs.astype(np.float32), self.one_hot_encode(cls_idx)
    
    def sample_specs(self, path, duration, count):
        x, _ = sf.read(path)

        if x.shape[0] < 1.66*SAMPLE_RATE:
            x =  np.tile(x, 5) # the shortest rec in the train set is 0.39 sec

        xs = []
        for _ in range(count):
            start_frame = int(np.random.rand() * (x.shape[0] - 1.66 * SAMPLE_RATE))
            xs.append(x[start_frame:start_frame+int(1.66*SAMPLE_RATE)])

        specs = []
        for x in xs:
            specs.append(audio_to_melspec(x))
        return np.stack(specs)
    
    def show(self, idx):
        x = self[idx][0][0]
        return plt.imshow(x.transpose(1,2,0)[:, :, 0])
        
    def one_hot_encode(self, y):
        one_hot = np.zeros((len(self.vocab)))
        if y != -1:
            one_hot[y] = 1
        return one_hot
    
    def __len__(self):
        return self.len_mult * len(self.vocab)

In [5]:
train_items = np.array(positive_class_items)[splits[0][0]].tolist()
val_items = np.array(positive_class_items)[splits[0][1]].tolist()

In [6]:
classes = pd.read_pickle('data/classes.pkl')
north_american_birds_common = pd.read_pickle('data/north_american_birds_common.pkl')

train_ds = MelspecPoolDatasetNegativeClass(train_items, negative_class_items, classes, north_american_birds_common, len_mult=300)
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=16, num_workers=NUM_WORKERS, pin_memory=True, shuffle=True)

In [7]:
class MelspecShortishValidatioDataset(torch.utils.data.Dataset):
    def __init__(self, items, vocab, items_neg_class):
        self.vocab = vocab
        self.items = items
        self.items_neg_class = items_neg_class
        
    def __len__(self): return 2*len(self.items)
    
    def __getitem__(self, idx):
        if idx < len(self.items):
            return self.create_example(self.items[idx])
        else:
            return self.create_example(self.items_neg_class[np.random.randint(len(self.items_neg_class))], True)
    
    def create_example(self, item, neg_class=False):
        cls_idx, path, num_specs = item
        if neg_class: cls_idx = -1
        
        x, _ = sf.read(path)

        example_duration = num_specs * 5 * SAMPLE_RATE
        if x.shape[0] < example_duration:
            x = np.tile(x, example_duration // x.shape[0] + 1)
            
        start_frame = 0
        x = x[start_frame:example_duration]

        xs = []
        for i in range(num_specs):
            for j in range(3):
                start_frame = int((i * 3 + j) * 1.66 * SAMPLE_RATE)
                xs.append(x[start_frame:start_frame+int(1.66*SAMPLE_RATE)])

        specs = []
        for x in xs:
            specs.append(audio_to_melspec(x))
        specs = np.stack(specs)
        imgs = specs.reshape(-1, 3, 80, 212)

        one_hot = np.zeros((len(self.vocab)))
        if cls_idx != -1: one_hot[cls_idx] = 1

        return imgs.astype(np.float32), one_hot

In [8]:
def bin_items_negative_class(items, vocab, all_classes):
    val_recs = defaultdict(list)
    for item in val_items:
        val_recs[item[0]].append(item)
        
    binned_items = defaultdict(list)
    for key in val_recs.keys():
        cls_idx = vocab.index(all_classes[key])
        for _, path, duration in val_recs[key]:
            if duration < 7.5: binned_items[1].append((cls_idx, path, 1))
            elif duration < 12.5: binned_items[2].append((cls_idx, path, 2))
            elif duration < 25: binned_items[4].append((cls_idx, path, 4))
            elif duration < 45: binned_items[6].append((cls_idx, path, 6))
            else: binned_items[10].append((cls_idx, path, 10))
    return binned_items

In [9]:
val_items_binned = bin_items_negative_class(val_items, north_american_birds_common, classes)

In [10]:
negative_class_items_num_specs = defaultdict(list)
for _, path, duration in negative_class_items:
    if duration < 7.5: bin_num = 1
    elif duration < 12.5: bin_num = 2
    elif duration < 25: bin_num = 4
    elif duration < 45: bin_num = 6
    else: bin_num = 10
    negative_class_items_num_specs[bin_num].append((1000, path, bin_num))

In [11]:
class FrontEnd(nn.Module):
    def __init__(self):
        super().__init__()
        self.bn = nn.BatchNorm1d(80, affine=False)
        self.register_parameter('alpha', torch.nn.Parameter(torch.tensor(0.)))
        
    def forward(self, x):
        bs, im_num, ch, y_dim, x_dim = x.shape
        x = x ** torch.sigmoid(self.alpha)
        x = x.view(-1, y_dim, x_dim)
        x = self.bn(x)
        return x.view(bs, im_num, ch, y_dim, x_dim)

In [12]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.frontend = FrontEnd()
        self.cnn = nn.Sequential(*list(torchvision.models.resnet34(True).children())[:-2])
        self.classifier = nn.Sequential(*[
            nn.Linear(512, 512), nn.ReLU(), nn.Dropout(p=0.5), nn.BatchNorm1d(512),
            nn.Linear(512, 512), nn.ReLU(), nn.Dropout(p=0.5), nn.BatchNorm1d(512),
            nn.Linear(512, len(north_american_birds_common))
        ])
    
    def forward(self, x):
        bs, im_num, ch, y_dim, x_dim = x.shape
        x = self.frontend(x)
        x = self.cnn(x.view(-1, ch, y_dim, x_dim))
        x = x.mean((2,3))
        x = self.classifier(x)
        x = x.view(bs, im_num, -1)
        x = lme_pool(x)
        return x

In [13]:
model = Model().cuda()

In [14]:
state_dict = torch.load('models/130_lmepool_frontend_0.72.pth')

del state_dict['classifier.8.weight']
del state_dict['classifier.8.bias']

model.load_state_dict(state_dict, strict=False)

In [17]:
import torch.optim as optim
from sklearn.metrics import accuracy_score, f1_score
import time

In [18]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), 1e-3)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, 5)

In [19]:
t0 = time.time()
for epoch in range(130):
    running_loss = 0.0
    for i, data in enumerate(train_dl, 0):
        model.train()
        inputs, labels = data[0].cuda(), data[1].cuda()
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        if np.isnan(loss.item()): 
            print(f'!!! nan encountered in loss !!! alpha: epoch: {epoch}\n')
        loss.backward()
        optimizer.step()
        scheduler.step()

        running_loss += loss.item()


    if epoch % 5 == 4:
        model.eval();
        preds = []
        targs = []
        fns = []

        for num_specs in val_items_binned.keys():
            valid_ds = MelspecShortishValidatioDataset(val_items_binned[num_specs], north_american_birds_common, negative_class_items_num_specs[num_specs])
            valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size=2*16, num_workers=NUM_WORKERS, pin_memory=True)

            fns += [item[1].name for item in valid_ds.items]

            with torch.no_grad():
                for data in valid_dl:
                    inputs, labels = data[0].cuda(), data[1].cuda()
                    outputs = model(inputs)
                    preds.append(outputs.cpu().detach())
                    targs.append(labels.cpu().detach())

        preds = torch.cat(preds)
        targs = torch.cat(targs)

        accuracy = accuracy_score(preds.sigmoid() > 0.5, targs)
        f1 = f1_score(preds.sigmoid() > 0.5, targs, average='micro')
        print(f'[{epoch + 1}, {(time.time() - t0)/60:.1f}] loss: {running_loss / (len(train_dl)-1):.3f}, acc: {accuracy:.3f}, f1: {f1:.3f}')
        running_loss = 0.0

        torch.save(model.state_dict(), f'models/{epoch+1}_lmepool_frontend_neg_class_{round(f1, 2)}.pth')

[5, 20.0] loss: 0.005, acc: 0.775, f1: 0.753
[10, 39.9] loss: 0.003, acc: 0.782, f1: 0.768
[15, 59.9] loss: 0.002, acc: 0.825, f1: 0.805
[20, 79.9] loss: 0.002, acc: 0.776, f1: 0.759
[25, 99.9] loss: 0.001, acc: 0.783, f1: 0.764
[30, 119.9] loss: 0.001, acc: 0.811, f1: 0.785
[35, 139.9] loss: 0.001, acc: 0.809, f1: 0.788
[40, 159.9] loss: 0.001, acc: 0.769, f1: 0.740
[45, 179.9] loss: 0.001, acc: 0.784, f1: 0.769
[50, 199.9] loss: 0.001, acc: 0.788, f1: 0.774
[55, 219.8] loss: 0.001, acc: 0.800, f1: 0.774
[60, 239.8] loss: 0.002, acc: 0.823, f1: 0.795
[65, 259.8] loss: 0.001, acc: 0.796, f1: 0.768
[70, 279.8] loss: 0.001, acc: 0.755, f1: 0.744
[75, 299.8] loss: 0.001, acc: 0.786, f1: 0.762
[80, 319.7] loss: 0.001, acc: 0.763, f1: 0.747
[85, 339.7] loss: 0.001, acc: 0.809, f1: 0.781
[90, 359.6] loss: 0.001, acc: 0.779, f1: 0.756
[95, 379.6] loss: 0.001, acc: 0.756, f1: 0.737
[100, 399.5] loss: 0.001, acc: 0.785, f1: 0.761
[105, 419.5] loss: 0.001, acc: 0.785, f1: 0.767
[110, 439.5] loss

In [20]:
f1s = []
ts = []
for t in np.linspace(0.4, 1, 61):
    f1s.append(f1_score(preds.sigmoid() > t, targs, average='micro'))
    ts.append(t)

In [21]:
max(f1s), accuracy_score(preds.sigmoid() > ts[np.argmax(f1s)], targs)

(0.767713004484305, 0.7956140350877193)

In [22]:
ts[np.argmax(f1s)]

0.8

In [23]:
from birdcall.metrics import *

preds_to_tp_fp_fn(preds, targs)

(tensor(872), tensor(286), tensor(268))

In [24]:
t0 = time.time()
for epoch in range(130, 260):
    running_loss = 0.0
    for i, data in enumerate(train_dl, 0):
        model.train()
        inputs, labels = data[0].cuda(), data[1].cuda()
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        if np.isnan(loss.item()): 
            print(f'!!! nan encountered in loss !!! alpha: epoch: {epoch}\n')
        loss.backward()
        optimizer.step()
        scheduler.step()

        running_loss += loss.item()


    if epoch % 5 == 4:
        model.eval();
        preds = []
        targs = []
        fns = []

        for num_specs in val_items_binned.keys():
            valid_ds = MelspecShortishValidatioDataset(val_items_binned[num_specs], north_american_birds_common, negative_class_items_num_specs[num_specs])
            valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size=2*16, num_workers=NUM_WORKERS, pin_memory=True)

            fns += [item[1].name for item in valid_ds.items]

            with torch.no_grad():
                for data in valid_dl:
                    inputs, labels = data[0].cuda(), data[1].cuda()
                    outputs = model(inputs)
                    preds.append(outputs.cpu().detach())
                    targs.append(labels.cpu().detach())

        preds = torch.cat(preds)
        targs = torch.cat(targs)

        accuracy = accuracy_score(preds.sigmoid() > 0.5, targs)
        f1 = f1_score(preds.sigmoid() > 0.5, targs, average='micro')
        print(f'[{epoch + 1}, {(time.time() - t0)/60:.1f}] loss: {running_loss / (len(train_dl)-1):.3f}, acc: {accuracy:.3f}, f1: {f1:.3f}')
        running_loss = 0.0

        torch.save(model.state_dict(), f'models/{epoch+1}_lmepool_frontend_neg_class_{round(f1, 2)}.pth')

[135, 19.9] loss: 0.001, acc: 0.795, f1: 0.762
[140, 39.9] loss: 0.001, acc: 0.793, f1: 0.765
[145, 59.8] loss: 0.001, acc: 0.782, f1: 0.752
[150, 79.8] loss: 0.001, acc: 0.777, f1: 0.762
[155, 99.7] loss: 0.001, acc: 0.774, f1: 0.745
[160, 119.7] loss: 0.000, acc: 0.733, f1: 0.722
[165, 139.6] loss: 0.001, acc: 0.762, f1: 0.751
[170, 159.6] loss: 0.000, acc: 0.795, f1: 0.768
[175, 179.5] loss: 0.001, acc: 0.793, f1: 0.775


KeyboardInterrupt: 

In [25]:
model.load_state_dict(torch.load('models/175_lmepool_frontend_neg_class_0.77.pth'))

<All keys matched successfully>

In [26]:
model.eval();
preds = []
targs = []
fns = []

for num_specs in val_items_binned.keys():
    valid_ds = MelspecShortishValidatioDataset(val_items_binned[num_specs], north_american_birds_common, negative_class_items_num_specs[num_specs])
    valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size=2*16, num_workers=NUM_WORKERS, pin_memory=True)

    fns += [item[1].name for item in valid_ds.items]

    with torch.no_grad():
        for data in valid_dl:
            inputs, labels = data[0].cuda(), data[1].cuda()
            outputs = model(inputs)
            preds.append(outputs.cpu().detach())
            targs.append(labels.cpu().detach())

preds = torch.cat(preds)
targs = torch.cat(targs)

In [27]:
max(f1s), accuracy_score(preds.sigmoid() > ts[np.argmax(f1s)], targs)

(0.767713004484305, 0.793859649122807)

In [28]:
ts[np.argmax(f1s)]

0.8

In [29]:
from birdcall.metrics import *

preds_to_tp_fp_fn(preds, targs)

(tensor(860), tensor(223), tensor(280))