In [1]:
from birdcall.data import *
from birdcall.metrics import *

import pandas as pd

In [2]:
classes = pd.read_pickle('data/classes.pkl')
train_ds = SpectrogramDataset(pd.read_pickle('data/train_set.pkl'), classes, len_mult=100, spec_max=80, spec_min=-100)
valid_ds = SpectrogramDataset(pd.read_pickle('data/val_set.pkl'), classes, len_mult=20)

In [3]:
len(train_ds), len(valid_ds)

(26400, 5280)

In [4]:
import torch
import torchvision
from torch import nn

In [5]:
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=120, shuffle=True, num_workers=NUM_WORKERS)
valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size=2*120, shuffle=False, num_workers=NUM_WORKERS)

In [6]:
for b in train_dl: break
b[0].shape, b[1]

(torch.Size([120, 3, 90, 714]),
 tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64))

In [7]:
b[0].mean(), b[0].std()

(tensor(-0.0143), tensor(0.9837))

In [8]:
res50 = torchvision.models.resnet50(True)

In [9]:
bottom = nn.Sequential(*list(res50.children())[:6])
mid = nn.Sequential(*list(res50.children())[6:-2])

In [10]:
from IPython.core.debugger import set_trace

In [11]:
class Head(nn.Module):
    def __init__(self):
        super().__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.layers = nn.ModuleList(
            [
                nn.Linear(2048, 1024), nn.ReLU(), nn.Dropout(p=0.2),
                nn.Linear(1024, 1024), nn.ReLU(), nn.Dropout(p=0.2),
                nn.Linear(1024, len(classes))
            ]
        )
    def forward(self, x):
#         set_trace()
        x = self.avg_pool(x)
        x = x.view(x.shape[0], -1)
        for l in self.layers:
            x = l(x)
        return x

In [12]:
model = nn.Sequential(bottom, mid, Head())

In [13]:
model.cuda();

In [14]:
model(b[0].cuda()).shape

torch.Size([120, 264])

In [15]:
b[0].shape

torch.Size([120, 3, 90, 714])

In [16]:
import torch.optim as optim

# criterion = nn.BCEWithLogitsLoss()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), 1e-3)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, 10)

In [17]:
from sklearn.metrics import accuracy_score, f1_score

In [18]:
import time

In [None]:
for epoch in range(100):
    t0 = time.time()
    running_loss = 0.0
    for i, data in enumerate(train_dl, 0):
        model.train()
        inputs, labels = data[0].cuda(), data[1].cuda()
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels.argmax(1))
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        running_loss += loss.item()
        
        
        if i % len(train_dl) == len(train_dl)-1:
            model.eval();
            preds = []
            targs = []

            with torch.no_grad():
                for data in valid_dl:
                    inputs, labels = data[0].cuda(), data[1].cuda()
                    outputs = model(inputs)
                    preds.append(outputs.cpu().detach())
                    targs.append(labels.cpu().detach())

                preds = torch.cat(preds)
                targs = torch.cat(targs)
            
            accuracy = (targs.argmax(1) == preds.softmax(-1).argmax(1)).float().mean().item()
            print(f'[{epoch + 1}, {time.time() - t0:.1f}] loss: {running_loss / (len(train_dl)-1):.3f}, accuracy: {accuracy:.3f}')
            running_loss = 0.0

[1, 91.0] loss: 5.608, accuracy: 0.004
[2, 90.3] loss: 5.421, accuracy: 0.005
[3, 90.5] loss: 5.072, accuracy: 0.006
[4, 90.4] loss: 4.833, accuracy: 0.006
[6, 90.4] loss: 4.460, accuracy: 0.010
[7, 90.3] loss: 4.306, accuracy: 0.007
[8, 90.5] loss: 4.197, accuracy: 0.008
[9, 90.4] loss: 4.066, accuracy: 0.008
[10, 90.3] loss: 3.915, accuracy: 0.007
[13, 90.4] loss: 3.597, accuracy: 0.005
[14, 90.3] loss: 3.482, accuracy: 0.008
[15, 90.5] loss: 3.377, accuracy: 0.007
[16, 90.3] loss: 3.291, accuracy: 0.004
[17, 90.6] loss: 3.190, accuracy: 0.005
[18, 90.5] loss: 3.138, accuracy: 0.005
[19, 90.1] loss: 2.967, accuracy: 0.006
[20, 90.3] loss: 2.908, accuracy: 0.004
[21, 90.5] loss: 2.851, accuracy: 0.006
[22, 90.3] loss: 2.765, accuracy: 0.007
[23, 90.4] loss: 2.676, accuracy: 0.004
[24, 90.3] loss: 2.614, accuracy: 0.006
[25, 90.4] loss: 2.548, accuracy: 0.004
[26, 90.0] loss: 2.478, accuracy: 0.005
[27, 90.5] loss: 2.425, accuracy: 0.003
[28, 90.3] loss: 2.310, accuracy: 0.004
[29, 90.

I attempted to train with sigmoid activations and the BCE loss, but to no avail. Subsequently, I positioned the task in a way that should be eaiser for the model to train on (softmax, cross entropy loss), but again no go, same results.

At this point I am becoming convinced that the model fails to generalize to unseen data. It could also explain some of the results I am seeing on kaggle - the disparity between the scores people get locally (with probably not the greatest way of sampling the validation set) vs their performance when they submit to the LB.

To undertand this better, one additional datapoint I could collect is my model's performance on the train set. It achieves a low loss but what are its accuracy and f1?