In [1]:
import pandas as pd
import os
import random
import torch
import csv
import librosa
import numpy as np
from skimage.transform import resize
from PIL import Image

import warnings


In [2]:
base_dir = "/courses/EDS232/rainforest"

In [3]:
fft = 2048
hop = 512
# Less rounding errors this way
sr = 48000
length = 10 * sr

with open('/courses/EDS232/rainforest/train_tp.csv') as f:
    reader = csv.reader(f)
    data = list(reader)

# data = pd.read_csv(os.path.join(base_dir, 'train_tp.csv'))

In [4]:
# Check minimum/maximum frequencies for bird calls
# Not neccesary, but there are usually plenty of noise in low frequencies, and removing it helps
fmin = 24000
fmax = 0


# Skip header row (recording_id,species_id,songtype_id,t_min,f_min,t_max,f_max) and start from 1 instead of 0
for i in range(1, len(data)):
    if fmin > float(data[i][4]):
        fmin = float(data[i][4])
    if fmax < float(data[i][6]):
        fmax = float(data[i][6])

# Get some safety margin
fmin = int(fmin * 0.9)

fmax = int(fmax * 1.1)
print('Minimum frequency: ' + str(fmin) + ', maximum frequency: ' + str(fmax))

Minimum frequency: 84, maximum frequency: 15056


In [5]:
warnings.filterwarnings('ignore')

print('Starting spectrogram generation')
for i in range(1, len(data)):
    # All sound files are 48000 bitrate, no need to slowly resample
    wav, sr = librosa.load('/courses/EDS232/rainforest/train/' + data[i][0] + '.flac', sr=None)
    
    t_min = float(data[i][3]) * sr
    t_max = float(data[i][5]) * sr
    
    # Positioning sound slice
    center = np.round((t_min + t_max) / 2)
    beginning = center - length / 2
    if beginning < 0:
        beginning = 0
    
    ending = beginning + length
    if ending > len(wav):
        ending = len(wav)
        beginning = ending - length
        
    slice = wav[int(beginning):int(ending)]
    
    # Mel spectrogram generation
    # Default settings were bad, parameters are adjusted to generate somewhat 
    # reasonable quality images
    # The better your images are, the better your neural net would perform
    # You can also use librosa.stft + librosa.amplitude_to_db instead
    mel_spec = librosa.feature.melspectrogram(slice, n_fft=fft, hop_length=hop, sr=sr, fmin=fmin, fmax=fmax, power=1.5)
    mel_spec = resize(mel_spec, (224, 400))
    
    # Normalize to 0...1 - this is what goes into neural net
    mel_spec = mel_spec - np.min(mel_spec)
    mel_spec = mel_spec / np.max(mel_spec)

    # And this 0...255 is for the saving in bmp format
    mel_spec = mel_spec * 255
    mel_spec = np.round(mel_spec)    
    mel_spec = mel_spec.astype('uint8')
    mel_spec = np.asarray(mel_spec)
    
    bmp = Image.fromarray(mel_spec, 'L')
    bmp.save('/courses/EDS232/rainforest/working/' + data[i][0] + '_' + data[i][1] + '_' + str(center) + '.bmp')
    
    if i % 100 == 0:
        print('Processed ' + str(i) + ' train examples from ' + str(len(data)))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Starting spectrogram generation
Processed 100 train examples from 1217
Processed 200 train examples from 1217
Processed 300 train examples from 1217
Processed 400 train examples from 1217
Processed 500 train examples from 1217
Processed 600 train examples from 1217
Processed 700 train examples from 1217
Processed 800 train examples from 1217
Processed 900 train examples from 1217
Processed 1000 train examples from 1217
Processed 1100 train examples from 1217
Processed 1200 train examples from 1217


In [6]:
num_birds = 24
# 6GB GPU-friendly (~4 GB used by model)
# Increase if neccesary
batch_size = 16

# This is enough to exactly reproduce results on local machine (Windows / Turing GPU)
# Kaggle GPU kernels (Linux / Pascal GPU) are not deterministic even with random seeds set
# Your score might vary a lot (~up to 0.05) on a different runs 
# due to picking different epochs to submit
rng_seed = 1234
random.seed(rng_seed)
np.random.seed(rng_seed)
os.environ['PYTHONHASHSEED'] = str(rng_seed)
torch.manual_seed(rng_seed)
torch.cuda.manual_seed(rng_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [7]:
import torch.utils.data as torchdata

class RainforestDataset(torchdata.Dataset):
    def __init__(self, filelist):
        self.specs = []
        self.labels = []
        for f in filelist:
            # Easier to pass species in filename at the start; 
            # worth changing later to more capable method
            label = int(str.split(f, '_')[1])
            label_array = np.zeros(num_birds, dtype=np.single)
            label_array[label] = 1.
            self.labels.append(label_array)
            
            # Open and save spectrogram to memory
            
            # If you use more spectrograms (add train_fp, for example), 
            # then they would not all fit to memory
            # In this case you should load them on the fly in __getitem__
            img = Image.open('/courses/EDS232/rainforest/working/' + f)
            mel_spec = np.array(img)
            img.close()
            
            # Transforming spectrogram from bmp to 0..1 array
            mel_spec = mel_spec / 255
            # Stacking for 3-channel image for resnet
            mel_spec = np.stack((mel_spec, mel_spec, mel_spec))
            
            self.specs.append(mel_spec)
    
    def __len__(self):
        return len(self.specs)
    
    def __getitem__(self, item):
        # Augment here if you want
        return self.specs[item], self.labels[item]

In [8]:
file_list = []
label_list = []

for f in os.listdir('/courses/EDS232/rainforest/working/'):
    if '.bmp' in f:
        file_list.append(f)
        label = str.split(f, '_')[1]
        label_list.append(label)


from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=rng_seed)

train_files = []
val_files = []

for fold_id, (train_index, val_index) in enumerate(skf.split(file_list, label_list)):
    # Picking only first fold to train/val on
    # This means loss of 20% training data
    # To avoid this, you can train 5 different models on 5 folds and average predictions
    if fold_id == 0:
        train_files = np.take(file_list, train_index)
        val_files = np.take(file_list, val_index)

print('Training on ' + str(len(train_files)) + ' examples')
print('Validating on ' + str(len(val_files)) + ' examples')

Training on 972 examples
Validating on 244 examples


In [15]:
!pip install resnest > /dev/null/

/usr/bin/sh: 1: cannot create /dev/null/: Is a directory


In [19]:
import torch.nn as nn
from resnest.torch import resnest50
import torch.nn.functional as F
!cd /courses/EDS232/rainforest && curl -O https://download.pytorch.org/models/resnet50-19c8e357.pth

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, 3)
        self.conv2 = nn.Conv2d(64, 64, 3)
        self.pool = nn.MaxPool2d(2, 2)
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(F.relu(self.conv2(x)))
        return x

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 97.7M  100 97.7M    0     0   103M      0 --:--:-- --:--:-- --:--:--  103M


In [20]:
train_dataset = RainforestDataset(train_files)
val_dataset = RainforestDataset(val_files)

train_loader = torchdata.DataLoader(train_dataset, 
                                    batch_size=batch_size, 
                                    sampler=torchdata.RandomSampler(train_dataset))

val_loader = torchdata.DataLoader(val_dataset, 
                                  batch_size=batch_size, 
                                  sampler=torchdata.RandomSampler(val_dataset))

# ResNeSt: Split-Attention Networks
# https://arxiv.org/abs/2004.08955
# Significantly outperforms standard Resnet

model = resnest50(pretrained=False)

model.fc = nn.Sequential(
    nn.Linear(2048, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(1024, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(1024, num_birds)
)

# Picked for this notebook; pick new ones after major changes 
# (such as adding train_fp to train data)
optimizer = torch.optim.SGD(model.parameters(), 
                            lr=0.01, 
                            weight_decay=0.0001, 
                            momentum=0.9)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 
                                            step_size=7, 
                                            gamma=0.4)

# This loss function is not exactly suited for competition metric, 
# which only cares about ranking of predictions
# Exploring different loss fuctions would be a good idea
pos_weights = torch.ones(num_birds)
pos_weights = pos_weights * num_birds
loss_function = nn.BCEWithLogitsLoss(pos_weight=pos_weights)

if torch.cuda.is_available():
    model = model.cuda()
    loss_function = loss_function.cuda()

In [21]:
best_corrects = 0

# Train loop
print('Starting training loop')
for e in range(0, 40):
    # Stats
    train_loss = []
    train_corr = []
    
    # Single epoch - train
    model.train()
    for batch, (data, target) in enumerate(train_loader):
        data = data.float()
        if torch.cuda.is_available():
            data, target = data.cuda(), target.cuda()
            
        optimizer.zero_grad()
        
        output = model(data)
        loss = loss_function(output, target)
        
        loss.backward()
        optimizer.step()
        
        # Stats
        vals, answers = torch.max(output, 1)
        vals, targets = torch.max(target, 1)
        corrects = 0
        for i in range(0, len(answers)):
            if answers[i] == targets[i]:
                corrects = corrects + 1
        train_corr.append(corrects)
        
        train_loss.append(loss.item())
    
    # Stats
    for g in optimizer.param_groups:
        lr = g['lr']
    print('Epoch ' + str(e) + ' training end. LR: ' + str(lr) + 
          ', Loss: ' + str(sum(train_loss) / len(train_loss)) +
          ', Correct answers: ' + str(sum(train_corr)) + '/' + 
          str(train_dataset.__len__()))
    
    # Single epoch - validation
    with torch.no_grad():
        # Stats
        val_loss = []
        val_corr = []
        
        model.eval()
        for batch, (data, target) in enumerate(val_loader):
            data = data.float()
            if torch.cuda.is_available():
                data, target = data.cuda(), target.cuda()
            
            output = model(data)
            loss = loss_function(output, target)
            
            # Stats
            vals, answers = torch.max(output, 1)
            vals, targets = torch.max(target, 1)
            corrects = 0
            for i in range(0, len(answers)):
                if answers[i] == targets[i]:
                    corrects = corrects + 1
            val_corr.append(corrects)
        
            val_loss.append(loss.item())
    
    # Stats
    print('Epoch ' + str(e) + ' validation end. LR: ' + str(lr) + 
          ', Loss: ' + str(sum(val_loss) / len(val_loss)) +
          ', Correct answers: ' + str(sum(val_corr)) + '/' + str(val_dataset.__len__()))
    
    # If this epoch is better than previous on validation, save model
    # Validation loss is the more common metric, 
    # but in this case our loss is misaligned with competition metric, 
    # making accuracy a better metric
    if sum(val_corr) > best_corrects:
        print('Saving new best model at epoch ' + str(e) + ' (' + 
              str(sum(val_corr)) + '/' + str(val_dataset.__len__()) + ')')
        torch.save(model, '/courses/EDS232/rainforest/best_model.pt')
        best_corrects = sum(val_corr)
        
    # Call every epoch
    scheduler.step()

# Free memory
del model

Starting training loop
Epoch 0 training end. LR: 0.01, Loss: 1.3638783028868378, Correct answers: 65/972
Epoch 0 validation end. LR: 0.01, Loss: 1.3503396734595299, Correct answers: 20/244
Saving new best model at epoch 0 (20/244)
Epoch 1 training end. LR: 0.01, Loss: 1.3188678651559549, Correct answers: 83/972
Epoch 1 validation end. LR: 0.01, Loss: 1.3053582459688187, Correct answers: 25/244
Saving new best model at epoch 1 (25/244)
Epoch 2 training end. LR: 0.01, Loss: 1.21498306071172, Correct answers: 90/972
Epoch 2 validation end. LR: 0.01, Loss: 1.2024907544255257, Correct answers: 20/244
Epoch 3 training end. LR: 0.01, Loss: 1.1887264857526685, Correct answers: 89/972
Epoch 3 validation end. LR: 0.01, Loss: 1.3292143195867538, Correct answers: 21/244
Epoch 4 training end. LR: 0.01, Loss: 1.1440586819023382, Correct answers: 94/972
Epoch 4 validation end. LR: 0.01, Loss: 1.2274692356586456, Correct answers: 20/244
Epoch 5 training end. LR: 0.01, Loss: 1.125576355418221, Correct 

In [26]:
def load_test_file(f):
    wav, sr = librosa.load('/courses/EDS232/rainforest/test/' + f, sr=None)

    # Split for enough segments to not miss anything
    segments = len(wav) / length
    segments = int(np.ceil(segments))
    
    mel_array = []
    
    for i in range(0, segments):
        # Last segment going from the end
        if (i + 1) * length > len(wav):
            slice = wav[len(wav) - length:len(wav)]
        else:
            slice = wav[i * length:(i + 1) * length]
        
        # Same mel spectrogram as before
        mel_spec = librosa.feature.melspectrogram(slice, 
                                                  n_fft=fft, 
                                                  hop_length=hop, 
                                                  sr=sr, 
                                                  fmin=fmin, 
                                                  fmax=fmax, 
                                                  power=1.5)
        mel_spec = resize(mel_spec, (224, 400))
    
        mel_spec = mel_spec - np.min(mel_spec)
        mel_spec = mel_spec / np.max(mel_spec)
          
        mel_spec = np.stack((mel_spec, mel_spec, mel_spec))

        mel_array.append(mel_spec)
    
    return mel_array

In [28]:
save_to_disk = 0

# Loading model back
model = resnest50(pretrained=False)

model.fc = nn.Sequential(
    nn.Linear(2048, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(1024, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(1024, num_birds)
)

model = torch.load('/courses/EDS232/rainforest/best_model.pt')
model.eval()

# Scoring does not like many files:(
if save_to_disk == 0:
    for f in os.listdir('/courses/EDS232/rainforest/working/'):
        os.remove('/courses/EDS232/rainforest/working/' + f)

if torch.cuda.is_available():
    model.cuda()
    
# Prediction loop
print('Starting prediction loop')
with open('submission.csv', 'w', newline='') as csvfile:
    submission_writer = csv.writer(csvfile, delimiter=',')
    submission_writer.writerow(['recording_id','s0','s1','s2','s3','s4','s5','s6','s7','s8','s9','s10',
                                's11', 's12','s13','s14','s15','s16','s17','s18','s19','s20','s21',
                                's22','s23'])
    
    test_files = os.listdir('/courses/EDS232/rainforest/test/')
    print(len(test_files))
    
    # Every test file is split on several chunks and prediction is made for each chunk
    for i in range(0, len(test_files)):
        data = load_test_file(test_files[i])
        data = torch.tensor(data)
        data = data.float()
        if torch.cuda.is_available():
            data = data.cuda()

        output = model(data)

        # Taking max prediction from all slices per bird species
        # Usually you want Sigmoid layer here to convert output to probabilities
        # In this competition only relative ranking matters, 
        # and not the exact value of prediction, so we can use it directly
        maxed_output = torch.max(output, dim=0)[0]
        maxed_output = maxed_output.cpu().detach()
        
        file_id = str.split(test_files[i], '.')[0]
        write_array = [file_id]
        
        for out in maxed_output:
            write_array.append(out.item())
    
        submission_writer.writerow(write_array)
        
        if i % 100 == 0 and i > 0:
            print('Predicted for ' + str(i) + ' of ' + str(len(test_files) + 1) + ' files')

print('Submission generated')

Starting prediction loop
1992
Predicted for 100 of 1993 files
Predicted for 200 of 1993 files
Predicted for 300 of 1993 files
Predicted for 400 of 1993 files
Predicted for 500 of 1993 files
Predicted for 600 of 1993 files
Predicted for 700 of 1993 files
Predicted for 800 of 1993 files
Predicted for 900 of 1993 files
Predicted for 1000 of 1993 files
Predicted for 1100 of 1993 files
Predicted for 1200 of 1993 files
Predicted for 1300 of 1993 files
Predicted for 1400 of 1993 files
Predicted for 1500 of 1993 files
Predicted for 1600 of 1993 files
Predicted for 1700 of 1993 files
Predicted for 1800 of 1993 files
Predicted for 1900 of 1993 files
Submission generated
