In [None]:
!pip freeze

In [None]:
import numpy as np
import pandas as pd
import os
import gc
import cv2
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
import time

from pydub import AudioSegment as AS
import librosa
from librosa.feature import melspectrogram
from librosa.core import power_to_db as ptdb

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as op
import torchvision.models as models
from torch.optim import Adam

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

from sklearn.utils import shuffle

from scipy.ndimage.morphology import binary_dilation, binary_erosion
from keras.utils import to_categorical
from albumentations import Normalize
from tensorflow.keras.preprocessing.sequence import pad_sequences as pad

In [None]:
from torch import FloatTensor

In [None]:
CHUNKS = 1
train_batch = 16
default_input = 512
drop = 0.2
epochs = 2
n_mels = 256
mel_len = 1954

In [None]:
test_path = r'C:\Users\jonat\Desktop\Career\Springboard\Capstone 3\birdsong-recognition\test.csv'
train_path = r'C:\Users\jonat\Desktop\Career\Springboard\Capstone 3\birdsong-recognition\train.csv'
test_audio_path = r'C:\Users\jonat\Desktop\Career\Springboard\Capstone 3\birdsong-recognition\example_test_audio/'
train_audio_path = r'C:\Users\jonat\Desktop\Career\Springboard\Capstone 3\birdsong-recognition\train_audio/'

In [None]:
os.chdir(r'C:\Users\jonat\Desktop\Career\Springboard\Capstone 3\birdsong-recognition\mel spec images')

In [None]:
os.listdir('train_file_1')

In [None]:
test_d = pd.read_csv(test_path)
train_d = pd.read_csv(train_path)

In [None]:
keys = set(train_d.ebird_code)
values = np.arange(0,len(keys))
dict_code = dict(zip(sorted(keys),values))

In [None]:
img_paths = ['train_file_1','train_file_2']
path_dict = {}
for folder_path in tqdm(img_paths):
    for img_path in os.listdir(folder_path):
        path_dict[img_path] = folder_path + '/'

In [None]:
def tensor_convert(data):
    return [FloatTensor(d) for d in data]

In [None]:
def normalize(x):
    return np.float32(x)/2**15

In [None]:
class MelDataset(Dataset):
    def __init__(self, df):
        self.aug = Normalize(p=1)
        self.code_dict = dict_code
        self.classes = len(dict_code)
        self.df, self.dataset_length = df, len(df)
        
    def __len__(self):
        return self.dataset_length
    
    def __getitem__(self, i):
        file_name = self.df.filename[i]
        image_name = file_name + '.jpg'
        ebird_code = self.df.ebird_code[i]
        num_code = self.code_dict[ebird_code]
        image = cv2.imread(path_dict[image_name] + image_name)
        code = to_categorical([num_code], num_classes=self.classes)
        return tensor_convert([self.aug(image=image)['image'], np.repeat(code, CHUNKS, 0)])

In [None]:
train_trunc = train_d[:4275]

In [None]:
split = int(0.8*len(train_trunc))
train_t = train_trunc.reset_index(drop=True)
test_t = train_t[split:].reset_index(drop=True)
train_t = train_t[:split].reset_index(drop=True)

train_set = MelDataset(train_t)
train_data = DataLoader(train_set,batch_size = 16, shuffle = True)
test_set = MelDataset(test_t)
test_loader = DataLoader(test_set, batch_size = 16)
train_loader = DataLoader(train_set, batch_size = train_batch, shuffle = True)

In [None]:
def shuffle_ids(tensor):
    return shuffle(np.arange(len(tensor)))

In [None]:
# 1 kaggle resnet
class ResNet(nn.Module):
    def __init__(self,input,out):
        super(ResNet, self).__init__()
        self.dropout = nn.Dropout(p=drop)
        self.dense_output = nn.Linear(input,out)
        self.resnet = models.resnet34(pretrained=True)
        self.resnet_head = list(self.resnet.children())
        self.resnet_head = nn.Sequential(*self.resnet_head[:-1])
        
    def forward(self,x):
        x = self.resnet_head(x)
        return self.dense_output(self.dropout(x.view(-1,F)))

In [None]:
F = 512
LR = 1e-3, 1e-2
keys_trunc = set(train_d.ebird_code)

In [None]:
network = ResNet(input=F,out=len(keys_trunc))
optimizer = Adam([{'params': network.resnet.parameters(), 'learning rate': LR[0]},
                  {'params': network.dense_output.parameters(), 'learning rate': LR[1]}])

In [None]:
def cross_entropy(y,y_pred):
    y = torch.argmax(y,axis=-1)
    return nn.CrossEntropyLoss()(y_pred,y.squeeze())

def accuracy(y,y_pred):
    y = torch.argmax(y,axis = -1).squeeze()
    y_pred = torch.argmax(y_pred,axis = -1).squeeze()
    return (y==y_pred).float().sum()/len(y)

In [None]:
def metrics(data,batch,epoch,begin,end,metric,group):
    t = group, metric,' ', data, ' '
    if group == 'Train':
        name = 'Batch ' + str(batch-1) + ' '
    if group =='Test':
        name = 'Epoch ' + str(epoch+1)
    time = np.round(end - begin, 1)
    time = 'Time: {} s'.format(time)
    print(name + '{} {}: {}{}{}'.format(*t) + '  ' + time)

In [None]:
dim = (3,n_mels,mel_len)
epochs = 5

begin = time.time()
print('Beginning Training ...\n')

for epoch in range(epochs):
    print('EPOCH ' + str(epoch+1))
    
    batch = 1
    network.train()
    for mini_batch in train_loader:
        train_X, train_y = mini_batch
        train_y = train_y.view(-1,len(keys_trunc))
        train_X = train_X.view(-1,*dim)
        ids = shuffle_ids(train_X)
        
        train_X = train_X[ids].to(device)
        train_y = train_y[ids].to(device)
        train_pred = network.forward(train_X)
        train_loss = cross_entropy(train_y,train_pred)
        train_accuracy = accuracy(train_y,train_pred)
        
        optimizer.zero_grad()
        train_loss.backward()
        
        optimizer.step()
        end = time.time()
        batch = batch + 1
        to_print = batch%100==1
        round_acc = np.round(train_accuracy.item(),3)
        if to_print:
            metrics(round_acc,batch,0,begin,end, 'Acc','Train')
            
    test_loss, test_points, test_accuracy = 0,0,0
    
    network.eval()
    with torch.no_grad():
        for mini_batch in test_loader:
            test_X, test_y = mini_batch
            test_y = test_y.view(-1,len(keys_trunc))
            test_X = test_X.view(-1,*dim)
            ids = shuffle_ids(test_X)
            
            test_X = test_X[ids].to(device)
            test_y = test_y[ids].to(device)
            test_pred = network.forward(test_X)
            test_points = test_points + len(test_y)
            test_loss += cross_entropy(test_y,test_pred).item()*len(test_y)
            test_accuracy += accuracy(test_y,test_pred).item()*len(test_y)
    
    end = time.time()
    test_loss /= test_points
    test_accuracy /= test_points
    acc = np.round(test_accuracy,3)
    metrics(acc,0,epoch,begin,end,'Acc','Test')
    print('')
    
print('Ending Training Session...')

In [None]:
##This code is exponentially more expensive to run on cpu from a laptop compared to kaggle's gpu
# As such I've run it on kaggle and will produce validation accuracy (with the complete dataset and 20 epochs instead of 5), separately in the write-up.
# Unfortunately the overall accuracy stayed very low (<1%) which is disappointing even if it is an improvement over a guess (1/264)