In [93]:
import sys
import os
import glob
import pandas as pd
import soundfile as sf
import numpy as np
import pickle
from sklearn import preprocessing
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import librosa
import torch
from torch.utils.data import DataLoader
from Datasets.SimpleDataset import SimpleDataset
import torch.nn.functional as F
from torch.utils.data import random_split
import torch.nn as nn

## Create a dictionary containing file path and its label ##

In [95]:
file_to_label_dict = {}

In [1]:
# Modified from:

"""
Created by Francisco Bravo Sanchez July 2021
This scripts reads the NIPS4B wav files and splits them according to the
csv annotations from NIPS4Bplus (Morfi V, Bas Y, Pamula H, Glotin H,
Stowell D. 2019. NIPS4Bplus: a richly annotated birdsong audio dataset.
PeerJ Comput. Sci. 5:e223 http://doi.org/10.7717/peerj-cs.223)

NIPS4B wav files:
http://sabiod.univ-tln.fr/nips4b/media/birds/NIPS4B_BIRD_CHALLENGE_TRAIN_TEST_WAV.tar.gz

NIPS4Bplus annotations:
https://doi.org/10.6084/m9.figshare.6798548

Instructions
https://github.com/fbravosanchez/NIPS4Bplus#readme
"""

import sys
import os
import glob
import pandas as pd
import soundfile as sf
import numpy as np


#Set directories
#path to NIPS4B_BIRD wav files
wav_path = "Data\\wav\\train"
#path to NIPS4Bplus csv annotation files
csv_path = "Data\\temporal_annotations_nips4b"
#output path for generated cut files
output_path = "Data\\cutfiles"

if not os.path.exists(output_path):
    os.makedirs(output_path)


#read csv label file list
lbl_files = pd.DataFrame(glob.glob(os.path.join(csv_path , '')+ '*.csv'))
lbl_files.columns = ['csv']
lbl_files['wav'] = 'nips4b_birds_trainfile' + lbl_files['csv'].str[-7:-4]


#process by csv file
for i, j in lbl_files.iterrows():

    #skip empty files
    try:
        k = pd.read_csv(j['csv'], header=None)
        tags = True
    except pd.errors.EmptyDataError:
        tags = False

    #for each valid csv file process wavefile
    if tags:
        [signal, fs] = sf.read(os.path.join(wav_path , '') + j['wav'] + '.wav')
        signal = signal.astype(np.float64)
        # print(signal.shape)

        # Signal normalization
        signal = signal/np.abs(np.max(signal))

        #cut signal according to tag
        for l, m in k.iterrows():
            beg_sig = int(m[0]*fs)
            end_sig = int((m[0]+m[1])*fs)
            signal_cut = signal[beg_sig:end_sig]

            # Save cut signal as a new wavefile
            file_out = os.path.join(output_path, '') + str(j['wav']) +'_'+ str(l) + '.wav'
            sf.write(file_out, signal_cut, fs)


            # Add to dictionary if it is a bird call            
            label = m[2]
            if "_call" in label or "_song" in label:
                file_to_label_dict[file_out] = label


(220672,)


NameError: name 'file_to_label_dict' is not defined

Pickle the file->label dictionary so we don't have to run that again
**NOTE: v1 version of the dict has _call and _song specifications removed, v2 has them still in (e.g. in v1 you would have Erirub and Erirub, in v2 you would have Erirub_call, Erirub_song)**

In [97]:
import pickle

with open('pickles/filelabeldictv2.pkl', 'wb') as fp:
    pickle.dump(file_to_label_dict, fp)
    print("Pickle Successful")

Pickle Successful


Load pickle

In [98]:
with open('pickles/filelabeldictv2.pkl', 'rb') as fp:
    file_to_label_dict = pickle.load(fp)
    print("Pickle loaded")

Pickle loaded


# Create dataset from file label dict #

### Convert labels into class indices ###

In [99]:
lables_int = None

In [100]:
labels = list(file_to_label_dict.values())
le = preprocessing.LabelEncoder()
le.fit(labels)
labels_int = le.transform(labels)

keys = list(file_to_label_dict.keys())

file_to_class_int = dict(zip(keys, labels_int))

Pickle Labels_int

In [101]:
with open('./pickles/labels_int.pkl', 'wb') as fp:
    pickle.dump(labels_int, fp)
    print("Pickle Successful")

Pickle Successful


In [102]:
with open('pickles/labels_int.pkl', 'rb') as fp:
    labels_int = pickle.load(fp)
    print("Pickle loaded")

Pickle loaded


In [3]:
one_hot_labels = None

In [27]:
num_classes = np.max(labels_int)
label_binarizer = preprocessing.LabelBinarizer()
label_binarizer.fit(range(max(labels_int)+1))
one_hot_labels = label_binarizer.transform(labels_int)

[ 3 20 20 ... 57 57 57]
[0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0]


Pickling of one hots

In [28]:
with open('./pickles/one_hot.pkl', 'wb') as fp:
    pickle.dump(one_hot_labels, fp)
    print("Pickle Successful")

Pickle Successful


In [4]:
with open('pickles/one_hot.pkl', 'rb') as fp:
    one_hot_labels = pickle.load(fp)
    print("Pickle loaded")

Pickle loaded


## Convert audio files to a simple numerical representation for the baseline ##

In [None]:
baseline_features = []

In [None]:
from torchaudio import load

for file in keys:
# audio file is decoded on the fly
    

## Convert audio files into wav2vec embeddings ##

In [103]:
features = []

In [104]:
'''
Modified from https://bagustris.wordpress.com/2022/08/23/acoustic-feature-extraction-with-transformers/
'''

from transformers import Wav2Vec2Processor, Wav2Vec2Model
import librosa
import torch

# load model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

for file in keys:
# audio file is decoded on the fly
    print(file)
    array, fs = librosa.load(file, sr=16000)
    # print(len(array))
    input = processor(array.squeeze(), sampling_rate=fs, return_tensors="pt")

    # apply the model to the input array from wav
    with torch.no_grad():
        outputs = model(**input)

    # extract last hidden state, compute average, convert to numpy
    last_hidden_states = outputs.last_hidden_state.squeeze().mean(axis=0).numpy()
    print(f"Hidden state shape: {last_hidden_states.shape}")
    features.append(last_hidden_states)

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2Model: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Data\cutfiles\nips4b_birds_trainfile001_1.wav
Hidden state shape: (768,)
Data\cutfiles\nips4b_birds_trainfile001_2.wav


RuntimeError: Calculated padded input size per channel: (1). Kernel size: (2). Kernel size can't be greater than actual input size

Pickle the datapoint list

In [80]:
with open('./pickles/datapoints.pkl', 'wb') as fp:
    pickle.dump(features, fp)
    print("Pickle Successful")

Pickle Successful


Load datapoint list

In [6]:
with open('pickles/datapoints.pkl', 'rb') as fp:
    features = pickle.load(fp)
    print("Pickle loaded")

features = np.array(features)
features = torch.from_numpy(features).unsqueeze(1)
print(features.shape)

Pickle loaded
torch.Size([5459, 1, 768])


Testing that datset creates properly

In [78]:
# features = np.array(features)
# features = torch.from_numpy(features).unsqueeze(1)
# print(features.shape)
test = SimpleDataset(features, one_hot_labels)
test_loader = DataLoader(test, batch_size=5, shuffle=True)

data, label = next(iter(test_loader))
print(data.shape)

torch.Size([5, 1, 768])


## Create Neural Network ##

In [68]:
# import sys
# import os
# import glob
# import pandas as pd
# import soundfile as sf
import numpy as np
import pickle
# from sklearn import preprocessing
# from transformers import Wav2Vec2Processor, Wav2Vec2Model
# import librosa
import torch
from torch.utils.data import DataLoader
from Datasets.SimpleDataset import SimpleDataset
# import torch.nn.functional as F
from torch.utils.data import random_split
import torch.nn as nn

In [69]:
labels_int = None
features = None

with open('pickles/labels_int.pkl', 'rb') as fp:
    labels_int = pickle.load(fp)
    print("Pickle loaded")

with open('pickles/datapoints.pkl', 'rb') as fp:
    features = pickle.load(fp)
    print("Pickle loaded")

features = np.array(features)
features = np.pad(features, ((0, 0), (0, 16)))
features = np.array(np.split(features, 28, axis=1))
features = features.reshape((features.shape[1], features.shape[0], -1))
print(features.shape)
features = torch.from_numpy(features).unsqueeze(1)
features = features.repeat(1, 3, 1, 1)
print(features.shape)

Pickle loaded
Pickle loaded
(5459, 28, 28)
torch.Size([5459, 3, 28, 28])


In [70]:
print(torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

True
cuda


In [71]:
from resnet1d.resnet1d import ResNet1D
from model.test_net import CNN
from model.Mod_Resnet import Mod_Resnet

# model = ResNet1D(1, 2, 3, 1, 1, 8, 59).to(device)
model = Mod_Resnet().to(device)
model.fc = nn.Linear(512, 59) # assuming that the fc7 layer has 512 neurons, otherwise change it 
model.cuda()
print(model)

Using cache found in C:\Users\Thats/.cache\torch\hub\pytorch_vision_v0.10.0


Mod_Resnet(
  (model): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_ru

In [72]:
# print(type(labels_int))
bird_call_dataset = SimpleDataset(features, labels_int)
train_size = int(0.8*len(bird_call_dataset))
test_size = len(bird_call_dataset) - train_size
train_set, test_set = random_split(bird_call_dataset, [train_size, test_size])

train_dataloader = DataLoader(train_set, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_set, batch_size=8, shuffle=True)

In [73]:
loss_fn = nn.CrossEntropyLoss()
# Try different learning rate
learning_rate = 0.01

# Try different optimizer
# optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [74]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        y = y.type(torch.LongTensor)
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        # print(X.shape)
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [75]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            y = y.type(torch.LongTensor)
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Valid Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [76]:
epochs = 100
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 12.516876  [    0/ 4367]
loss: 8.134588  [  800/ 4367]
loss: 4.776534  [ 1600/ 4367]
loss: 4.498838  [ 2400/ 4367]
loss: 4.286469  [ 3200/ 4367]
loss: 4.251761  [ 4000/ 4367]
Valid Error: 
 Accuracy: 4.1%, Avg loss: 10.626030 

Epoch 2
-------------------------------
loss: 4.294632  [    0/ 4367]
loss: 4.394954  [  800/ 4367]
loss: 3.476369  [ 1600/ 4367]
loss: 4.528619  [ 2400/ 4367]
loss: 4.345991  [ 3200/ 4367]
loss: 4.274442  [ 4000/ 4367]
Valid Error: 
 Accuracy: 5.3%, Avg loss: 4.697166 

Epoch 3
-------------------------------
loss: 4.266598  [    0/ 4367]
loss: 3.819308  [  800/ 4367]
loss: 4.057833  [ 1600/ 4367]
loss: 4.642469  [ 2400/ 4367]
loss: 3.445732  [ 3200/ 4367]
loss: 4.598976  [ 4000/ 4367]
Valid Error: 
 Accuracy: 5.9%, Avg loss: 4.290036 

Epoch 4
-------------------------------
loss: 4.126251  [    0/ 4367]
loss: 3.865439  [  800/ 4367]
loss: 4.347623  [ 1600/ 4367]
loss: 4.036867  [ 2400/ 4367]
loss: 4.310561  [ 3200