In [1]:
# import pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import torchvision
import torchvision.transforms as transforms
import os
import numpy as np
import glob
import shutil
import matplotlib.pyplot as plt
import scipy.io

## Pre-Processing

#### Deleting Wav File to Save Space and Combine Folders Across Days

In [2]:
root_data = "/home/george-vengrovski/Documents/canary_song_detector/USA5207"

folders = os.listdir(root_data)

if os.path.exists(root_data + "/joined") == False:
    os.mkdir(root_data + "/joined")
dst = root_data + "/joined"

for folder in folders:
    if folder == "joined":
        continue 

    files = os.listdir(os.path.join(root_data, folder))
    
    for file in files:
        if file.endswith('.wav'):
            os.remove(os.path.join(root_data, folder, file))
        else:
            for matrix in os.listdir(os.path.join(root_data, folder, file)):
                src = os.path.join(root_data, folder, file, matrix)
                shutil.move(src, dst)

#### Test and Train Split

In [3]:
src = "/home/george-vengrovski/Documents/canary_song_detector/USA5207/joined"
split = 0.8
files = os.listdir(src)
true_root_dir = "/home/george-vengrovski/Documents/canary_song_detector"
train_dir = os.path.join(true_root_dir, "train")
test_dir = os.path.join(true_root_dir, "test")

if not os.path.exists(test_dir):
    os.mkdir(test_dir)

if not os.path.exists(train_dir):
    os.mkdir(train_dir)

for file in files:
    x = np.random.uniform()
    if x > split:
        dest_dir = test_dir
    else:
        dest_dir = train_dir

    dest_path = os.path.join(dest_dir, file)

    # If destination path does not exist, move the file
    if not os.path.exists(dest_path):
        shutil.move(os.path.join(src, file), dest_path)


#### Dataloader

In [66]:
class DetectorDataClass():
    def __init__(self, dir, spec=513):
        self.data = []
        self.spec = spec

        for file in os.listdir(dir):
            self.data.append(os.path.join(dir, file))

    def __getitem__(self, index):
        data = self.data[index]
        mat_data = scipy.io.loadmat(data)
        mat_data = mat_data["song_data"]
        mat_data = mat_data[0][0]

        arr1 = mat_data[0]
        arr2 = mat_data[1]

        # beware if spec shape changes, this might cause error
        if arr1.shape[0] == self.spec:
            spec = torch.Tensor(arr1)
            raw_labels = torch.Tensor(arr2).int()
        else:
            spec = torch.Tensor(arr2)
            raw_labels = torch.Tensor(arr1).int()

        if raw_labels.shape == (0,0):
            song = False
        else:
            song = True

        # labels will be the same length as the song, but it will be filled with 1s between indcies of start and stops
        labels = torch.zeros(size=(spec.shape[1],))

        if song == True:
            num_entries = raw_labels.shape[1]

            for i in range(num_entries):
                if i % 2 == 0:
                    labels[raw_labels[i]:raw_labels[i+1]].fill_(1)
        
        spec = spec.unsqueeze(0)
        
        return spec, labels, song

    def __len__(self):
        return len(self.data) 

def collate_fn(batch):
    # Extract sequences and labels
    sequences = [x[0] for x in batch]
    labels = [x[1] for x in batch]
    
    # Find the max length for padding
    max_len_spec = max([s.size(2) for s in sequences])
    max_len_labels = max([l.size(0) for l in labels])
    
    # Pad each sequence to max length
    sequences_padded = []
    for s in sequences:
        pad_size = max_len_spec - s.size(2)
        sequences_padded.append(F.pad(s, (0, pad_size)))
    
    # Pad each label to max length
    labels_padded = []
    for l in labels:
        pad_size = max_len_labels - l.size(0)
        labels_padded.append(F.pad(l, (0, pad_size)))
    
    # Convert lists to tensors
    sequences_padded = torch.stack(sequences_padded)
    labels_padded = torch.stack(labels_padded)
    
    # Get the song flags
    songs = [x[2] for x in batch]
    
    return sequences_padded, labels_padded, songs

train_dataset = DetectorDataClass(train_dir)
test_dataset = DetectorDataClass(test_dir)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

In [67]:
class aviaBERT(nn.Module):
    def __init__(self, d_transformer, nhead_transformer, embedding_dim, num_labels, tau=0.1, dropout=0.1, transformer_layers=1,dim_feedforward=256):
        super(aviaBERT, self).__init__()
        self.tau = tau
        self.num_labels = num_labels
        self.dropout = dropout

        # TweetyNet Front End
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(5, 5), stride=1, padding=2)
        self.pool1 = nn.MaxPool2d(kernel_size=(8, 1), stride=(8, 1))
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(5, 5), stride=1, padding=2)
        self.pool2 = nn.MaxPool2d(kernel_size=(8, 1), stride=(8, 1))

        # Positional Encoding
        self.pos_conv1 = nn.Conv1d(d_transformer, d_transformer, kernel_size=3, padding=1, dilation=1)
        self.pos_conv2 = nn.Conv1d(d_transformer, d_transformer, kernel_size=3, padding=2, dilation=2)

        # transformer
        self.transformerProjection = nn.Linear(512, d_transformer)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_transformer, nhead=nhead_transformer, batch_first=True, dim_feedforward=dim_feedforward)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=transformer_layers)
        self.transformerDeProjection = nn.Linear(d_transformer, embedding_dim)

        # label embedding
        self.label_embedding = nn.Embedding(num_labels, embedding_dim)

    def convolutional_positional_encoding(self, x):
        pos = F.relu(self.pos_conv1(x))
        pos = F.relu(self.pos_conv2(pos))
        return pos

    def feature_extractor_forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool1(x)
        x = F.relu(self.conv2(x))
        x = self.pool2(x)
        x = x.flatten(1,2)
        return x

    def transformer_forward(self, x):
        # project the input to the transformer dimension
        x = x.permute(0,2,1)
        x = self.transformerProjection(x)
        x = x.permute(0,2,1)

        # add convolutional positional encoding
        pos_enc = self.convolutional_positional_encoding(x)
        x = x + pos_enc
        x = x.permute(0,2,1)
        x = self.transformer_encoder(x)
        x = self.transformerDeProjection(x)
        return x

    def forward(self, x):
        x = self.feature_extractor_forward(x)
        x = self.transformer_forward(x)
        return x

    def BCE_loss(self, y_pred, y_true):
        """loss function for TweetyNet
        Parameters
        ----------
        y_pred : torch.Tensor
            output of TweetyNet model, shape (batch, classes, timebins)
        y_true : torch.Tensor
            one-hot encoded labels, shape (batch, classes, timebins)
        Returns
        -------
        loss : torch.Tensor
            mean cross entropy loss
        """
        loss = torch.nn.BCEWithLogitsLoss()
        return loss(input = y_pred, target = y_true)

In [68]:
from torch.optim.lr_scheduler import StepLR
import matplotlib.pyplot as plt

def detailed_count_parameters(model):
    """Print details of layers with the number of trainable parameters in the model."""
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        param = parameter.numel()
        total_params += param
        # print(f"Layer: {name} | Parameters: {param:,} | Shape: {list(parameter.shape)}")
    print(f"\nTotal Trainable Parameters: {total_params:,}")

epochs = 50
learning_rate = 1e-4
max_batches = 10  # maximum number of batches per epoch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

loss_list = []
model = aviaBERT(d_transformer=64, nhead_transformer=2, embedding_dim=1, num_labels=1, tau=0.1, dropout=0.00)
detailed_count_parameters(model)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.01)


Total Trainable Parameters: 209,666


In [70]:
def sum_squared_weights(model):
    pass

def frame_error_rate(y_pred, y_true):
    """
    Compute the frame error rate.
    y_pred: Tensor of shape (batch_size, time_steps) - typically the output of a softmax
    y_true: Tensor of shape (batch_size, time_steps) - ground truth labels
    Returns the frame error rate.
    """

    y_pred = torch.round(y_pred)
    mismatches = (y_pred != y_true)
    error = mismatches.sum() / y_true.size(0) / y_true.size(1)
    return error * 100  # return error as percentage

# Modify the validation function to also compute and return the frame error rate
def validate_model(model, test_loader):
    model.eval()
    total_val_loss = 0
    total_frame_error = 0
    num_val_batches = 0
    with torch.no_grad():
        for i, (spec, label, song) in enumerate(test_loader):
            if i > 10:
              break
            spec = spec.to(device)
            label = label.to(device)

            output = model.forward(spec)

            loss = model.BCE_loss(y_pred = output.squeeze(2), y_true=label)

            total_frame_error += frame_error_rate(y_pred = output.squeeze(2), y_true=label).item()

            total_val_loss += loss.item()
            num_val_batches += 1

    return total_val_loss / num_val_batches, total_frame_error / num_val_batches

initial_val_loss, initial_frame_error = validate_model(model, test_loader)
print(f'Initial Validation Loss: {initial_val_loss:.2e}, Initial Frame Error Rate: {initial_frame_error:.2f}%')

loss_list = []
val_loss_list = []

for epoch in range(epochs):
    total_loss = 0
    num_batches = 0

    # Training Loop
    model.train()
    for i, (spec, label, song) in enumerate(train_loader):
        if i > 10:
          break
        spec = spec.to(device)
        label = label.to(device)

        output = model.forward(spec)

        loss = model.BCE_loss(y_pred = output.squeeze(2), y_true=label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        num_batches += 1

    avg_train_loss = total_loss / num_batches
    loss_list.append(avg_train_loss)

    avg_val_loss, avg_frame_error = validate_model(model, test_loader)
    val_loss_list.append(avg_val_loss)
    print(f'Epoch [{epoch+1}/{epochs}], Training Loss: {avg_train_loss:.2e}, Validation Loss: {avg_val_loss:.2e}, Frame Error Rate: {avg_frame_error:.2f}%')

# print loss curve
plt.plot(loss_list, label='Training Loss')
plt.plot(val_loss_list, label='Validation Loss')
plt.legend()
plt.show()

print(f"final loss {loss_list[-1]}")

Initial Validation Loss: 4.46e-02, Initial Frame Error Rate: 98.46%
Epoch [1/50], Training Loss: 5.22e-02, Validation Loss: 3.82e-02, Frame Error Rate: 98.69%
Epoch [2/50], Training Loss: 6.16e-02, Validation Loss: 4.66e-02, Frame Error Rate: 98.54%
Epoch [3/50], Training Loss: 5.47e-02, Validation Loss: 4.65e-02, Frame Error Rate: 98.48%
Epoch [4/50], Training Loss: 6.48e-02, Validation Loss: 4.14e-02, Frame Error Rate: 98.46%
Epoch [5/50], Training Loss: 6.66e-02, Validation Loss: 4.68e-02, Frame Error Rate: 98.93%
Epoch [6/50], Training Loss: 6.30e-02, Validation Loss: 4.20e-02, Frame Error Rate: 98.21%
Epoch [7/50], Training Loss: 6.05e-02, Validation Loss: 4.04e-02, Frame Error Rate: 98.79%
Epoch [8/50], Training Loss: 5.85e-02, Validation Loss: 3.96e-02, Frame Error Rate: 98.92%
Epoch [9/50], Training Loss: 5.31e-02, Validation Loss: 4.01e-02, Frame Error Rate: 98.58%
Epoch [10/50], Training Loss: 6.52e-02, Validation Loss: 3.88e-02, Frame Error Rate: 99.24%
Epoch [11/50], Traini

KeyboardInterrupt: 