In [1]:
import torch
import torchaudio
import torchaudio.transforms as transforms 
import torch.nn as nn

import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import glob
import os
import numpy as np
import matplotlib.pyplot as plt

import scipy.signal
import librosa
import soundfile as sf
import shutil

from sklearn.cluster import KMeans
from tqdm import tqdm

torch.random.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

data wrangling

In [3]:
import os
import numpy as np
import shutil
from sklearn.model_selection import train_test_split
import time

class DataSplitter:
    def __init__(self, data_dir, bird_name, test_size=0.2):
        self.data_dir = data_dir
        self.bird_name = bird_name
        self.test_size = test_size

    def split_data(self, root_dir):  # Add root_dir parameter here
        # Get the filenames
        filenames = os.listdir(os.path.join(self.data_dir, f"{self.bird_name}_songs"))
        filenames = [f for f in filenames if f.endswith('.wav')]
        
        # Generate train and test split
        train_files, test_files = train_test_split(filenames, test_size=self.test_size, random_state=42)

        # Create directories if they do not exist
        train_dir = os.path.join(root_dir, "train")  # Use root_dir instead of self.data_dir
        test_dir = os.path.join(root_dir, "test")  # Use root_dir instead of self.data_dir
        os.makedirs(train_dir, exist_ok=True)
        os.makedirs(test_dir, exist_ok=True)

        # Copy the files
        start = time.time()
        num_train_files = self._copy_files(train_files, train_dir)
        num_test_files = self._copy_files(test_files, test_dir)
        end = time.time()
        
        print(f"Copied {num_train_files} train files and {num_test_files} test files")
        print(f"Total copy duration: {end - start} seconds")

    def _copy_files(self, file_list, target_dir):
        count = 0
        for f in file_list:
            # Check if corresponding npz file exists
            src_npz = os.path.join(self.data_dir, f"{self.bird_name}_data_matrices", f"{f}.npz")
            if os.path.exists(src_npz):
                # Copy wav file
                src = os.path.join(self.data_dir, f"{self.bird_name}_songs", f)
                dst = os.path.join(target_dir, f)
                shutil.copy(src, dst)
                
                # Copy npz file
                dst_npz = os.path.join(target_dir, f"{f}.npz")
                shutil.copy(src_npz, dst_npz)
                count += 1

        return count

root_dir = "/home/george-vengrovski/Documents/classifier_test"
data_dir = "/home/george-vengrovski/Documents/canary_data"
bird_name = "llb3"

splitter = DataSplitter(data_dir, bird_name)
splitter.split_data(root_dir)  # Pass root_dir as an argument here

Copied 2108 train files and 547 test files
Total copy duration: 75.93291974067688 seconds


In [90]:
from scipy.signal import butter, lfilter

# Create a Butterworth highpass filter.
def butter_highpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = butter(order, normal_cutoff, btype='high', analog=False)
    return b, a

def butter_highpass_filter(data, cutoff, fs, order=5):
    b, a = butter_highpass(cutoff, fs, order=order)
    y = lfilter(b, a, data)
    return y

class SongDataSet(Dataset):
    def __init__(self, train_or_test, root_dir):
        self.root_dir = root_dir
        self.file_dirs = []
        self.label_dirs = []
        
        if train_or_test not in ["train", "test"]:
            raise ValueError("train_or_test must be either train or test")

        self.file_dir = os.path.join(self.root_dir, train_or_test)

        # Collect all files in the given directories
        for file in os.listdir(self.file_dir):
            if file.endswith('.wav'):
                file_path = os.path.join(self.file_dir, file)
                label_path = os.path.join(self.file_dir, f"{os.path.splitext(file)[0]}.wav.npz")
                if os.path.exists(label_path):
                    self.file_dirs.append(file_path)
                    self.label_dirs.append(label_path)

    def __getitem__(self, index):
        # Load waveforms and labels
        waveform, _ = librosa.load(self.file_dirs[index], sr=36200)  
        
        # Apply the high-pass filter
        waveform = butter_highpass_filter(waveform, cutoff=500, fs=36200)
        
        # If the waveform's length is less than 1 second, recursively call this function on the next index
        if len(waveform) < 36200:  # For 1 second of data at 36200 Hz sample rate
            return self.__getitem__(index + 1 if index + 1 < self.__len__() else 0)

        label_data = np.load(self.label_dirs[index])
        label = label_data['labels']

        # Slice the first second of waveform and first 362 labels
        waveform = waveform[:36200]
        label = label[:,:,:362]

        # Convert to torch tensors
        waveform = torch.from_numpy(waveform).float()
        label = torch.from_numpy(label).long()

        return waveform, label

    def __len__(self):
        return len(self.file_dirs)
    
data_root = "/home/george-vengrovski/Documents/classifier_test"
train_dataset = SongDataSet("train", root_dir=data_root)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

test_dataset = SongDataSet("test", root_dir=data_root)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=True)

In [9]:
class Classifier(nn.Module):
    def __init__(self, feature_extractor_dim, hidden_size, num_layers, projection_dim):
        super(Classifier, self).__init__()

        # feature extractor
        self.conv1 = nn.Conv1d(1, feature_extractor_dim, kernel_size=10, stride=6)
        self.conv2 = nn.Conv1d(feature_extractor_dim, feature_extractor_dim, kernel_size=5, stride=2)
        self.conv3 = nn.Conv1d(feature_extractor_dim, feature_extractor_dim, kernel_size=4, stride=2)
        self.conv4 = nn.Conv1d(feature_extractor_dim, feature_extractor_dim, kernel_size=3, stride=2)
        self.conv5 = nn.Conv1d(feature_extractor_dim, feature_extractor_dim, kernel_size=3, stride=2)
        self.conv6 = nn.Conv1d(feature_extractor_dim, feature_extractor_dim, kernel_size=3, stride=1)
        self.conv7 = nn.Conv1d(feature_extractor_dim, feature_extractor_dim, kernel_size=2, stride=1)
        self.adapatvie_pool = nn.AdaptiveAvgPool1d(362)

        # GRU Layer
        self.gru = nn.GRU(input_size=feature_extractor_dim, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.projection_matrix = nn.Linear(hidden_size*2, 1)
        

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = F.relu(self.conv5(x))
        x = F.relu(self.conv6(x))
        x = F.relu(self.conv7(x))
        x = self.adapatvie_pool(x)

        # Reshape for GRU
        x = x.permute(0, 2, 1)

        # x is the output for each timestep of the GRU while h is the final hidden state
        x, h = self.gru(x)
        # # x = x.permute(0, 2, 1)
        x = self.projection_matrix(x)
        # # argmax dim 2 
        x = x.squeeze(2)

        return x 
    
    def loss(self, predictions, labels):
        labels = labels.squeeze(1)

        pred_length = predictions.shape[1]

        labels = labels[:, :pred_length]
        # cross entropy loss
        loss = F.cross_entropy(predictions.float(), labels.float())
        return loss

In [29]:
# get cuda devices
devices=torch.cuda.device_count()
print("cuda devices: ",devices)

# set device to 2 
device = torch.device( "cpu")
print(device)

cuda devices:  2
cpu


In [126]:
# Given spoof waveform and converting it to the device
spoof_waveform = torch.randn(1, 1, 36200)
spoof_waveform = spoof_waveform.to(device)

# in one second on of input, there are 362 time bins 

# Your classifier model (I'm assuming you have it defined already)
model = Classifier(feature_extractor_dim=128, hidden_size=128, num_layers=1, projection_dim=1)

# Passing the spoof waveform through the model
output = model.forward(spoof_waveform)

# Printing the shape of the output
print(output.shape)


torch.Size([1, 362])


In [99]:
# get a random sample from dataloader

waveform, label = next(iter(train_loader))

epochs = 100
learning_rate = 1e-3
max_batches = 150

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')
model = Classifier(feature_extractor_dim=256, hidden_size=128, num_layers=1, projection_dim=32).to(device)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

loss_list = []

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
model.train()
for epoch in range(epochs):
    total_loss = 0  # reset total loss for each epoch
    num_batches = 0  # reset num_batches for each epoch
    
    for i, (waveform, label) in enumerate(train_loader):
        if i >= max_batches:  # only use max_batches batches
            break
        # move to device
        waveform = waveform.unsqueeze(1).to(device)
        label = label.to(device)  

        print(waveform.shape)

        # forward pass
        output = model.forward(waveform)
        print(output.shape)
        print(label.shape)
        break

        # Compute loss
        # loss = model.loss(predictions=output, labels=label)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # accumulate loss
        total_loss += loss.item()
        num_batches += 1

    avg_loss = total_loss / num_batches  # compute average loss
    loss_list.append(avg_loss)
    print ('Epoch [{}/{}], Average Loss: {:.4f}'.format(epoch+1, epochs, avg_loss))

    # # Save model every 10 epochs
    # if (epoch+1) % 10 == 0:
    #     torch.save(model.state_dict(), f'hubert_model_epoch_{epoch+1}.pth')

# print loss curve
plt.plot(loss_list)

The model has 1,611,777 trainable parameters
torch.Size([1, 1, 36200])
torch.Size([1, 362])
torch.Size([1, 1, 6674])


ZeroDivisionError: division by zero