In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torchsummary import summary
from torchmetrics.functional import accuracy

import os
from tqdm.notebook import tqdm
import pydub
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, matthews_corrcoef



In [2]:
learning_rate = 1e-4
eps = 1e-8
batch_size = 32
epochs = 10

# Load and prepare data

In [3]:
df = pd.read_csv('data/train_test_split.csv')

In [4]:
train_wav_list = np.concatenate([
    df['train_cat'].tolist(),
    df[~df['train_dog'].isnull()]['train_dog'].tolist()
])

In [5]:
test_wav_list = np.concatenate([
    df['test_cat'].tolist(),
    df['test_dog'].tolist()
])

In [6]:
class WavDataset(Dataset):
    def __init__(self, wave_list):
        super(WavDataset, self).__init__()
        self.wav_list = wave_list
        self.labels_index = {
            'cat': 0,
            'dog': 1
        }
        self.data_path = 'data/cats_dogs/'
        self.max_ms = 5000
        
    def __len__(self):
        return len(self.wav_list)
    
    def __getitem__(self, index):
        wav_file = self.wav_list[index]
        wav_file_path = os.path.join(self.data_path, wav_file)
        
        labels = self.labels_index[wav_file[:3]]
        sig, sr = torchaudio.load(wav_file_path)
        
        # Resample to 441khz
        resig = torchaudio.transforms.Resample(sr, 44100)(sig[:1, :])
        sr = 44100
        
        # Convert to 2 channel
        resig = torch.cat([resig, resig])
        
        # Pad trunc
        num_rows , sig_len = resig.shape
        max_len = sr // 1000 * self.max_ms
        
        if sig_len > max_len:
            resig = resig[:, :max_len]
        elif sig_len < max_len:
            # Length of padding to add at the beginning and end of the signal
            pad_begin_len = np.random.randint(0, max_len - sig_len)
            pad_end_len = max_len - sig_len - pad_begin_len
            
            # Pad with 0s
            pad_begin = torch.zeros((num_rows, pad_begin_len))
            pad_end = torch.zeros((num_rows, pad_end_len))
            
            resig = torch.cat((pad_begin, resig, pad_end), 1)
            
        # Convert to mel spectogram
        spec = torchaudio.transforms.MelSpectrogram(
            sr, n_fft=1024, hop_length=None, n_mels=64
        )(resig)
        spec = torchaudio.transforms.AmplitudeToDB(top_db=80)(spec)
        
        return spec, torch.tensor(labels)

In [7]:
train_dataset = WavDataset(train_wav_list)
test_dataset = WavDataset(test_wav_list)

In [8]:
train_sampler = RandomSampler(train_dataset)
test_sampler = SequentialSampler(test_dataset)

train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size)

# Build a simple model

In [9]:
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(2, 32, 3)
        self.relu = nn.ReLU()
        self.maxpool1 = nn.MaxPool2d(2)
        
        self.conv2 = nn.Conv2d(32, 64, 3)
        self.maxpool2 = nn.MaxPool2d(2)
        
        self.flatten = nn.Flatten()
        self.avgpool = nn.AdaptiveAvgPool2d(output_size=1)
        self.fc1 = nn.Linear(64 * 1 * 1, 256)
        self.dropout = nn.Dropout(0.25)
        self.pred = nn.Linear(256, 2)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.maxpool1(x)
        
        x = self.conv2(x)
        x = self.relu(x)
        x = self.maxpool2(x)
        
        x = self.avgpool(x)
        x = self.flatten(x)
        
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.pred(x)
        
        return x

In [10]:
model = SimpleCNN()
model.to('cuda')

SimpleCNN(
  (conv1): Conv2d(2, 32, kernel_size=(3, 3), stride=(1, 1))
  (relu): ReLU()
  (maxpool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (maxpool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (avgpool): AdaptiveAvgPool2d(output_size=1)
  (fc1): Linear(in_features=64, out_features=256, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
  (pred): Linear(in_features=256, out_features=2, bias=True)
)

In [11]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, eps=eps)

In [12]:
summary(model, (2, 64, 430))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 32, 62, 428]             608
              ReLU-2          [-1, 32, 62, 428]               0
         MaxPool2d-3          [-1, 32, 31, 214]               0
            Conv2d-4          [-1, 64, 29, 212]          18,496
              ReLU-5          [-1, 64, 29, 212]               0
         MaxPool2d-6          [-1, 64, 14, 106]               0
 AdaptiveAvgPool2d-7             [-1, 64, 1, 1]               0
           Flatten-8                   [-1, 64]               0
            Linear-9                  [-1, 256]          16,640
             ReLU-10                  [-1, 256]               0
          Dropout-11                  [-1, 256]               0
           Linear-12                    [-1, 2]             514
Total params: 36,258
Trainable params: 36,258
Non-trainable params: 0
---------------------------------

# Training Loop

In [13]:
for epoch in tqdm(range(epochs)):
    train_loss = []
    test_loss = []
    
    # Train
    model.train()
    for index, (spec, label) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        batch_spec = spec.to('cuda')
        batch_label = label.to('cuda')
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        logits = model(batch_spec)
        
        # Back pass
        loss = criterion(logits, batch_label)
        loss.backward()
        optimizer.step()
        
        train_loss.append(loss.item())
        
    # Testing
    model.eval()
    for index, (test_spec, test_label) in tqdm(enumerate(test_dataloader), total=len(test_dataloader)):
        test_batch_spec = test_spec.to('cuda')
        test_batch_label = test_label.to('cuda')
        
        with torch.no_grad():
            test_logits = model(test_batch_spec)
        
        loss = criterion(test_logits, test_batch_label)
        test_loss.append(loss.item())

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore
  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore





HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))





KeyError: 'nan'

# Simple Evaluation