In [1]:
import torch
import pytorch_lightning as pl
from torch.nn import functional as F
from torch.utils.data import DataLoader, random_split
import torchaudio
from src.dataloader import MultiFXDataset
from src.model.resnet import resnet18
import pytorch_lightning as pl
from argparse import ArgumentParser
from logging import Logger
from warnings import simplefilter

In [4]:
data = "dataset/generated/gen_multiFX_10102021"
with_clean = True
batch_size = 64
learning_rate = 1e-3

trainer_args = {
    "gpus": 1,
    "max_epochs": 10,
}

In [5]:
transform = torchaudio.transforms.MelSpectrogram(sample_rate=44100,
                                                  n_fft=2048,
                                                  n_mels=128)
train_set = MultiFXDataset(data, 'train', transform)
valid_set = MultiFXDataset(data, 'valid', transform)
train_loader = DataLoader(train_set, batch_size=batch_size, num_workers=1, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=batch_size, num_workers=1)

print("=> Start training")
with_clean = True
if with_clean is True:
    print("=> Training with clean")
    in_channels = 2
else:
    print("=> Training no clean")
    in_channels = 1

model = resnet18(in_channels, train_set.settings["n_classes"], with_clean, learning_rate)

trainer = pl.Trainer(**trainer_args)
trainer.fit(model, train_loader, valid_loader)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


=> Start training
=> Training with clean


2021-10-10 00:14:48.260902: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0

  | Name    | Type              | Params
----------------------------------------------
0 | conv1   | Conv2d            | 800   
1 | bn1     | BatchNorm2d       | 32    
2 | relu    | ReLU              | 0     
3 | maxpool | MaxPool2d         | 0     
4 | layer_1 | Sequential        | 33.1 K
5 | layer_2 | Sequential        | 21.8 K
6 | layer_3 | Sequential        | 131 K 
7 | layer_4 | Sequential        | 525 K 
8 | avgpool | AdaptiveAvgPool2d | 0     
9 | fc      | Linear            | 1.7 K 
----------------------------------------------
714 K     Trainable params
0         Non-trainable params
714 K     Total params
2.859     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(


RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 202, in _worker_loop
    data = fetcher.fetch(index)
  File "/opt/conda/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/opt/conda/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/jovyan/workspace/SFXlearner/src/dataloader.py", line 109, in __getitem__
    datas = self.transform(torch.cat([audio_clean, audio_wet]))
RuntimeError: Sizes of tensors must match except in dimension 0. Got 220500 and 110250 in dimension 1 (The offending index is 1)


In [32]:
import glob
from tqdm import tqdm

error_files = []
for file in tqdm(glob.glob("dataset/generated/gen_multiFX_10102021/train/audio/*.wav")):
    audio, sr = torchaudio.load(file)
    if audio.shape[1] != 220500:
        error_files.append(file.split("/")[-1])

print(error_files)


  0%|          | 0/544 [00:00<?, ?it/s][A
 11%|█▏        | 62/544 [00:00<00:00, 612.88it/s][A
 23%|██▎       | 124/544 [00:00<00:00, 579.67it/s][A
 34%|███▍      | 186/544 [00:00<00:00, 595.34it/s][A
 47%|████▋     | 255/544 [00:00<00:00, 631.75it/s][A
 59%|█████▊    | 319/544 [00:00<00:00, 623.87it/s][A
 70%|███████   | 382/544 [00:00<00:00, 588.92it/s][A
 81%|████████▏ | 442/544 [00:00<00:00, 589.94it/s][A
100%|██████████| 544/544 [00:00<00:00, 593.55it/s][A

['95.wav', '27.wav', '163.wav', '231.wav', '299.wav', '367.wav', '435.wav', '503.wav']





In [33]:
labels = torch.load("dataset/generated/gen_multiFX_10102021/train/label_tensor.pt")
error_labels = torch.ones(13)
for file in tqdm(error_files):
    error_labels = torch.vstack((error_labels, labels[int(file.split(".")[0])]))



100%|██████████| 8/8 [00:00<00:00, 6562.57it/s]


In [35]:
print(error_labels[1:])

tensor([[0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]])


In [21]:
print(labels[92:97])

tensor([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]])


In [22]:
torch.mean(error_labels, axis=0)

tensor(1.)

In [59]:
from IPython.display import Audio

In [61]:
Audio("dataset/generated/gen_multiFX_10062021/train/audio/11600.wav")