# Audio Autoencoder

In [1]:
import os
import numpy as np
import IPython.display as ipd

audio_files = []
for root, _, files in os.walk(os.path.expanduser('~/FMA/fma_small/fma_small/000/')):
    for f in files:
        audio_files.append(root+f)
audio_files[:5]

['/home/b073040018/FMA/fma_small/fma_small/000/000002.mp3',
 '/home/b073040018/FMA/fma_small/fma_small/000/000005.mp3',
 '/home/b073040018/FMA/fma_small/fma_small/000/000010.mp3',
 '/home/b073040018/FMA/fma_small/fma_small/000/000140.mp3',
 '/home/b073040018/FMA/fma_small/fma_small/000/000141.mp3']

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from torch.utils.data import Dataset, DataLoader

class FMA(Dataset):
    def __init__(self, audio_files):
        self.audio_files = audio_files

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        x, sr = torchaudio.load(audio_files[idx])
        if x.shape[0] != 2:
            x = x.repeat(2, 1)
        return x[:, :5*sr], sr

  '"sox" backend is being deprecated. '


In [3]:
dataset_train = FMA(audio_files)
print(f'Number of samples: {len(dataset_train)}')
x, sr = dataset_train[0]
print('First item:')
print(x)
print(x.shape)
ipd.Audio(x, rate=sr)

Number of samples: 62
First item:
tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  4.2779e-03,
         -3.0075e-03, -7.6644e-03],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  1.2519e-02,
          5.7099e-03,  1.1541e-05]])
torch.Size([2, 220500])


In [4]:
train_loader = DataLoader(dataset_train, batch_size=4, shuffle=True)

In [5]:
class UNetDown(nn.Module):
    def __init__(self, in_size, out_size, normalize=True, dropout=0.0):
        super(UNetDown, self).__init__()
        layers = [nn.Conv1d(in_size, out_size, 4, 2, 1, bias=False)]
        if normalize:
            layers.append(nn.InstanceNorm1d(out_size))
        layers.append(nn.Tanh())
        if dropout:
            layers.append(nn.Dropout(dropout))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)


class UNetUp(nn.Module):
    def __init__(self, in_size, out_size, dropout=0.0):
        super(UNetUp, self).__init__()
        layers = [
            nn.ConvTranspose1d(in_size, out_size, 4, 2, 1, bias=False),
            nn.InstanceNorm1d(out_size),
            nn.Tanh(),
        ]
        if dropout:
            layers.append(nn.Dropout(dropout))

        self.model = nn.Sequential(*layers)

    def forward(self, x, skip_input):
        x = self.model(x)
        if x.shape[-1] != skip_input.shape[-1]:
            x = F.pad(x, (0, 1))
        x = torch.cat((x, skip_input), 1)

        return x
    
class GeneratorUNet(nn.Module):
    def __init__(self, in_channels=2, out_channels=2):
        super(GeneratorUNet, self).__init__()

        self.down1 = UNetDown(in_channels, 64, normalize=False)
        self.down2 = UNetDown(64, 128)
        self.down3 = UNetDown(128, 256)
        self.down4 = UNetDown(256, 512, dropout=0.5)
        self.down5 = UNetDown(512, 512, dropout=0.5)
        self.down6 = UNetDown(512, 512, dropout=0.5)
        self.down7 = UNetDown(512, 512, dropout=0.5)
        self.down8 = UNetDown(512, 512, normalize=False, dropout=0.5)

        self.up1 = UNetUp(512, 512, dropout=0.5)
        self.up2 = UNetUp(1024, 512, dropout=0.5)
        self.up3 = UNetUp(1024, 512, dropout=0.5)
        self.up4 = UNetUp(1024, 512, dropout=0.5)
        self.up5 = UNetUp(1024, 256)
        self.up6 = UNetUp(512, 128)
        self.up7 = UNetUp(256, 64)

        self.final = nn.Sequential(
            nn.Upsample(scale_factor=2),
            nn.Conv1d(128, out_channels, 4, padding=1),
            nn.ConstantPad1d((0, 1), 0),
        )

    def forward(self, x):
        # U-Net generator with skip connections from encoder to decoder
        d1 = self.down1(x)
        d2 = self.down2(d1)
        d3 = self.down3(d2)
        d4 = self.down4(d3)
        d5 = self.down5(d4)
        d6 = self.down6(d5)
        d7 = self.down7(d6)
        d8 = self.down8(d7)
        u1 = self.up1(d8, d7)
        u2 = self.up2(u1, d6)
        u3 = self.up3(u2, d5)
        u4 = self.up4(u3, d4)
        u5 = self.up5(u4, d3)
        u6 = self.up6(u5, d2)
        u7 = self.up7(u6, d1)

        return self.final(u7)


In [6]:
def weights_init_normal(m):
    classname = m.__class__.__name__
    if classname.find("Conv") != -1:
        torch.nn.init.normal_(m.weight.data, 0.0, 0.02)

# Num epochs
start_epoch, max_epoch = 1, 20
log_step = 5

# Loss functions
criterion = torch.nn.MSELoss()

# Initialize generator
generator = GeneratorUNet()

if torch.cuda.is_available():
    generator = generator.cuda()
    criterion.cuda()

generator.apply(weights_init_normal)

# Optimizers
optimizer = torch.optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.999))

In [7]:
from tqdm import tqdm
from IPython.display import clear_output
import time

t = time.time()
loader = train_loader

for epoch in range(start_epoch, max_epoch + 1):
    for data in loader:
        x = data[0].cuda()
        # ------------------
        #  Train Generators
        # ------------------
        optimizer.zero_grad()
        gen = generator(x)
        loss = criterion(gen, x)
        loss.backward()
        optimizer.step()
        
    print(epoch)
    print(loss.cpu().detach().numpy())

1
0.015474649
2
0.0034349547
3
0.0035092626
4
0.07747233
5
0.0041097156
6
0.010015577
7
0.002680377
8
0.0020022816
9
0.001545522
10
0.0013035225
11
0.0020604183
12
0.00042945545
13
0.0016565135
14
0.0017092576
15
0.000426073
16
0.00047501025
17
0.0005726219
18
0.0007639767
19
0.001096747
20
0.003775067


In [17]:
x, sr = dataset_train[1]
ipd.Audio(x, rate=sr)

In [18]:
gen = generator(torch.unsqueeze(x, 0).cuda()).cpu().detach().numpy()[0]
ipd.Audio(gen, rate=sr)