# Audio Autoencoder

In [1]:
import os, re
import numpy as np
import IPython.display as ipd
SPEED = '0.5x'

audio_files = []
for root, _, files in os.walk(os.path.expanduser('~/FMA/fma_small/fma_small/000/')):
    for f in files:
        audio_files.append(root+f)
audio_files[:5]

['/home/b073040018/FMA/fma_small/fma_small/000/000002.mp3',
 '/home/b073040018/FMA/fma_small/fma_small/000/000005.mp3',
 '/home/b073040018/FMA/fma_small/fma_small/000/000010.mp3',
 '/home/b073040018/FMA/fma_small/fma_small/000/000140.mp3',
 '/home/b073040018/FMA/fma_small/fma_small/000/000141.mp3']

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from torch.utils.data import Dataset, DataLoader

class FMA(Dataset):
    def __init__(self, audio_files):
        self.audio_files = audio_files

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        x, sr = torchaudio.load(audio_files[idx])
        x_stretched, _ = torchaudio.load(re.sub(
            'fma_small/fma_small',
            'fma_small/fma_small_' + SPEED,
            audio_files[idx]
        ))
        # might be mono
        if x.shape[0] != 2:
            x = x.repeat(2, 1)
            x_stretched = x_stretched.repeat(2, 1)
            
        x = x[:, :5*sr] # extract first 5 sec
        x_stretched = x_stretched[:, :10*sr]
        
        return x, x_stretched, sr

  '"sox" backend is being deprecated. '


In [3]:
dataset_train = FMA(audio_files)
print(f'Number of samples: {len(dataset_train)}')
x, x_stretched, sr = dataset_train[0]
print('First item:')
print('Original speed:')
print(x)
ipd.Audio(x[:, :sr*4], rate=sr)

Number of samples: 62
First item:
Original speed:
tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  4.2779e-03,
         -3.0075e-03, -7.6644e-03],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  1.2519e-02,
          5.7099e-03,  1.1541e-05]])


In [4]:
print('Stretched speed:')
print(x_stretched)
ipd.Audio(x_stretched[:, :sr*4], rate=sr)

Stretched speed:
tensor([[ 0.0000,  0.0000,  0.0000,  ..., -0.0343, -0.0420, -0.0403],
        [ 0.0000,  0.0000,  0.0000,  ..., -0.0279, -0.0361, -0.0354]])


In [5]:
train_loader = DataLoader(dataset_train, batch_size=4, shuffle=True)

In [6]:
class UNetDown(nn.Module):
    def __init__(self, in_size, out_size, normalize=True, dropout=0.0):
        super(UNetDown, self).__init__()
        layers = [nn.Conv1d(in_size, out_size, 4, 2, 1, bias=False)]
        if normalize:
            layers.append(nn.InstanceNorm1d(out_size))
        layers.append(nn.Tanh())
        if dropout:
            layers.append(nn.Dropout(dropout))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)


class UNetUp(nn.Module):
    def __init__(self, in_size, out_size, dropout=0.0):
        super(UNetUp, self).__init__()
        layers = [
            nn.ConvTranspose1d(in_size, out_size, 4, 2, 1, bias=False),
            nn.InstanceNorm1d(out_size),
            nn.Tanh(),
        ]
        if dropout:
            layers.append(nn.Dropout(dropout))

        self.model = nn.Sequential(*layers)

    def forward(self, x, skip_input):
        x = self.model(x)
        if x.shape[-1] != skip_input.shape[-1]:
            x = F.pad(x, (0, 1))
        
        x = torch.cat((x, skip_input), 1)
        return x
    
class GeneratorUNet(nn.Module):
    def __init__(self, in_channels=2, out_channels=2):
        super(GeneratorUNet, self).__init__()

        self.down1 = UNetDown(in_channels, 64, normalize=False)
        self.down2 = UNetDown(64, 128)
        self.down3 = UNetDown(128, 256)
        self.down4 = UNetDown(256, 512, dropout=0.5)
        self.down5 = UNetDown(512, 512, dropout=0.5)
        self.down6 = UNetDown(512, 512, dropout=0.5)
        self.down7 = UNetDown(512, 512, dropout=0.5)
        self.down8 = UNetDown(512, 512, normalize=False, dropout=0.5)

        self.up1 = UNetUp(512, 512, dropout=0.5)
        self.up2 = UNetUp(1024, 512, dropout=0.5)
        self.up3 = UNetUp(1024, 512, dropout=0.5)
        self.up4 = UNetUp(1024, 512, dropout=0.5)
        self.up5 = UNetUp(1024, 256)
        self.up6 = UNetUp(512, 128)
        self.up7 = UNetUp(256, 64)

        self.final = nn.Sequential(
            nn.Upsample(scale_factor=4),
            nn.Conv1d(128, out_channels, 4, padding=1),
            nn.ConstantPad1d((0, 1), 0),
        )

    def forward(self, x):
        # U-Net generator with skip connections from encoder to decoder
        d1 = self.down1(x)
        d2 = self.down2(d1)
        d3 = self.down3(d2)
        d4 = self.down4(d3)
        d5 = self.down5(d4)
        d6 = self.down6(d5)
        d7 = self.down7(d6)
        d8 = self.down8(d7)
        u1 = self.up1(d8, d7)
        u2 = self.up2(u1, d6)
        u3 = self.up3(u2, d5)
        u4 = self.up4(u3, d4)
        u5 = self.up5(u4, d3)
        u6 = self.up6(u5, d2)
        u7 = self.up7(u6, d1)

        return self.final(u7)


In [7]:
def weights_init_normal(m):
    classname = m.__class__.__name__
    if classname.find("Conv") != -1:
        torch.nn.init.normal_(m.weight.data, 0.0, 0.02)

# Num epochs
start_epoch, max_epoch = 1, 5
log_step = 5

# Loss functions
criterion = torch.nn.MSELoss()

# Initialize generator
generator = GeneratorUNet()

if torch.cuda.is_available():
    generator.cuda()
    criterion.cuda()

generator.apply(weights_init_normal)

# Optimizers
optimizer = torch.optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.999))

In [8]:
from tqdm import tqdm
from IPython.display import clear_output
import time

t = time.time()
loader = train_loader

for epoch in tqdm(range(start_epoch, max_epoch + 1)):
    for data in loader:
        x = data[0].cuda()
        y = data[1].cuda()
        # ------------------
        #  Train Generators
        # ------------------
        optimizer.zero_grad()
        gen = generator(x)
        loss = criterion(gen, y)
        loss.backward()
        optimizer.step()
        
    print(epoch)
    print(loss.cpu().detach().numpy())

  0%|          | 0/5 [00:00<?, ?it/s]

1


 20%|██        | 1/5 [00:16<01:06, 16.57s/it]

0.08404387
2


 40%|████      | 2/5 [00:32<00:48, 16.24s/it]

0.03482226
3


 60%|██████    | 3/5 [00:47<00:31, 15.99s/it]

0.118692234
4


 80%|████████  | 4/5 [01:02<00:15, 15.82s/it]

0.046151694
5


100%|██████████| 5/5 [01:19<00:00, 15.80s/it]

0.07510684





In [9]:
x, streched, sr = dataset_train[0]
ipd.Audio(x[:, :sr*4], rate=sr)

In [10]:
ipd.Audio(streched[:, :sr*4], rate=sr)

In [11]:
gen = generator(torch.unsqueeze(x, 0).cuda()).cpu().detach().numpy()[0]
ipd.Audio(gen[:, :sr*4], rate=sr)