In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.datasets as ds
import torchvision.transforms as trans
import torchvision
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
import datetime
import math, random
import librosa
import soundfile as sf
import waveform_tooling as wt
import os
from torch.autograd import Variable

torch.cuda.init()
torch.cuda.empty_cache()
print(torch.cuda.is_available())
print(torch.backends.mkl.is_available())

True
True


In [2]:
# Split input file into a set of waveforms

inputFile = "./LJ001-0005-44100.wav"
outputPath = "./waveforms/human-voice/wav"

y, sr = librosa.load(inputFile, sr = None)

y = y / np.amax(np.absolute(y))

wmap = wt.getWavelengthMap(y, 66)
waveforms, wavelengths = wt.split(y, wmap, oversampling = 2, encoderWidth = 600, mode = wt.lastCrossingPoint)

counter = 0

os.makedirs(outputPath, exist_ok = True)
for waveform in waveforms:
    sf.write(outputPath + f'/%04d.wav' % counter, waveform, 44100, 'PCM_24')
    counter+=1

In [3]:
# Utility functions

class Flatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)

class Reshape(nn.Module):
    def __init__(self, *args):
        super(Reshape, self).__init__()
        self.shape = args

    def forward(self, x):
        return x.view(tuple([x.size(0)] + list(self.shape)))

In [4]:
# Autoencoder

n_latent_dim = 32

class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        
        n_channels = 256
        
        self.encoder = nn.Sequential(
            nn.Conv1d(1, n_channels, 25, 1, padding=0),
            nn.Tanh(),
            nn.MaxPool1d(2),
            nn.Conv1d(n_channels, n_channels, 25, 1, padding=0),
            nn.Tanh(),
            nn.MaxPool1d(2),
            nn.Conv1d(n_channels, n_channels, 5, 1, padding=0),
            nn.Tanh(),
            nn.MaxPool1d(2),
            nn.Conv1d(n_channels, n_channels, 5, 1, padding=0),
            nn.Tanh(),
            nn.MaxPool1d(2),
            nn.Conv1d(n_channels, n_channels, 3, 1, padding=0),
            nn.Tanh(),
            nn.MaxPool1d(2),
            Flatten(),
            nn.Linear(n_channels * 14, n_channels),
            nn.Tanh(),
            nn.Linear(n_channels, n_latent_dim),
            nn.Tanh(),
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(n_latent_dim, n_channels),
            nn.Tanh(),
            nn.Linear(n_channels, n_channels * 14),
            nn.Tanh(),
            Reshape(n_channels, 14),
            nn.Upsample(scale_factor = 2, mode='nearest'),
            nn.ConvTranspose1d(n_channels, n_channels, 3, 1, padding=0),
            nn.Tanh(),
            nn.Upsample(scale_factor = 2, mode='nearest'),
            nn.ConvTranspose1d(n_channels, n_channels, 5, 1, padding=0),
            nn.Tanh(),
            nn.Upsample(scale_factor = 2, mode='nearest'),
            nn.ConvTranspose1d(n_channels, n_channels, 5, 1, padding=0),
            nn.Tanh(),
            nn.Upsample(scale_factor = 2, mode='nearest'),
            nn.ConvTranspose1d(n_channels, n_channels, 25, 1, padding=0),
            nn.Tanh(),
            nn.Upsample(scale_factor = 2, mode='nearest'),
            nn.ConvTranspose1d(n_channels, 1, 25, 1, padding=0),
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]
        num_features = 1
        for s in size:
            num_features *= s
        return num_features
    
def wav_loader(path):
    (y, sr) = librosa.load(path, sr = None)

    return torch.tensor(y)

def npy_loader(path):
    sample = torch.from_numpy(np.load(path))
    return sample

def load(encoder, decoder, map_location):
    autoencoder = Autoencoder().cuda()
    autoencoder.encoder = torch.load(encoder, map_location=map_location)
    autoencoder.decoder = torch.load(decoder, map_location=map_location)
    return autoencoder

def train(datasetFolder):
    batchSize = 500
    
    data = ds.DatasetFolder(datasetFolder, 
        loader=wav_loader,
        extensions='.wav')

    sampleloader = torch.utils.data.DataLoader(data, batch_size=batchSize,
                                              shuffle=True, num_workers=0)

    sample_loss_criterion = nn.L1Loss().cuda()
    spectral_loss_criterion = nn.L1Loss().cuda()
    autoencoder_optimizer = optim.Adam(autoencoder.parameters(), lr=0.0001, betas=(0.5, 0.999))

    for epoch in range(10000):
        running_spectral_loss = 0.0
        
        counter = 0
        
        if epoch == 50:
            autoencoder_optimizer = optim.Adam(autoencoder.parameters(), lr=0.00002, betas=(0.5, 0.999))

        for i, inputs in enumerate(sampleloader, 0):
            if inputs[0].size()[0] < batchSize:
                continue
                
            input = inputs[0].unsqueeze(1).cuda()

            autoencoder_optimizer.zero_grad()
            encoded = autoencoder(input)

            e_spec = torch.rfft(encoded.squeeze(1), 1)
            i_spec = torch.rfft(input.squeeze(1), 1)
            
            spectral_loss = spectral_loss_criterion(e_spec, i_spec)

            spectral_loss.backward()
            autoencoder_optimizer.step()

            running_spectral_loss += spectral_loss.item()
            counter += 1

        print('[%s] [%d] loss: %.5f' %
          (datetime.datetime.now(), 
            epoch + 1, 
            running_spectral_loss / counter))
        
        if(epoch % 10 == 0 and epoch > 0):
            testInput = wav_loader('./waveforms/human-voice-test/1570.wav').unsqueeze(0).unsqueeze(0).cuda()

            testEncoded = autoencoder(testInput)

            plt.figure(1)
            plt.title("Original")
            plt.plot(testInput.squeeze(0).squeeze(0).cpu().detach().numpy())
            plt.show()

            plt.figure(2)
            plt.title("Encoded")
            plt.plot(testEncoded.squeeze(0).squeeze(0).cpu().detach().numpy())
            plt.show()

In [6]:
# Weights

#autoencoder = Autoencoder().cuda()
autoencoder = load("./human-voice-last-crossing-point-encoder.net", "./human-voice-last-crossing-point-decoder.net", "cuda")
#train("./waveforms/human-voice")

In [7]:
# Process the waveforms

inputPath = './waveforms/human-voice/wav/'
outputPath = './waveforms/human-voice-output/'

os.makedirs(outputPath, exist_ok=True)

counter = 0

latent_state = torch.zeros(1, 1, n_latent_dim).cuda()

for f in os.listdir(inputPath):
    input = wav_loader(inputPath + f).unsqueeze(0).unsqueeze(0).cuda()
    
    latent = autoencoder.encoder(input).detach()
    
    latent_state = latent_state.lerp(latent, 0.5)
    
    encoded = autoencoder.decoder(latent_state)
    
    counter += 1

    sf.write(outputPath + f, encoded.squeeze(0).squeeze(0).cpu().detach().numpy(), 44100, 'PCM_24')

In [8]:
# Merge back

inputPath = './waveforms/human-voice-output/'

outputFilename = 'render-human-voice.wav'

waveforms = []
for f in os.listdir(inputPath):
    y, sr = librosa.load(inputPath + f, sr = None)
    waveforms.append(y)
    
output = wt.merge(waveforms, oversampling = 2, mode = wt.lastCrossingPoint)
sf.write('./' + outputFilename, output, 44100, 'PCM_24')