In [1]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numba as nb
import librosa as lib
import time
import warnings
warnings.filterwarnings("ignore")
import librosa.display
import IPython.display
from scipy.fftpack import dct
from scipy.fftpack import idct
from math import pi
from math import e
from sklearn.metrics import mean_squared_error as mse
from collections import defaultdict

Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit


In [2]:
train_voice = pd.read_csv('common-voice/cv-valid-train.csv')
train_noise = pd.read_csv('UrbanSound8K/metadata/UrbanSound8K.csv')

In [3]:
voice = list(train_voice.filename.values)

In [4]:
fold_nums = [1,2,3,4,5,6,7,8,9,10]

In [5]:
def organize_data(voice, noise_data, fold_nums):
    result = {}
    result['voice'] = []
    result['combined'] = []
    count = 0
    for fold_num in fold_nums:
        noise = list(noise_data[noise_data.fold == fold_num][noise_data['class'] == 'children_playing'].slice_file_name)
        for i in range(len(noise)):
            sample_voice = list(lib.load('common-voice/cv-valid-train/' + voice[count])[0])
            sample_noise = list(lib.load('UrbanSound8K/audio/fold{}/'.format(fold_num) + noise[i])[0])
            extended_noise = sample_noise

            while len(sample_voice) > len(extended_noise):
                extended_noise.extend(sample_noise)

            sample_combined = []
            for val1, val2 in zip(sample_voice, extended_noise):
                sample_combined.append(val1 + val2)

            result['voice'].append(sample_voice)
            result['combined'].append(sample_combined)
            count += 1
        print('Success')
        
    return result


In [6]:
def MLT_window(N):
    helper = []
    
    for n in range(N):
        helper.append(np.sin(pi*((n+.5)/N)))
        
    return helper

In [7]:
def forward_stdct(sample, W, m, L, N):
    s_hat = []
    
    for i in range(len(m)):
        W_sample = np.multiply(W, sample[m[i]:m[i] + N])
        DCT = dct(W_sample, norm = 'ortho')
        s_hat.append(DCT)
    
    return s_hat

In [8]:
def backward_stdct(s_hat, W, m, L, N):
    s_new = [0.0]*L
    
    for i in range(len(m)):
        IDCT = idct(s_hat[i].to('cpu').detach().numpy(), norm = 'ortho')
        helper = np.multiply(W, IDCT)
        s_new[m[i]:m[i]+N] += helper
        
    return s_new

In [9]:
def transform_data(data, labels, W, N, data_str, label_str, cuda):
    data_dict = {}
    data_dict[data_str] = defaultdict(list)
    data_dict[label_str] = defaultdict(list)
    for n in range(len(data)):
        L = len(data[n])
        m = [int(i*(N/2)) for i in range(int((2*L/N) - 1))]
        s_hat = forward_stdct(data[n], W, m, L, N)
        label_hat = forward_stdct(labels[n], W, m, L, N)
        length = len(s_hat)
        for i in range(length):
            if (i > 1):
                prev2_window = s_hat[i - 2]
                prev1_window = s_hat[i-1]
                cur_window = s_hat[i]
                new_cur_vector = np.array([prev2_window, prev1_window, cur_window]).flatten()
                current_torch_vector = torch.tensor(new_cur_vector, dtype = torch.double, device = cuda)
                data_dict[data_str]['noisy_sample_{}'.format(n)].append(current_torch_vector)
                
                current_torch_vector_label = torch.tensor(label_hat[i], dtype = torch.double, device = cuda)
                data_dict[label_str]['voice_sample_{}'.format(n)].append(current_torch_vector_label)
                
            elif i == 0:
                zeros = [0]*N
                
                first_window = s_hat[0]
                first_vector = np.array([zeros, zeros, first_window]).flatten()
                torch_vector = torch.tensor(first_vector, dtype = torch.double, device = cuda)
                data_dict[data_str]['noisy_sample_{}'.format(n)].append(torch_vector)
                
                torch_vector_label = torch.tensor(label_hat[0], dtype = torch.double, device = cuda)
                data_dict[label_str]['voice_sample_{}'.format(n)].append(torch_vector_label)
            
            elif i == 1:
                zeros = [0]*N
                
                first_window = s_hat[0]
                second_window = s_hat[1]
                second_vector = np.array([zeros, first_window, second_window]).flatten()
                second_torch_vector = torch.tensor(second_vector, dtype = torch.double, device = cuda)
                data_dict[data_str]['noisy_sample_{}'.format(n)].append(second_torch_vector)
                
                second_torch_vector_label = torch.tensor(label_hat[1], dtype = torch.double, device = cuda)
                data_dict[label_str]['voice_sample_{}'.format(n)].append(second_torch_vector_label)
        
    return data_dict

In [10]:
class FourLayerNet(nn.Module):
    
    def __init__(self, Din, H1, H2, H3, Dout):
        super(FourLayerNet, self).__init__()
        
        self.linear1 = nn.Linear(Din, H1)
        self.linear2 = nn.Linear(H1, H2)
        self.linear3 = nn.Linear(H2, H3)
        self.linear4 = nn.Linear(H3, Dout)
        
    def forward(self, x):
        first = F.elu(self.linear1(x))
        second = F.selu(self.linear2(first))
        third = F.gelu(self.linear3(second))
        fourth = self.linear4(third)
        
        return fourth

In [11]:
class ThreeLayerNet(nn.Module):
    
    def __init__(self, Din, H1, H2, Dout):
        super(ThreeLayerNet, self).__init__()
        
        self.linear1 = nn.Linear(Din, H1)
        self.linear2 = nn.Linear(H1, H2)
        self.linear3 = nn.Linear(H2, Dout)
        
    def forward(self, x):
        first = F.selu(self.linear1(x))
        second = F.gelu(self.linear2(first))
        third = self.linear3(second)
        
        return third

In [12]:
class TwoLayerNet(nn.Module):
    
    def __init__(self, Din, H1, Dout):
        super(TwoLayerNet, self).__init__()
        
        self.linear1 = nn.Linear(Din, H1)
        self.linear2 = nn.Linear(H1, Dout)
        
    def forward(self, x):
        first = F.tanh(self.linear1(x))
        second = self.linear2(first)
        
        return second

In [13]:
def train_network(model, num_epochs, loss_fn, optimizer, train_data):
    for t in range(num_epochs):
        total_loss = []
        for index in range(len(train_data['train_data'])):
            x = torch.stack(train_data['train_data']['noisy_sample_{}'.format(index)]).to(cuda)
            y = torch.stack(train_data['train_labels']['voice_sample_{}'.format(index)]).to(cuda)
            y_pred = model.forward(x)

            loss = loss_fn(y_pred, y)
            total_loss.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        if t%10 == 0:
            print(t, np.mean(total_loss))
            
    return model

In [14]:
def validation(validation_data, validation_data_dict, model):
    mse_original = []
    mse_dirty = []
    mse_clean = []
    for i in range(len(validation_data_dict['validation_data'])):
        L = len(validation_data['combined'][i])
        N = 128
        W = MLT_window(N)
        m = [int(i*(N/2)) for i in range(int((2*L/N)-1))]
        
        changed_shat_val = []
        for el in validation_data_dict['validation_data']['noisy_sample_{}'.format(i)]:
            changed_shat_val.append(model(el))
        
        s_new_val = backward_stdct(changed_shat_val, W, m, L, N)
        s_new_clean = backward_stdct(validation_data_dict['validation_labels']['voice_sample_{}'.format(i)], W, m, L, N)
        s_new_dirty = validation_data['combined'][i]
        
        mse_original.append(mse(s_new_dirty[N:len(s_new_val)-N], s_new_clean[N:len(s_new_val)-N]))
        mse_dirty.append(mse(s_new_val[N:len(s_new_val)-N], s_new_dirty[N:len(s_new_val)-N]))
        mse_clean.append(mse(s_new_val[N:len(s_new_val)-N], s_new_clean[N:len(s_new_val)-N]))
        
    print('Average MSE between the original noisy and clean signal: {}'.format(np.mean(mse_original)))
    print('Average MSE between model output and noisy singal: {}'.format(np.mean(mse_dirty)))
    print('Average MSE between model output and clean signal: {}'.format(np.mean(mse_clean)))

## Data Preparation

In [15]:
cuda = torch.device('cuda')

In [16]:
train_data = organize_data(voice, train_noise, fold_nums[:6])

Success
Success
Success
Success
Success
Success


In [21]:
train_data_dict = transform_data(train_data['combined'], train_data['voice'], W, N, 'train_data', 'train_labels', cuda)

## Neural Network Model Instantiation and Training

In [22]:
model1 = FourLayerNet(128*3, int(128*2.5), 128*2, int(128*1.5), 128).double().to(cuda)
optimizer1 = torch.optim.Adam(model1.parameters())

In [23]:
model2 = ThreeLayerNet(128*3, int(128*2.5), int(128*1.5), 128).double().to(cuda)
optimizer2 = torch.optim.Adam(model2.parameters())

In [24]:
model3 = TwoLayerNet(128*3, 128*2, 128).double().to(cuda)
optimizer3 = torch.optim.Adam(model3.parameters())

In [25]:
loss_function = nn.MSELoss(reduction = 'mean')

In [26]:
two_layer_net = train_network(model3, 200, loss_function, optimizer3, train_data_dict)

0 0.001188422360564077
10 0.0007634426073312159
20 0.0007556456462319581
30 0.0007498989407183789
40 0.0007448335659721771
50 0.0007420943784799458
60 0.0007398044751288581
70 0.0007379940314625577
80 0.000736326497608683
90 0.000734881670237516
100 0.0007340140764064712
110 0.0007324249396463482
120 0.0007314613267109296
130 0.0007311715470258406
140 0.0007303210944592649
150 0.0007294768820255279
160 0.0007288913247468209
170 0.0007283497067206976
180 0.0007277409462604447
190 0.000726860762847975


In [27]:
three_layer_net = train_network(model2, 200, loss_function, optimizer2, train_data_dict)

0 0.001221295324761317
10 0.000736445912133225
20 0.0006751473652961667
30 0.0006482751691584887
40 0.0006329092781529569
50 0.0006201950451189576
60 0.0006085090982240824
70 0.0006012241386874764
80 0.0005942088706933905
90 0.0005884122826419335
100 0.0005822178414069758
110 0.0005756504298278776
120 0.0005698992752151779
130 0.0005650527128793198
140 0.0005590968373351507
150 0.0005547059383862573
160 0.0005515159772711032
170 0.000549775316513866
180 0.0005463910012885757
190 0.0005439800172332671


In [28]:
four_layer_net = train_network(model1, 200, loss_function, optimizer1, train_data_dict)

0 0.0013330905394985696
10 0.0007585145118679859
20 0.0006759971341091145
30 0.0006373938851638815
40 0.0006116250622458589
50 0.0005943550895553765
60 0.0005817334352666519
70 0.0005696276244184572
80 0.0005579512547373587
90 0.0005505735765437926
100 0.000546246113433163
110 0.0005394817035597474
120 0.0005354037170536995
130 0.0005290663780511821
140 0.00052627970405805
150 0.0005185233534035745
160 0.0005149426988342548
170 0.000511807926000944
180 0.0005061297978466665
190 0.0005022031844768415


## Model Validation

In [34]:
validation_data = organize_data(voice, train_noise, fold_nums[6:8])

Success
Success


In [47]:
validation_data_dict = transform_data(validation_data['combined'], validation_data['voice'], W, N, 'validation_data', 'validation_labels', cuda)


In [43]:
validation(validation_data, validation_data_dict, two_layer_net)

Average MSE between the original noisy and clean signal: 0.004261348032147487
Average MSE between model output and noisy singal: 0.002390543012499608
Average MSE between model output and clean signal: 0.0020025144240509484


In [44]:
validation(validation_data, validation_data_dict, three_layer_net)

Average MSE between the original noisy and clean signal: 0.004261348032147487
Average MSE between model output and noisy singal: 0.0021596526064831013
Average MSE between model output and clean signal: 0.0017421923142928585


In [45]:
validation(validation_data, validation_data_dict, four_layer_net)

Average MSE between the original noisy and clean signal: 0.004261348032147487
Average MSE between model output and noisy singal: 0.0023164105534484677
Average MSE between model output and clean signal: 0.0016182336587833041
