In [1]:
from denoiser import pretrained
from IPython.display import display, Audio
import torch
import torchaudio
import os
from scipy.signal import convolve
from denoiser.dsp import convert_audio
import pandas as pd
import random
import librosa
import math
import numpy as np

SAMPLE_RATE = 16000
RIR_files = os.listdir('RIR/')[1:]
BG_files = os.listdir('Bg_sound')
SNR = [i for i in range(0,10)]

In [7]:
def get_audio_list(directory):
    return [os.path.join(directory, file) for file in os.listdir(directory)]

def get_org_audfname(aud_dir, fname):
    return os.path.join(aud_dir, fname.split('/')[-1][2:])

def get_context_label(fname):
    return (int)(fname.split('/')[-1][0])

def normalise_npArray(arr):
    arr = (arr - min(arr))/(max(arr) - min(arr))
    return arr

def get_convolved_audio(aud_dir, filename):
    
    label = get_context_label(filename)
    org_audio_fname = get_org_audfname(aud_dir, filename)
    rir_fname = os.path.join('RIR/', RIR_files[label])
    
    org_wav, sr = librosa.load(org_audio_fname, sr=SAMPLE_RATE)
    #org_wav = normalise_npArray(org_wav)
    rir_wav, sr = librosa.load(rir_fname, sr=SAMPLE_RATE)
    #rir_wav = normalise_npArray(rir_wav)

    convolved_audio = convolve(org_wav, rir_wav)[0:org_wav.shape[0]]
    #convolved_audio = normalise_npArray(convolved_audio)
    return convolved_audio

def get_noise_from_sound(signal,noise,SNR):
    
    RMS_s=math.sqrt(np.mean(signal**2))
    #required RMS of noise
    RMS_n=math.sqrt(RMS_s**2/(pow(10,SNR/10)))
    
    #current RMS of noise
    RMS_n_current=math.sqrt(np.mean(noise**2))
    if RMS_n_current == 0:
        RMS_n_current = 1
    noise=noise*(RMS_n/RMS_n_current)
    
    return noise

def add_noise(audio, noise):

    SNR_list = [i for i in range(0,10)]
    SNR_choice = random.choice(SNR_list)
    print(SNR_choice)
    noise = get_noise_from_sound(audio, noise[0:audio.shape[0]], SNR_choice)
    noise = normalise_npArray(noise)
    noisy_audio = audio + noise
    return noisy_audio

def get_noisy_audio(aud_dir, aud_fname):
    label = get_context_label(aud_fname)

    convolved_audio = get_convolved_audio(aud_dir, aud_fname)
    bg_file = os.path.join('Bg_sound/', BG_files[2-label])
    bg_wav, sr = librosa.load(bg_file, sr=SAMPLE_RATE)
    noisy_audio = add_noise(convolved_audio, bg_wav)
    #noisy_audio = normalise_npArray(noisy_audio)

    return noisy_audio

In [5]:
aud_dir = '../../LibriVox_Kaggle/achtgesichterambiwasse/'
aud_list = get_audio_list(r'../../LibriVox_Kaggle/achtgesichterambiwasse/n1n2/')
get_context_label(aud_list[1]), aud_list[1], get_org_audfname(aud_dir, aud_list[1])

(0,
 '../../LibriVox_Kaggle/achtgesichterambiwasse/n1n2/0_achtgesichterambiwasse_0009.wav',
 '../../LibriVox_Kaggle/achtgesichterambiwasse/achtgesichterambiwasse_0009.wav')

In [8]:
#RIR_files = os.listdir('RIR/')[1:]
audio = get_noisy_audio(aud_dir, random.choice(aud_list))

display(Audio(audio, rate=SAMPLE_RATE))

6


In [None]:
from huggingsound import SpeechRecognitionModel

In [96]:
choi = 2

os.listdir('Bg_sound')

['restaurant2.wav', 'supermarkt.wav', 'train2.wav']

In [60]:
#sample = '../../LibriVox_Kaggle/achtgesichterambiwasse/n1n2/0_achtgesichterambiwasse_1257.wav'
sample = aud_list[3]
print(sample, get_context_label(sample))
wav, sr = librosa.load(get_org_audfname(aud_dir, sample), sr=22050)

rir_wav, sr_rir = librosa.load(os.path.join('RIR/',random.choice(RIR_files)), sr=22050)
con_wav = convolve(wav, rir_wav)[0:wav.shape[0]]

display(Audio(wav, rate=sr))
display(Audio(con_wav, rate=sr))

../../LibriVox_Kaggle/achtgesichterambiwasse/n1n2/2_achtgesichterambiwasse_0015.wav 2


In [22]:
os.listdir('../../LibriVox_Kaggle/achtgesichterambiwasse/n1n2/')

['1_achtgesichterambiwasse_0007.wav',
 '0_achtgesichterambiwasse_0009.wav',
 '0_achtgesichterambiwasse_0013.wav',
 '2_achtgesichterambiwasse_0015.wav',
 '2_achtgesichterambiwasse_0022.wav',
 '1_achtgesichterambiwasse_0023.wav',
 '2_achtgesichterambiwasse_0027.wav',
 '0_achtgesichterambiwasse_0029.wav',
 '2_achtgesichterambiwasse_0030.wav',
 '0_achtgesichterambiwasse_0032.wav',
 '1_achtgesichterambiwasse_0034.wav',
 '2_achtgesichterambiwasse_0038.wav',
 '2_achtgesichterambiwasse_0039.wav',
 '1_achtgesichterambiwasse_0045.wav',
 '2_achtgesichterambiwasse_0047.wav',
 '2_achtgesichterambiwasse_0049.wav',
 '0_achtgesichterambiwasse_0055.wav',
 '1_achtgesichterambiwasse_0057.wav',
 '2_achtgesichterambiwasse_0058.wav',
 '0_achtgesichterambiwasse_0061.wav',
 '2_achtgesichterambiwasse_0066.wav',
 '1_achtgesichterambiwasse_0070.wav',
 '2_achtgesichterambiwasse_0072.wav',
 '2_achtgesichterambiwasse_0075.wav',
 '2_achtgesichterambiwasse_0078.wav',
 '2_achtgesichterambiwasse_0082.wav',
 '0_achtgesi

In [17]:
denoiser_model = pretrained.dns64().cuda()

wav = convert_audio(wav.cuda(), sr, denoiser_model.sample_rate, denoiser_model.chin)
with torch.no_grad():
    denoised = denoiser_model(wav[None])[0]

display(Audio(denoised.cpu(), rate=denoiser_model.sample_rate))