In [1]:
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T
import numpy as np
import pandas as pd
import os
import random
import torch
import torch.nn as nn
from IPython.display import display, Audio

SAMPLE_RATE = 16000
N_MELS = 40
N_FFT = int(SAMPLE_RATE * 0.04)
HOP_LEN = int(SAMPLE_RATE * 0.02)

mel_spectrogram = T.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=N_FFT,
    hop_length=HOP_LEN,
    n_mels=N_MELS
)

In [4]:
def load_audio(aud_filename):
    wav, sr = torchaudio.load(aud_filename, normalize=True)
    if wav.shape[0] > 1:
        wav = torch.mean(wav, dim=0, keepdim=True)
    if sr != SAMPLE_RATE:
        resampler = T.Resample(sr, SAMPLE_RATE)
        wav = resampler(wav)

    return wav

def get_duration(aud_filename):
    audio_info = torchaudio.info(aud_filename)
    return (audio_info.num_frames/audio_info.sample_rate)

def round_up(audio):
    rem = SAMPLE_RATE - audio.shape[1]%SAMPLE_RATE
    baggage = torch.zeros([1,rem], dtype=audio.dtype)
    audio = torch.cat((audio,baggage), 1)
    while audio.shape[1] < SAMPLE_RATE*5:
        audio = audio.repeat(1,2)
    return audio

def same_dur_as(audio0, audio1):
    while audio0.shape[1] > audio1.shape[1]:
        audio1 = audio1.repeat(1,2)
        
    return audio1[:,0:audio0.shape[1]]

def add_two_noise(aud_filename, snr_list):
    
    snr = torch.tensor(snr_list)
    wav0 = round_up(load_audio(aud_filename[0]))
    wav1 = round_up(load_audio(aud_filename[1]))
    
    if get_duration(aud_filename[0])>get_duration(aud_filename[1]):
        wav1 = same_dur_as(wav0, wav1)
    else:
        wav0 = same_dur_as(wav1, wav0)
    
    noisy = F.add_noise(wav0, wav1, snr)
    noisy = nn.functional.normalize(noisy)
    return noisy

def get_log_melSpectrogram(audio):
    mel_feats = mel_spectrogram(audio)
    log_mel_feats = T.AmplitudeToDB()(mel_feats)
    return mel_feats

def get_random_audioFeatures(audio):
    
    t_audio = (int)(audio.shape[1]/SAMPLE_RATE)
    rand_sec = random.choice([i for i in range(0,t_audio-5)])
    start_sample = rand_sec*SAMPLE_RATE
    end_sample = (rand_sec+5)*SAMPLE_RATE
    log_mel_features = get_log_melSpectrogram(audio[:,start_sample:end_sample])

    return log_mel_features

In [15]:
sampl = '../audioData/NIGENS/NIGENS/phone/TELEPHONE-ELECTRONIC_GEN-HDF-22874.wav'
sample_ = '../audioData/NIGENS/NIGENS/piano/Piano+2017_58_5.wav'
noise = add_two_noise([sampl, sample_], [5])
feats = get_random_audioFeatures(noise)
noise.isnan().any(), torch.isnan(noise).any(), feats.mean()


(tensor(False), tensor(False), tensor(0.0048))

../audioData/NIGENS/NIGENS/phone/TelephoneElectronic+1025_61.wav

../audioData/NIGENS/NIGENS/phone/Telephone_DIGIP02-63.wav

In [11]:
display(Audio(noise, rate=SAMPLE_RATE))

In [88]:
sample_ = '../../LibriVox_Kaggle/achtgesichterambiwasse/achtgesichterambiwasse_0022.wav'
#sample_1 = '../../LibriVox_Kaggle/achtgesichterambiwasse/achtgesichterambiwasse_0001.wav'
noise = '../audioData/NIGENS/NIGENS/footsteps/FootstepsWood+6017_16_1.wav'
snrlist = [5]

noisy = add_two_noise([sample_, noise], snrlist)
display(Audio(noisy, rate=SAMPLE_RATE))
feats = get_random_audioFeatures(noisy)

In [92]:
inverse_mel_scale_transform = T.InverseMelScale(n_mels=N_MELS, n_stft=N_FFT // 2 + 1)
linear_spectrogram = inverse_mel_scale_transform(feats)

# Inverse Griffin-Lim HOP_LEN
griffin_lim_transform = T.GriffinLim(n_fft=N_FFT, hop_length=HOP_LEN)
reconstructed_waveform = griffin_lim_transform(linear_spectrogram)

display(Audio(reconstructed_waveform, rate=SAMPLE_RATE))