In [96]:
import torch
import torchaudio
import numpy as np
import pandas as pd
import librosa
from IPython.display import display, Audio
import math
import random
import os
from scipy.signal import convolve
import torch.nn as nn
import torch.optim as optim

SAMPLE_RATE = 16000

In [3]:
def get_seconds(audio):

    duration = (int)(audio.shape[0]/SAMPLE_RATE)
    audio_list = []
    
    for i in range(0, duration*SAMPLE_RATE, SAMPLE_RATE):
        audio_list.append(audio[i:i+SAMPLE_RATE])
    return audio_list

In [5]:
def round_up_audio(audio):
    
    rem = audio.shape[0]%SAMPLE_RATE
    zero_len = SAMPLE_RATE-rem
    added_arr = np.zeros(zero_len, audio.dtype)
    ext_audio = np.concatenate((audio, added_arr), axis=None)

    return ext_audio

In [125]:
def add_echo_from_file(filename, audio):

    rir_wav,sr_rir = librosa.load(filename, sr=SAMPLE_RATE)
    echo_audio = convolve(audio, rir_wav, mode='full')

    return echo_audio[0:SAMPLE_RATE]

In [7]:
def get_noise_from_sound(signal,noise,SNR):
    
    RMS_s=math.sqrt(np.mean(signal**2))
    #required RMS of noise
    RMS_n=math.sqrt(RMS_s**2/(pow(10,SNR/10)))
    
    #current RMS of noise
    RMS_n_current=math.sqrt(np.mean(noise**2))
    noise=noise*(RMS_n/RMS_n_current)
    
    return noise

In [33]:
def add_noise(audio, noise):

    SNR_list = [i for i in range(0,10)]
    SNR_choice = random.choice(SNR_list)
    
    noise = get_noise_from_sound(audio, noise, SNR_choice)
    noisy_audio = audio + noise
    return noisy_audio, noise

In [159]:
audio_sample = '../../LibriVox_Kaggle/achtgesichterambiwasse/achtgesichterambiwasse_0003.wav'
bg_dir = '../../LibriVox_Kaggle/BGnoise/'
rir_dir = '../../RIR/MIT_IR_Survey/Audio/'

bg_files = os.listdir(bg_dir)
rir_files = os.listdir(rir_dir)[1:]


def get_random_audio_sec(audio_filename):
    
    # Choosing a random background and echo filename
    bg_file = bg_dir + random.choice(bg_files)
    rir_file = rir_dir + random.choice(rir_files)

    # Extracting audio data
    wav, sr = librosa.load(audio_filename, sr=SAMPLE_RATE)
    bg_wav,sr =librosa.load(bg_file, sr=SAMPLE_RATE)

    # Randomising and normalising audio data
    wav = round_up_audio(wav)
    wav /= np.max(np.abs(wav), axis=0)
    bg_wav /= np.max(np.abs(bg_wav), axis=0)

    # Getting a random audio and bg second
    wav_duration = (int)(wav.shape[0]/SAMPLE_RATE)
    sec_choice = random.choice([i for i in range(0, wav_duration-1)])
    rand_audio_sec = wav[sec_choice*SAMPLE_RATE:(sec_choice+1)*SAMPLE_RATE]
    bg_duration = (int)(bg_wav.shape[0]/SAMPLE_RATE)
    temp = [i for i in range(0, bg_duration-1)]
    bg_random_sec = random.choice(temp)
    bg_random_wav = bg_wav[bg_random_sec*SAMPLE_RATE:((bg_random_sec+1)*SAMPLE_RATE)]
    
    # Adding echo and bg noise to the audio
    echo_audio = add_echo_from_file(rir_file, rand_audio_sec)
    #print(echo_audio.shape, bg_wav.shape)
    noisy_audio, noise = add_noise(echo_audio, bg_random_wav)

    noisy_audio = torch.from_numpy(noisy_audio).unsqueeze(0)
    noise = torch.from_numpy(noise).unsqueeze(0)

    return noisy_audio, noise



In [82]:
csv_file = '../../LibriVox_Kaggle/LibriVox_Kaggle_out.csv'

df = pd.read_csv(csv_file)
drop_list = ['orgText', 'outText', 'WER']
train_len = (int)(0.8*len(df))
df2 = df.drop(drop_list, axis=1)
df2 = df2.sample(frac=1)
df2_train = df2[0:train_len]
df2_test = df2[train_len:len(df2)-1]
#os.path.join(bg_dir, df.iloc[2,0])
df2_train.to_csv('only_audioFname_train.csv', index=False)
df2_test.to_csv('only_audioFname_test.csv', index=False)

In [160]:
from torch.utils.data import Dataset, DataLoader

class audioDataset(Dataset):

    def __init__(self, audio_csvfile, audio_dir):
        self.audio_df = pd.read_csv(audio_csvfile)
        self.audio_dir = audio_dir

    def __len__(self):
        return len(self.audio_df)
    
    def __getitem__(self, index):
        audio_path = os.path.join(self.audio_dir, self.audio_df.iloc[index, 0])

        audio, label = get_random_audio_sec(audio_path)

        return audio, label
        

In [161]:
aud_dir = '../../LibriVox_Kaggle/'
train_dataset = audioDataset('only_audioFname_train.csv', aud_dir)
test_dataset = audioDataset('only_audioFname_test.csv', aud_dir)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [120]:
audio, labs = next(iter(train_dataloader))
not(torch.isnan(audio).any()), torch.isnan(labs).any(), type(audio), type(labs)


(True, tensor(False), torch.Tensor, torch.Tensor)

In [162]:
torch.manual_seed(13)
torch.cuda.manual_seed(13)

class speechRemoval00(nn.Module):
    
    def __init__(self):
        super(speechRemoval00, self).__init__()
        self.encoder = nn.Sequential(
            nn.Conv1d(1, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv1d(64, 32, kernel_size=3, padding=1),
            nn.ReLU()
            )
        
        self.decoder = nn.Sequential(
            nn.Conv1d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv1d(64, 1, kernel_size=3, padding=1),
            nn.ReLU()
            )
    
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)

        return x


In [163]:
torch.manual_seed(13)
torch.cuda.manual_seed(13)

device = 'cuda:1' if torch.cuda.is_available() else 'cpu'

model = speechRemoval00().to(device)

loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [164]:
epochs = 10

for epoch in range(0,epochs):

    for data in train_dataloader:

        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(inputs)
        
        # Compute loss
        loss = loss_fn(outputs, inputs)

        # BP and optim
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f"Epoch [{epoch + 1}/{epochs}] Loss: {loss.item()}")


Epoch [1/10] Loss: 0.03603314235806465
Epoch [2/10] Loss: 0.7521934509277344
Epoch [3/10] Loss: 0.04060957953333855
Epoch [4/10] Loss: 0.13763341307640076
Epoch [5/10] Loss: 0.014758163131773472
Epoch [6/10] Loss: 0.026880130171775818
Epoch [7/10] Loss: 0.06877470761537552
Epoch [8/10] Loss: 0.08155541867017746
Epoch [9/10] Loss: 0.01393034216016531
Epoch [10/10] Loss: 0.03397713601589203


In [181]:
torch.save(model.state_dict(), 'model_conf00.pt')

In [179]:
noisy_audio, noise = get_random_audio_sec(audio_sample)

display(Audio(noisy_audio, rate=SAMPLE_RATE))
display(Audio(noise, rate=SAMPLE_RATE))

model = model.to('cpu')
model.eval()
with torch.inference_mode():
    out_aud = model(noisy_audio)

display(Audio(out_aud, rate=SAMPLE_RATE))