In [12]:
import torch
import torchaudio
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchaudio.transforms import Convolve
import torchaudio.functional as F
from IPython.display import display, Audio
import numpy as np
import pandas as pd
import random
import os

In [6]:
aud_dir = '../../LibriVox_Kaggle/'
bg_dir = '../../LibriVox_Kaggle/BGnoise/'
rir_dir = '../../RIR/MIT_IR_Survey/Audio/'
train_csv_file = 'only_audioFname_train.csv'
test_csv_file = 'only_audioFname_test.csv'

bg_files = os.listdir(bg_dir)
rir_files = os.listdir(rir_dir)[1::]

SAMPLE_RATE = 16000

In [38]:
def resample_audio(audio, sr):
    resampled_audio = F.resample(audio, sr, SAMPLE_RATE)
    return resampled_audio

def stereo_to_mono(audio):
    new_audio = torch.mean(audio, dim=0).unsqueeze(0)
    return new_audio


def load_audio(aud_fname):
    
    raw_wav, sampleRate = torchaudio.load(aud_fname)
    if raw_wav.shape[0] == 2:
        raw_wav = stereo_to_mono(raw_wav)
    if sampleRate != SAMPLE_RATE:
        raw_wav = resample_audio(raw_wav, sampleRate)
    return raw_wav


def add_noise(audio, rir, noise_wav, snr):
    echo_audio = F.fftconvolve(audio, rir)[:,0:audio.shape[1]]
    noisy_audio = F.add_noise(echo_audio, noise_wav[:,0:audio.shape[1]], torch.Tensor([snr]))
    return noisy_audio

def random_second_choice(audio):
    duration = (int)(audio.shape[1]/SAMPLE_RATE)
    random_sec = random.choice([i for i in range(0, duration-1)])
    return random_sec

In [46]:
def get_data(filename):

    rir_fname = os.path.join(rir_dir,random.choice(rir_files))
    bg_fname = os.path.join(bg_dir, random.choice(bg_files))
    snr_choice = random.choice([5,10,20])

    wav = load_audio(filename)
    rir_ = load_audio(rir_fname)
    bg = load_audio(bg_fname)

    rand_wav_sec = random_second_choice(wav)
    rand_bg_sec = random_second_choice(bg)

    wav_sec = wav[:,rand_wav_sec*SAMPLE_RATE:(rand_wav_sec+1)*SAMPLE_RATE]
    bg_sec = bg[:,rand_bg_sec*SAMPLE_RATE:(rand_bg_sec+1)*SAMPLE_RATE]

    noisy_audio = add_noise(wav_sec, rir_, bg_sec, snr_choice)

    return noisy_audio, bg_sec


In [49]:
sample_aud = '../../LibriVox_Kaggle/achtgesichterambiwasse/achtgesichterambiwasse_0009.wav'

noisy, noise = get_data(sample_aud)


In [51]:
class audioDataset(Dataset):

    def __init__(self, audio_csvfile, aud_dir):
        self.audio_df = pd.read_csv(audio_csvfile)
        self.aud_dir = aud_dir

    def __len__(self):
        return len(self.audio_df)
    
    def __getitem__(self, index):
        audio_path = os.path.join(self.aud_dir, self.audio_df.iloc[index, 0])

        audio, label = get_data(audio_path)

        return audio, label

In [61]:
aud_dir = '../../LibriVox_Kaggle/'
train_csv_file = 'only_audioFname_train.csv'
test_csv_file = 'only_audioFname_test.csv'

train_dataset = audioDataset('only_audioFname_train.csv', aud_dir)
test_dataset = audioDataset('only_audioFname_test.csv', aud_dir)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [53]:
audio, labs = next(iter(train_dataloader))

(torch.Size([32, 1, 16000]), torch.Size([32, 1, 16000]))

In [54]:
torch.manual_seed(13)
torch.cuda.manual_seed(13)

class speechRemoval00(nn.Module):
    
    def __init__(self):
        super(speechRemoval00, self).__init__()
        self.encoder = nn.Sequential(
            nn.Conv1d(1, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv1d(64, 32, kernel_size=3, padding=1),
            nn.ReLU()
            )
        
        self.decoder = nn.Sequential(
            nn.Conv1d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv1d(64, 1, kernel_size=3, padding=1),
            nn.ReLU()
            )
    
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)

        return x


In [62]:
torch.manual_seed(13)
torch.cuda.manual_seed(13)

device = 'cuda:2' if torch.cuda.is_available() else 'cpu'

model = speechRemoval00().to(device)

loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [63]:
epochs = 10

for epoch in range(0,epochs):

    loss_ten = torch.Tensor([])
    for data in train_dataloader:
        print('pass')
        model.train()
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(inputs)
        
        # Compute loss
        loss = loss_fn(outputs, inputs)

        # BP and optim
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_ten = torch.cat((loss_ten,torch.Tensor([loss.item()])),0)
    
    print(f"Epoch [{epoch + 1}/{epochs}] Loss: {torch.mean(loss_ten)}")


pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass


IndexError: Cannot choose from an empty sequence