In [1]:
import torch
import torchaudio
import soundfile as sf 
import IPython.display as ipd
import pandas as pd
from utils import h_params

# Mix 2 audio samples

In [2]:
speech_path = "tests/test_data/speech/1462-170142-0001.flac"
ambient_path = "tests/test_data/ambient/14387-9-0-19.wav"

## Play Audio files
**speech**

In [3]:
speech_wav, sr = sf.read(speech_path)
ipd.Audio(speech_wav, rate=sr)

**ambience**

In [4]:
ipd.Audio(ambient_path)

## Mix the audio files

In [5]:
from preprocessing import create_match_8k_libri, mix_samples, stereo_to_mono, match_sample_rate
from utils import h_params

speech_sample, sr_s = torchaudio.load(speech_path)
ambient_sample, sr_a = torchaudio.load(ambient_path)

print(speech_sample.shape)
print(ambient_sample.shape)
match_8k_libri = create_match_8k_libri(h_params.sr_urban, h_params.sr_libri)
ambient_sample = match_8k_libri(ambient_sample)

res = mix_samples(speech_sample, ambient_sample)

ipd.Audio(res.numpy(), rate=sr_s)

torch.Size([1, 153360])
torch.Size([2, 176400])


## Check libri-speech meta-data DataFrame

In [6]:
libri_meta = pd.read_csv(h_params.libri_meta)
libri_meta.head()

Unnamed: 0,ID,SEX,SUBSET,MINUTES,NAME,PATH
0,14,F,train-clean-360,25.03,Kristin LeMoine,train-clean-360/14/212/14-212-0005.flac
1,14,F,train-clean-360,25.03,Kristin LeMoine,train-clean-360/14/212/14-212-0052.flac
2,14,F,train-clean-360,25.03,Kristin LeMoine,train-clean-360/14/212/14-212-0044.flac
3,14,F,train-clean-360,25.03,Kristin LeMoine,train-clean-360/14/212/14-212-0013.flac
4,14,F,train-clean-360,25.03,Kristin LeMoine,train-clean-360/14/212/14-212-0029.flac


## Check urban8k meta-data DataFrame

In [7]:
urban_meta = pd.read_csv(h_params.urban_meta)
urban_meta.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class,PATH
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark,audio/fold5/100032-3-0-0.wav
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing,audio/fold5/100263-2-0-117.wav
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing,audio/fold5/100263-2-0-121.wav
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing,audio/fold5/100263-2-0-126.wav
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing,audio/fold5/100263-2-0-137.wav


In [8]:
print(f'only background: {len(urban_meta[urban_meta["salience"] == 2])}\nall samples: {len(urban_meta)}')

only background: 3030
all samples: 8732


In [9]:
from utils import CWD
idx = 4
path = CWD / h_params.urban_path / "audio" / str("fold" + str(urban_meta.loc[idx, "fold"])) / str(urban_meta.loc[idx, "slice_file_name"])
ambient, sr = torchaudio.load(path)
ipd.Audio(ambient.numpy(), rate=sr)