In [1]:
import masp as srs
import numpy as np
import soundfile as sf
from IPython.display import Audio
import scipy
import copy
import pandas as pd
import os
from os.path import join as pjoin
from multiprocessing import Pool
import matplotlib.pyplot as plt
import mat73
import tqdm
import pyrubberband as pyrb


In [2]:
# import my modules (helpers.py where I stored all the functions):
import helpers as hlp
import importlib
importlib.reload(hlp);

In [3]:
# WE DEFINE THE CHUNK LENGHT TO 4 SECONDS (DEFAULT TO USING SUDO-RM-RF)
# SET PATHS:
mls_path = '/home/ubuntu/Data/mls_spanish'
wham_path = '/home/ubuntu/Data/wham'
output_path = '/home/ubuntu/Data/ha_scenes_sounds/'
fs = 16000
fs_n = 'wav16k'
mode = 'min'

In [4]:
def assign_chunks_by_duration(seconds_list):
    # Files that can reasonably be padded will be padded.
    # Otherwise we crop
    out = []
    chunk = []
    low_bound = np.array([(x+1)*2 for x in list(range(11))])   
    hi_bound = np.array([(x+2)*2 for x in list(range(11))])
    n_chunks = np.repeat(np.array([x+1 for x in list(range(6))]), 2)
    n_chunks = n_chunks[:len(low_bound)]
    for s in seconds_list:
        x = n_chunks[np.argmax(np.logical_and(s > low_bound, s < hi_bound))]
        out.append(x)
        chunk.append(list(range(x)))
    return out, chunk   

In [5]:
'''
noise_info = []
for split in ['tr', 'cv', 'tt']:
    split_path = pjoin(pjoin(wham_path, 'wham_noise'), split)
    for audio_path in  tqdm.tqdm(os.listdir(split_path)):
        audio, fs = sf.read(pjoin(split_path, audio_path))
        noise_info.append({'split': split, 'audio_path': pjoin(split_path, audio_path), 'size_mb': os.stat(pjoin(split_path, audio_path)).st_size / (1024 * 1024), 'fs': fs, 'shape':audio.shape})
        

ometa = pd.read_csv(pjoin(mls_path, 'metainfo.txt'), sep= '|')
ometa.columns = ometa.columns.str.strip()
ometa['PARTITION'] = ometa['PARTITION'].str.strip()

info = []
for split in ['train', 'test', 'dev']:
    split_path = pjoin(pjoin(mls_path, split), 'audio')
    speakers = os.listdir(split_path)
    for speaker in speakers:
        speaker_path = pjoin(split_path, speaker)
        books = os.listdir(speaker_path)
        for book in books:
            book_path = pjoin(speaker_path, book)
            audio_paths = os.listdir(book_path)
            for audio_path in audio_paths:
                gender = list(ometa[ometa['SPEAKER']==int(speaker)]['GENDER'])[0].strip()
                info.append({'split': split, 'speaker': speaker, 'book': book, 'audio_path': pjoin(book_path, audio_path), 'gender': gender})

df_s = pd.DataFrame(info)
df_n = pd.DataFrame(noise_info)


lens = list(df_n['shape'])

df_n.iloc[0]['audio_path']

df_s.to_pickle("mls_info.pkl")
df_n.to_pickle("wham_info.pkl")
''';

In [6]:
df_s = pd.read_pickle("mls_info.pkl")
df_n = pd.read_pickle("wham_info.pkl")

In [7]:
def augment_wham_train_set(tr, speech_len, fs):
    tr = tr.reset_index(drop=True)
    # Store lenght (in samples and seconds) from shape
    tr.insert(2, "len_samp", [x[0] for x in list(tr['shape'])], True)
    tr.insert(2, "len_s", [x[0]/fs for x in list(tr['shape'])], True)
    
    # Assign a number of chunks. Chunks from [2,4]s are expanded to one 4s chunk. 
    # Chunks from [4,6]s are cropped to one 4s chunk
    nchunks, chunk = assign_chunks_by_duration(list(tr['len_s']))
    tr.insert(2, "num_chunks", nchunks, True)
    lentr = len(tr)
    tr = tr.reindex(tr.index.repeat(tr.num_chunks))
    tr = tr.reset_index(drop=True)
    tr.insert(2, "chunk", [item for sublist in chunk for item in sublist], True)
    # One copy will have phase inversion (*(-1))
    L = [False, True]
    tr = (pd.DataFrame(np.repeat(tr.values, 2, axis=0), columns=tr.columns)
               .assign(phase_inv = np.tile(L, len(tr))))
    # The other copy will have a swap of left and right channels
    tr = (pd.DataFrame(np.repeat(tr.values, 2, axis=0), columns=tr.columns)
               .assign(lr_inv = np.tile(L, len(tr))))
    # The rest of utterances we have to augment will randomly time-streched
    stretch_utt = speech_len - len(tr)
    stretch  = np.concatenate((np.zeros(len(tr)), np.random.uniform(low=0.9, high=1.1, size=(stretch_utt))))
    tr = pd.concat([tr, tr[0:stretch_utt]])
    tr.insert(11, "stretch", stretch, True)
    tr = tr.reset_index(drop=True)
    tr = tr.rename(columns={'audio_path': 'noise_path'})
    tr = tr.rename(columns={'split': 'wham_split'})
    return tr

def crop_wham_test_set(tr, speech_len, fs):
    tr = tr.reset_index(drop=True)
    tr.insert(2, "len_samp", [x[0] for x in list(tr['shape'])], True)
    tr.insert(2, "len_s", [x[0]/fs for x in list(tr['shape'])], True)
    tr = tr[tr.len_s > 4.]
    tr = tr[:speech_len]
    tr = tr.reset_index(drop=True)
    tr.insert(2, "chunk", [0]*len(tr))
    tr.insert(3, "num_chunks", [1]*len(tr))
    tr.insert(9, "phase_inv", [False]*len(tr))
    tr.insert(10, "lr_inv", [False]*len(tr))
    tr.insert(11, "stretch", np.zeros(len(tr)))
    tr = tr.rename(columns={'audio_path': 'noise_path'})
    tr = tr.rename(columns={'split': 'wham_split'})
    return tr

In [8]:
tr = augment_wham_train_set(df_n[df_n['split']=='tr'], len(df_s[df_s['split']=='train']), fs)
cv = crop_wham_test_set(df_n[df_n['split']=='cv'], len(df_s[df_s['split']=='dev']), fs)
tt = crop_wham_test_set(df_n[df_n['split']=='tt'], len(df_s[df_s['split']=='test']), fs)

In [9]:
aug_wham = pd.concat([tr, tt, cv], axis=0)
aug_wham = aug_wham.reset_index(drop=True)

In [10]:
# now we merge the augmented wham metadata with the MLS-spanish metadata
df = pd.concat([df_s, aug_wham], axis=1)

In [11]:
df

Unnamed: 0,split,speaker,book,audio_path,gender,wham_split,noise_path,chunk,num_chunks,len_s,len_samp,size_mb,fs,shape,phase_inv,lr_inv,stretch
0,train,8688,8509,/home/ubuntu/Data/mls_spanish/train/audio/8688...,M,tr,/home/ubuntu/Data/wham/wham_noise/tr/012o030z_...,0,2,9.290938,148655,1.134232,16000,"(148655, 2)",False,False,0.0
1,train,8688,8509,/home/ubuntu/Data/mls_spanish/train/audio/8688...,M,tr,/home/ubuntu/Data/wham/wham_noise/tr/012o030z_...,0,2,9.290938,148655,1.134232,16000,"(148655, 2)",False,True,0.0
2,train,8688,8509,/home/ubuntu/Data/mls_spanish/train/audio/8688...,M,tr,/home/ubuntu/Data/wham/wham_noise/tr/012o030z_...,0,2,9.290938,148655,1.134232,16000,"(148655, 2)",True,False,0.0
3,train,8688,8509,/home/ubuntu/Data/mls_spanish/train/audio/8688...,M,tr,/home/ubuntu/Data/wham/wham_noise/tr/012o030z_...,0,2,9.290938,148655,1.134232,16000,"(148655, 2)",True,True,0.0
4,train,8688,8509,/home/ubuntu/Data/mls_spanish/train/audio/8688...,M,tr,/home/ubuntu/Data/wham/wham_noise/tr/012o030z_...,1,2,9.290938,148655,1.134232,16000,"(148655, 2)",False,False,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225489,dev,2592,1378,/home/ubuntu/Data/mls_spanish/dev/audio/2592/1...,F,cv,/home/ubuntu/Data/wham/wham_noise/cv/01vo030s_...,0,1,10.816313,173061,1.320435,16000,"(173061, 2)",False,False,0.0
225490,dev,2592,1378,/home/ubuntu/Data/mls_spanish/dev/audio/2592/1...,F,cv,/home/ubuntu/Data/wham/wham_noise/cv/40no0304_...,0,1,13.52325,216372,1.650871,16000,"(216372, 2)",False,False,0.0
225491,dev,2592,1378,/home/ubuntu/Data/mls_spanish/dev/audio/2592/1...,F,cv,/home/ubuntu/Data/wham/wham_noise/cv/01ko030u_...,0,1,14.284625,228554,1.743813,16000,"(228554, 2)",False,False,0.0
225492,dev,2592,1378,/home/ubuntu/Data/mls_spanish/dev/audio/2592/1...,F,cv,/home/ubuntu/Data/wham/wham_noise/cv/014c0208_...,0,1,6.931,110896,0.846153,16000,"(110896, 2)",False,False,0.0


In [12]:
# import my modules (helpers.py where I stored all the functions):
import helpers as hlp
import importlib
importlib.reload(hlp);

head_orient_azi = np.random.uniform(low = -45, high = 45, size = len(df))
head_orient_ele = np.random.uniform(low = -10, high = 10, size = len(df))

angle = np.random.uniform(low = -45, high = 45, size = len(df))
dist = np.random.uniform(low = 0.5, high = 3, size = len(df))
snr = np.random.uniform(low = -6, high = 6, size = len(df))

In [13]:
room_x = np.random.uniform(low = 2., high = 30., size = len(df))
room_y = np.random.uniform(low = 2., high = 30., size = len(df))
room_z = np.random.uniform(low = 2.5, high = 5., size = len(df))
np.random.shuffle(room_x)
np.random.shuffle(room_y)
np.random.shuffle(room_z)

In [14]:
t60s =  np.random.uniform(low = .1, high = 1., size = len(df))
t60s = np.sort(t60s)
volumes = room_x * room_y * room_z
volumes = np.sort(volumes)
dist = np.sort(dist)
perm = np.random.permutation(len(volumes))
room_x = room_x[perm]
room_y = room_y[perm]
room_z = room_z[perm]
dist = dist[perm]
t60s = t60s[perm]

In [15]:
head_pos = []
for k in range(len(room_x)):
    head_pos.append(np.array([np.random.uniform(low = 0.35*room_x[k], high = 0.65*room_x[k]),
                        np.random.uniform(low = 0.35*room_y[k], high = 0.65*room_y[k]),
                        np.random.uniform(low = 1., high = 2.)]))

In [16]:
head_pos = np.array(head_pos)

In [17]:
room = np.array((room_x, room_y, room_z)).T

In [18]:
target_pos = []
for k in range(len(room_x)):
    new_target_pos, new_head_pos = hlp.place_on_circle_in_room(head_pos[k], dist[k], 
                                                               angle[k]+head_orient_azi[k], room[k])
    head_pos[k] = new_head_pos
    target_pos.append(new_target_pos)


In [19]:
target_pos = np.squeeze(np.array(target_pos))

In [20]:
# Checks:
np.all(target_pos < room) # all targets are in the room

True

In [21]:
np.all(head_pos < room) # all heads are in the room

True

In [22]:
# now let's check the ears:
ears_pos = []
for k in range(head_pos.shape[0]):
    ears_pos.append(np.array(hlp.head_2_ku_ears(head_pos[k], np.array([head_orient_azi[k],head_orient_ele[k]]))))

In [23]:
ears_pos = np.array(ears_pos)

In [24]:
ears_pos.shape

(225494, 2, 3)

In [25]:
np.all(ears_pos[:, 0, :] < room) # all left ears are in the room

True

In [26]:
np.all(ears_pos[:, 1, :] < room) # all right are in the room

True

In [27]:
# final MINIMUM distance between head and target (check we don't have an intra-craneal target)
min(np.sqrt(np.sum((target_pos - head_pos)**2, axis=1))) > 0.0875 * 2

True

In [28]:
# minimum distance of ears against a wall
min ( min(room[:, 0] - ears_pos[:, 0, 0]), min(room[:, 0] - ears_pos[:, 1, 0]))

0.6874933440489601

In [29]:
min ( min(room[:, 1] - ears_pos[:, 0, 1]), min(room[:, 1] - ears_pos[:, 1, 1]))

0.6255256520512633

In [30]:
min ( min(room[:, 2] - ears_pos[:, 0, 2]), min(room[:, 2] - ears_pos[:, 1, 2]))

0.5024764954279237

In [31]:
target_pos.shape

(225494, 3)

In [32]:
room.shape

(225494, 3)

In [33]:
# minimum distance of targets against a wall
min(min(room[:, 0] - target_pos[:, 0]), min(room[:, 1] - target_pos[:, 1]), min(room[:, 2] - target_pos[:, 2]))

0.20000164551057864

In [34]:
df.insert(17, "room_x", room[:, 0])
df.insert(18, "room_y", room[:, 1])
df.insert(19, "room_z", room[:, 2])
df.insert(20, "rt60", t60s)
df.insert(21, "headC_x", head_pos[:,0])
df.insert(22, "headC_y", head_pos[:,1])
df.insert(23, "headC_z", head_pos[:,2])
df.insert(24, "src_x", target_pos[:,0])
df.insert(25, "src_y", target_pos[:,1])
df.insert(26, "src_z", target_pos[:,2])
df.insert(27, "headOrient_azi", head_orient_azi)
df.insert(28, "headOrient_ele", head_orient_ele)

In [35]:
'''
# generate a figure for each situation:
for k in tqdm.tqdm(range(head_pos.shape[0])):
    hlp.plot_scene(room[k], head_pos[k], np.array([head_orient_azi[k], head_orient_ele[k]])
                   , ears_pos[k],[target_pos[k]], perspective="xy")
    plt.title(str(head_orient_azi[k])+ '_' + str(angle[k]))
    plt.savefig(pjoin('situation_plots_rot', os.path.splitext(os.path.basename(df.iloc[k].audio_path))[0]+'.pdf'))
    plt.close('all')
''';
