# Meta-data generation script for Interspeech2024 speech enhancement dataset

Define RIRs for the dataset. 60.000 utterances, 70% 15% 15%

In [1]:
import masp as srs
import numpy as np
import soundfile as sf
from IPython.display import Audio
import scipy
import copy
import pandas as pd
import os
from os.path import join as pjoin
from multiprocessing import Pool
import matplotlib.pyplot as plt
#import mat73
import librosa as lsa
import scipy.signal as sig

In [2]:
def get_6band_rt60_vector():
    # We genearate a multiband RT60 vector. Check 'notebooks/RT60_analysis_AEC.ipynb' 
    # for more info 
    np.random.seed() #we randomize so multiprocessing doesn't yield same RT60s
    alphas = np.array([1.7196874268124676,
                         1.6152228672267106,
                         1.9318203836226113,
                         2.55718115999814,
                         4.176814897493042,
                         2.4892656080814346])
    betas = np.array([0.38685390302225775,
                         0.24453641709737417,
                         0.14321372785643122,
                         0.10453218827453133,
                         0.08678871224845529,
                         0.18290733668646034])
    sim_rt60Fs = []
    for i in range(len(alphas)):
        sim_rt60Fs.append(np.random.gamma(alphas[i], betas[i], 1))
    return np.array(sim_rt60Fs).squeeze()


def place_on_circle(head_pos,r,azi, ele):
# place a source around the reference point (like head)
    azi_rad = (azi) * (np.pi / 180)
    ele_rad = ele * np.pi / 180
    x_coord=head_pos[0]+r*np.cos(ele_rad) * np.cos(azi_rad)
    y_coord=head_pos[1]+r*np.cos(ele_rad) * np.sin(azi_rad)
    z_coord=head_pos[2]+r*np.sin(ele_rad)
    src_pos=np.array([x_coord, y_coord, z_coord]) 
    return src_pos

In [3]:
np.random.seed(0)

In [4]:
n_utterances = 60000

In [5]:
rt60s = []
for i in range(n_utterances):
    rt60s.append(get_6band_rt60_vector())
rt60s = np.array(rt60s)

In [6]:
rt60s.shape

(60000, 6)

In [7]:
band_centerfreqs = np.zeros((6))
band_centerfreqs[0] = 125
for nb in range(5):
    band_centerfreqs[nb+1] = 2 * band_centerfreqs[nb]

In [8]:
len(rt60s)

60000

In [9]:
rt60 = np.array([np.mean(x) for x in rt60s])

In [10]:
np.min(rt60)

0.09242059579147861

In [11]:
np.max(rt60)

1.2455474143266876

In [None]:
plt.hist(rt60s[:, 0], 500, density=True,alpha=0.8);
plt.hist(rt60s[:, 1], 500, density=True,alpha=0.8);
plt.hist(rt60s[:, 2], 500, density=True,alpha=0.8);
plt.hist(rt60s[:, 3], 500, density=True,alpha=0.8);
plt.hist(rt60s[:, 4], 500, density=True,alpha=0.8);
plt.hist(rt60s[:, 5], 500, density=True,alpha=0.8);
plt.hist(rt60, 500, density=True, alpha=0.8);

plt.legend([str(int(x))+'Hz' for x in band_centerfreqs] + ['mean'])
#plt.title('RT60 histogram')
plt.xlabel('RT60[s]')
plt.ylabel('count')
plt.xlim([0, 2])
plt.grid(True)

In [None]:
def head_2_ku_ears(head_pos,head_orient):
# based on head pos and orientation, compute coordinates of ears
    ear_distance_ku100=0.0875
    theta = (head_orient[0]) * np.pi / 180
    R_ear = [head_pos[0] - ear_distance_ku100 * np.sin(theta),
              head_pos[1] + ear_distance_ku100 * np.cos(theta), 
              head_pos[2]]
    L_ear = [head_pos[0] + ear_distance_ku100 * np.sin(theta),
              head_pos[1] - ear_distance_ku100 * np.cos(theta), 
              head_pos[2]]
    return [L_ear,R_ear]
    
def plot_scene(room_dims,head_pos,head_orient,l_mic_pos,l_src_pos, src_orient, perspective="xy"):
#   function to plot the designed scene
#   room_dims - dimensions of the room [x,y,z]
#   head_pos - head position [x,y,z]
#   head_orient - [az,el]
#   l_src_pos - list of source positions [[x,y,z],...,[x,y,z]]
#   perspective - which two dimensions to show 
    if perspective=="xy":
        dim1=1
        dim2=0
    elif perspective=="yz":
        dim1=2
        dim2=1
    elif perspective=="xz":
        dim1=2
        dim2=0
    fig = plt.figure()
    ax = fig.add_subplot()
    plt.xlim((0,room_dims[dim1]))
    plt.ylim((0,room_dims[dim2]))
    plt.axvline(head_pos[dim1], color='y') # horizontal lines
    plt.axhline(head_pos[dim2], color='y') # vertical lines
    plt.grid(True)
    # plot sources and receivers
    plt.plot(head_pos[dim1],head_pos[dim2], "o", ms=10, mew=2, color="black")
    # plot ears
    plt.plot(l_mic_pos[0][dim1],l_mic_pos[0][dim2], "o", ms=3, mew=2, color="blue")# left ear in blue
    plt.plot(l_mic_pos[1][dim1],l_mic_pos[1][dim2], "o", ms=3, mew=2, color="red")# right ear in red

    for i,src_pos in enumerate(l_src_pos):
        plt.plot(src_pos[dim1],src_pos[dim2], "o", ms=10, mew=2, color="red")
        plt.annotate(str(i), (src_pos[dim1],src_pos[dim2]))
    # plot head orientation if looking from above 
    if perspective=="xy":
        plt.plot(head_pos[dim1],head_pos[dim2], marker=(1, 1, -head_orient[0]), ms=20, mew=2,color="black")
        plt.plot(src_pos[dim1],src_pos[dim2], marker=(1, 1, -src_orient[0]), ms=20, mew=2,color="red")

    ax.set_aspect('equal', adjustable='box')


In [None]:
head_orient_azi = np.random.uniform(low = -180, high = 175, size = n_utterances)
head_orient_ele = np.random.uniform(low = -25, high = 20, size = n_utterances)


#angle = np.random.uniform(low = -45, high = 45, size = len(df))
dist = np.random.uniform(low = 0.5, high = 3, size = n_utterances)
#snr = np.random.uniform(low = 0, high = 6, size = len(df))

In [None]:
room_x = np.random.uniform(low = 3., high = 30., size = n_utterances)
room_y = room_x * np.random.uniform(low=0.5, high=1, size=n_utterances) #avoid tunnels
room_z = np.random.uniform(low = 2.5, high = 5., size = n_utterances)

In [None]:
#
volumes = room_x * room_y * room_z
volumes = np.sort(volumes)

In [None]:
#we also sort rt60 maximum values by volume

In [None]:
volumes

In [None]:
perm_rt60s = np.argsort(np.max(rt60s, 1))

In [None]:
rt60 = rt60[perm_rt60s]

In [None]:
rt60s.shape

In [None]:
for i in range(6):
    rt60s[:, i] = rt60s[perm_rt60s, i]

In [None]:
dist = np.sort(dist)
perm = np.random.permutation(len(volumes))

In [None]:
room_x = room_x[perm]
room_y = room_y[perm]
room_z = room_z[perm]
dist = dist[perm]
rt60 = rt60[perm]
for i in range(6):
    rt60s[:, i] = rt60s[perm, i]
head_pos = []
for k in range(len(room_x)):
    head_pos.append(np.array([np.random.uniform(low = 0.35*room_x[k], high = 0.65*room_x[k]),
                        np.random.uniform(low = 0.35*room_y[k], high = 0.65*room_y[k]),
                        np.random.uniform(low = 1., high = 2.)]))
head_pos = np.array(head_pos)
room = np.array((room_x, room_y, room_z)).T

In [None]:
srcs = []

src_target_azi = np.random.uniform(low = -45., high = 45., size = n_utterances)
src_target_ele = np.random.uniform(low = -20., high = 20., size = n_utterances)

for k in range(len(room_x)):
    #target_pos.append(hlp.place_on_circle_in_room(head_pos[k], dist[k], 
    #                                                           angle[k]+head_orient_azi[k], room[k]))
    #target_pos = np.squeeze(np.array(target_pos))
    src_pos = place_on_circle(head_pos[k], dist[k], head_orient_azi[k] + src_target_azi[k], head_orient_ele[k] + src_target_ele[k])
    src_pos[src_pos < 0.2] = 0.2              
    while np.any(src_pos > room[k] - 0.2):
        dist[k]*=0.9
        src_pos = place_on_circle(head_pos[k], dist[k], head_orient_azi[k] + src_target_azi[k], head_orient_ele[k] + src_target_ele[k])
        src_pos[src_pos < 0.2] = 0.2 
        print('correcting to r ', dist[k])
    srcs.append(src_pos)
srcs = np.array(srcs)

In [None]:
src_azi_dis = np.random.uniform(low = -45., high = 45., size = n_utterances)
src_ele_dis = np.random.uniform(low = -20., high = 20., size = n_utterances)

In [None]:
src_azi = np.zeros_like(src_azi_dis)
src_ele = np.zeros_like(src_ele_dis)

In [None]:
for k in range(n_utterances):
    src_azi[k] = head_orient_azi[k] + src_azi_dis[k] + 180
    src_ele[k] = head_orient_ele[k] + src_ele_dis[k] + 90
    if src_azi[k] < - 180:
        src_azi[k] += 360
    elif src_azi[k] > 180:
        src_azi[k] -= 360
    if src_azi[k] == 180:
        src_azi[k] = -180
        
    if src_ele[k] < - 180:
        src_ele[k] += 360
    elif src_ele[k] > 180:
        src_ele[k] -= 360
    if src_ele[k] == 180:
        src_ele[k] = -180

In [None]:
head_orient_azi

In [None]:
src_azi

In [None]:
src_azi_dis

In [None]:
# Checks:
np.all(srcs < room) # all targets are in the room

In [None]:
np.all(head_pos < room) # all heads are in the room

In [None]:
# now let's check the ears:
ears_pos = []
for k in range(head_pos.shape[0]):
    ears_pos.append(np.array(head_2_ku_ears(head_pos[k], np.array([head_orient_azi[k],head_orient_ele[k]]))))

ears_pos = np.array(ears_pos)

In [None]:
np.all(ears_pos[:, 0, :] < room) # all left ears are in the room

In [None]:
np.all(ears_pos[:, 1, :] < room) # all right are in the room

In [None]:
np.all(ears_pos > 0)

In [None]:
# final MINIMUM distance between head and target (check we don't have an intra-craneal target)
min(np.sqrt(np.sum((srcs - head_pos)**2, axis=1))) > 0.0875 * 2

In [None]:
# minimum distance of ears against a wall
min ( min(room[:, 0] - ears_pos[:, 0, 0]), min(room[:, 0] - ears_pos[:, 1, 0]))

In [None]:
min ( min(room[:, 1] - ears_pos[:, 0, 1]), min(room[:, 1] - ears_pos[:, 1, 1]))

In [None]:
min ( min(room[:, 2] - ears_pos[:, 0, 2]), min(room[:, 2] - ears_pos[:, 1, 2]))

In [None]:
# minimum distance of targets against a wall
min(min(room[:, 0] - srcs[:, 0]), min(room[:, 1] - srcs[:, 1]), min(room[:, 2] - srcs[:, 2]))

In [None]:
np.array([src_azi, src_ele]).shape

In [None]:
np.array([head_orient_azi[k], head_orient_ele[k]]).shape

In [None]:
for k in range(30):
    plot_scene(room[k],head_pos[k], np.array([head_orient_azi[k], head_orient_ele[k]]), 
               head_2_ku_ears(head_pos[k], np.array([head_orient_azi[k], head_orient_ele[k]])) ,
               [srcs[k]], np.array([src_azi[k], src_ele[k]]), perspective="xy")

In [None]:
# We split into train val and test set:

In [None]:
sets = ['train'] * int(n_utterances*0.7) + ['val'] * int(n_utterances*0.15)+ ['test'] * int(n_utterances*0.15)

perm2 = np.random.permutation(n_utterances)

sets = np.array(sets)

sets = sets[perm2]

# store data
df = pd.DataFrame({'id': range(n_utterances), 'room_x': room_x, 'room_y': room_y, 'room_z': room_z, 
                   'rt60': rt60, 'rt60_125hz': rt60s[:, 0], 'rt60_250hz': rt60s[:, 1], 'rt60_500hz': rt60s[:, 2],
                   'rt60_1000hz': rt60s[:, 3], 'rt60_2000hz': rt60s[:, 4], 'rt60_4000hz': rt60s[:, 5],
                   'headC_x' : head_pos[:, 0], 'headC_y' : head_pos[:,1], 'headC_z' : head_pos[:, 2],
                   'head_azi' : head_orient_azi, 'head_ele' : head_orient_ele,
                   'src_x' : srcs[:, 0], 'src_y' : srcs[:, 1], 'src_z' : srcs[:, 2],
                   'src_azi' : src_azi, 'src_ele' : src_ele, 'set': sets})

In [None]:
#df.to_csv('meta_ins24.csv', index=False, compression='infer')