In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib
%matplotlib inline

In [None]:
import paderbox as pb

In [None]:
from collections import defaultdict
import itertools
def plot_meeting(ex):
    with pb.visualization.axes_context(columns=1, figure_size=(10, 3)) as ac:
        activity = defaultdict(pb.array.interval.zeros)
        speech_activity = defaultdict(pb.array.interval.zeros)
        try:
            num_samples = pb.utils.nested.get_by_path(ex, 'num_samples.original_source', allow_early_stopping=True)
        except KeyError:
            num_samples = pb.utils.nested.get_by_path(ex, 'num_samples.speech_source', allow_early_stopping=True)
        for o, l, s,  in zip(ex['offset'], num_samples, ex['speaker_id']):
            speech_activity[s][o:o+l]=True

        pb.visualization.plot.activity(speech_activity, ax=ac.new)
        
def plot_meetings(generator_dataset, number=6, columns=3, figure_width=10):
    with pb.visualization.axes_context(columns=columns, figure_size=(figure_width, 3)) as ac:
        for ex in itertools.islice(generator_dataset, number):
            activity = defaultdict(pb.array.interval.zeros)
            try:
                num_samples = pb.utils.nested.get_by_path(ex, 'num_samples.original_source', allow_early_stopping=True)
            except KeyError:
                num_samples = pb.utils.nested.get_by_path(ex, 'num_samples.speech_source', allow_early_stopping=True)
            for o, l, s in zip(ex['offset'], num_samples, ex['speaker_id']):
                activity[s][o:o+l] = True

            pb.visualization.plot.activity(activity, ax=ac.new)

In [None]:
%load_ext autoreload
%autoreload 2
from padercontrib.database.sms_librispeech_meeting import mixture_generator as g

## Preparation: Prepare input dataset
The mixture/meeting generators are generic, i.e., they work with any database that contains examples of single-speaker speech.
The input database has to have its examples in the correct format, i.e., they have to contain the correct keys.

The examples have to have the following format:
 - `example_id` (`str`): The ID of the input example. Has to be unique in the input dataset
 - `num_samples` (`int`): The number of samples in the example
 - `speaker_id` (`str`): The ID of the speaker that uttered the speech in this example
 - `audio_path.observation` (`str`): The path to the audio, will later be in `audio_path.original_source` (**TODO: this is not used by the generator, but just copied. Do we want to have this mandatory?**)
 
For meeting data additionally:
 - `scenario` (`str`): An identifier that uniquely identifies a "scenario" that should not change for a single speaker in a meeting. E.g., in LibriSpeech the scenario should be `f"{chapter_id}_{speaker_id}"`

All other keys are simply copied over from the input examples, so all information present in the input examples will be present in the generated mixtures.

In [None]:
# Prepare input datasets
from padercontrib.database.wsj import WSJ_8kHz
db = WSJ_8kHz()
dataset_name = 'test_eval92'

def wsj_format_fn(example):
    example['num_samples'] = example['num_samples']['observation']
    example['scenario'] = example['speaker_id']
    return example

input_ds = db.get_dataset(dataset_name).map(wsj_format_fn)

# If we want to have reverberation, prepare the RIR dataset
rir_ds = g.rir_dataset_from_scenarios('/net/db/sms_wsj/rirs/scenarios.json', dataset_name)

In [None]:
input_ds[0]

## Fully overlapped mixtures

In [None]:
# Deterministic, anechoic
# TODO: num_spekaers -> list

# Compute a composition of base examples. This makes sure that the speaker distribution
# in the mixtures is equal to the speaker distribution in the original database.
ds = g.get_composition_dataset(input_dataset=input_ds, num_speakers=2)

# If required: Offset the utterances
ds = ds.map(g.ConstantOffsetSampler())

# If required: Add log_weights to simulate volume differences
ds = ds.map(g.UniformLogWeightSampler(max_weight=5))

len(ds), ds[0]

In [None]:
plot_meetings(ds)

In [None]:
# Deterministic, anechoic, no offset, like WSJ0-2mix
# TODO: num_spekaers -> list

# Compute a composition of base examples. This makes sure that the speaker distribution
# in the mixtures is equal to the speaker distribution in the original database.
ds = g.get_composition_dataset(input_dataset=input_ds, num_speakers=2)

# If required: Offset the utterances
ds = ds.map(g.ConstantOffsetSampler(0))

# If required: Add log_weights to simulate volume differences
ds = ds.map(g.UniformLogWeightSampler(max_weight=5))

len(ds), ds[0]

In [None]:
plot_meetings(ds)

In [None]:
# If required: Add reverberation
ds = ds.map(g.RIRSampler(rir_ds))
ds[0]

In [None]:
# Load an example
from padercontrib.database.sms_librispeech_meeting.scenario import multi_channel_scenario_map_fn
def load_audio(example):
    example['audio_data'] = pb.io.audioread.recursive_load_audio(example['audio_path'])
    return example
ds = ds.map(load_audio)
ds = ds.map(multi_channel_scenario_map_fn)
ex = ds[0]
pb.io.play(ex['audio_data']['observation'], sample_rate=8000)

In [None]:
# Dynamic mixing: Set the rng argument to `True` to get a non-deterministic dataset that changes its contents 
# every time it is iterated. Useful if you want to train on an infinite stream of randomly generated examples
# TODO: dynamic_ -> rng
ds = g.get_composition_dataset(input_dataset=input_ds, num_speakers=2, rng=True)
# only the function above this line changed from the determinstic case
# -------------------------------------------------------------------------------------------------------------------
# the part below this line is deterministic and equal to the cell above
ds = ds.map(g.SMSWSJOffsetSampler())
ds = ds.map(g.UniformLogWeightSampler(max_weight=5))

In [None]:
# Check that iterating two times gives different examples
for _ in range(2):
    for e in ds:
        print(e)
        print()
        break

## Generate Meetings

In [None]:
# Deterministic, anechoic, use the same base function as for SMS-WSJ, i.e., we have the same initial examples as SMS-WSJ
ds = g.get_composition_dataset(input_dataset=input_ds, num_speakers=[3, 4, 5])
ds = ds.map(g.UniformLogWeightSampler(max_weight=5))
ds = ds.map(g.MeetingSampler(duration=60*8000)(input_ds))
# len(ds)

In [None]:
ds[0]

In [None]:
plot_meetings(ds, columns=2, figure_width=20, number=6)

In [None]:
# Deterministic, anechoic, use the same base function as for SMS-WSJ, i.e., we have the same initial examples as SMS-WSJ
import functools
ds = g.get_composition_dataset(input_dataset=input_ds, num_speakers=[3, 4])
ds = ds.map(g.UniformLogWeightSampler(max_weight=5))
ds = ds.map(functools.partial(g.RIRSampler(rir_ds)))
ds = ds.map(g.MeetingSampler(duration=60*8000)(input_ds))
# len(ds)

In [None]:
# Librispeech, this takes a long time to load the VAD information
from padercontrib.database.librispeech import LibriSpeech

db = LibriSpeech()

# Map scenario: scenario is composed of speaker-ID and chapter-ID because the environment changes heavily between chapters
def format_fn(example):
    example['scenario'] = example['speaker_id'] + '_' + example['chapter_id']
    return example

librispeech_input_ds = db.get_dataset('test_clean').map(format_fn)

In [None]:
librispeech_input_ds[0]

In [None]:
# This code is exactly the same as for WSJ, except for the input dataset and some config
# TODO: rng arg (int or bool), random sart seed for dynamic mixing
ds = g.get_composition_dataset(input_dataset=librispeech_input_ds, num_speakers=(5, 8))
ds = ds.map(g.UniformLogWeightSampler())
ds = ds.map(g.MeetingSampler(600*8000)(librispeech_input_ds))

In [None]:
ds[0]

In [None]:
plot_meetings(ds)

## Class-based interface idea

In [None]:
db = g.wsj_full_overlap.WSJ8_kHz_FullOverlap()

In [None]:
db.dataset_names

In [None]:
db.get_dataset('cv_dev93')[0]

## Generate JSON

In [None]:
from tqdm.notebook import tqdm
database_dict = {'datasets': {dataset_name: dict(tqdm(db.get_dataset(dataset_name).items(), desc=dataset_name)) for dataset_name in db.dataset_names}}
pb.io.dump(database_dict, 'wsj_full_overlap.json')