In [None]:
%cd ..

In [None]:
from IPython import display

import math
import os
import random

import librosa
import numpy as np

from matplotlib import pyplot as plt

import tabulate

from train import PATH_LOADERS, ROOT
from evaluate.quality import evaluate

In [None]:
display.HTML('''<script>
  code_show=true;
  function code_toggle() {
    if (code_show){
      $('div.input').hide();
    } else {
      $('div.input').show();
    }
    code_show = !code_show
  }
  $( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Toggle code on/off"></form>''')

## Seen data

We have four training speakers and test on the same four speakers.

**Qualitative results.**

In [None]:
dataset = "grid"
filelist = "k-seen"
split = "test"
path_loader = PATH_LOADERS[dataset](ROOT, filelist + "-" + split)
path_prediction_dict = {
    "k": "data/grid/samples-konstantinos/seen",
    "ours": "output/synth-samples/grid-k-seen-test-magnus-best",
}

In [None]:
results = {
    method: {
        metric: dict(zip(path_loader.ids, evaluate(metric, path_loader, path_prediction_dict[method])))
        for metric in ["pesq", "stoi"]
    }
    for method in ["k", "ours"] 
}

In [None]:
def get_audio_html(path, sr):
    audio, _ = librosa.core.load(path, sr)
    display_audio = display.Audio(audio, rate=sr)
    return display_audio._repr_html_()

def get_row(id1):
    filename = path_loader.id_to_filename(id1, "audio")
    return [
        filename,
        get_audio_html(os.path.join(path_loader.folders["audio"], filename), 16_000),
        get_audio_html(os.path.join(path_prediction_dict["ours"], filename), 16_000),
        get_audio_html(os.path.join(path_prediction_dict["k"], filename), 50_000),
        results["ours"]["pesq"][id1],
        results["k"]["pesq"][id1],
        results["ours"]["stoi"][id1],
        results["k"]["stoi"][id1],
    ]

In [None]:
selected_ids = random.sample(path_loader.ids, 16)
table = [get_row(i) for i in selected_ids]
table = sorted(table, key=lambda t: t[0])
headers = ["filename", "groundtruth", "ours", "k", "pesq ↑ ours", "pesq ↑ k", "stoi ↑ ours", "stoi ↑ k"]
display.display(display.HTML(tabulate.tabulate(table, tablefmt='html', headers=headers)))

**Quantitative results.**

Methods:
- `K`: the method of Vougioukas _et al._ (Interspeech, 2019)
- `B / spk`: baseline model trained independently for each speaker
- `B`: baseline model trained on all four speaker at once
- `SI`: baseline model augmenented with speaker ID information
- `SI + D`: baseline model with speaker ID and dispel branch

Notes:
- Test data consists of four seen speakers: `s1`, `s2`, `s4`, `s29`
- ? For MCD I was not able to achieve comparable to what is in the paper (not even for their method)
- ? What kind of FSG should I use for the ASR? Currently forcing six words at the output.

| method | STOI ↑ | PESQ ↑ | MCD ↓ | WER ↓ |
|--------|--------|--------|-------|-------|
| `K` _paper_ | 0.518 | 1.71 | 22.29 | 26.6 |
| `K` _recomputed_ | 0.525 | 1.72 | 673.3 | 27.1 |
| `B / spk` | 0.452 | 1.82 | 882.7 | 17.8 |
| `B` | 0.470 | 1.88 | 882.5 | 21.8 |
| `SI` | 0.468 | 1.85 | 864.7 | 19.9 |
| `SI + D` | 0.449 | 1.78 | 866.8 | 24.5 |

Code to run:

```bash
# K
python evaluate/quality.py -m stoi -d grid --filelist k-seen -p data/grid/samples-konstantinos/seen
# B / spk
python evaluate/quality.py -m stoi -d grid --filelist k-seen -p output/synth-samples/grid-k-seen-test-magnus-best
# B
python evaluate/quality.py -m stoi -d grid --filelist k-seen -p output/synth-samples/grid-k-seen-test-magnus-indep-best
# SI
python evaluate/quality.py -m stoi -d grid --filelist k-seen -p output/synth-samples/grid-k-seen-test-magnus-multi-speaker-best
# SI + D
python evaluate/quality.py -m stoi -d grid --filelist k-seen -p output/synth-samples/grid-k-seen-test-magnus-multi-speaker-dispel-best

# For WER
bash scripts/evaluate-seen-wer.sh
```

## Unseen data

- 14 seen speakers and 9 unseen speakers
- We use a subset of the samples: 50 samples for each speaker, that is 450 samples
- For the methods that rely on speaker identity we use a mean embedding of the speakers seen at train time


| method | STOI ↑ | PESQ ↑ | MCD ↓ | WER ↓ |
|--------|--------|--------|-------|-------|
| `K` _paper_ | 0.445 | 1.24 | 24.29 | 40.5 |
| `K` _recomputed_ | 0.449 | 1.24 | 752.5 | 40.1 |
| `B` | 0.374 | 1.28 | 896.9 | 36.4 |
| `SI` | 0.352 | 1.15 | 883.7 | 39.4 |
| `SI + D` | 0.315 | 1.07 | 995.7 | 62.1 |
| `SE` | 0.365 | 1.25 | 908.2 | 34.9 |
| `SE + D` | 0.359 | 1.17 | 974.3 | 42.3 |

**Qualitative results.**
In this section we look at the samples generated for the methods.

In [None]:
dataset = "grid"
filelist = "unseen-k-small"
split = "test"
path_loader = PATH_LOADERS[dataset](ROOT, filelist + "-" + split)
path_prediction_dict = {
    "k": "data/grid/samples-konstantinos/unseen",
    "B": "output/synth-samples/grid-multi-speaker-unseen-k-small-test-magnus-best",
    "SI": "output/synth-samples/grid-multi-speaker-unseen-k-small-test-magnus-multi-speaker-best-emb-mean",
    "SI+D": "output/synth-samples/grid-multi-speaker-unseen-k-small-test-magnus-multi-speaker-dispel-best-emb-mean",
}

In [None]:
def get_row(id1):
    sr = 16_000
    filename = path_loader.id_to_filename(id1, "audio")
    return [
        filename,
        get_audio_html(os.path.join(path_loader.folders["audio"], filename), sr),
        get_audio_html(os.path.join(path_prediction_dict["k"], filename), 50_000),
    ] + [
        get_audio_html(os.path.join(path_prediction_dict[method], filename), sr)
        for method in "B SI SI+D".split()
    ]

In [None]:
selected_ids = random.sample(path_loader.ids, 16)
table = [get_row(i) for i in selected_ids]
table = sorted(table, key=lambda t: t[0])
headers = ["filename", "groundtruth", "k", "B", "SI", "SI+D"]
display.display(display.HTML(tabulate.tabulate(table, tablefmt='html', headers=headers)))