Merge pull request #329 from claritychallenge/319-bring-cad1-recipes-…

…improvements-to-main Bringing CAD1 recipe from v0.3.4 to main
claritychallenge · Oct 26, 2023 · daed423 · daed423
2 parents 7cc2651 + 8ec9c7b
commit daed423
Show file tree

Hide file tree

Showing 20 changed files with 777 additions and 483 deletions.
diff --git a/clarity/evaluator/haspi/eb.py b/clarity/evaluator/haspi/eb.py
@@ -1288,13 +1288,10 @@ def env_smooth(envelopes: np.ndarray, segment_size: int, sample_rate: float) ->
     """
 
     # Compute the window
-    n_samples = int(
-        np.around(segment_size * (0.001 * sample_rate))
-    )  # Segment size in samples
-    test = n_samples - 2 * np.floor(n_samples / 2)  # 0=even, 1=odd
-    if test > 0:
-        # Force window length to be even
-        n_samples = n_samples + 1
+    # Segment size in samples
+    n_samples = int(np.around(segment_size * (0.001 * sample_rate)))
+    n_samples += n_samples % 2
+
     window = np.hanning(n_samples)  # Raised cosine von Hann window
     wsum = np.sum(window)  # Sum for normalization
 
@@ -1848,6 +1845,7 @@ def bm_covary(
         correlation = correlation[
             int(len(reference_seg) - 1 - maxlag) : int(maxlag + len(reference_seg))
         ]
+
         unbiased_cross_correlation = np.max(np.abs(correlation * half_corr))
         if (ref_mean_square > small) and (proc_mean_squared > small):
             # Normalize cross-covariance
@@ -1877,6 +1875,7 @@ def bm_covary(
             correlation = correlation[
                 int(len(reference_seg) - 1 - maxlag) : int(maxlag + len(reference_seg))
             ]
+
             unbiased_cross_correlation = np.max(np.abs(correlation * win_corr))
             if (ref_mean_square > small) and (proc_mean_squared > small):
                 # Normalize cross-covariance
@@ -1900,7 +1899,7 @@ def bm_covary(
         ref_mean_square = np.sum(reference_seg**2) * halfsum2
         proc_mean_squared = np.sum(processed_seg**2) * halfsum2
 
-        correlation = np.correlate(reference_seg, processed_seg, "full")
+        correlation = correlate(reference_seg, processed_seg, "full")
         correlation = correlation[
             int(len(reference_seg) - 1 - maxlag) : int(maxlag + len(reference_seg))
         ]

diff --git a/clarity/utils/signal_processing.py b/clarity/utils/signal_processing.py
@@ -1,4 +1,5 @@
 """Signal processing utilities."""
+# pylint: disable=import-error
 from __future__ import annotations
 
 # pylint: disable=import-error

diff --git a/recipes/cad1/README.md b/recipes/cad1/README.md
@@ -21,37 +21,42 @@ The performance of each system on the validation set is reported below.
 
 ### Task 1 - Listening music via headphones
 
-**The overall HAAQI score is 0.3608.**
+The overall HAAQI score is:
+
+- Demucs: **0.2592**
+- Open-Unmix: **0.2273**
 
 #### Average HAAQI score per song
 
-| Song                                            |   HAAQI    |
-|:------------------------------------------------|:----------:|
-| Actions - One Minute Smile                      |   0.3066   |
-| Alexander Ross - Goodbye Bolero                 |   0.4257   |
-| ANiMAL - Rockshow                               |   0.2389   |
-| Clara Berry And Wooldog - Waltz For My Victims  |   0.4202   |
-| Fergessen - Nos Palpitants                      |   0.4554   |
-| James May - On The Line                         |   0.3889   |
-| Johnny Lokke - Promises & Lies                  |   0.3395   |
-| Leaf - Summerghost                              |   0.3595   |
-| Meaxic - Take A Step                            |   0.3470   |
-| Patrick Talbot - A Reason To Leave              |   0.4545   |
-| Skelpolu - Human Mistakes                       |   0.3055   |
-| Triviul - Angelsaint                            |   0.2883   |
+| Song                                           | Demucs | Open-UnMix |
+|:-----------------------------------------------|:------:|:----------:|
+| Actions - One Minute Smile                     | 0.2485 |   0.2257   |
+| Alexander Ross - Goodbye Bolero                | 0.3084 |   0.2574   |
+| ANiMAL - Rockshow                              | 0.1843 |   0.1864   |
+| Clara Berry And Wooldog - Waltz For My Victims | 0.3094 |   0.2615   |
+| Fergessen - Nos Palpitants                     | 0.3542 |   0.2592   |
+| James May - On The Line                        | 0.2778 |   0.2398   |
+| Johnny Lokke - Promises & Lies                 | 0.2544 |   0.2261   |
+| Leaf - Summerghost                             | 0.2513 |   0.2105   |
+| Meaxic - Take A Step                           | 0.2455 |   0.2239   |
+| Patrick Talbot - A Reason To Leave             | 0.2673 |   0.2331   |
+| Skelpolu - Human Mistakes                      | 0.2123 |   0.1951   |
+| Traffic Experiment - Sirens                    | 0.2558 |   0.2339   |
+| Triviul - Angelsaint                           | 0.2101 |   0.1955   |
+| Young Griffo - Pennies                         | 0.2499 |   0.2297   |
 
 ### Task 2 - Listening music in a car with presence of noise
 
-**The overall HAAQI score is 0.1248.**
+**The overall HAAQI score is 0.1423.**
 
 #### Average HAAQI score per genre
 
-| Genre          |   HAAQI    |
-|:---------------|:----------:|
-| Classical      |   0.1240   |
-| Hip-Hop        |   0.1271   |
-| Instrumental   |   0.1250   |
-| International  |   0.1267   |
-| Orchestral     |   0.1121   |
-| Pop            |   0.1339   |
-| Rock           |   0.1252   |
+| Genre          | HAAQI  |
+|:---------------|:------:|
+| Classical      | 0.1365 |
+| Hip-Hop        | 0.1462 |
+| Instrumental   | 0.1416 |
+| International  | 0.1432 |
+| Orchestral     | 0.1329 |
+| Pop            | 0.1498 |
+| Rock           | 0.1460 |
diff --git a/recipes/cad1/task1/baseline/README.md b/recipes/cad1/task1/baseline/README.md
@@ -15,8 +15,8 @@ To download the data, please visit [here](https://forms.gle/UQkuCxqQVxZtGggPA).
 Alternatively, you can download the MUSDB18-HQ dataset from the official [SigSep website](https://sigsep.github.io/datasets/musdb.html#musdb18-hq-uncompressed-wav).
 If you opt for this alternative, be sure to download the uncompressed wav version. Note that you will need both packages to run the baseline system.
 
-If you need additional music data for training your model, please restrict to the use of [MedleyDB](https://medleydb.weebly.com/) [4] [5],
-[BACH10](https://labsites.rochester.edu/air/resource.html) [6] and [FMA-small](https://github.com/mdeff/fma) [7].
+If you need additional music data for training your model, please restrict to the use of [MedleyDB](https://medleydb.weebly.com/) [[4](#4-references)] [[5](#4-references)],
+[BACH10](https://labsites.rochester.edu/air/resource.html) [[6](#4-references)] and [FMA-small](https://github.com/mdeff/fma) [[7](#4-references)].
 Theses are shared as `cadenza_cad1_task1_augmentation_medleydb.tar.gz`, `cadenza_cad1_task1_augmentation_bach10.tar.gz`
 and `cadenza_cad1_task1_augmentation_fma_small.tar.gz`.
 **Keeping the augmentation data restricted to these datasets will ensure that the evaluation is fair for all participants**.
@@ -56,7 +56,7 @@ cadenza_data
 
 ### 1.2 Additional optional data
 
-* **MedleyDB** contains both MedleyDB versions 1 [[4](#references)] and 2 [[5](#references)] datasets.
+* **MedleyDB** contains both MedleyDB versions 1 [[4](#4-references)] and 2 [[5](#4-references)] datasets.
 
 Tracks from the MedleyDB dataset are not included in the evaluation set.
 However, is your responsibility to exclude any song that may be already contained in the training set.
@@ -70,7 +70,7 @@ cadenza_data
             └───Metadata
 ```
 
-* **BACH10** contains the BACH10 dataset [[6](#references)].
+* **BACH10** contains the BACH10 dataset [[6](#4-references)].
 
 Tracks from the BACH10 dataset are not included in MUSDB18-HQ and can all be used as training augmentation data.
 
@@ -84,7 +84,7 @@ cadenza_data
             ├───...
 ```
 
-* **FMA Small** contains the FMA small subset of the FMA dataset [[7](references)].
+* **FMA Small** contains the FMA small subset of the FMA dataset [[7](#4-references)].
 
 Tracks from the FMA small dataset are not included in the MUSDB18-HQ.
 This dataset does not provide independent stems but only the full mix.
@@ -123,18 +123,26 @@ Note that we use [hydra](https://hydra.cc/docs/intro/) for config handling.
 
 ### 2.1 Enhancement
 
-The baseline enhance simply takes the out-of-the-box [Hybrid Demucs](https://github.com/facebookresearch/demucs) [1]
+We offer two baseline systems:
+
+1. Using the out-of-the-box time-domain [Hybrid Demucs](https://github.com/facebookresearch/demucs) [[1](#4-references)]
 source separation model distributed on [TorchAudio](https://pytorch.org/audio/main/tutorials/hybrid_demucs_tutorial.html)
-and applies a simple NAL-R [2] fitting amplification to each VDBO (`vocals`, `drums`, `bass` and `others`) stem.
+2. Using the out-of-the-box spectrogram-based [Open-Unmix](https://github.com/sigsep/open-unmix-pytorch)
+source separation model (version `umxhq`) distributed through [PyTorch Hub](https://pytorch.org/hub/)
 
-The remixing is performed by summing the amplified VDBO stems.
+Both system use the same enhancement strategy; using the music separation model, the baseline system estimates the
+VDBO (`vocals`, `drums`, `bass` and `others`) stems. Then, they apply a simple NAL-R [[2](#4-references)] fitting amplification to each of them.
+These results on eight mono signals (four from the left channel and four from the right channel). Finally, each signal is downsampled to 24000 Hertz, convert to 16bit precision and
+encoded using the lossless FLAC compression. These eight signal are then used for the objective evaluation (HAAQI).
 
-The baseline generates a left and right signal for each VDBO stem and a remixed signal, totalling 9 signals per song-listener.
+The baselines also provide a remixing strategy to generate a stereo signal for each listener. This is done by summing
+the amplified VDBO stems, where each channel (left and right in stereo) is composed of the addition of the corresponding
+four stems. This stereo remixed signal is then used for subjective evaluation (listening panel).
 
 To run the baseline enhancement system first, make sure that `paths.root` in `config.yaml` points to
 where you have installed the Cadenza data. This parameter defaults to the working directory.
-You can also define your own `path.exp_folder` to store enhanced
-signals and evaluated results.
+You can also define your own `path.exp_folder` to store the enhanced signals and evaluated results and select what
+music separation model you want to employ.
 
 Then run:
 
@@ -158,9 +166,8 @@ The folder `enhanced_signals` will appear in the `exp` folder.
 
 ### 2.2 Evaluation
 
-The `evaluate.py` simply takes the signals stored in `enhanced_signals` and computes the HAAQI [[3](#references)] score
-for each of the eight left and right VDBO stems.
-The average of these eight scores is computed and returned for each signal.
+The `evaluate.py` script takes the eight VDBO signals stored in `enhanced_signals` and computes the
+HAAQI [[3](#4-references)] score. The final score for the sample is the average of the scores of each stem.
 
 To run the evaluation stage, make sure that `path.root` is set in the `config.yaml` file and then run
 
@@ -172,13 +179,19 @@ A csv file containing the eight HAAQI scores and the combined score will be gene
 
 To check the HAAQI code, see [here](../../../../clarity/evaluator/haaqi).
 
-Please note: you will not get identical HAAQI scores for the same signals if the random seed is not defined
-(in the given recipe, the random seed for each signal is set as the last eight digits of the song md5).
-As there are random noises generated within HAAQI, but the differences should be sufficiently small.
+Please note: you will not get identical HAAQI scores for the same signals if the random seed is not defined.
+This is due to the  random noises generated within HAAQI, but the differences should be sufficiently small.
+For reproducibility, in the given recipe, the random seed for each signal is set as the last eight digits
+of the song md5.
+
+## 3. Results
+
+The overall HAAQI score for each baseline is:
 
-The score for the baseline is 0.3608 HAAQI overall.
+* Demucs: **0.2592**
+* Open-Unmix: **0.2273**
 
-## References
+## 4. References
 
 * [1] Défossez, A. "Hybrid Spectrogram and Waveform Source Separation". Proceedings of the ISMIR 2021 Workshop on Music Source Separation. [doi:10.48550/arXiv.2111.03600](https://arxiv.org/abs/2111.03600)
 * [2] Byrne, Denis, and Harvey Dillon. "The National Acoustic Laboratories'(NAL) new procedure for selecting the gain and frequency response of a hearing aid." Ear and hearing 7.4 (1986): 257-265. [doi:10.1097/00003446-198608000-00007](https://doi.org/10.1097/00003446-198608000-00007)

diff --git a/recipes/cad1/task1/baseline/config.yaml b/recipes/cad1/task1/baseline/config.yaml
@@ -2,18 +2,20 @@ path:
   root: ../../cadenza_data_demo/cad1/task1
   metadata_dir: ${path.root}/metadata
   music_dir: ${path.root}/audio/musdb18hq
-  music_train_file: ${path.metadata_dir}/musdb18.train.json
-  music_valid_file: ${path.metadata_dir}/musdb18.valid.json
-  listeners_train_file: ${path.metadata_dir}/listeners.train.json
-  listeners_valid_file: ${path.metadata_dir}/listeners.valid.json
-  exp_folder: ./exp # folder to store enhanced signals and final results
+  music_file: ${path.metadata_dir}/musdb18.valid.json
+  listeners_file: ${path.metadata_dir}/listeners.valid.json
+  music_segments_test_file: ${path.metadata_dir}/musdb18.segments.test.json
+  exp_folder: ./exp_${separator.model} # folder to store enhanced signals and final results
 
+team_id: T001
 
-sample_rate: 44100
+sample_rate: 44100       # sample rate of the input mixture
+stem_sample_rate: 24000  # sample rate output stems
+remix_sample_rate: 32000 # sample rate for output remixed signal
 
 nalr:
   nfir: 220
-  fs: ${sample_rate}
+  sample_rate: ${sample_rate}
 
 apply_compressor: False
 compressor:
@@ -27,7 +29,6 @@ soft_clip: True
 
 separator:
   model: demucs   # demucs or openunmix
-  sources: [drums, bass, other, vocals]
   device: ~
 
 evaluate: