clarity/evaluator/hasqi/hasqi.py

from __future__ import annotations

import logging
from typing import TYPE_CHECKING, Final

import numpy as np

from clarity.evaluator.haspi import eb
from clarity.utils.audiogram import Audiogram, Listener

if TYPE_CHECKING:
    from numpy import ndarray


# HASQI assumes the following audiogram frequencies:
HASQI_AUDIOGRAM_FREQUENCIES: Final = np.array([250, 500, 1000, 2000, 4000, 6000])


def hasqi_v2(
    reference: ndarray,
    reference_sample_rate: float,
    processed: ndarray,
    processed_sample_rate: float,
    audiogram: Audiogram,
    equalisation: int = 1,
    level1: float = 65.0,
    silence_threshold: float = 2.5,
    add_noise: float = 0.0,
    segment_covariance: int = 16,
) -> tuple[float, float, float, list[float]]:
    """
    Function to compute the HASQI version 2 quality index using the
    auditory model followed by computing the envelope cepstral
    correlation and BM vibration average short-time coherence signals.
    The reference signal presentation level for NH listeners is assumed
    to be 65 dB SPL. The same model is used for both normal and
    impaired hearing.

    Arguments:
        reference (np.ndarray): Clear input reference speech signal with no noise or
            distortion. If a hearing loss is specified, NAL-R equalization is optional
        reference_sample_Rate (int): Sampling rate in Hz for reference signal.
        processed (np.ndarray): Output signal with noise, distortion, HA gain, and/or
            processing.
        processed_sample_rate (int): Sampling rate in Hz for processed signal.
        hearing_loss (np.ndarray): vector of hearing loss at the 6 audiometric
            frequencies [250, 500, 1000, 2000, 4000, 6000] Hz.
        equalisation (int): Mode to use when equalising the reference signal:
                1 = no EQ has been provided, the function will add NAL-R
                2 = NAL-R EQ has already been added to the reference signal
        level1: Optional input specifying level in dB SPL that corresponds to a
              signal RMS = 1. Default is 65 dB SPL if argument not provided.
        silence_threshold (float): Silence threshold sum across bands, dB above audio
            threshold. Default: 2.5
        add_noise (float): Additive noise in dB SL to conditional cross-covariance.
            Default is 0.0
        segment_covariance (int): Segment size for the covariance calculation.
            Default is 16

    Returns:
        tuple(Combined, Nonlin, Linear, raw)
            Combined: Quality estimate is the product of the nonlinear and linear terms
            Nonlin: Nonlinear quality component = (cepstral corr)^2 x seg BM coherence
            Linear: Linear quality component = std of spectrum and spectrum slope
            raw: Vector of raw values = [CepCorr, BMsync5, Dloud, Dslope]

    James M. Kates, 5 August 2013.
    Translated from MATLAB to Python by Gerardo Roa Dabike, October 2022.
    """

    if not audiogram.has_frequencies(HASQI_AUDIOGRAM_FREQUENCIES):
        logging.warning(
            "Audiogram does not have all HASQI frequency measurements"
            "Measurements will be interpolated"
        )

    audiogram = audiogram.resample(HASQI_AUDIOGRAM_FREQUENCIES)

    # Auditory model for quality
    # Reference is no processing or NAL-R, impaired hearing
    (
        reference_db,
        reference_basilar_membrane,
        processed_db,
        processed_basilar_membrane,
        reference_sl,
        processed_sl,
        sample_rate,
    ) = eb.ear_model(
        reference,
        reference_sample_rate,
        processed,
        processed_sample_rate,
        audiogram.levels,
        equalisation,
        level1,
    )

    # Envelope and long-term average spectral features
    # Smooth the envelope outputs: 125 Hz sub-sampling rate
    reference_smooth = eb.env_smooth(reference_db, segment_covariance, sample_rate)
    processed_smooth = eb.env_smooth(processed_db, segment_covariance, sample_rate)

    # Mel cepstrum correlation using smoothed envelopes
    (
        average_cepstral_correlation,
        _individual_cepstral_correlations,
    ) = eb.mel_cepstrum_correlation(
        reference_smooth, processed_smooth, silence_threshold, add_noise
    )

    # Linear changes in the log-term spectra
    # dloud  vector: [sum abs diff, std dev diff, max diff] spectra
    # dnorm  vector: [sum abs diff, std dev diff, max diff] norm spectra
    # dslope vector: [sum abs diff, std dev diff, max diff] slope
    d_loud_stats, _d_norm_stats, d_slope_stats = eb.spectrum_diff(
        reference_sl, processed_sl
    )

    # Temporal fine structure correlation measurements
    # Compute the time-frequency segment covariances
    (
        signal_cross_covariance,
        reference_mean_square,
        _processed_mean_square,
    ) = eb.bm_covary(
        reference_basilar_membrane,
        processed_basilar_membrane,
        segment_covariance,
        sample_rate,
    )

    # Average signal segment cross-covariance
    # avecov=weighted ave of cross-covariances, using only data above threshold
    # syncov=ave cross-covariance with added IHC loss of synchronization at HF
    silence_threshold = 2.5  # Threshold in dB SL for including time-freq tile
    _, ihc_sync_covariance = eb.ave_covary2(
        signal_cross_covariance, reference_mean_square, silence_threshold
    )
    basilar_membrane_sync5 = ihc_sync_covariance[
        4
    ]  # Ave segment coherence with IHC loss of sync

    # Extract and normalize the spectral features
    # Dloud:std
    d_loud = d_loud_stats[1] / 2.5  # Loudness difference std
    d_loud = 1.0 - d_loud  # 1=perfect, 0=bad
    d_loud = max(min(d_loud, 1.0), 0.0)

    # Dslope:std
    d_slope = d_slope_stats[1]  # Slope difference std
    d_slope = 1.0 - d_slope
    d_slope = max(min(d_slope, 1.0), 0.0)

    # Construct the models
    # Nonlinear model
    non_linear = (
        average_cepstral_correlation**2
    ) * basilar_membrane_sync5  # Combined envelope and temporal fine structure
    # Linear model
    linear = 0.579 * d_loud + 0.421 * d_slope  # Linear fit

    # Combined model
    combined = non_linear * linear  # Product of nonlinear x linear

    # Raw data
    raw = [average_cepstral_correlation, basilar_membrane_sync5, d_loud, d_slope]
    return combined, non_linear, linear, raw


def hasqi_v2_better_ear(
    reference_left: ndarray,
    reference_right: ndarray,
    processed_left: ndarray,
    processed_right: ndarray,
    sample_rate: float,
    listener: Listener,
    level: float = 100.0,
) -> float:
    """Better ear HASQI.

    Calculates HASQI for left and right ear and selects the better result.

    Args:
        reference_left (np.ndarray): left channel of reference signal
        reference_right (np.ndarray): right channel of reference signal
        reference_left (np.ndarray): left channel of processed signal
        reference_right (np.ndarray): right channel of processed signal
        sample_rate: sampling rate for both signal
        audiogram_l: left ear audiogram
        audiogram_r: right ear audiogram
        level: level in dB SPL corresponding to RMS=1
        audiogram_freq: selected frequencies to use for audiogram

    Returns:
        float: beHASQI score

    Gerardo Roa Dabike, November 2022
    """

    score_left, _, _, _ = hasqi_v2(
        reference_left,
        sample_rate,
        processed_left,
        sample_rate,
        listener.audiogram_left,
        equalisation=1,
        level1=level,
    )
    score_right, _, _, _ = hasqi_v2(
        reference_right,
        sample_rate,
        processed_right,
        sample_rate,
        listener.audiogram_right,
        equalisation=1,
        level1=level,
    )

    return max(score_left, score_right)