-
Notifications
You must be signed in to change notification settings - Fork 47
/
hasqi.py
220 lines (188 loc) · 7.61 KB
/
hasqi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Final
import numpy as np
from clarity.evaluator.haspi import eb
from clarity.utils.audiogram import Audiogram, Listener
if TYPE_CHECKING:
from numpy import ndarray
# HASQI assumes the following audiogram frequencies:
HASQI_AUDIOGRAM_FREQUENCIES: Final = np.array([250, 500, 1000, 2000, 4000, 6000])
def hasqi_v2(
reference: ndarray,
reference_sample_rate: float,
processed: ndarray,
processed_sample_rate: float,
audiogram: Audiogram,
equalisation: int = 1,
level1: float = 65.0,
silence_threshold: float = 2.5,
add_noise: float = 0.0,
segment_covariance: int = 16,
) -> tuple[float, float, float, list[float]]:
"""
Function to compute the HASQI version 2 quality index using the
auditory model followed by computing the envelope cepstral
correlation and BM vibration average short-time coherence signals.
The reference signal presentation level for NH listeners is assumed
to be 65 dB SPL. The same model is used for both normal and
impaired hearing.
Arguments:
reference (np.ndarray): Clear input reference speech signal with no noise or
distortion. If a hearing loss is specified, NAL-R equalization is optional
reference_sample_Rate (int): Sampling rate in Hz for reference signal.
processed (np.ndarray): Output signal with noise, distortion, HA gain, and/or
processing.
processed_sample_rate (int): Sampling rate in Hz for processed signal.
hearing_loss (np.ndarray): vector of hearing loss at the 6 audiometric
frequencies [250, 500, 1000, 2000, 4000, 6000] Hz.
equalisation (int): Mode to use when equalising the reference signal:
1 = no EQ has been provided, the function will add NAL-R
2 = NAL-R EQ has already been added to the reference signal
level1: Optional input specifying level in dB SPL that corresponds to a
signal RMS = 1. Default is 65 dB SPL if argument not provided.
silence_threshold (float): Silence threshold sum across bands, dB above audio
threshold. Default: 2.5
add_noise (float): Additive noise in dB SL to conditional cross-covariance.
Default is 0.0
segment_covariance (int): Segment size for the covariance calculation.
Default is 16
Returns:
tuple(Combined, Nonlin, Linear, raw)
Combined: Quality estimate is the product of the nonlinear and linear terms
Nonlin: Nonlinear quality component = (cepstral corr)^2 x seg BM coherence
Linear: Linear quality component = std of spectrum and spectrum slope
raw: Vector of raw values = [CepCorr, BMsync5, Dloud, Dslope]
James M. Kates, 5 August 2013.
Translated from MATLAB to Python by Gerardo Roa Dabike, October 2022.
"""
if not audiogram.has_frequencies(HASQI_AUDIOGRAM_FREQUENCIES):
logging.warning(
"Audiogram does not have all HASQI frequency measurements"
"Measurements will be interpolated"
)
audiogram = audiogram.resample(HASQI_AUDIOGRAM_FREQUENCIES)
# Auditory model for quality
# Reference is no processing or NAL-R, impaired hearing
(
reference_db,
reference_basilar_membrane,
processed_db,
processed_basilar_membrane,
reference_sl,
processed_sl,
sample_rate,
) = eb.ear_model(
reference,
reference_sample_rate,
processed,
processed_sample_rate,
audiogram.levels,
equalisation,
level1,
)
# Envelope and long-term average spectral features
# Smooth the envelope outputs: 125 Hz sub-sampling rate
reference_smooth = eb.env_smooth(reference_db, segment_covariance, sample_rate)
processed_smooth = eb.env_smooth(processed_db, segment_covariance, sample_rate)
# Mel cepstrum correlation using smoothed envelopes
(
average_cepstral_correlation,
_individual_cepstral_correlations,
) = eb.mel_cepstrum_correlation(
reference_smooth, processed_smooth, silence_threshold, add_noise
)
# Linear changes in the log-term spectra
# dloud vector: [sum abs diff, std dev diff, max diff] spectra
# dnorm vector: [sum abs diff, std dev diff, max diff] norm spectra
# dslope vector: [sum abs diff, std dev diff, max diff] slope
d_loud_stats, _d_norm_stats, d_slope_stats = eb.spectrum_diff(
reference_sl, processed_sl
)
# Temporal fine structure correlation measurements
# Compute the time-frequency segment covariances
(
signal_cross_covariance,
reference_mean_square,
_processed_mean_square,
) = eb.bm_covary(
reference_basilar_membrane,
processed_basilar_membrane,
segment_covariance,
sample_rate,
)
# Average signal segment cross-covariance
# avecov=weighted ave of cross-covariances, using only data above threshold
# syncov=ave cross-covariance with added IHC loss of synchronization at HF
silence_threshold = 2.5 # Threshold in dB SL for including time-freq tile
_, ihc_sync_covariance = eb.ave_covary2(
signal_cross_covariance, reference_mean_square, silence_threshold
)
basilar_membrane_sync5 = ihc_sync_covariance[
4
] # Ave segment coherence with IHC loss of sync
# Extract and normalize the spectral features
# Dloud:std
d_loud = d_loud_stats[1] / 2.5 # Loudness difference std
d_loud = 1.0 - d_loud # 1=perfect, 0=bad
d_loud = max(min(d_loud, 1.0), 0.0)
# Dslope:std
d_slope = d_slope_stats[1] # Slope difference std
d_slope = 1.0 - d_slope
d_slope = max(min(d_slope, 1.0), 0.0)
# Construct the models
# Nonlinear model
non_linear = (
average_cepstral_correlation**2
) * basilar_membrane_sync5 # Combined envelope and temporal fine structure
# Linear model
linear = 0.579 * d_loud + 0.421 * d_slope # Linear fit
# Combined model
combined = non_linear * linear # Product of nonlinear x linear
# Raw data
raw = [average_cepstral_correlation, basilar_membrane_sync5, d_loud, d_slope]
return combined, non_linear, linear, raw
def hasqi_v2_better_ear(
reference_left: ndarray,
reference_right: ndarray,
processed_left: ndarray,
processed_right: ndarray,
sample_rate: float,
listener: Listener,
level: float = 100.0,
) -> float:
"""Better ear HASQI.
Calculates HASQI for left and right ear and selects the better result.
Args:
reference_left (np.ndarray): left channel of reference signal
reference_right (np.ndarray): right channel of reference signal
reference_left (np.ndarray): left channel of processed signal
reference_right (np.ndarray): right channel of processed signal
sample_rate: sampling rate for both signal
audiogram_l: left ear audiogram
audiogram_r: right ear audiogram
level: level in dB SPL corresponding to RMS=1
audiogram_freq: selected frequencies to use for audiogram
Returns:
float: beHASQI score
Gerardo Roa Dabike, November 2022
"""
score_left, _, _, _ = hasqi_v2(
reference_left,
sample_rate,
processed_left,
sample_rate,
listener.audiogram_left,
equalisation=1,
level1=level,
)
score_right, _, _, _ = hasqi_v2(
reference_right,
sample_rate,
processed_right,
sample_rate,
listener.audiogram_right,
equalisation=1,
level1=level,
)
return max(score_left, score_right)