**Audio Separation Loss**

$$\mathcal{L}_{AS} = \sum_f^{all frames}(\sum_n^{N+1}  A_{{sep}_{n,i}} - A_{{ori}_i})^2 $$

In [None]:
### AS Loss
def AS_loss(audio_gt, audio_sp):
    loss = 0
    for x in range(len(audio_gt)):
        sp_sum = 0
        for i in range(audio_sp.shape[0]):
            sp_sum += audio_sp[i][x]
        loss += (sp_sum - audio_gt[x])**2
    return loss



**Less Separation Loss**

$$ hann\_func(x) := 종 모양 func$$

spectral flatness * power 가 0 혹은 1에 가까울 때 손실률이 0에 가까워지게

In [None]:
### LS Loss
import numpy as np
import librosa

def hann_func(x):
    return 0.5 * (1 - np.cos(2 * np.pi * x))
"""
def power_to_db(
    S: _ScalarOrSequence[_ComplexLike_co],
    *,
    ref: Union[float, Callable] = 1.0,
    amin: float = 1e-10,
    top_db: Optional[float] = 80.0,
) -> Union[np.floating[Any], np.ndarray]:
    """
    """Convert a power spectrogram (amplitude squared) to decibel (dB) units

    This computes the scaling ``10 * log10(S / ref)`` in a numerically
    stable way.

    Parameters
    ----------
    S : np.ndarray
        input power

    ref : scalar or callable
        If scalar, the amplitude ``abs(S)`` is scaled relative to ``ref``::

            10 * log10(S / ref)

        Zeros in the output correspond to positions where ``S == ref``.

        If callable, the reference value is computed as ``ref(S)``.

    amin : float > 0 [scalar]
        minimum threshold for ``abs(S)`` and ``ref``

    top_db : float >= 0 [scalar]
        threshold the output at ``top_db`` below the peak:
        ``max(10 * log10(S/ref)) - top_db``

    Returns
    -------
    S_db : np.ndarray
        ``S_db ~= 10 * log10(S) - 10 * log10(ref)``

    See Also
    --------
    perceptual_weighting
    db_to_power
    amplitude_to_db
    db_to_amplitude
    Notes
    -----
    This function caches at level 30.

    Examples
    --------
    Get a power spectrogram from a waveform ``y``

    >>> y, sr = librosa.load(librosa.ex('trumpet'))
    >>> S = np.abs(librosa.stft(y))
    >>> librosa.power_to_db(S**2)
    array([[-41.809, -41.809, ..., -41.809, -41.809],
           [-41.809, -41.809, ..., -41.809, -41.809],
           ...,
           [-41.809, -41.809, ..., -41.809, -41.809],
           [-41.809, -41.809, ..., -41.809, -41.809]], dtype=float32)

    Compute dB relative to peak power

    >>> librosa.power_to_db(S**2, ref=np.max)
    array([[-80., -80., ..., -80., -80.],
           [-80., -80., ..., -80., -80.],
           ...,
           [-80., -80., ..., -80., -80.],
           [-80., -80., ..., -80., -80.]], dtype=float32)

    Or compare to median power

    >>> librosa.power_to_db(S**2, ref=np.median)
    array([[16.578, 16.578, ..., 16.578, 16.578],
           [16.578, 16.578, ..., 16.578, 16.578],
           ...,
           [16.578, 16.578, ..., 16.578, 16.578],
           [16.578, 16.578, ..., 16.578, 16.578]], dtype=float32)

    And plot the results
    >>> import matplotlib.pyplot as plt
    >>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
    >>> imgpow = librosa.display.specshow(S**2, sr=sr, y_axis='log', x_axis='time',
    ...                                   ax=ax[0])
    >>> ax[0].set(title='Power spectrogram')
    >>> ax[0].label_outer()
    >>> imgdb = librosa.display.specshow(librosa.power_to_db(S**2, ref=np.max),
    ...                                  sr=sr, y_axis='log', x_axis='time', ax=ax[1])
    >>> ax[1].set(title='Log-Power spectrogram')
    >>> fig.colorbar(imgpow, ax=ax[0])
    >>> fig.colorbar(imgdb, ax=ax[1], format="%+2.0f dB")
    """
    """
    S = np.asarray(S)

    if amin <= 0:
        raise ParameterError("amin must be strictly positive")

    if np.issubdtype(S.dtype, np.complexfloating):
        warnings.warn(
            "power_to_db was called on complex input so phase "
            "information will be discarded. To suppress this warning, "
            "call power_to_db(np.abs(D)**2) instead.",
            stacklevel=2,
        )
        magnitude = np.abs(S)
    else:
        magnitude = S

    if callable(ref):
        # User supplied a function to calculate reference power
        ref_value = ref(magnitude)
    else:
        ref_value = np.abs(ref)

    log_spec: np.ndarray = 10.0 * np.log10(np.maximum(amin, magnitude))
    log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))

    if top_db is not None:
        if top_db < 0:
            raise ParameterError("top_db must be non-negative")
        log_spec = np.maximum(log_spec, log_spec.max() - top_db)

    return log_spec
"""

def LS_loss(x):
    """
    sf: spectral_flatness of the audio 
    p: power of the audio
    """
    sf = spectral_flatness(x)
    p = power_to_db(x)
    y = sf * p
    return hann_func(y)

**Audio Dominance Loss**

$$ \mathcal{L}_{AD} = \sum_{n=1}^N (-\log{spectral\_flatness(A_{sep_n})}) \cdot \frac{\langle [max\_pooling({AF}_n)] \cdot D_n \rangle}{\Vert [max\_pooling({AF}_n)] \Vert \cdot \Vert D_n \Vert}) $$

In [None]:
import numpy as np

def spectral_flatness(
    *,
    y: Optional[np.ndarray] = None,
    S: Optional[np.ndarray] = None,
    n_fft: int = 2048,
    hop_length: int = 512,
    win_length: Optional[int] = None,
    window: _WindowSpec = "hann",
    center: bool = True,
    pad_mode: _PadModeSTFT = "constant",
    amin: float = 1e-10,
    power: float = 2.0,
) -> np.ndarray:
    """Compute spectral flatness

    Spectral flatness (or tonality coefficient) is a measure to
    quantify how much noise-like a sound is, as opposed to being
    tone-like [#]_. A high spectral flatness (closer to 1.0)
    indicates the spectrum is similar to white noise.
    It is often converted to decibel.

    .. [#] Dubnov, Shlomo  "Generalization of spectral flatness
           measure for non-gaussian linear processes"
           IEEE Signal Processing Letters, 2004, Vol. 11.

    Parameters
    ----------
    y : np.ndarray [shape=(..., n)] or None
        audio time series. Multi-channel is supported.
    S : np.ndarray [shape=(..., d, t)] or None
        (optional) pre-computed spectrogram magnitude
    n_fft : int > 0 [scalar]
        FFT window size
    hop_length : int > 0 [scalar]
        hop length for STFT. See `librosa.stft` for details.
    win_length : int <= n_fft [scalar]
        Each frame of audio is windowed by `window()`.
        The window will be of length `win_length` and then padded
        with zeros to match ``n_fft``.
        If unspecified, defaults to ``win_length = n_fft``.
    window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
        - a window specification (string, tuple, or number);
          see `scipy.signal.get_window`
        - a window function, such as `scipy.signal.windows.hann`
        - a vector or array of length ``n_fft``
        .. see also:: `librosa.filters.get_window`
    center : boolean
        - If `True`, the signal ``y`` is padded so that frame
          ``t`` is centered at ``y[t * hop_length]``.
        - If `False`, then frame `t` begins at ``y[t * hop_length]``
    pad_mode : string
        If ``center=True``, the padding mode to use at the edges of the signal.
        By default, STFT uses zero padding.
    amin : float > 0 [scalar]
        minimum threshold for ``S`` (=added noise floor for numerical stability)
    power : float > 0 [scalar]
        Exponent for the magnitude spectrogram.
        e.g., 1 for energy, 2 for power, etc.
        Power spectrogram is usually used for computing spectral flatness.

    Returns
    -------
    flatness : np.ndarray [shape=(..., 1, t)]
        spectral flatness for each frame.
        The returned value is in [0, 1] and often converted to dB scale.

    Examples
    --------
    From time-series input

    >>> y, sr = librosa.load(librosa.ex('trumpet'))
    >>> flatness = librosa.feature.spectral_flatness(y=y)
    >>> flatness
    array([[0.001, 0.   , ..., 0.218, 0.184]], dtype=float32)

    From spectrogram input

    >>> S, phase = librosa.magphase(librosa.stft(y))
    >>> librosa.feature.spectral_flatness(S=S)
    array([[0.001, 0.   , ..., 0.218, 0.184]], dtype=float32)

    From power spectrogram input

    >>> S, phase = librosa.magphase(librosa.stft(y))
    >>> S_power = S ** 2
    >>> librosa.feature.spectral_flatness(S=S_power, power=1.0)
    array([[0.001, 0.   , ..., 0.218, 0.184]], dtype=float32)

    """
    if amin <= 0:
        raise ParameterError("amin must be strictly positive")

    S, n_fft = _spectrogram(
        y=y,
        S=S,
        n_fft=n_fft,
        hop_length=hop_length,
        power=1.0,
        win_length=win_length,
        window=window,
        center=center,
        pad_mode=pad_mode,
    )

    if not np.isrealobj(S):
        raise ParameterError(
            "Spectral flatness is only defined " "with real-valued input"
        )
    elif np.any(S < 0):
        raise ParameterError(
            "Spectral flatness is only defined " "with non-negative energies"
        )

    S_thresh = np.maximum(amin, S**power)
    gmean = np.exp(np.mean(np.log(S_thresh), axis=-2, keepdims=True))
    amean = np.mean(S_thresh, axis=-2, keepdims=True)
    flatness: np.ndarray = gmean / amean
    return flatness

In [None]:
from numpy import dot
from numpy.linalg import norm
from math import log
def cos_sim(a,b):
    return dot(a,b)/(norm(a) * norm(b))

def AD_loss(N, audio_features, audio_sp, D):
    loss = 0
    for n in range(len(N)):
        loss +=  -log(spectral_flatness(audio_sp[n]))* cos_sim(np.max(audio_features[n], axis=1), D[n])
    
    return loss

**Scoremap Loss**

$$ \mathcal{L}_{Sm} = -\log \frac{{MAX}(Score_{u,v})}{\sum^{N,H,W}_{m,u,v}Score_{m,u,v}} $$

In [None]:
def Sm_loss(N,n_scoremap):
    loss = 0
    for u in range(len(n_scoremap.shape[1])):
        for v in range(len(n_scoremap.shape[2])):
            loss += np.max(n_scoremap, axis=0)[u][v]/np.sum(n_scoremap, axis=0)[u][v]
    return -log(loss)

**Contrastive Loss**

<img src = "./IMGs/Clip_Fig1.png">

Given a batch of N (image, text) pairs, CLIP is trained to predict which of the N × N possible (image, text) pairings across a batch actually occurred. To do this, CLIP learns a multi-modal embedding space by jointly training an image encoder and text encoder to maximize the cosine similarity of the image and text embeddings of the N real pairs in the batch while minimizing the cosine similarity of the embeddings of the N2 − N incorrect pairings. We optimize a symmetric cross entropy loss over these similarity scores. In Figure 3 we include pseudocode of the core of an implementation of CLIP. \\

<!-- <img src = "./IMGs/Clip_Fig2.png">{width="50%"} -->

$$ Pos : S_{i\rightarrow i} = avg(per\_pixel\_embedding_i \cdot mask\_embedding_i) \\ .
\\
Neg : S_{i\rightarrow j} = avg(per\_pixel\_embedding_i \cdot mask\_embedding_j) \\ . \\
\mathcal{L}_{C} = 
$$

6분 35초 참조

In [None]:
def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))

def clip_loss(mask_embedding: torch.Tensor, mask_features: torch.Tensor, batch_size) -> torch.Tensor:
    logit_scores = torch.zeros((batch_size, batch_size))

    for i in range(batch_size):
        for j in range(batch_size):
          mask_outputs = torch.einsum("bqc,bchw->bqhw", mask_embedding[j].unsqueeze(0), mask_features[i].unsqueeze(0))
          score = mask_outputs.sum()
          logit_scores[i][j] = score

    logit_scores = logit_scores / logit_scores.norm(p=1, dim=-1, keepdim=True)

    feature_loss = contrastive_loss(logit_scores)
    embed_loss = contrastive_loss(logit_scores.t())
    return (feature_loss + embed_loss) / 2.0