In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from typing import Callable, Dict, Iterable, List, Tuple

  from tqdm.autonotebook import tqdm, trange


In [19]:
model = SentenceTransformer("all-mpnet-base-v2")

In [20]:
str_a = "When discussing the client's concerns and presentation during intake, it would be best to discuss suicidal ideation first."
str_b = "I hate you"

In [21]:
emb_a = model.encode(str_a)
emb_b = model.encode(str_b)

In [22]:
def cosine_similarity_scaled(list1: np.ndarray, list2: np.ndarray) -> float:
    """
    Normalized cosine similarity for *normalized* embeddings.

    Normalized cosine similarity takes values from [0;1]
    """
    cosine_sim = np.dot(list1, list2) / (np.linalg.norm(list1) * np.linalg.norm(list2))
    return (1.0 + cosine_sim) / 2.0


def embedding_alignment(ref_emb: np.ndarray, hypo_emb: np.ndarray) -> List[float]:
    """
    Return embedding matching alignment for each item in hypo_emb
    ref_emb: list of reference embeddings
    hypo_emb: list oh hypothesises embeddings
    """
    scores = []
    for he in hypo_emb:
        # some embeddings can be empty. For example, for latex-style equations, or empty string
        if len(he) > 0:
            out = [cosine_similarity_scaled(he, re) for re in ref_emb if len(re) > 0]
            if len(out) > 0:
                
                scores.append(max(out))
    return scores

In [23]:
cosine_similarity_scaled(emb_a, emb_b)

0.53459033370018