# Evaluate embedding models

## load and format test cases

In [1]:
import re
from typing import Dict, List

In [2]:
with open('../data/test_cases.txt') as f:
    test_cases_raw = f.readlines()

In [3]:
test_cases_raw[:7]

['问：什么是我国第一部编年史著作？\n',
 '\n',
 '答：《左传》。\n',
 '\n',
 '资料：附：《左传》是我国第一部编年史著作。\n',
 '\n',
 '问：什么是我国第一部编年国别史？\n']

In [4]:
len(test_cases_raw)

245

In [5]:
def test_cases_preprocessing(raw_texts: str) -> Dict[str, str]:
    # combining into a single string, remove all "\n" in between
    texts_split = "".join(raw_texts[0::2]).split('问：')[1:]
    # separate by keyword 答, 资料
    texts_split = [re.split(r'\n答：|\n资料：', x) for x in texts_split]
    # remove all \xa0 in between
    texts_split = [[x.replace('\xa0', '') for x in sublist] for sublist in texts_split]
    # format into a dict with q, a, and ref keys
    test_cases = [{'q': x[0], 'a': x[1], 'ref': x[2:]} for x in texts_split]
    
    return test_cases

test_cases = test_cases_preprocessing(test_cases_raw)
test_cases[:3]

[{'q': '什么是我国第一部编年史著作？', 'a': '《左传》。', 'ref': ['附：《左传》是我国第一部编年史著作。\n']},
 {'q': '什么是我国第一部编年国别史？', 'a': '《国语》。', 'ref': ['附：《国语》是我国第一部编年国别史。\n']},
 {'q': '“寡人之于国也”下一句是什么？来自哪里？',
  'a': '“寡人之于国也”下一句是“尽心焉耳矣”。这个句子来自《孟子》。',
  'ref': ['梁惠王曰：“寡人之于国也，尽心焉耳矣。河内凶，则移其民于河东，移其粟于河内；河东凶亦然。察邻国之政，无如寡人之用心者。邻国之民不加少，寡人之民不加多，何也？”',
   '《寡人之于国也》（孟子）\n']}]

In [6]:
len(test_cases)

37

# Evaluate embedding models

* We want our `q` questions to be as close to `ref` refernced documents as possible, as `ref` are text chunks directly retrieved from the documents that are to be put into vector databases.
* Evaluated options from SentenceTransformer registries

In [7]:
# # not free
# from langchain_openai import OpenAIEmbeddings
# embeddings = OpenAIEmbeddings()

In [8]:
from sentence_transformers import SentenceTransformer

In [9]:
# from transformers import file_utils
# print(file_utils.default_cache_path)

Model name | Provider | Model size (#pamras) | Model Size (disk) | Download past month | Highlights | Time Load/Inference (online compute) | Mean difference paired & unpaired Q & Ref | HF Link |
--|--|--|--|--|--|--|--|--|
intfloat/multilingual-e5-large | Microsoft | 560M | 2.2G | 93K |24 layers and the embedding size is 1024| 5.0s/1920s | 0.062 |https://huggingface.co/intfloat/multilingual-e5-large|
intfloat/multilingual-e5-base| Microsoft | 278M | 1.1G | 42K |12 layers and the embedding size is 768| 3.4s/531s| 0.063 | https://huggingface.co/intfloat/multilingual-e5-base|
sentence-transformers/LaBSE | Google | | 1.9G | 88K | the embedding size is 768 | 5.7s/620s | 0.19 | https://huggingface.co/sentence-transformers/LaBSE|
maidalun1020/bce-embedding-base_v1 | NetEase-Youdao |  279M | 1.1G | 111K | optimized for RAG | 3.0s/495s | 0.23 | https://huggingface.co/maidalun1020/bce-embedding-base_v1
BAAI/bge-large-zh-v1.5|Beijing Academy of Artificial Intelligence| 326M | 1.3G | 22K | | 1.6s/1730s| 0.26 |  https://huggingface.co/BAAI/bge-large-zh-v1.5#usage|
uer/sbert-base-chinese-nli| Tencent | | 409M  | 8K | 12 layers and the embedding size is 768 | 0.6s/1350s | 0.22 | https://huggingface.co/uer/sbert-base-chinese-nli |
sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2| Sentence Transformer | | 449M | 38K | 384 embedding size | 1.4s/392s | 0.25 | https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 |
sentence-transformers/distiluse-base-multilingual-cased-v1 | Sentence Transformer | | 539M | 31K | 768 embedding size | 1.3s/163s | 0.28 | https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v1 |
sentence-transformers/distiluse-base-multilingual-cased-v2 | Sentence Transformer | | 539M | 43K | 768 enbedding size | 1.2s/164s | 0.25 | https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2 |
sentence-transformers/paraphrase-multilingual-mpnet-base-v2 | Sentence Transformer | | 1.1G | 24K | 768 embedding size | 2.7s/463s | 0.21 | https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2 |



In [10]:
# validate GPU/Metal acceleration on Mac (no action requires, should be enabled with environment build)
# https://developer.apple.com/metal/pytorch/
# The output should show: tensor([1.], device='mps:0')

import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


In [11]:
# all embeedings supported in sentence-transformers library
# https://huggingface.co/models?library=sentence-transformers

# cached model objects in ~/.cache/torch/sentence_transformers

sentence_transformer_model_lists = [
    # Microsoft
    'intfloat/multilingual-e5-large',
    'intfloat/multilingual-e5-base',

    # Google
    'sentence-transformers/LaBSE',

    # Chinese companies
    'maidalun1020/bce-embedding-base_v1', # this requires direct download through Git LFS from HF as it is gated https://huggingface.co/maidalun1020/bce-embedding-base_v1/tree/main
    'BAAI/bge-large-zh-v1.5',
    'uer/sbert-base-chinese-nli',

    # Sentence Transformer native
    # https://www.sbert.net/docs/pretrained_models.html#multi-lingual-models
    'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
    'sentence-transformers/distiluse-base-multilingual-cased-v1',
    'sentence-transformers/distiluse-base-multilingual-cased-v2',
    'sentence-transformers/paraphrase-multilingual-mpnet-base-v2',
]

# # download the models
# for model_name in sentence_transformer_model_lists:
#     print(model_name)
#     _ = SentenceTransformer(model_name)

In [12]:
docs = [x['q'] for x in test_cases] + [x['a'] for x in test_cases]
len(docs)

74

In [13]:
from loguru import logger
import time

In [14]:
import numpy as np

def profile_docs(docs: List[str]):
    characters = [len(x) for x in docs]
    len_docs = len(docs)
    logger.info(f"total Chinese characters: {sum(characters)}, total docs: {len_docs}")
    logger.info(f"Min/Mean/Max characters per doc: {min(characters)}, {np.mean(characters):.2f}, {max(characters)}")

profile_docs(docs)

[32m2024-01-21 09:31:03.790[0m | [1mINFO    [0m | [36m__main__[0m:[36mprofile_docs[0m:[36m6[0m - [1mtotal Chinese characters: 3820, total docs: 74[0m
[32m2024-01-21 09:31:03.791[0m | [1mINFO    [0m | [36m__main__[0m:[36mprofile_docs[0m:[36m7[0m - [1mMin/Mean/Max characters per doc: 3, 51.62, 523[0m


In [15]:
MODEL_DIR = '/Users/fred/.cache/torch/sentence_transformers/'

def _get_model_path(model_dir, model_name):
    return model_dir+model_name.replace('/', '_')+'/'

In [16]:
# # Test the querying time for each embedding model using questions in the test cases

# def timer_embedding_model(model_name, sentences, model_dir) -> None:
#     time_start = time.time()
#     model = SentenceTransformer.load(_get_model_path(model_dir, model_name))
#     time_end = time.time()
#     logger.info(f'Time taken loading {model_name}: {time_end - time_start:.2f}s')
    
#     time_start = time.time()
#     model.encode(sentences)
#     time_end = time.time()
#     logger.info(f"Time taken for {model_name}: {time_end - time_start:.2f}s")

#     print()

# for model in sentence_transformer_model_lists:
#     timer_embedding_model(model, docs, MODEL_DIR)

"contrastive loss" measure for test cases

* paired `ref` and `q` should have high similarity
* unpaired `ref` and `q` should have low similarity
* scores can be computed as the difference between the two as "contrastive" measure

In [17]:
def _flatten_list(ll: List[List[str]]) -> List[str]:
    return [item for sublist in ll for item in sublist]

_flatten_list([[1,2], [3, 4, 5], [6]])

[1, 2, 3, 4, 5, 6]

In [18]:
# model = SentenceTransformer('sentence-transformers/distilu4se-base-multilingual-cased-v1')

In [19]:
%%time
from sentence_transformers import util as st_utils

scores = dict()

for model_name in sentence_transformer_model_lists:
    
    time_start = time.time()
    model = SentenceTransformer.load(_get_model_path(MODEL_DIR, model_name))
    time_end = time.time()
    logger.info(f'Time taken loading {model_name}: {time_end - time_start:.2f}s')
    
    score_list = list()
    
    time_start = time.time()
    for qa in test_cases:
        
        query = qa['q']
        docs_paired = qa['ref']
        docs_unpaired = _flatten_list([x['ref'] for x in test_cases if x is not qa])

        # skip if empty records found
        if query == "" or docs_paired == []:
            continue
        
        query_embedding = model.encode(query, convert_to_tensor=True)
        docs_paired_embeddings = model.encode(docs_paired, convert_to_tensor=True)
        docs_unpaired_embeddings = model.encode(docs_unpaired, convert_to_tensor=True)
        
        cos_scores_paired = st_utils.cos_sim(query_embedding, docs_paired_embeddings)[0] # cos_sim is used in multi-dim computation, so it returns nested result [[]]
        cos_scored_unpaired = st_utils.cos_sim(query_embedding, docs_unpaired_embeddings)[0]
        score_list.append({'score_paired': cos_scores_paired.numpy(), 'score_unpaired': cos_scored_unpaired.numpy()})

    time_end = time.time()
    logger.info(f'Time taken to score by {model_name}: {time_end-time_start:.2f}s')
    print()
    
    scores[model_name] = score_list

[32m2024-01-21 09:31:08.754[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mTime taken loading intfloat/multilingual-e5-large: 4.95s[0m
[32m2024-01-21 10:03:09.163[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m34[0m - [1mTime taken to score by intfloat/multilingual-e5-large: 1920.41s[0m





[32m2024-01-21 10:03:12.586[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mTime taken loading intfloat/multilingual-e5-base: 3.42s[0m
[32m2024-01-21 10:12:04.361[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m34[0m - [1mTime taken to score by intfloat/multilingual-e5-base: 531.77s[0m





[32m2024-01-21 10:12:10.018[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mTime taken loading sentence-transformers/LaBSE: 5.66s[0m
[32m2024-01-21 10:22:29.862[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m34[0m - [1mTime taken to score by sentence-transformers/LaBSE: 619.84s[0m





[32m2024-01-21 10:22:32.818[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mTime taken loading maidalun1020/bce-embedding-base_v1: 2.96s[0m
[32m2024-01-21 10:30:48.336[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m34[0m - [1mTime taken to score by maidalun1020/bce-embedding-base_v1: 495.52s[0m





[32m2024-01-21 10:30:49.948[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mTime taken loading BAAI/bge-large-zh-v1.5: 1.61s[0m
[32m2024-01-21 10:59:39.496[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m34[0m - [1mTime taken to score by BAAI/bge-large-zh-v1.5: 1729.55s[0m
No sentence-transformers model found with name /Users/fred/.cache/torch/sentence_transformers/uer_sbert-base-chinese-nli/. Creating a new one with MEAN pooling.





[32m2024-01-21 10:59:40.109[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mTime taken loading uer/sbert-base-chinese-nli: 0.61s[0m
[32m2024-01-21 11:22:09.834[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m34[0m - [1mTime taken to score by uer/sbert-base-chinese-nli: 1349.72s[0m





[32m2024-01-21 11:22:11.204[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mTime taken loading sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2: 1.37s[0m
[32m2024-01-21 11:28:43.177[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m34[0m - [1mTime taken to score by sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2: 391.97s[0m





[32m2024-01-21 11:28:44.487[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mTime taken loading sentence-transformers/distiluse-base-multilingual-cased-v1: 1.31s[0m
[32m2024-01-21 11:31:27.702[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m34[0m - [1mTime taken to score by sentence-transformers/distiluse-base-multilingual-cased-v1: 163.21s[0m





[32m2024-01-21 11:31:28.916[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mTime taken loading sentence-transformers/distiluse-base-multilingual-cased-v2: 1.21s[0m
[32m2024-01-21 11:34:13.391[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m34[0m - [1mTime taken to score by sentence-transformers/distiluse-base-multilingual-cased-v2: 164.47s[0m





[32m2024-01-21 11:34:16.101[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mTime taken loading sentence-transformers/paraphrase-multilingual-mpnet-base-v2: 2.71s[0m
[32m2024-01-21 11:41:59.225[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m34[0m - [1mTime taken to score by sentence-transformers/paraphrase-multilingual-mpnet-base-v2: 463.12s[0m



CPU times: user 16h 53min 27s, sys: 23min 6s, total: 17h 16min 34s
Wall time: 2h 10min 55s


In [21]:
# to save
# use np.load('../data/embedding_models_evaluation.npy', allow_pickle=True)
np.save('../data/embedding_models_evaluation.npy', scores, allow_pickle=True)

### Process scores

In [36]:
def aggregate_scores(scores: Dict[str, List[Dict[str, np.array]]]):

    result = dict()
    for model_name, score in scores.items():
        # inner loop aggregation to average all scores of examples for each q
        # result looks like [{'score_paired': arr, 'score_unpaired': arr}, {same fotmat}, {same}, ...]
        score_agg = [{'score_paired' : np.mean(x['score_paired']), 'score_unpaired': np.mean(x['score_unpaired'])} for x in score]
        
        # outer loop aggregation to average all scores of all qs
        score_paired = np.mean([x['score_paired'] for x in score_agg])
        score_unpaired = np.mean([x['score_unpaired'] for x in score_agg])

        # final result per model
        diff = score_paired - score_unpaired
        result[model_name] = diff

    return result

aggregate_scores(scores)

{'intfloat/multilingual-e5-large': 0.0616228,
 'intfloat/multilingual-e5-base': 0.062966585,
 'sentence-transformers/LaBSE': 0.18936068,
 'maidalun1020/bce-embedding-base_v1': 0.23052517,
 'BAAI/bge-large-zh-v1.5': 0.25686648,
 'uer/sbert-base-chinese-nli': 0.21703584,
 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2': 0.24728146,
 'sentence-transformers/distiluse-base-multilingual-cased-v1': 0.2788659,
 'sentence-transformers/distiluse-base-multilingual-cased-v2': 0.24307375,
 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2': 0.21112707}

Final recommendation: `sentence-transformers/distiluse-base-multilingual-cased-v1`