# Testing embedding models

## load and format test cases

In [50]:
import re
from typing import Dict, List

In [51]:
with open('../data/test_cases.txt') as f:
    test_cases_raw = f.readlines()

In [52]:
test_cases_raw[:7]

['问：什么是我国第一部编年史著作？\n',
 '\n',
 '答：《左传》。\n',
 '\n',
 '资料：附：《左传》是我国第一部编年史著作。\n',
 '\n',
 '问：什么是我国第一部编年国别史？\n']

In [53]:
len(test_cases_raw)

245

In [54]:
def test_cases_preprocessing(raw_texts: str) -> Dict[str, str]:
    # combining into a single string, remove all "\n" in between
    texts_split = "".join(raw_texts[0::2]).split('问：')[1:]
    # separate by keyword 答, 资料
    texts_split = [re.split(r'\n答：|\n资料：', x) for x in texts_split]
    # remove all \xa0 in between
    texts_split = [[x.replace('\xa0', '') for x in sublist] for sublist in texts_split]
    # format into a dict with q, a, and ref keys
    test_cases = [{'q': x[0], 'a': x[1], 'ref': x[2:]} for x in texts_split]
    
    return test_cases

test_cases = test_cases_preprocessing(test_cases_raw)
test_cases[:3]

[{'q': '什么是我国第一部编年史著作？', 'a': '《左传》。', 'ref': ['附：《左传》是我国第一部编年史著作。\n']},
 {'q': '什么是我国第一部编年国别史？', 'a': '《国语》。', 'ref': ['附：《国语》是我国第一部编年国别史。\n']},
 {'q': '“寡人之于国也”下一句是什么？来自哪里？',
  'a': '“寡人之于国也”下一句是“尽心焉耳矣”。这个句子来自《孟子》。',
  'ref': ['梁惠王曰：“寡人之于国也，尽心焉耳矣。河内凶，则移其民于河东，移其粟于河内；河东凶亦然。察邻国之政，无如寡人之用心者。邻国之民不加少，寡人之民不加多，何也？”',
   '《寡人之于国也》（孟子）\n']}]

In [6]:
len(test_cases)

37

# Evaluate embedding models

* We want our `q` questions to be as close to `ref` refernced documents as possible, as `ref` are text chunks directly retrieved from the documents that are to be put into vector databases.

* langchain_community.embeddings
    * from langchain_openai import OpenAIEmbeddings
    * SentenceTransformer registries
    * from langchain_community.embeddings import OllamaEmbeddings
    * from langchain_community.embeddings import LlamaCppEmbeddings
    * from langchain_community.embeddings import GPT4AllEmbeddings
    * from langchain_google_vertexai import VertexAIEmbeddings
    * from langchain_community.embeddings import CohereEmbeddings
    * from langchain_community.embeddings import QianfanEmbeddingsEndpoint

In [7]:
# # not free
# from langchain_openai import OpenAIEmbeddings
# embeddings = OpenAIEmbeddings()

In [8]:
from sentence_transformers import SentenceTransformer

In [9]:
# from transformers import file_utils
# print(file_utils.default_cache_path)

Model name | Provider | Model size (#pamras) | Model Size (disk) | Download past month | Highlights | Time Load/Inference | HF Link |
--|--|--|--|--|--|--|--|
intfloat/multilingual-e5-large | Microsoft | 560M | 2.2G | 93K |24 layers and the embedding size is 1024| 4.2s/55.2s |https://huggingface.co/intfloat/multilingual-e5-large|
intfloat/multilingual-e5-base| Microsoft | 278M | 1.1G | 42K |12 layers and the embedding size is 768| 2.7s/19.6s| https://huggingface.co/intfloat/multilingual-e5-base|
sentence-transformers/LaBSE | Google | | 1.9G | 88K | the embedding size is 768 | 4.7s/14.9s | https://huggingface.co/sentence-transformers/LaBSE|
maidalun1020/bce-embedding-base_v1 | NetEase-Youdao |  279M | XG | 111K | optimized for RAG | 2.7s/20.5s | https://huggingface.co/maidalun1020/bce-embedding-base_v1
BAAI/bge-large-zh-v1.5|Beijing Academy of Artificial Intelligence| 326M | 1.3G | 22K | | 1.4s/76.7s| https://huggingface.co/BAAI/bge-large-zh-v1.5#usage|
uer/sbert-base-chinese-nli| Tencent | | 409M  | 8K | 12 layers and the embedding size is 768 | 0.6s/25.0s | https://huggingface.co/uer/sbert-base-chinese-nli |
sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2| Sentence Transformer | | 449M | 38K | 384 embedding size | 1.4s/11.8s | https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 |
sentence-transformers/distiluse-base-multilingual-cased-v1 | Sentence Transformer | | 539M | 31K | 768 embedding size | 1.2s/7.1s | https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v1 |
sentence-transformers/distiluse-base-multilingual-cased-v2 | Sentence Transformer | | 539M | 43K | 768 enbedding size | 1.2s/5.7s | https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2 |
sentence-transformers/paraphrase-multilingual-mpnet-base-v2 | Sentence Transformer | | 1.1G | 24K | 768 embedding size | 2.6s/11.0s | https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2 |



In [10]:
# validate GPU/Metal acceleration on Mac (no action requires, should be enabled with environment build)
# https://developer.apple.com/metal/pytorch/
# The output should show: tensor([1.], device='mps:0')

import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


In [11]:
# all embeedings supported in sentence-transformers library
# https://huggingface.co/models?library=sentence-transformers

# cached model objects in ~/.cache/torch/sentence_transformers

sentence_transformer_model_lists = [
    # Microsoft
    'intfloat/multilingual-e5-large',
    'intfloat/multilingual-e5-base',

    # Google
    'sentence-transformers/LaBSE',

    # Chinese companies
    'maidalun1020/bce-embedding-base_v1', # this requires direct download through Git LFS from HF as it is gated https://huggingface.co/maidalun1020/bce-embedding-base_v1/tree/main
    'BAAI/bge-large-zh-v1.5',
    'uer/sbert-base-chinese-nli',

    # Sentence Transformer native
    # https://www.sbert.net/docs/pretrained_models.html#multi-lingual-models
    'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
    'sentence-transformers/distiluse-base-multilingual-cased-v1',
    'sentence-transformers/distiluse-base-multilingual-cased-v2',
    'sentence-transformers/paraphrase-multilingual-mpnet-base-v2',
]

# # download the models
# for model_name in sentence_transformer_model_lists:
#     print(model_name)
#     _ = SentenceTransformer(model_name)

In [14]:
docs = [x['q'] for x in test_cases] + [x['a'] for x in test_cases]
len(docs)

74

In [15]:
import numpy as np

def profile_docs(docs: List[str]):
    characters = [len(x) for x in docs]
    len_docs = len(docs)
    logger.info(f"total Chinese characters: {sum(characters)}, total docs: {len_docs}")
    logger.info(f"Min/Mean/Max characters per doc: {min(characters)}, {np.mean(characters):.2f}, {max(characters)}")

profile_docs(docs)

[32m2024-01-20 22:36:46.997[0m | [1mINFO    [0m | [36m__main__[0m:[36mprofile_docs[0m:[36m6[0m - [1mtotal Chinese characters: 3746, total docs: 74[0m
[32m2024-01-20 22:36:46.998[0m | [1mINFO    [0m | [36m__main__[0m:[36mprofile_docs[0m:[36m7[0m - [1mMin/Mean/Max characters per doc: 2, 50.62, 522[0m


In [12]:
MODEL_DIR = '/Users/fred/.cache/torch/sentence_transformers/'

def _get_model_path(model_dir, model_name):
    return model_dir+model_name.replace('/', '_')+'/'

In [13]:
from loguru import logger
import time

def timer_embedding_model(model_name, sentences, model_dir) -> None:
    time_start = time.time()
    model = SentenceTransformer.load(_get_model_path(model_dir, model_name))
    time_end = time.time()
    logger.info(f'Time taken loading {model_name}: {time_end - time_start:.2f}s')
    
    time_start = time.time()
    model.encode(sentences)
    time_end = time.time()
    logger.info(f"Time taken for {model_name}: {time_end - time_start:.2f}s")

    print()

In [16]:
# Test the querying time for each embedding model using questions in the test cases

for model in sentence_transformer_model_lists:
    timer_embedding_model(model, docs, MODEL_DIR)

[32m2024-01-20 22:36:51.328[0m | [1mINFO    [0m | [36m__main__[0m:[36mtimer_embedding_model[0m:[36m8[0m - [1mTime taken loading intfloat/multilingual-e5-large: 4.33s[0m
[32m2024-01-20 22:37:49.244[0m | [1mINFO    [0m | [36m__main__[0m:[36mtimer_embedding_model[0m:[36m13[0m - [1mTime taken for intfloat/multilingual-e5-large: 57.91s[0m





[32m2024-01-20 22:37:52.141[0m | [1mINFO    [0m | [36m__main__[0m:[36mtimer_embedding_model[0m:[36m8[0m - [1mTime taken loading intfloat/multilingual-e5-base: 2.70s[0m
[32m2024-01-20 22:38:11.407[0m | [1mINFO    [0m | [36m__main__[0m:[36mtimer_embedding_model[0m:[36m13[0m - [1mTime taken for intfloat/multilingual-e5-base: 19.27s[0m





[32m2024-01-20 22:38:16.160[0m | [1mINFO    [0m | [36m__main__[0m:[36mtimer_embedding_model[0m:[36m8[0m - [1mTime taken loading sentence-transformers/LaBSE: 4.59s[0m
[32m2024-01-20 22:38:31.408[0m | [1mINFO    [0m | [36m__main__[0m:[36mtimer_embedding_model[0m:[36m13[0m - [1mTime taken for sentence-transformers/LaBSE: 15.25s[0m





[32m2024-01-20 22:38:34.163[0m | [1mINFO    [0m | [36m__main__[0m:[36mtimer_embedding_model[0m:[36m8[0m - [1mTime taken loading maidalun1020/bce-embedding-base_v1: 2.68s[0m
[32m2024-01-20 22:38:54.369[0m | [1mINFO    [0m | [36m__main__[0m:[36mtimer_embedding_model[0m:[36m13[0m - [1mTime taken for maidalun1020/bce-embedding-base_v1: 20.20s[0m





[32m2024-01-20 22:38:55.936[0m | [1mINFO    [0m | [36m__main__[0m:[36mtimer_embedding_model[0m:[36m8[0m - [1mTime taken loading BAAI/bge-large-zh-v1.5: 1.41s[0m
[32m2024-01-20 22:40:13.134[0m | [1mINFO    [0m | [36m__main__[0m:[36mtimer_embedding_model[0m:[36m13[0m - [1mTime taken for BAAI/bge-large-zh-v1.5: 77.20s[0m
No sentence-transformers model found with name /Users/fred/.cache/torch/sentence_transformers/uer_sbert-base-chinese-nli/. Creating a new one with MEAN pooling.





[32m2024-01-20 22:40:13.824[0m | [1mINFO    [0m | [36m__main__[0m:[36mtimer_embedding_model[0m:[36m8[0m - [1mTime taken loading uer/sbert-base-chinese-nli: 0.66s[0m
[32m2024-01-20 22:40:38.920[0m | [1mINFO    [0m | [36m__main__[0m:[36mtimer_embedding_model[0m:[36m13[0m - [1mTime taken for uer/sbert-base-chinese-nli: 25.10s[0m





[32m2024-01-20 22:40:40.298[0m | [1mINFO    [0m | [36m__main__[0m:[36mtimer_embedding_model[0m:[36m8[0m - [1mTime taken loading sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2: 1.36s[0m
[32m2024-01-20 22:40:51.449[0m | [1mINFO    [0m | [36m__main__[0m:[36mtimer_embedding_model[0m:[36m13[0m - [1mTime taken for sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2: 11.15s[0m





[32m2024-01-20 22:40:52.768[0m | [1mINFO    [0m | [36m__main__[0m:[36mtimer_embedding_model[0m:[36m8[0m - [1mTime taken loading sentence-transformers/distiluse-base-multilingual-cased-v1: 1.16s[0m
[32m2024-01-20 22:40:58.399[0m | [1mINFO    [0m | [36m__main__[0m:[36mtimer_embedding_model[0m:[36m13[0m - [1mTime taken for sentence-transformers/distiluse-base-multilingual-cased-v1: 5.63s[0m





[32m2024-01-20 22:40:59.591[0m | [1mINFO    [0m | [36m__main__[0m:[36mtimer_embedding_model[0m:[36m8[0m - [1mTime taken loading sentence-transformers/distiluse-base-multilingual-cased-v2: 1.17s[0m
[32m2024-01-20 22:41:05.408[0m | [1mINFO    [0m | [36m__main__[0m:[36mtimer_embedding_model[0m:[36m13[0m - [1mTime taken for sentence-transformers/distiluse-base-multilingual-cased-v2: 5.82s[0m





[32m2024-01-20 22:41:08.045[0m | [1mINFO    [0m | [36m__main__[0m:[36mtimer_embedding_model[0m:[36m8[0m - [1mTime taken loading sentence-transformers/paraphrase-multilingual-mpnet-base-v2: 2.61s[0m
[32m2024-01-20 22:41:19.547[0m | [1mINFO    [0m | [36m__main__[0m:[36mtimer_embedding_model[0m:[36m13[0m - [1mTime taken for sentence-transformers/paraphrase-multilingual-mpnet-base-v2: 11.50s[0m





In [39]:
from sentence_transformers import util as st_utils

scores = dict()

for model_name in sentence_transformer_model_lists:
    model = SentenceTransformer.load(_get_model_path(MODEL_DIR, model_name))
    score_list = list()
    
    time_start = time.time()
    for qa in test_cases:
        
        query = qa['q']
        docs = qa['ref']

        # skip if empty records found
        if query == "" or docs == []:
            continue
        
        docs_embeddings = model.encode(docs, convert_to_tensor=True)
        query_embedding = model.encode(query, convert_to_tensor=True)
        
        cos_scores = st_utils.cos_sim(query_embedding, docs_embeddings)[0] # cos_sim is used in multi-dim computation, so it returns nested result [[]]
        score_list.append(cos_scores.numpy())

    time_end = time.time()
    logger.info(f'Time taken to score by {model_name}: {time_end-time_start:.2f}s')
    
    scores[model_name] = score_list

[32m2024-01-20 22:59:13.936[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m26[0m - [1mTime taken to score by intfloat/multilingual-e5-large: 58.66s[0m
