# Evaluate vector databases

In [1]:
from typing import Dict, List
from langchain_core.documents.base import Document

## Load evaluation dataset and docs

In [2]:
from langchain_community.document_loaders import NotionDBLoader
import tomllib

In [3]:
with open('../.tokens.toml', 'rb') as f:
    _TOKENS = tomllib.load(f)

with open('../.notion_databases.toml', 'rb') as f:
    _DATABASES_NOTION = tomllib.load(f)

In [4]:
def load_notion_dbs(dbs, id):
    loader = NotionDBLoader(
        integration_token=_TOKENS['notion'],
        database_id=dbs[id],
        request_timeout_sec=300,  # optional, defaults to 10
    )
    data = loader.load()
    return data

In [5]:
%%time
from concurrent.futures import ThreadPoolExecutor

te = ThreadPoolExecutor()
results = list(te.map(lambda x: load_notion_dbs(_DATABASES_NOTION, x), _DATABASES_NOTION.keys()))

CPU times: user 10.5 s, sys: 842 ms, total: 11.3 s
Wall time: 1min 44s


In [6]:
docs_from_notion: Dict[str, List[Document]] = dict(zip(_DATABASES_NOTION.keys(), results))

In [7]:
# optional pickle step so we don't need to query notionDB again
import pickle

with open('../data/notion_offline.pkl', 'wb') as f:
    pickle.dump(docs_from_notion, f)

In [5]:
import pickle
with open('../data/notion_offline.pkl', 'rb') as f:
    docs_from_notion = pickle.load(f)

In [6]:
docs_from_notion['写作'][0].metadata

{'date': {'start': '2013-10-26', 'end': '2013-10-26', 'time_zone': None},
 'name': '2013-OCT-26 师说',
 'tags': ['日常记趣'],
 'id': '273ea76f-a35c-474e-bfe0-41a3daae5c96'}

In [7]:
def add_source_property(docs_from_notion: Dict[str, List[Document]]) -> List[Document]:
    docs_list = list()
    
    for db_name, docs in docs_from_notion.items():
        for doc in docs:
            # because our data are gathered from multiple databases
            # we are going to throw the database names as one property
            # into the docs' metadata field
            # and return as a list
            doc.metadata['source'] = db_name

            # vector dbs don't allow complex metadata types like dict and list
            # chroma is explicity about this, faiss is implicit about it
            # but both "filter" arg assumes simple string match
            # we'll convert into flattened date
            if 'date' in doc.metadata:
                if 'start' in doc.metadata['date']:
                    doc.metadata['date_start'] = doc.metadata['date']['start']
                if 'end' in doc.metadata['date']:
                    doc.metadata['date_end'] = doc.metadata['date']['end']
                    del doc.metadata['date']

            if 'tags' in doc.metadata:
                doc.metadata['tags'] = ", ".join(doc.metadata['tags'])
                

        docs_list.extend(docs)
        
    return docs_list

In [None]:
# Let an automated process takes care the rest
from langchain_community.vectorstores.utils import filter_complex_metadata
filter_complex_metadata(docs_from_notion)

In [None]:
from langchain_community.vectorstores import FAISS, Chroma

In [19]:
help(FAISS.similarity_search_with_score)

Help on function similarity_search_with_score in module langchain_community.vectorstores.faiss:

similarity_search_with_score(self, query: 'str', k: 'int' = 4, filter: 'Optional[Dict[str, Any]]' = None, fetch_k: 'int' = 20, **kwargs: 'Any') -> 'List[Tuple[Document, float]]'
    Return docs most similar to query.
    
    Args:
        query: Text to look up documents similar to.
        k: Number of Documents to return. Defaults to 4.
        filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
        fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
                  Defaults to 20.
    
    Returns:
        List of documents most similar to the query text with
        L2 distance in float. Lower score represents more similarity.



In [18]:
help(Chroma.similarity_search_with_score)

Help on function similarity_search_with_score in module langchain_community.vectorstores.chroma:

similarity_search_with_score(self, query: 'str', k: 'int' = 4, filter: 'Optional[Dict[str, str]]' = None, where_document: 'Optional[Dict[str, str]]' = None, **kwargs: 'Any') -> 'List[Tuple[Document, float]]'
    Run similarity search with Chroma with distance.
    
    Args:
        query (str): Query text to search for.
        k (int): Number of results to return. Defaults to 4.
        filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
    
    Returns:
        List[Tuple[Document, float]]: List of documents most similar to
        the query text and cosine distance in float for each.
        Lower score represents more similarity.



In [8]:
docs_from_notion = add_source_property(docs_from_notion)
docs_from_notion[0].metadata

{'date': {'start': '2013-10-26', 'end': '2013-10-26', 'time_zone': None},
 'name': '2013-OCT-26 师说',
 'tags': ['日常记趣'],
 'id': '273ea76f-a35c-474e-bfe0-41a3daae5c96'}

## Testing ideas

* use langchain.text_splitterRecursiveCharacterTextSplitter
* storage: test 2 vector databses: chroma & faiss
* retriever:
  * `Self-querying retriever` --  use an LLM to construct new queries that can question the structured data/metadata of the document
  * `MultiQueryRetriever` -- allow an LLM to paraphrase the query to get hopefully a diverse set of docs
  * `Contextual compression` -- use an LLM to pre-filter and compress the docs retrieved before feeding the contexts to another LLM to answer
  * https://python.langchain.com/docs/modules/data_connection/retrievers/
* retrieval methods: cos/dot; llm-aided; MMR (Maximum marginal relevance)

In [9]:
# Presumably docs in NotionDB fits more with MarkdownHeaderTextSplitter
# however, most of the documents in my personal databases don't have such header-text structure
# and they are not important for my use cases (I won't ask it to reason on a specific section 
# in a doc). Thus I'll use the regular RecursiveCharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [10]:
chunk_params = {
    # 1 Chinese characters = 2 english character
    # most paragraphs/sections are within 500 Chinese words/chars
    'chunk_size': 1000, 
    'chunk_overlap': 250,
}

rc_splitter = RecursiveCharacterTextSplitter(**chunk_params)

splits = rc_splitter.split_documents(docs_from_notion)
len(docs_from_notion), len(splits)

(224, 1136)

In [12]:
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings

embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=_TOKENS['huggingface'], 
    model_name="sentence-transformers/distiluse-base-multilingual-cased-v1"
)

In [36]:
%%time
vs_chroma = Chroma.from_documents(
    documents=splits, 
    embedding=embeddings, 
    persist_directory='../database/vs_chroma'
)

CPU times: user 1.36 s, sys: 111 ms, total: 1.48 s
Wall time: 2.96 s


In [13]:
%%time

# faiss is taking a lazy operation here to ingest document
vs_faiss = FAISS.from_documents(
    documents=splits, 
    embedding=embeddings, 
)

vs_faiss.save_local('../database/vs_faiss')

CPU times: user 273 ms, sys: 60.6 ms, total: 333 ms
Wall time: 1.8 s


In [52]:
# both don't allow fuzzy matches on filter
# have to be 100% match
vs_faiss.similarity_search_with_score(
    '谁说过陌生贵己？', 
    filter={
        'author': '冯友兰',
    }
)

vs_chroma.similarity_search_with_score(
    '谁说过陌生贵己？', 
    filter={
        'author': '冯友兰',
    }
)

[]

In [14]:
# faiss applies filter after semantic search
# it also has an additional fetch_k arg for senmantic search
vs_faiss.similarity_search_with_score(
    '谁说过陌生贵己？', 
    filter={
        'author': '【中】冯友兰',
    },
    k=2,
)

[(Document(page_content='为我，轻物重生\n《孟子》中说：“杨子取为我，拔一毛而利天下，不为也。”；《吕氏春秋》中说：“陌生贵己。”；《淮南子》中写：“全性保真，不以物累形：杨子所立也。”这些是同时代的著作中对杨朱思想的记录和反映\n在道家更后期的《老子》和《庄子》中也有相同的体现。《老子》中写到：“名与身：孰亲？身与货：孰多？”《庄子》中写到：“山木自寇也。膏火自煎也，桂可食，故伐之。漆可用，故割之。”\n无用是全生的方法。善于全生的人，一定不能多为恶，但也一定不能多为善。他一定要生活在善恶之间，力求无用。到头来，无用却对于他有大用\n从为我到无我：先秦道家发展三阶段\n先秦道家都是为我的，但是随着思考的深入，后来的发展使这种为我走向反面，取消了它自身\n第一阶段杨朱，出发点是全生避害\n第二阶段老子，开始企图揭示宇宙事物变化的规律。一个人如果懂得了这些规律，并且遵循规律而调整行动，那么他就能够使事物转向对他有利的方向。但即便一个人懂得自然规律，预料之外的因素仍然会发挥作用，并带来可能的危害（“吾所以有大患者，为吾有身，及吾无身，吾有何患！”）\n第三阶段庄子，因为没有办法避免受到外界事物的影响，庄子转而从一种更高的观点看待事物，产生”齐生死，一物我“的理论\n孟子：儒家的理想主义派\n人性本善', metadata={'author': '【中】冯友兰', 'date': {'start': '2023-05-06', 'end': None, 'time_zone': None}, 'tags': ['人文'], 'name': '中国哲学简史', 'id': '0419517a-59be-47a2-a4b9-bb6f21630614'}),
  1.0759109)]

In [63]:
# chroma applies filter before semantic sesarch
vs_chroma.similarity_search_with_score(
    '谁说过陌生贵己？', 
    filter={
        'author': '【中】冯友兰',
    },
    k=2,
)

[(Document(page_content='为我，轻物重生\n《孟子》中说：“杨子取为我，拔一毛而利天下，不为也。”；《吕氏春秋》中说：“陌生贵己。”；《淮南子》中写：“全性保真，不以物累形：杨子所立也。”这些是同时代的著作中对杨朱思想的记录和反映\n在道家更后期的《老子》和《庄子》中也有相同的体现。《老子》中写到：“名与身：孰亲？身与货：孰多？”《庄子》中写到：“山木自寇也。膏火自煎也，桂可食，故伐之。漆可用，故割之。”\n无用是全生的方法。善于全生的人，一定不能多为恶，但也一定不能多为善。他一定要生活在善恶之间，力求无用。到头来，无用却对于他有大用\n从为我到无我：先秦道家发展三阶段\n先秦道家都是为我的，但是随着思考的深入，后来的发展使这种为我走向反面，取消了它自身\n第一阶段杨朱，出发点是全生避害\n第二阶段老子，开始企图揭示宇宙事物变化的规律。一个人如果懂得了这些规律，并且遵循规律而调整行动，那么他就能够使事物转向对他有利的方向。但即便一个人懂得自然规律，预料之外的因素仍然会发挥作用，并带来可能的危害（“吾所以有大患者，为吾有身，及吾无身，吾有何患！”）\n第三阶段庄子，因为没有办法避免受到外界事物的影响，庄子转而从一种更高的观点看待事物，产生”齐生死，一物我“的理论\n孟子：儒家的理想主义派\n人性本善', metadata={'author': '【中】冯友兰', 'date_start': '2023-05-06', 'id': '0419517a-59be-47a2-a4b9-bb6f21630614', 'name': '中国哲学简史', 'source': '读书笔记（文学）', 'tags': '人文'}),
  1.0759108066558838),
 (Document(page_content='忠与恕\n实行仁的具体方法就是推己及人（“己欲立而立人，己欲达而达人……可谓仁之方也”）\n“忠”是推己及人的肯定方面；“恕”是其否定方面\n忠恕之道是人的道德生活的开端和终结，实行忠恕就是行仁（“夫子之道，忠恕而已矣”）\n知命\n“无所为而为”：儒家认为，一个人不可能“无为”，因为每个人都有他应该做的事。然而他做这些事都是“无所为”，因为做这些事的价值在于做的本身之内，而不是在于外在的结果\n知命是承认世界本来存在的必然性，这样，对

In [None]:
# TODO: use SelfQueryRetriever to allow in metadata context
# https://python.langchain.com/docs/modules/data_connection/retrievers/self_query/
retriever = vectorstore.as_retriever()