# Evaluate vector databases

In [9]:
from typing import Dict, List
from langchain_core.documents.base import Document

## Load evaluation dataset and docs

In [10]:
from langchain_community.document_loaders import NotionDBLoader
import tomllib

In [11]:
with open('../.tokens.toml', 'rb') as f:
    _TOKENS = tomllib.load(f)

with open('../.notion_databases.toml', 'rb') as f:
    _DATABASES_NOTION = tomllib.load(f)

In [12]:
# def load_notion_dbs(dbs, id):
#     loader = NotionDBLoader(
#         integration_token=_TOKENS['notion'],
#         database_id=dbs[id],
#         request_timeout_sec=300,  # optional, defaults to 10
#     )
#     data = loader.load()
#     return data

In [13]:
# %%time
# from concurrent.futures import ThreadPoolExecutor

# te = ThreadPoolExecutor()
# results = list(te.map(lambda x: load_notion_dbs(_DATABASES_NOTION, x), _DATABASES_NOTION.keys()))

# docs_from_notion: Dict[str, List[Document]] = dict(zip(_DATABASES_NOTION.keys(), results))

# # optional pickle step so we don't need to query notionDB again
# import pickle

# with open('../data/notion_offline.pkl', 'wb') as f:
#     pickle.dump(docs_from_notion, f)

In [14]:
import pickle
with open('../data/notion_offline.pkl', 'rb') as f:
    docs_from_notion = pickle.load(f)

In [15]:
docs_from_notion['写作'][0].metadata

{'date': {'start': '2013-10-26', 'end': '2013-10-26', 'time_zone': None},
 'name': '2013-OCT-26 师说',
 'tags': ['日常记趣'],
 'id': '273ea76f-a35c-474e-bfe0-41a3daae5c96'}

In [16]:
def add_source_property(docs_from_notion: Dict[str, List[Document]]) -> List[Document]:
    docs_list = list()
    
    for db_name, docs in docs_from_notion.items():
        for doc in docs:
            # because our data are gathered from multiple databases
            # we are going to throw the database names as one property
            # into the docs' metadata field
            # and return as a list
            doc.metadata['source'] = db_name

            # vector dbs don't allow complex metadata types like dict and list
            # chroma is explicity about this, faiss is implicit about it
            # but both "filter" arg assumes simple string match
            # we'll convert into flattened date
            if 'date' in doc.metadata:
                if 'start' in doc.metadata['date']:
                    doc.metadata['date_start'] = doc.metadata['date']['start']
                if 'end' in doc.metadata['date']:
                    doc.metadata['date_end'] = doc.metadata['date']['end']
                    del doc.metadata['date']

            if 'tags' in doc.metadata:
                doc.metadata['tags'] = ", ".join(doc.metadata['tags'])
                

        docs_list.extend(docs)
        
    return docs_list

In [17]:
docs_from_notion = add_source_property(docs_from_notion)
docs_from_notion[0].metadata

{'name': '2013-OCT-26 师说',
 'tags': '日常记趣',
 'id': '273ea76f-a35c-474e-bfe0-41a3daae5c96',
 'source': '写作',
 'date_start': '2013-10-26',
 'date_end': '2013-10-26'}

In [18]:
# Let an automated process takes care the rest
from langchain_community.vectorstores.utils import filter_complex_metadata
docs_from_notion = filter_complex_metadata(docs_from_notion)

In [20]:
from langchain_community.vectorstores import Redis

## Testing ideas

* use langchain.text_splitterRecursiveCharacterTextSplitter
* storage: test 2 vector databses: Redis, Superbase

In [21]:
# Presumably docs in NotionDB fits more with MarkdownHeaderTextSplitter
# however, most of the documents in my personal databases don't have such header-text structure
# and they are not important for my use cases (I won't ask it to reason on a specific section 
# in a doc). Thus I'll use the regular RecursiveCharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [22]:
chunk_params = {
    # 1 Chinese characters = 2 english character
    # most paragraphs/sections are within 500 Chinese words/chars
    'chunk_size': 1000, 
    'chunk_overlap': 250,
}

rc_splitter = RecursiveCharacterTextSplitter(**chunk_params)

splits = rc_splitter.split_documents(docs_from_notion)
len(docs_from_notion), len(splits)

(225, 1136)

In [23]:
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings

embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=_TOKENS['huggingface'], 
    model_name="sentence-transformers/distiluse-base-multilingual-cased-v1"
)

In [32]:
metadata_set = set()

for x in [x.metadata for x in docs_from_notion]:
    metadata_set = metadata_set.union(list(x.keys()))

metadata_set

{'author', 'date_end', 'date_start', 'id', 'name', 'source', 'tags'}

In [None]:
!redis-stack-server

In [33]:
%%time

index_schema = {
    # "tag": [{"name": "genre"}],
    "text": [
        {"name": "author"},
        {"name": "date_end"},
        {"name": "date_start"},
        {"name": "id"},
        {"name": "name"},
        {"name": "source"},
        {"name": "tags"},
    ],
}

vectorstore = Redis.from_documents(
    documents=docs_from_notion,
    embedding=embeddings,
    redis_url="redis://localhost:6379",
    index_name="notiondb",
    index_schema=index_schema,
)

`index_schema` does not match generated metadata schema.
If you meant to manually override the schema, please ignore this message.
index_schema: {'text': [{'name': 'author'}, {'name': 'date_end'}, {'name': 'date_start'}, {'name': 'id'}, {'name': 'name'}, {'name': 'source'}, {'name': 'tags'}]}
generated_schema: {'text': [{'name': 'name'}, {'name': 'tags'}, {'name': 'id'}, {'name': 'source'}, {'name': 'date_start'}, {'name': 'date_end'}], 'numeric': [], 'tag': []}



CPU times: user 112 ms, sys: 21.6 ms, total: 134 ms
Wall time: 1.11 s


In [35]:
!pip show langchain

Name: langchain
Version: 0.1.7
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /Users/fred/micromamba/envs/my-notion-companion/lib/python3.12/site-packages
Requires: aiohttp, dataclasses-json, jsonpatch, langchain-community, langchain-core, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: 


In [37]:
# the 'filter' keyword assumes exact match
f = {'author': '*冯友兰*',}

vectorstore.similarity_search_with_score(
    '谁说过陌生贵己？', 
    # filter=f
)

Metadata key date_end not found in metadata. Setting to None. 
Metadata fields defined for this instance: ['author', 'date_end', 'date_start', 'id', 'name', 'source', 'tags']
Metadata key date_end not found in metadata. Setting to None. 
Metadata fields defined for this instance: ['author', 'date_end', 'date_start', 'id', 'name', 'source', 'tags']
Metadata key author not found in metadata. Setting to None. 
Metadata fields defined for this instance: ['author', 'date_end', 'date_start', 'id', 'name', 'source', 'tags']
Metadata key author not found in metadata. Setting to None. 
Metadata fields defined for this instance: ['author', 'date_end', 'date_start', 'id', 'name', 'source', 'tags']


[(Document(page_content='第一次阅读：2019.05.19 杨伯峻《论语译注》\n第二次阅读：2023.12.01 罗晓晖《论语译释》\n\n\n学而第一\n\n1.1 子曰：“学而时习之，不亦说乎？有朋自远方来，不亦乐乎？人不知而不愠，不亦君子乎？\n【”说“和“乐”的区别——《说文》：“说，释也。”因此”说“有”说开“、”开解“的意思，”说“通”悦“时，表示心情舒张。而”乐“是乐器，因此这种情绪是被音乐激发的、感动于物的。】\n\n1.2 有子曰：“其为人也孝弟，而好犯上者，鲜矣；不好犯上，而好作乱者，未之有也。君子务本，本立而道生。孝弟也者，其为仁之本与！\n\n1.3\xa0子曰：“巧言令色，鲜矣仁！”\n\n1.4 曾子曰：“吾日三省吾身：为人谋而不忠乎？与朋友交而不信乎？传不习乎？“\n【曾子对修身的理解谈到了品德和学习。而孔子论学（1.1）的站位更高，指向生命的愉悦和不假外求的自我圆满（“人不知而不愠”）】\n\n1.6 子曰：“弟子入则孝，出则弟，谨而信，泛爱众，而亲仁。行有余力，则以学文。”\n\n1.7 子夏曰：“贤贤易色；事父母，能竭其力；事君，能致其身；与朋友交，言而有信。虽曰未学，吾必谓之学矣。”\n\n1.8 子曰：“君子不重则不威，学则不固。主忠信，无友不如己者，过则勿惮改。”\n【不如己者：与自己志趣不同的人，而不应理解为不如自己的人。这五句话没有什么联系，可能是孔子在不同场合说的。】\n\n1.10 子禽问于子贡曰：“夫子至于是邦也，必闻其政，求之与，抑与之与？”子贡曰：“夫子温、良、恭、俭、让以得之。夫子之求之也，其诸异乎人之求之与？”\n\n1.12 有子曰：“礼之用，和为贵。先王之道，斯为美，小大由之。有所不行，知和而和，不以礼节之，亦不可行也。”\n\n1.14 子曰：“君子食无求饱，居无求安，敏于事而慎于言，就有道而正焉。可谓好学也已。”\n\n1.15 子贡曰：“贫而无谄，富而无骄，何如？”子曰：“可也。未若贫而乐，富而好礼者也。”子贡曰：“《诗》云：‘如切如磋，如琢如磨’，其斯之谓与？”子曰：“赐也，始可与言《诗》已矣，告诸往而知来者。”\n【“贫而无谄，富而无骄”属于自我节制，其中还有贫富的差异；“贫而乐，富而好礼”则是建设性的精神发展，这种人对贫富不以为意。从前者到后者就需要提升境界，向加工

In [28]:
# faiss applies filter after semantic search
# it also has an additional fetch_k arg for senmantic search
vectorstore.similarity_search_with_score(
    '谁说过陌生贵己？', 
    filter={
        'author': '【中】冯友兰',
    },
    k=2,
)

ValueError: Query failed with syntax error. This is likely due to malformation of filter, vector, or query argument

In [None]:
# TODO: use SelfQueryRetriever to allow in metadata context
# https://python.langchain.com/docs/modules/data_connection/retrievers/self_query/
retriever = vectorstore.as_retriever()