# Evaluate vector databases

In [1]:
from typing import Dict, List
from langchain_core.documents.base import Document

## Load evaluation dataset and docs

In [2]:
from langchain_community.document_loaders import NotionDBLoader
import tomllib

In [3]:
with open('../.tokens.toml', 'rb') as f:
    _TOKENS = tomllib.load(f)

with open('../.notion_databases.toml', 'rb') as f:
    _DATABASES_NOTION = tomllib.load(f)

with open('../.config.toml', 'rb') as f:
    _CONFIGS = tomllib.load(f)


In [4]:
# def load_notion_dbs(dbs, id):
#     loader = NotionDBLoader(
#         integration_token=_TOKENS['notion'],
#         database_id=dbs[id],
#         request_timeout_sec=300,  # optional, defaults to 10
#     )
#     data = loader.load()
#     return data

In [5]:
# %%time
# from concurrent.futures import ThreadPoolExecutor

# te = ThreadPoolExecutor()
# results = list(te.map(lambda x: load_notion_dbs(_DATABASES_NOTION, x), _DATABASES_NOTION.keys()))

# docs_from_notion: Dict[str, List[Document]] = dict(zip(_DATABASES_NOTION.keys(), results))

# # optional pickle step so we don't need to query notionDB again
# import pickle

# with open('../data/notion_offline.pkl', 'wb') as f:
#     pickle.dump(docs_from_notion, f)

In [6]:
import pickle
with open('../data/notion_offline.pkl', 'rb') as f:
    docs_from_notion = pickle.load(f)

In [7]:
docs_from_notion['写作'][0].metadata

{'date': {'start': '2013-10-26', 'end': '2013-10-26', 'time_zone': None},
 'name': '2013-OCT-26 师说',
 'tags': ['日常记趣'],
 'id': '273ea76f-a35c-474e-bfe0-41a3daae5c96'}

In [8]:
def process_property(docs_from_notion: Dict[str, List[Document]]) -> List[Document]:
    docs_list = list()
    
    for db_name, docs in docs_from_notion.items():
        for doc in docs:
            # because our data are gathered from multiple databases
            # we are going to throw the database names as one property
            # into the docs' metadata field
            # and return as a list
            doc.metadata['source'] = db_name

            # change dates into YYYYMMDD int format to allow GT/LT/EQ comparison
            if 'date' in doc.metadata:
                if 'start' in doc.metadata['date']:
                    doc.metadata['date_start'] = int(doc.metadata['date']['start'].replace("-", ""))
                if 'end' in doc.metadata['date'] and doc.metadata['date']['end']:
                    doc.metadata['date_end'] = int(doc.metadata['date']['end'].replace("-", ""))
                    
                del doc.metadata['date']

            if 'tags' in doc.metadata:
                doc.metadata['tags'] = ", ".join(doc.metadata['tags'])
                

        docs_list.extend(docs)
        
    return docs_list

In [9]:
docs_from_notion = process_property(docs_from_notion)
docs_from_notion[0].metadata

{'name': '2013-OCT-26 师说',
 'tags': '日常记趣',
 'id': '273ea76f-a35c-474e-bfe0-41a3daae5c96',
 'source': '写作',
 'date_start': 20131026,
 'date_end': 20131026}

In [10]:
# Let an automated process takes care the rest
from langchain_community.vectorstores.utils import filter_complex_metadata
docs_from_notion = filter_complex_metadata(docs_from_notion)

In [11]:
from langchain_community.vectorstores import Redis

## Testing ideas

* use langchain.text_splitterRecursiveCharacterTextSplitter
* storage: test 2 vector databses: Redis, Superbase

In [12]:
# Presumably docs in NotionDB fits more with MarkdownHeaderTextSplitter
# however, most of the documents in my personal databases don't have such header-text structure
# and they are not important for my use cases (I won't ask it to reason on a specific section 
# in a doc). Thus I'll use the regular RecursiveCharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [13]:
chunk_params = {
    # 1 Chinese characters = 2 english character
    # most paragraphs/sections are within 500 Chinese words/chars
    'chunk_size': 1000, 
    'chunk_overlap': 250,
}

rc_splitter = RecursiveCharacterTextSplitter(**chunk_params)

splits = rc_splitter.split_documents(docs_from_notion)
len(docs_from_notion), len(splits)

(225, 1136)

In [14]:
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings

embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=_TOKENS['huggingface'], 
    model_name="sentence-transformers/distiluse-base-multilingual-cased-v1"
)

In [15]:
metadata_set = set()

for x in [x.metadata for x in docs_from_notion]:
    metadata_set = metadata_set.union(list(x.keys()))

metadata_set

{'author', 'date_end', 'date_start', 'id', 'name', 'source', 'tags'}

In [16]:
# make sure we defined the schema for all metadata in _CONFIG file
assert metadata_set == \
    set([x['name'] for x in _CONFIGS['redis_schema']['text']] + [x['name'] for x in _CONFIGS['redis_schema']['numeric']])

In [17]:
docs_from_notion[0].metadata

{'name': '2013-OCT-26 师说',
 'tags': '日常记趣',
 'id': '273ea76f-a35c-474e-bfe0-41a3daae5c96',
 'source': '写作',
 'date_start': 20131026,
 'date_end': 20131026}

In [17]:
!redis-stack-server # init redis-stack server

Starting redis-stack-server, database path /opt/homebrew/var/db/redis-stack
2581:C 18 Feb 2024 16:57:47.406 * oO0OoO0OoO0Oo Redis is starting oO0OoO0OoO0Oo
2581:C 18 Feb 2024 16:57:47.406 * Redis version=7.2.4, bits=64, commit=d2c8a4b9, modified=0, pid=2581, just started
2581:C 18 Feb 2024 16:57:47.406 * Configuration loaded
2581:M 18 Feb 2024 16:57:47.406 * Increased maximum number of open files to 10032 (it was originally set to 4864).
2581:M 18 Feb 2024 16:57:47.406 * monotonic clock: POSIX clock_gettime
                _._                                                  
           _.-``__ ''-._                                             
      _.-``    `.  `_.  ''-._           Redis 7.2.4 (d2c8a4b9/0) 64 bit
  .-`` .-```.  ```\/    _.,_ ''-._                                  
 (    '      ,       .-`  | `,    )     Running in standalone mode
 |`-._`-...-` __...-.``-._|'` _.-'|     Port: 6379
 |    `-._   `._    /     _.-'    |     PID: 2581
  `-._    `-._  `-./  _.-'    _.-'    

In [18]:
Redis.drop_index(
    redis_url=_CONFIGS['redis_url'],
    index_name=_CONFIGS['index_name'], 
    delete_documents=True
)

True

In [19]:
%%time

# Redis supports default "tag" fields alongside with "text" and "numeric"
# looks like a better match for "tags" property at the first glance
# but we'll classify it as "text" anyway because the to give consistency of 
# how downstream self-query writes filter queries.
# ref: https://redis.io/docs/interact/search-and-query/advanced-concepts/tags

vectorstore = Redis.from_documents(
    documents=splits,
    embedding=embeddings,
    redis_url=_CONFIGS['redis_url'],
    index_name=_CONFIGS['index_name'],
    index_schema=_CONFIGS['redis_schema'],
)

`index_schema` does not match generated metadata schema.
If you meant to manually override the schema, please ignore this message.
index_schema: {'text': [{'name': 'author'}, {'name': 'id'}, {'name': 'name'}, {'name': 'source'}, {'name': 'tags'}], 'numeric': [{'name': 'date_start'}, {'name': 'date_end'}]}
generated_schema: {'text': [{'name': 'name'}, {'name': 'tags'}, {'name': 'id'}, {'name': 'source'}], 'numeric': [{'name': 'date_start'}, {'name': 'date_end'}], 'tag': []}



CPU times: user 295 ms, sys: 55 ms, total: 350 ms
Wall time: 1min 4s


In [20]:
!rvl index info -i notiondb



Index Information:
╭──────────────┬────────────────┬──────────────────┬─────────────────┬────────────╮
│ Index Name   │ Storage Type   │ Prefixes         │ Index Options   │   Indexing │
├──────────────┼────────────────┼──────────────────┼─────────────────┼────────────┤
│ notiondb     │ HASH           │ ['doc:notiondb'] │ []              │          0 │
╰──────────────┴────────────────┴──────────────────┴─────────────────┴────────────╯
Index Fields:
╭────────────────┬────────────────┬─────────┬────────────────┬────────────────┬────────────────┬────────────────┬────────────────┬────────────────┬─────────────────┬────────────────╮
│ Name           │ Attribute      │ Type    │ Field Option   │ Option Value   │ Field Option   │ Option Value   │ Field Option   │   Option Value │ Field Option    │ Option Value   │
├────────────────┼────────────────┼─────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────────────────┼──────────────

In [22]:
from langchain.vectorstores.redis import RedisFilter, RedisNum, RedisText

# form comparator logic
# ref: https://github.com/langchain-ai/langchain/blob/d7c26c89b2d4f5ff676ba7c3ad4f9075d50a8ab7/libs/community/langchain_community/vectorstores/redis/filters.py#L313

f = RedisText("author") % "*冯友兰*"
vectorstore.similarity_search_with_score('谁说过陌生贵己？', filter=f, k=3)

Metadata key date_end not found in metadata. Setting to None. 
Metadata fields defined for this instance: ['author', 'id', 'name', 'source', 'tags', 'date_start', 'date_end']
Metadata key date_end not found in metadata. Setting to None. 
Metadata fields defined for this instance: ['author', 'id', 'name', 'source', 'tags', 'date_start', 'date_end']
Metadata key date_end not found in metadata. Setting to None. 
Metadata fields defined for this instance: ['author', 'id', 'name', 'source', 'tags', 'date_start', 'date_end']


[(Document(page_content='为我，轻物重生\n《孟子》中说：“杨子取为我，拔一毛而利天下，不为也。”；《吕氏春秋》中说：“陌生贵己。”；《淮南子》中写：“全性保真，不以物累形：杨子所立也。”这些是同时代的著作中对杨朱思想的记录和反映\n在道家更后期的《老子》和《庄子》中也有相同的体现。《老子》中写到：“名与身：孰亲？身与货：孰多？”《庄子》中写到：“山木自寇也。膏火自煎也，桂可食，故伐之。漆可用，故割之。”\n无用是全生的方法。善于全生的人，一定不能多为恶，但也一定不能多为善。他一定要生活在善恶之间，力求无用。到头来，无用却对于他有大用\n从为我到无我：先秦道家发展三阶段\n先秦道家都是为我的，但是随着思考的深入，后来的发展使这种为我走向反面，取消了它自身\n第一阶段杨朱，出发点是全生避害\n第二阶段老子，开始企图揭示宇宙事物变化的规律。一个人如果懂得了这些规律，并且遵循规律而调整行动，那么他就能够使事物转向对他有利的方向。但即便一个人懂得自然规律，预料之外的因素仍然会发挥作用，并带来可能的危害（“吾所以有大患者，为吾有身，及吾无身，吾有何患！”）\n第三阶段庄子，因为没有办法避免受到外界事物的影响，庄子转而从一种更高的观点看待事物，产生”齐生死，一物我“的理论\n孟子：儒家的理想主义派\n人性本善', metadata={'id': 'doc:notiondb:85ff8b7811ba4274a12a5033fd705ed6', 'author': '【中】冯友兰', 'name': '中国哲学简史', 'source': '读书笔记（文学）', 'tags': '人文', 'date_start': '20230506', 'date_end': None}),
  0.703),
 (Document(page_content='忠与恕\n实行仁的具体方法就是推己及人（“己欲立而立人，己欲达而达人……可谓仁之方也”）\n“忠”是推己及人的肯定方面；“恕”是其否定方面\n忠恕之道是人的道德生活的开端和终结，实行忠恕就是行仁（“夫子之道，忠恕而已矣”）\n知命\n“无所为而为”：儒家认为，一个人不可能“无为”，因为每个人都有他应该做的事。然而他做这些事都是“无所为”，因为做这些事的价值在于做的本身之内，而不是在于外在的结果\n知命是承认世界本

In [23]:
f = RedisText("name") % "*三岛由纪夫*"
vectorstore.similarity_search_with_score('清显与本多', filter=f, k=3)

Metadata key date_end not found in metadata. Setting to None. 
Metadata fields defined for this instance: ['author', 'id', 'name', 'source', 'tags', 'date_start', 'date_end']
Metadata key date_end not found in metadata. Setting to None. 
Metadata fields defined for this instance: ['author', 'id', 'name', 'source', 'tags', 'date_start', 'date_end']
Metadata key date_end not found in metadata. Setting to None. 
Metadata fields defined for this instance: ['author', 'id', 'name', 'source', 'tags', 'date_start', 'date_end']


[(Document(page_content='2\n清显有个不好的倾向，那就是他轻蔑爱慕自己的人，起止轻蔑，甚至近于冷酷。这一点，本多早就察觉出来了。他的友人当中再没有谁比本多更敏锐的了。本多估计，这种倨傲，就是从清显十三岁那年知道别人对自己的俊美喝彩以后，从心底里悄悄地培育起来的好像霉菌一样的感情。\n那是一朵银白色的霉菌花，一碰，仿佛就会响起铃声来。\n\n\n3\n本多突然从正面问道：“松枝，你近来怎么搞的？我说什么，你都心不在焉。”\n“哪儿的话。”清显猝不及防含糊其辞地回答了一句。他用美丽而明亮的眼睛望了望他的朋友。被友人知道自己傲慢倒并不难为情，最怕就是被他了解自己的苦恼。\n清显知道，这时他若敞开胸怀，本多就会鲁莽地闯进自己的心房，这种作为无论是谁都绝对不能容许的。这么一来，清显很可能转瞬间就会失去唯一的挚友。\n但是，这时候本多马上理解了清显的内心活动。他知道要继续维持他同清显的友谊，就必须舍弃卑俗的关系，不应该一不留神就触摸刚涂上油漆的墙而留下手痕。必要时，连友人的死苦也必须视而不见。\n特别是，倘若这是一种特殊的死苦，通过隐藏已然达成优雅的话。\n\n\n4\n“因为神圣的东西全部是由与梦和回忆相同的要素形成的，会使由于时间和空间的关系而与我们相隔的东西奇迹般地呈现在我们眼前。而且，这三种东西的共同的特点是：无论哪一种都是用手触摸不到的。手触摸得到的东西，一旦离开它一步，它就可能变成神圣的东西，变成奇迹，变成不可能有的美的东西。一切事物都具有其神圣性，可是我们的手指触摸了它，它就变成污浊的了。我们人类真是个不可思议的存在啊。一方面玷污手指触摸得到的所有东西，一方面自己内里又偏偏具备能成为神圣的东西的素质。”\n\n\n5\n现实不同于梦境，是多么缺乏可塑性的素材啊。现实不是模模糊糊、飘飘忽忽的感觉，\n它就像一颗黑色的药丸，一旦痛快地凝缩起来，便会立即发挥效力。', metadata={'id': 'doc:notiondb:ab062d1e9e614654837ed93756778692', 'author': '【日】三岛由纪夫', 'name': '春雪 【日】三岛由纪夫', 'source': '读书笔记（文学）', 'tags': '小说', 'date_start': '20140304', 'date_end': None}),

In [24]:
f = (RedisNum("date_start") > 20201231) & (RedisNum("date_start") < 20211231)
vectorstore.similarity_search_with_score("", filter=f, k=2)

Metadata key date_end not found in metadata. Setting to None. 
Metadata fields defined for this instance: ['author', 'id', 'name', 'source', 'tags', 'date_start', 'date_end']
Metadata key date_end not found in metadata. Setting to None. 
Metadata fields defined for this instance: ['author', 'id', 'name', 'source', 'tags', 'date_start', 'date_end']


[(Document(page_content='他又想起阿切尔·斯隆，回忆起将近二十年前，那种渐渐强大到盖过那张喜欢冷嘲热讽的脸的慢性痛苦，以及驱散了那种严峻本身的慢慢腐蚀的绝望——他想，他现在明白了，说来微不足道，斯隆忧虑的某种徒劳感。\n\n\n格蕾丝满满胖起来。在那年冬天喝十三岁生日这段时间，她体重增加了五十磅，脸蛋满满鼓起来，而且很干燥，就像正在发酵的面团，四肢也渐渐柔软，动作变得缓慢、笨拙。她吃得比以前还少，但非常喜欢甜食，房间里总放着一盒糖果，好像体内的某种东西开始松弛、柔软和绝望了，好像体内某种没有形体的东西在搏斗着，忽然松懈了，现在说服她的肉体明确指定过那种阴暗和隐秘的生活。\n斯通纳心怀伤感地眼睁睁看着这种变化。这种伤感掩饰了他显现给世人的那张冷漠的脸。他不允许自己产生那种轻松、奢侈的内疚感。考虑到他的天性和伊迪斯生活的环境，他完全束手无策。这种想法强化了他的悲伤，这种悲伤是内疚都无法引发的，让他对女儿的爱更加彻底、更加深刻。\n斯通纳知道——而且很早就知道，他认为——女儿属于那种极其稀有而且永远那么漂亮可爱的人类中的一员，这种人的道德质地是那么娇柔，必须认真养护和关心，这样它才能称心如意。由于跟这个世界格格不入，它只好生存在一个不可能是自己家园的地方。渴望温柔和安静、不得已要生存的地方，也没有蛮力击退反对它的残暴势力，只有退缩到一个静谧之地，那里荒凉、狭小而柔静。', metadata={'id': 'doc:notiondb:b47804cf0d1f4214aaa03fb7e7fc4f9d', 'author': '【美】约翰·威廉斯', 'name': '斯通纳 【美】约翰·威廉斯', 'source': '读书笔记（文学）', 'tags': '小说', 'date_start': '20210828', 'date_end': None}),
  0.9503),
 (Document(page_content='retainers have don’t-bite-the-hand-that-feeds considerations, thus there’s indutry-wide “2 years rule” such that any ethical retainers won’t recruit away any people