In [3]:
from dotenv import find_dotenv,load_dotenv
_ = load_dotenv(find_dotenv())

这个检索器结合了语义相似性和时间衰减。

评分算法为:

`semantic_similarity + (1.0 - decay_rate) ^ hours_passed`

值得注意的是，`hours_passed`指的是自上次访问检索器中的对象以来经过的小时数，而不是自创建对象以来经过的小时数。这意味着频繁访问的对象保持“新鲜”。

In [4]:
from datetime import datetime, timedelta

import faiss
from langchain.docstore import InMemoryDocstore
from langchain.retrievers import TimeWeightedVectorStoreRetriever
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

## 低衰减率
较低的`decay rate`(在这种情况下，我们极端地将其设置为接近0)意味着记忆将被“记住”更长时间。`decay rate`为0意味着记忆永远不会被遗忘，这使得这个检索器相当于向量查找。

In [5]:
# Define your embedding model
embeddings_model = OpenAIEmbeddings()
# Initialize the vectorstore as empty
embedding_size = 1536
index = faiss.IndexFlatL2(embedding_size)
vectorstore = FAISS(embeddings_model, index, InMemoryDocstore({}), {})
retriever = TimeWeightedVectorStoreRetriever(
    vectorstore=vectorstore, decay_rate=0.0000000000000000000000001, k=1
)

In [6]:
yesterday = datetime.now() - timedelta(days=1)
retriever.add_documents(
    [Document(page_content="hello world", metadata={"last_accessed_at": yesterday})]
)
retriever.add_documents([Document(page_content="hello foo")])

['2dbd95ad-6aac-4d75-ab8d-d58bdb16c8af']

In [7]:
# "Hello World" is returned first because it is most salient, and the decay rate is close to 0., meaning it's still recent enough
retriever.get_relevant_documents("hello world")

[Document(page_content='hello world', metadata={'last_accessed_at': datetime.datetime(2024, 1, 23, 14, 33, 30, 188156), 'created_at': datetime.datetime(2024, 1, 23, 14, 32, 17, 200733), 'buffer_idx': 0})]

## 高衰减率
如果`decay rate`很高(例如，几个9)，`recency score`很快就会变为0!如果你把这个一直设为1，所有对象的`recency`都是0，再一次等价于向量查找。

In [8]:
# Define your embedding model
embeddings_model = OpenAIEmbeddings()
# Initialize the vectorstore as empty
embedding_size = 1536
index = faiss.IndexFlatL2(embedding_size)
vectorstore = FAISS(embeddings_model, index, InMemoryDocstore({}), {})
retriever = TimeWeightedVectorStoreRetriever(
    vectorstore=vectorstore, decay_rate=0.999, k=1
)

In [9]:
yesterday = datetime.now() - timedelta(days=1)
retriever.add_documents(
    [Document(page_content="hello world", metadata={"last_accessed_at": yesterday})]
)
retriever.add_documents([Document(page_content="hello foo")])

['38800b55-4906-4bad-82ce-d33bafa5b039']

In [10]:
# "Hello Foo" is returned first because "hello world" is mostly forgotten
retriever.get_relevant_documents("hello world")

[Document(page_content='hello foo', metadata={'last_accessed_at': datetime.datetime(2024, 1, 23, 14, 36, 42, 87641), 'created_at': datetime.datetime(2024, 1, 23, 14, 36, 22, 103654), 'buffer_idx': 1})]

## 虚拟时间
使用LangChain中的一些实用程序，您可以模拟出时间组件。

In [11]:
import datetime

from langchain.utils import mock_now

In [12]:
# Notice the last access time is that date time
with mock_now(datetime.datetime(2024, 2, 3, 10, 11)):
    print(retriever.get_relevant_documents("hello world"))

[Document(page_content='hello world', metadata={'last_accessed_at': MockDateTime(2024, 2, 3, 10, 11), 'created_at': datetime.datetime(2024, 1, 23, 14, 36, 19, 615484), 'buffer_idx': 0})]
