# faiss
`pip install -U langchain-community faiss-cpu langchain-openai tiktoken`

In [2]:
from dotenv import load_dotenv, find_dotenv
from langchain.globals import set_debug

load_dotenv(find_dotenv())
set_debug(False)

In [13]:

import math
import time
import faiss
import numpy as np


## 增加数据

In [14]:
d = 768  # 向量维数
 
data = [[i] * d for i in range(2000)]
data = np.array(data).astype('float32')  # 注意，只能插入float32类型的向量 
ids = np.arange(0, 2000)
data_length=len(ids)   # 自定义向量的I


In [16]:
ids

array([   0,    1,    2, ..., 1997, 1998, 1999])

In [15]:

nlist = int(4 * math.sqrt(data_length))  # 聚类中心的个数
time1 = time.time()
nlist

178

In [17]:
quantizer = faiss.IndexFlatL2(d)  # 内部的索引方式依然不变
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)  # 倒排索引
index.train(data)  # 注意，倒排索引一定要进行train
index.add_with_ids(data,ids)
print(index.is_trained)
time2 = time.time()
print(f'构建索引插入数据的时间为{time2 - time1}')



True
构建索引插入数据的时间为113.68068528175354




In [18]:

query_vector = np.array([[1] * 768]).astype('float32')
dis, ind = index.search(query_vector, 1)  # 1代表返回的结果数
print(f'全1向量的最近的向量id为{ind}')
print(dis)



全1向量的最近的向量id为[[1]]
[[0.]]


In [19]:

add_data = np.array([[1000] * 768]).astype('float32')
add_id = np.array([3000])
index.add_with_ids(add_data, add_id)
print(f'\n注意插入数据后的样本总数为{index.ntotal}')




注意插入数据后的样本总数为2001


In [20]:
query_vector = np.array([[1000] * 768]).astype('float32')
dis, ind = index.search(query_vector, 1)
print(f'新插入的全1000向量的最近的向量id为{ind}')
print(dis)
 
query_vector = np.array([[1] * 768]).astype('float32')
dis, ind = index.search(query_vector, 1)
print(f'\n全1向量的最近的向量id为{ind}')
print(dis)


新插入的全1000向量的最近的向量id为[[1000]]
[[0.]]

全1向量的最近的向量id为[[1]]
[[0.]]


## 删除

In [21]:
d = 768  # 维数
 
data = [[i] * d for i in range(500)]
data = np.array(data).astype('float32')
ids = np.arange(2000, 2500)
 
 
time1 = time.time()
 
index = faiss.index_factory(d, "IDMap,Flat", faiss.METRIC_L2)  # 不使用倒排索引，暴力搜索，此时不需要进行train
index.add_with_ids(data,ids)
print(index.is_trained)
time2 = time.time()
print(f'构建索引插入数据的时间为{time2 - time1}')


True
构建索引插入数据的时间为0.0008194446563720703


In [22]:
 
query_vector = np.array([[2] * 768]).astype('float32')
dis, ind = index.search(query_vector, 1)
print(f'\n初始状态下，全2向量的最近的向量id为{ind}')
print(dis)
 




初始状态下，全2向量的最近的向量id为[[2002]]
[[0.]]


In [23]:
query_vector = np.array([[1] * 768]).astype('float32')
dis, ind = index.search(query_vector, 1)
print(f'\n初始状态下，全1向量的最近的向量id为{ind}')
print(dis)
 
query_vector = np.array([[0] * 768]).astype('float32')
dis, ind = index.search(query_vector, 1)
print(f'\n初始状态下，全0向量的最近的向量id为{ind}')
print(dis)
 



初始状态下，全1向量的最近的向量id为[[2001]]
[[0.]]

初始状态下，全0向量的最近的向量id为[[2000]]
[[0.]]


In [24]:
print('\n下面删除id为2000的数据,也就是全0的那条vector')
index.remove_ids(np.array([2000,2001]))
 
query_vector = np.array([[0] * 768]).astype('float32')
dis, ind = index.search(query_vector, 1)
print(f'\n删除操作后，全0向量的最近的向量id为{ind}')
print(dis)
 
 
query_vector = np.array([[1] * 768]).astype('float32')
dis, ind = index.search(query_vector, 1)
print(f'\n进行删除操作后，全1向量的最近的向量id为{ind}')
print(dis)
 
query_vector = np.array([[2] * 768]).astype('float32')
dis, ind = index.search(query_vector, 1)
print(f'\n初始状态下，全2向量的最近的向量id为{ind}')
print(dis)


下面删除id为2000的数据,也就是全0的那条vector

删除操作后，全0向量的最近的向量id为[[2002]]
[[3072.]]

进行删除操作后，全1向量的最近的向量id为[[2002]]
[[768.]]

初始状态下，全2向量的最近的向量id为[[2002]]
[[0.]]


## 更新

In [25]:

  
 
d = 768  # 维数
 
data = [[i] * d for i in range(500)]
data = np.array(data).astype('float32')
# centers = int(8 * math.sqrt(len(data)))
ids = np.arange(2000, 2500)
 
 
 
nlist = 100
m = 8
 
time1 = time.time()
 
quantizer = faiss.IndexFlatL2(d)  # 内部的索引方式依然不变
# index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8)  # PQ方式，每个向量都被编码为8个字节大小
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)  # 这个索引支持add_with_ids
index.train(data)
index.add_with_ids(data,ids)
print(index.is_trained)
time2 = time.time()
print(f'构建索引插入数据的时间为{time2 - time1}')
 

 


True
构建索引插入数据的时间为0.012265682220458984




In [26]:
# index.nprobe = 10  # 选择n个维诺空间进行索引,
 
query_vector = np.array([[1] * 768]).astype('float32')
dis, ind = index.search(query_vector, 1)
print(f'初始状态全1向量的最近的向量id为{ind}')
print(dis)
 
query_vector = np.array([[1000] * 768]).astype('float32')
dis, ind = index.search(query_vector, 1)
print(f'\n初始状态全1000向量的最近的向量id为{ind}')
print(dis)
 
 
query_vector = np.array([[0] * 768]).astype('float32')
dis, ind = index.search(query_vector, 1)
print(f'\n删除全0向量之前全0向量最近的index为{ind}')
print(dis)
 

 

初始状态全1向量的最近的向量id为[[2001]]
[[0.]]

初始状态全1000向量的最近的向量id为[[2499]]
[[1.9276854e+08]]

删除全0向量之前全0向量最近的index为[[2000]]
[[0.]]


In [27]:
index.remove_ids(np.array([2000]))
print('\n注意删除的向量id为2000，将全0向量进行删除')
print(f'样本的总数为{index.ntotal}')
 
 
query_vector = np.array([[0] * 768]).astype('float32')
dis, ind = index.search(query_vector, 1)
print(f'\n删除全0向量之后全0向量最近的index为{ind}')
print(dis)
 
add_data = np.array([[1000] * 768]).astype('float32')
add_id = np.array([2000])
index.add_with_ids(add_data, add_id)
print(f'\n注意，此时将index为2000的数据进行了更新，更新的数据为全1000，插入数据后的样本总数为{index.ntotal}')
 
 
query_vector = np.array([[1000] * 768]).astype('float32')
dis, ind = index.search(query_vector, 1)
print(f'\n更新后的全1000向量的最近的向量id为{ind}')
print(dis)
 
query_vector = np.array([[1] * 768]).astype('float32')
dis, ind = index.search(query_vector, 1)
print(f'\n全1向量的最近的向量id为{ind}')
print(dis)


注意删除的向量id为2000，将全0向量进行删除
样本的总数为499

删除全0向量之后全0向量最近的index为[[2001]]
[[768.]]

注意，此时将index为2000的数据进行了更新，更新的数据为全1000，插入数据后的样本总数为500

更新后的全1000向量的最近的向量id为[[2000]]
[[0.]]

全1向量的最近的向量id为[[2001]]
[[0.]]


## FAISS 在lanchain中的使用

In [3]:
# Uncomment the following line if you need to initialize FAISS with no AVX2 optimization
# os.environ['FAISS_NO_AVX2'] = '1'

from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter

loader = TextLoader("../data/state_of_the_union.txt")
documents = loader.load()
documents

[Document(metadata={'source': '../data/state_of_the_union.txt'}, page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their 

In [4]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
docs

[Document(metadata={'source': '../data/state_of_the_union.txt'}, page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their 

In [5]:
embeddings = OpenAIEmbeddings()
db = FAISS.from_documents(docs, embeddings)
print(db.index.ntotal)

42


In [7]:
db.index_to_docstore_id

{0: 'b90770f5-489d-46e9-8fbd-47dbef45d441',
 1: '088c862b-7f8f-46c1-b848-94c56801843c',
 2: '82cbb346-23e0-4642-86e3-137f03121d7e',
 3: '1a2ddc1d-5cbe-40c4-b10f-36545b3091a6',
 4: 'f66f294d-ef79-4d53-9ccd-dff1fd236409',
 5: '928b7d56-b1d2-47d0-9c34-775dc9d519c8',
 6: '1b355446-9360-404b-8bca-608da2b8c2d7',
 7: 'ba13b7f3-1e39-42d7-aa2f-72c7f4afe951',
 8: '999d79a8-9df3-426f-8fae-a4063b3e1e30',
 9: 'c48be4f6-35e3-4fdd-83ba-21a85955f2b2',
 10: '886ea901-8802-4c53-ae67-5d2a58226abe',
 11: '1d7a8afd-de66-44bf-8a90-7f3242b5a4c4',
 12: '2f0c5b3e-e956-4075-a99d-a3ab2b3c2356',
 13: '4a71da86-e950-4c06-9d2f-6a877a3e8c98',
 14: '1927af97-8500-49a4-ad2d-22509dc3859b',
 15: '8345a629-222c-4c31-ada6-3802d292cae5',
 16: '80bdc318-fc71-42ae-b8b6-c343904c1b01',
 17: '48c15ef3-8e77-4972-8975-2d60da19c9e9',
 18: 'b4886a04-ab30-4955-91bd-c11ba7d7a9c0',
 19: '7fdf301b-dee8-41a8-87d4-d14709e40e2c',
 20: '138505ba-b8ba-4898-a15b-a061b223d746',
 21: 'a1543b90-c1a7-4502-985e-c42b2bc28efd',
 22: '6ec9e854-6476-

In [8]:
# Querying
query = "What did the president say about Ketanji Brown Jackson"
docs = db.similarity_search(query)

In [9]:
docs

[Document(metadata={'source': '../data/state_of_the_union.txt'}, page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.'),
 Document(metadata={'source': '../data/state_of_the_union.txt'}, page_content='A former top litigator in private practice. A

In [10]:
# 我们还可以将vectorstore转换为retriver类。这使得我们可以很容易地在其他LangChain方法中使用它，这些方法主要与检索器一起工作
retriever = db.as_retriever()
retriever.invoke(query)

[Document(metadata={'source': '../data/state_of_the_union.txt'}, page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.'),
 Document(metadata={'source': '../data/state_of_the_union.txt'}, page_content='A former top litigator in private practice. A

In [11]:
# Similarity Search with score
# 一些 FAISS 特定的方法。其中之一是similarity_search_with_score，它不仅允许您返回文档，还允许您返回查询到它们的距离分数。
# 返回的距离分数是 L2 距离。因此，分数越低越好。

docs_and_scores = db.similarity_search_with_score(query)
docs_and_scores[0]

(Document(metadata={'source': '../data/state_of_the_union.txt'}, page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.'),
 0.36930057)

您还可以搜索与给定嵌入向量相似的文档，并使用嵌入向量作为参数，而不是带有 `similacy_search_by_vector` 的字符串。

In [12]:
embedding_vector = embeddings.embed_query(query)
docs_and_scores = db.similarity_search_by_vector(embedding_vector)

In [13]:
docs_and_scores

[Document(metadata={'source': '../data/state_of_the_union.txt'}, page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.'),
 Document(metadata={'source': '../data/state_of_the_union.txt'}, page_content='A former top litigator in private practice. A

In [17]:
## Saving and loading
# 您还可以保存和加载FAISS索引。这很有用，所以您不必每次使用它都重新创建它。
db.save_local("/slurm/home/admin/nlp/DL/tests/faiss/faiss_index")


In [19]:
new_db = FAISS.load_local("/slurm/home/admin/nlp/DL/tests/faiss/faiss_index", embeddings, allow_dangerous_deserialization=True)
docs = new_db.similarity_search(query)
docs[0]

Document(metadata={'source': '../data/state_of_the_union.txt'}, page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.')

In [27]:
db.docstore.search("b90770f5-489d-46e9-8fbd-47dbef45d441")

Document(metadata={'source': '../data/state_of_the_union.txt'}, page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their d

In [29]:
db.docstore._dict

{'b90770f5-489d-46e9-8fbd-47dbef45d441': Document(metadata={'source': '../data/state_of_the_union.txt'}, page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, t

In [None]:
## Serializing and De-Serializing to bytes
# 您可以通过这些功能定制faiss索引。如果使用90 MB（句子转换器/All-Minilm-L6-V2或任何其他模型）的嵌入式模型，
# 则最终的尺寸将超过90 MB。该模型的大小也包括在整体尺寸中。为了克服这一点，请使用以下功能。
# 这些功能仅序列化faiss索引，大小将要少得多。
# 如果您希望将索引存储在SQL（例如SQL）中，这可能会有所帮助。

from langchain_huggingface import HuggingFaceEmbeddings

pkl = db.serialize_to_bytes()  # serializes the faiss
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

db = FAISS.deserialize_from_bytes(
    embeddings=embeddings, serialized=pkl
)  # Load the index

In [28]:
## Merging
db1 = FAISS.from_texts(["foo"], embeddings)
db2 = FAISS.from_texts(["bar"], embeddings)

db1.docstore._dict

{'5f239446-bd4a-44ae-93d5-475b8f94d667': Document(page_content='foo')}

In [30]:
db2.docstore._dict

{'744fbb4c-59cc-400e-8e8e-76b3cabc6050': Document(page_content='bar')}

In [31]:
db1.merge_from(db2)

In [32]:
db1.docstore._dict

{'5f239446-bd4a-44ae-93d5-475b8f94d667': Document(page_content='foo'),
 '744fbb4c-59cc-400e-8e8e-76b3cabc6050': Document(page_content='bar')}

In [33]:
# FAISS vectorstore也支持过滤，因为FAISS本身不支持过滤，所以我们必须手动进行过滤。这是通过首先获取多于k的结果，
# 然后对它们进行过滤来实现的。这个过滤器要么是一个可调用对象，它接受一个元数据字典作为输入并返回一个bool值，要么是一个元数据字典，
# 其中每个缺失的键都被忽略，每个存在的k必须在一个值列表中。
# 您还可以在调用任何搜索方法时设置fetch k参数，以设置在过滤之前要获取多少文档。这里有一个小例子

from langchain_core.documents import Document

list_of_documents = [
    Document(page_content="foo", metadata=dict(page=1)),
    Document(page_content="bar", metadata=dict(page=1)),
    Document(page_content="foo", metadata=dict(page=2)),
    Document(page_content="barbar", metadata=dict(page=2)),
    Document(page_content="foo", metadata=dict(page=3)),
    Document(page_content="bar burr", metadata=dict(page=3)),
    Document(page_content="foo", metadata=dict(page=4)),
    Document(page_content="bar bruh", metadata=dict(page=4)),
]
db = FAISS.from_documents(list_of_documents, embeddings)
results_with_scores = db.similarity_search_with_score("foo")
for doc, score in results_with_scores:
    print(f"Content: {doc.page_content}, Metadata: {doc.metadata}, Score: {score}")

Content: foo, Metadata: {'page': 1}, Score: 0.0
Content: foo, Metadata: {'page': 2}, Score: 0.0
Content: foo, Metadata: {'page': 3}, Score: 0.0
Content: foo, Metadata: {'page': 4}, Score: 0.0


In [34]:
results_with_scores = db.similarity_search_with_score("foo", filter=dict(page=1))
# Or with a callable:
# results_with_scores = db.similarity_search_with_score("foo", filter=lambda d: d["page"] == 1)
for doc, score in results_with_scores:
    print(f"Content: {doc.page_content}, Metadata: {doc.metadata}, Score: {score}")

Content: foo, Metadata: {'page': 1}, Score: 0.0
Content: bar, Metadata: {'page': 1}, Score: 0.31471875309944153


In [35]:
results = db.max_marginal_relevance_search("foo", filter=dict(page=1))
for doc in results:
    print(f"Content: {doc.page_content}, Metadata: {doc.metadata}")

Content: foo, Metadata: {'page': 1}
Content: bar, Metadata: {'page': 1}


In [36]:
results = db.similarity_search("foo", filter=dict(page=1), k=1, fetch_k=4)
for doc in results:
    print(f"Content: {doc.page_content}, Metadata: {doc.metadata}")

Content: foo, Metadata: {'page': 1}


In [37]:
## 您还可以从 vectorstore 中删除记录。下面的示例中db.index_to_docstore_id表示包含 FAISS 索引元素的字典。
print("count before:", db.index.ntotal)
db.delete([db.index_to_docstore_id[0]])
print("count after:", db.index.ntotal)

count before: 8
count after: 7
