# FAISS
- Meta 的開源向量數據庫

In [3]:
import os 
import logging 

from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

True

In [4]:
from langchain.document_loaders import PyPDFLoader
loaders = [
    PyPDFLoader("docs/01.pdf"),
    PyPDFLoader("docs/02.pdf"),
    PyPDFLoader("docs/03.pdf"),
    PyPDFLoader("docs/04.pdf"),
]

docs = []

for loader in loaders:
    docs.extend(loader.load())

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap=100,
    length_function = len,
    separators=["\n\n", '\n', ' ', '']
    )

chunks = text_splitter.split_documents(docs)



In [5]:
from langchain_openai import OpenAIEmbeddings
import os
embeddings = OpenAIEmbeddings(
    base_url = os.environ["EMBEDDINGS_BASE_URL"]
)

In [6]:
from langchain.llms import Ollama

chat = Ollama(model="openchat:latest")

In [7]:
from langchain.vectorstores import FAISS


In [8]:
vectordb = FAISS.from_documents(docs, embeddings)

In [9]:
query = "在香港有什麼美味的食物? 如果有請給出該店舖的地址"

docs = vectordb.similarity_search(query)
docs[0].page_content[:200]

'1110\n小店美食\n13  添好運點心專門店\n全球最便宜的米芝蓮星級餐廳之一，雖然在台\n灣已經開了多間分店，但要品嚐正港風味的點\n心，還是要到本地的街坊小店。菜單上的選擇\n或許不如茶樓多，但價格親民，招牌點心酥皮\n焗叉燒包，保證讓人一試愛上。\n地址：深水埗福榮街9至11號 \n電話：+852 2788 1226\n網址：www.timhowan.com.hk\n14  新香園\n來到這家老字號茶餐廳，必吃'

## RetrivalQA

In [10]:
from langchain.chains import RetrievalQA # 檢索+回答問題
retriever = vectordb.as_retriever()
model = RetrievalQA.from_chain_type(
    retriever = retriever,
    llm = chat,
    chain_type = "stuff", # 表示檢索到的文檔,直接讓llm去回答問題
    verbose= True
)

In [11]:
model

RetrievalQA(verbose=True, combine_documents_chain=StuffDocumentsChain(llm_chain=LLMChain(prompt=PromptTemplate(input_variables=['context', 'question'], template="Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:"), llm=Ollama(model='openchat:latest')), document_variable_name='context'), retriever=VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x128682dc0>))

In [12]:
query = "在香港有什麼美味的食物? 如果有請給出該店舖的地址"

response = model(
    {"query":query}
)
print("回答:", response['result'])

  warn_deprecated(




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
回答:  以下是一些在香港吃得到的美味食物：

1. 蘇記（Siu Kee）：逐漸式微的餐廳，主打經典在地美食如法式吐司、豬排煎蛋麵和港式奶茶等。
   地址：深水埗耀東街15至16號
   電話：+852 2779 1182

2. 文記車仔麵（Wun Kwong Cheung Fun）：傳統香港車仔麵，有三家店設在同一條街。提供各式麵食、湯頭和配料。
   地址：深水埗福榮街109、111-117E、121及123C號
   電話：+852 9059 5104

3. 飛鷹餐廳（Fung Shing Restaurant）：豪華的港式牛排館，營造出濃濃的復古情調。除了鐵板牛排外，羅宋湯、德國豬腳等也是不錯的選擇。
   地址：深水埗荔枝角道256號
   電話：+852 2395 2576

4. 綠林甜品（Green Line Desserts）：出品的甜點有包括紅豆、堅果和黑芝麻等熬煮的傳統甜湯，以及豆腐布丁、榴槤煎餅等創新甜點。
   地址：深水埗元州街77至79號
   電話：+852 2361 4205


## 如何保存與加載 FAISS

In [13]:
!rm -rf ./db

In [16]:
persist_directory = "./db"

vectordb.save_local(persist_directory)

new_db = FAISS.load_local(
    folder_path = persist_directory,
    embeddings = embeddings,
    allow_dangerous_deserialization = True
)

### 查看VectorDB的文黨


In [17]:
import pandas as pd 

In [41]:
def store_to_df(vectordb):
   v_dict = vectordb.docstore._dict
   data_rows =[]
   for k in v_dict.keys():
      doc_name = v_dict[k].metadata["source"].split("/")[-1]
      page_number = v_dict[k].metadata["page"] +1
      content = v_dict[k].page_content
      data_rows.append({"chunk_id": k, "doc_name":doc_name, "page_number":page_number, "content":content})
   vector_df = pd.DataFrame(data_rows)
   return vector_df 

def show_vectorStore(vectordb):
    vector_df = store_to_df(vectordb)
    display(vector_df)


In [42]:
show_vectorStore(new_db)

Unnamed: 0,chunk_id,doc_name,page_number,content
0,05a6ae78-1dbb-497f-986b-79d6af54a636,01.pdf,1,衝\n一\n波\n$$07&3JOEE$$07&3...
1,59f451a6-656d-435e-bfd4-f0e8a7d976f2,01.pdf,2,網路報導專區\n下載PDF手冊\n閱讀\n動態電子書\n23　中　環\t 蘭桂坊Bar Ho...
2,73cc4762-340f-4793-9359-431bbd90d75b,01.pdf,3,04\n地址：香港九龍太子砵蘭街418號地下 電話：＋852-2392-9283\n營業時間...
3,267b4fe3-fb82-4125-beb2-520cad6d7340,01.pdf,4,06\n地址：香港九龍旺角通菜街1A-1L 威達 Deli2商業大廈1字樓 B 舖 電話：＋...
4,27cd5652-bc39-4f63-b6d7-1ce5c96eb05f,01.pdf,5,08\n地址：香港銅鑼灣謝斐道477-481號肇明大廈地下及1樓 A 室 電話：＋852-2...
...,...,...,...,...
83,5495ed0c-13c0-4234-9b37-4b5a556a9a8a,04.pdf,25,4746\n添好運點心專門店 \np.11\n寶華扎作 \np.34\n汝州街 \n（珠仔街...
84,3da10a70-4bfe-46f2-8e5e-c246f318885b,04.pdf,26,4948\n 4948\nDoughnut \np.9\n福榮街 \np.31\n合益泰小...
85,14df7c3e-a29c-4779-a0ce-b3d6e1e47550,04.pdf,27,5150\n福榮街 \np.31\n公和荳品廠 \np.10\n1\n2\n3\n汝州街 \...
86,dc3708bd-475d-453c-93d9-f6b8f5469de7,04.pdf,28,香港旅遊發展局旅客服務\nHONG KONG TRAVEL BUDDY \n歡迎旅客掃描QR...


### 添加和刪除VectorDB中的文檔

In [44]:
# 刪除對應的PDF的文檔
def delete_document(store, document_name):
    vector_df = store_to_df(store)
    chunks_list = vector_df.loc[vector_df["doc_name"] == document_name]["chunk_id"].tolist()
    store.delete(ids=chunks_list)

# 更新VectorDB, 同時更新RetrievalQA
def refresh_model(new_store):
    retriever = new_store.as_retriever()
    model = RetrievalQA.from_chain_type(
    retriever = retriever,
    llm = chat,
    chain_type = "stuff", # 表示檢索到的文檔,直接讓llm去回答問題
    verbose= True
)
    return model 

In [51]:
# delete_document(new_db, '02.pdf')
# delete_document(new_db, '01.pdf')
delete_document(new_db, '03.pdf')
delete_document(new_db, '04.pdf')

In [52]:
show_vectorStore(new_db)

In [53]:
model = refresh_model(new_db)

query = "泰昌餅家的地址是哪裡? "

response = model(
    {"query":query}
)
print("回答:", response['result'])



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
回答:  我不知道泰昌餅家的地址。


In [54]:
## 添加PDF文檔
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
def add_vectorStore(store, directory):
    loader = PyPDFLoader(directory)

    doc = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap=100,
        length_function = len,
        separators=["\n\n", '\n', ' ', '']
        )

    chunks = text_splitter.split_documents(doc)

    extension = FAISS.from_documents(chunks, embeddings)

    store.merge_from(extension)


In [55]:
add_vectorStore(new_db, "docs/05.pdf")

In [57]:
# show_vectorStore(new_db)

In [59]:
model = refresh_model(new_db)

query = "和昌飯店有什麼好吃的? 和昌飯店地址和電話是什麼? "

response = model(
    {"query":query}
)
print("回答:", response['result'])



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
回答:  和昌飯店是一家著名的粵質餐廳，有許多美味菜品可選。它的特色菜單包括各種煎餘、燉湯、炒菜等，都充滿本地風味。和昌飯店地址在香港灣仔軒尼詩道48-62號上海實業大廈地下及地庫（港鐵灣仔站B1出口步行3分鐘），電話號碼是+852-2834-9963。
