## 向量數據庫
- 主要目的是比對相似度
- Chroma

In [1]:
import os 
import logging 

from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())
api_key = os.environ.get('OPENAI_API_KEY')
if api_key is None:
    raise ValueError("The OPENAI_API_KEY environment variable is not set.")

In [2]:
import os 
from langchain_openai import ChatOpenAI
chat = ChatOpenAI(
    openai_api_base=os.environ["CHATGPT_API_ENDPOINT"],
    openai_api_key=os.environ["OPENAI_API_KEY"])



In [3]:
from langchain.document_loaders import ReadTheDocsLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken

loader = ReadTheDocsLoader("htmldocs")
docs = loader.load()
print(len(docs))

tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo") #分詞器
def token_count(text):
    tokens = tokenizer.encode(
        text, 
        disallowed_special = ()
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 400,
    chunk_overlap = 20,
    length_function = token_count,
    separators= ["\n\n","\n", " ",""]
)
chunks = text_splitter.split_text(docs[0].page_content)

token_count(chunks[0]),token_count(chunks[1]),token_count(chunks[2]),token_count(chunks[3]),token_count(chunks[4]),

2


(383, 373, 345, 376, 105)

In [4]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(
    base_url = os.environ["EMBEDDINGS_BASE_URL"]
)

In [5]:
setence1 = "I like cats."
setence2 = "I like dogs."
setence3 = "The weather is ugly outside" 

embedding1 = embeddings.embed_query(setence1)
embedding2 = embeddings.embed_query(setence2)
embedding3 = embeddings.embed_query(setence3)

In [6]:
import numpy as np 
# 點積
print(np.dot(embedding1, embedding2))
print(np.dot(embedding1, embedding3))
print(np.dot(embedding2, embedding3))

0.9176366334801493
0.7469114508261729
0.7529033051533119


## Chroma

In [7]:
## method 1 
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import ReadTheDocsLoader
import tiktoken


loader = ReadTheDocsLoader("htmldocs")
docs = loader.load()

tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo") #分詞器

def token_count(text):
    tokens = tokenizer.encode(
        text, 
        disallowed_special = ()
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 400,
    chunk_overlap = 20,
    length_function = token_count,
    separators= ["\n\n","\n", " ",""]
)

chunks = text_splitter.split_text(docs[0].page_content)

In [8]:
from langchain.vectorstores import Chroma
persist_directory = "./db"
!rm -rf ./db

In [9]:
vectordb = Chroma.from_texts(
    texts = chunks,
    embedding = embeddings,
    persist_directory = persist_directory
)

In [10]:
## method 2

doc_chunks = text_splitter.create_documents(chunks)

vectordb = Chroma.from_documents(
    documents = doc_chunks,
    embedding = embeddings,
    persist_directory = persist_directory
)

In [11]:
vectordb._collection.count()

10

### 相似度搜索

In [1]:
from langchain.document_loaders import PyPDFLoader
loaders = [
    PyPDFLoader("docs/01.pdf"),
    PyPDFLoader("docs/02.pdf"),
    PyPDFLoader("docs/03.pdf"),
    PyPDFLoader("docs/04.pdf"),
]



In [2]:
docs = []

for loader in loaders:
    docs.extend(loader.load())

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap=100,
    length_function = len,
    separators=["\n\n", '\n', ' ', '']
    )

splits = text_splitter.split_documents(docs)
len(splits) # 109 方塊

109

In [5]:
from langchain_openai import OpenAIEmbeddings
import os
embeddings = OpenAIEmbeddings(
    base_url = os.environ["EMBEDDINGS_BASE_URL"]
)

In [6]:
from langchain.vectorstores import Chroma
persist_directory = "./db"
!rm -rf ./db

In [7]:
vectordb = Chroma.from_documents(
    documents = splits,
    embedding = embeddings,
    persist_directory = persist_directory
)

In [10]:
print(vectordb._collection.count())

109


In [23]:
question = "有什麼西式美食推薦?"

docs_ss = vectordb.similarity_search(question, k=3)
docs_mmr = vectordb.max_marginal_relevance_search(question, k=2, fetch_k=3) #不會有重複的embedding出現

print(len(docs_ss))
print(len(docs_mmr))
print(docs_ss[0].page_content[:20])
print(docs_mmr[0].page_content[:20])

3
2
1110
小店美食
13  添好運點心專
1110
小店美食
13  添好運點心專


In [27]:
question = "有什麼景色優美的景點可以推薦?"

docs_ss = vectordb.similarity_search(question, k=3, filter={"source": "docs/03.pdf"})

for d in docs_ss:
    print(d.metadata)


{'page': 8, 'source': 'docs/03.pdf'}
{'page': 11, 'source': 'docs/03.pdf'}
{'page': 9, 'source': 'docs/03.pdf'}
