## `BM25Retriever` 와` FAISS` 검색기를 결합
- BM25: 키워드 유사도
- FAISS: 의미기반 유사도

In [2]:
!pip install -qU langchain_openai langchain-community rank_bm25 faiss-cpu

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/54.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m412.2/412.2 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.8/50.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

In [4]:
doc_list = [
    "I like apples",
    "I like apple company",
    "I like apple's iphone",
    "Apple is my favorite company",
    "I like apple's ipad",
    "I like apple's macbook",
]

In [8]:
# bm25 retriever와 faiss retriever를 초기화
bm25_retriever = BM25Retriever.from_texts(
    doc_list,
)
bm25_retriever.k = 1  # BM25Retriever의 검색 결과 개수를 1로 설정

embedding = OpenAIEmbeddings()  # OpenAI 임베딩을 사용
faiss_vectorstore = FAISS.from_texts(
    doc_list,
    embedding,
)
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs = {"k": 1})

# 앙상블 retriever를 초기화
ensemble_retriever = EnsembleRetriever(
    retrievers = [bm25_retriever, faiss_retriever],
    weights = [0.3, 0.7],
)

In [9]:
query = "my favorite fruit is apple"
ensemble_result = ensemble_retriever.invoke(query)
bm25_result = bm25_retriever.invoke(query)
faiss_result = faiss_retriever.invoke(query)

print("[Ensemble Retriever]")
for doc in ensemble_result:
    print(f"Content: {doc.page_content}")
    print()

print("[BM25 Retriever]") # 단어 기반
for doc in bm25_result:
    print(f"Content: {doc.page_content}")
    print()

print("[FAISS Retriever]") # 의미 기
for doc in faiss_result:
    print(f"Content: {doc.page_content}")
    print()

[Ensemble Retriever]
Content: I like apples

Content: Apple is my favorite company

[BM25 Retriever]
Content: Apple is my favorite company

[FAISS Retriever]
Content: I like apples



In [10]:
query = "Apple company makes my favorite iphone"
ensemble_result = ensemble_retriever.invoke(query)
bm25_result = bm25_retriever.invoke(query)
faiss_result = faiss_retriever.invoke(query)

print("[Ensemble Retriever]")
for doc in ensemble_result:
    print(f"Content: {doc.page_content}")
    print()

print("[BM25 Retriever]")
for doc in bm25_result:
    print(f"Content: {doc.page_content}")
    print()

print("[FAISS Retriever]")
for doc in faiss_result:
    print(f"Content: {doc.page_content}")
    print()

[Ensemble Retriever]
Content: I like apple's iphone

Content: Apple is my favorite company

[BM25 Retriever]
Content: Apple is my favorite company

[FAISS Retriever]
Content: I like apple's iphone



In [11]:
from langchain_core.runnables import ConfigurableField


ensemble_retriever = EnsembleRetriever(
    # 리트리버 목록을 설정. 여기서는 bm25_retriever와 faiss_retriever를 사용
    retrievers=[bm25_retriever, faiss_retriever],
).configurable_fields(
    weights=ConfigurableField(
        # 검색 매개변수의 고유 식별자를 설정
        id="ensemble_weights",
        # 검색 매개변수의 이름을 설정
        name="Ensemble Weights",
        # 검색 매개변수에 대한 설명을 작성
        description="Ensemble Weights",
    )
)

In [12]:
config = {"configurable": {"ensemble_weights": [0.3, 0.7]}}

# config 매개변수를 사용하여 검색 설정을 지정
docs = ensemble_retriever.invoke("my favorite fruit is apple", config = config)
docs

[Document(id='ed9d6edd-62dd-49fc-b180-cfb3c14515c3', metadata={}, page_content='I like apples'),
 Document(metadata={}, page_content='Apple is my favorite company')]

In [13]:
config = {"configurable": {"ensemble_weights": [0.3, 0.7]}}

# config 매개변수를 사용하여 검색 설정을 지정
docs = ensemble_retriever.invoke("my favorite fruit is apple", config = config)
docs

[Document(id='ed9d6edd-62dd-49fc-b180-cfb3c14515c3', metadata={}, page_content='I like apples'),
 Document(metadata={}, page_content='Apple is my favorite company')]