In [None]:
%pip install rank-bm25 llama-index llama-index-readers-file pymupdf llama-index-llms-openai llama-index-retrievers-bm25

In [None]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
!mkdir data
!wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"

In [None]:
from pathlib import Path
from llama_index.readers.file import PyMuPDFReader
import os
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI
from llama_index.retrievers.bm25 import BM25Retriever
loader = PyMuPDFReader()
documents = loader.load(file_path="./data/llama2.pdf")

In [None]:
documents

In [None]:
os.environ["OPENAI_API_KEY"] = '<YOUR API KEY>'

In [None]:
# 청킹 전략 define
splitter = SentenceSplitter(chunk_size=256)
index = VectorStoreIndex.from_documents(documents, transformations=[splitter])

In [None]:
# 기반 모델 정의
llm = OpenAI(model="gpt-3.5-turbo")

In [None]:
# Semantic Search Retriever + BM25 Sparse Search Retriever 정의
vector_retriever = index.as_retriever(similarity_top_k=2)

bm25_retriever = BM25Retriever.from_defaults(
    docstore=index.docstore, similarity_top_k=2
)

In [None]:
# 각각의 Retrieval 결과 확인
bm_result = bm25_retriever.retrieve('Tell me what llama2 and gpt are, and provide key differences between them. provide it with bullet points for better readability')
vector_result = vector_retriever.retrieve('Tell me what llama2 and gpt are, and provide key differences between them. provide it with bullet points for better readability')

In [None]:
# BM25 Retrieval 결과
for node in bm_result:
    print(f"Score: {node.score:.2f} - {node.text}...\n-----\n")

In [None]:
# Semantic Retrieval 결과
for node in vector_result:
    print(f"Score: {node.score:.2f} - {node.text}...\n-----\n")

In [None]:
# Hybrid Search용 쿼리퓨전 리트리버 정의
from llama_index.core.retrievers import QueryFusionRetriever

retriever = QueryFusionRetriever(
    [vector_retriever, bm25_retriever],
    similarity_top_k=2,
    num_queries=4,
    mode="reciprocal_rerank",
    use_async=True,
    verbose=True
)

In [None]:
# Query Generation + generated query별 hybrid retrieval 진행
nodes_with_scores = retriever.retrieve('Tell me what llama2 and gpt are, and provide key differences between them. provide it with bullet points for better readability')

In [None]:
# 최종 Hybrid Search Retrieval 결과
for node in nodes_with_scores:
    print(f"Score: {node.score:.2f} - {node.text}...\n-----\n")

In [None]:
# Retrieval 모듈에 쿼리엔진(Augmented Generator) 더하기
from llama_index.core.query_engine import RetrieverQueryEngine

query_engine = RetrieverQueryEngine.from_args(retriever) # 하이브리드 쿼리엔진
query_engine_0 = RetrieverQueryEngine.from_args(bm25_retriever) # BM25 쿼리엔진
query_engine_1 = RetrieverQueryEngine.from_args(vector_retriever) # 시멘틱 쿼리엔진

In [None]:
response_0 = query_engine_0.query('Tell me what llama2 and gpt are, and provide key differences between them. provide it with bullet points for better readability.')
response_1 = query_engine_1.query('Tell me what llama2 and gpt are, and provide key differences between them. provide it with bullet points for better readability.')

In [None]:
response = query_engine.query('Tell me what llama2 and gpt are, and provide key differences between them.provide it with bullet points for better readability.')

In [None]:
# BM25 쿼리결과
from llama_index.core.response.notebook_utils import display_response
display_response(response_0)


In [None]:
# Semantic 쿼리결과
display_response(response_1)

In [None]:
# Hybrid(rerank 적용) 쿼리결과
display_response(response)