In [None]:
import getpass, os, pymongo, pprint
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, StorageContext, ServiceContext,load_index_from_storage
from llama_index.core.settings import Settings
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.vector_stores import MetadataFilter, MetadataFilters, ExactMatchFilter, FilterOperator
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
from sentence_transformers import SentenceTransformer

In [None]:
import yaml
import pymongo
import json
import numpy as np

In [None]:
with open('../config.yaml', 'r') as stream:
    config = yaml.safe_load(stream)

# Load the configuration from the config.yaml file
mongo_uri = config['mongodb']['mongo_uri']
db_name = config['mongodb']['db_name']
collection_name = config['mongodb']['ucce_collection_name']

In [None]:
directory_path = '../data'
pdf_files = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith('.pdf')]


In [None]:
sample_data = SimpleDirectoryReader(input_files=pdf_files).load_data()

In [None]:
sample_data[0]

In [None]:
# Connect to your Atlas cluster
mongodb_client = pymongo.MongoClient(mongo_uri)

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [None]:
%env OPENAI_API_KEY="open_ai_key"

In [None]:
os.environ["OPENAI_API_KEY"] = 'open_ai_key'

In [None]:
collection = mongodb_client[db_name][collection_name]

In [None]:
# collection.delete_many({})

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
Settings.llm = OpenAI()

Settings.embed_model = HuggingFaceEmbedding(model_name='nli-mpnet-base-v2')

In [None]:
# Instantiate the vector store
atlas_vector_search = MongoDBAtlasVectorSearch(
    mongodb_client,
    db_name = db_name,
    collection_name = collection_name,
    index_name = "vector_llama_index"
)
vector_store_context = StorageContext.from_defaults(vector_store=atlas_vector_search)

In [None]:
vector_store_index = VectorStoreIndex.from_documents(
   sample_data, storage_context=vector_store_context, show_progress=True
)

In [None]:
index = VectorStoreIndex.from_documents(sample_data)

In [None]:
# !pip install llama-index-retrievers-bm25

### Hybrid Fusion Retriever using Relative Score Fusion

In [None]:
from llama_index.retrievers.bm25 import BM25Retriever

vector_retriever = index.as_retriever(similarity_top_k=5)

bm25_retriever = BM25Retriever.from_defaults(
    docstore=index.docstore, similarity_top_k=10
)

In [None]:
# apply nested async to run in a notebook
import nest_asyncio

nest_asyncio.apply()

In [None]:
from llama_index.core.retrievers import QueryFusionRetriever

retriever = QueryFusionRetriever(
    [vector_retriever, bm25_retriever],
    retriever_weights=[0.5, 0.5],
    similarity_top_k=10,
    num_queries=1,  # set this to 1 to disable query generation
    mode="relative_score",
    use_async=True,
    verbose=True,
)

nodes_with_scores = retriever.retrieve(
    "query"
)

for node in nodes_with_scores:
    print(f"Score: {node.score:.2f} - {node.text[:100]}...\n-----")

In [None]:
from llama_index.core.query_engine import RetrieverQueryEngine

query_engine = RetrieverQueryEngine.from_args(retriever)

In [None]:
response = query_engine.query("query")

In [None]:
from llama_index.core.response.notebook_utils import display_response

display_response(response)

### Hybrid Fusion Retriever using Distribution Based Score Fusion

In [None]:
from llama_index.core.retrievers import QueryFusionRetriever

retriever = QueryFusionRetriever(
    [vector_retriever, bm25_retriever],
    retriever_weights=[0.5, 0.5],
    similarity_top_k=10,
    num_queries=1,  # set this to 1 to disable query generation
    mode="dist_based_score",
    use_async=True,
    verbose=True,
)

nodes_with_scores = retriever.retrieve(
    "query"
)

for node in nodes_with_scores:
    print(f"Score: {node.score:.2f} - {node.text[:100]}...\n-----")

In [None]:
response = query_engine.query("query")

In [None]:
from llama_index.core.response.notebook_utils import display_response

display_response(response)

### Basic RAG

In [None]:
import pprint
from llama_index.core.response.notebook_utils import display_response

query_engine = index.as_query_engine(similarity_top_k=3)
query = "query"
response = query_engine.query(query)
display_response(response)
pprint.pprint(response.source_nodes)