# RAG Using Different LLM Endpoints in PCAI

## Importing the Libraries

In [1]:
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings
from langchain_nvidia_ai_endpoints.reranking import NVIDIARerank
from langchain_weaviate.vectorstores import WeaviateVectorStore
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.text_splitter import CharacterTextSplitter
import weaviate

## Fetching the Secret Token for RAG Essentials

In [2]:
import weaviate, os
from weaviate.classes.init import Auth

#getting the auth token
secret_file_path = "/etc/secrets/ezua/.auth_token"

with open(secret_file_path, "r") as file:
    token = file.read().strip()

## Connecting to Weaviate

In [5]:
domain = ".cluster.local"
http_host = "weaviate.hpe-weaviate.svc.cluster.local"
grpc_host = "weaviate-grpc.hpe-weaviate.svc" + domain
weaviate_headers = {"x-auth-token": token}
#weaviate_headers = {"x-auth-token": "wrong token"}

client = weaviate.connect_to_custom(
    http_host=http_host,        # Hostname for the HTTP API connection
    http_port=80,              # Default is 80, WCD uses 443
    http_secure=False,           # Whether to use https (secure) for the HTTP API connection
    grpc_host=grpc_host,        # Hostname for the gRPC API connection
    grpc_port=50051,              # Default is 50051, WCD uses 443
    grpc_secure=False,           # Whether to use a secure channel for the gRPC API connection
    headers=weaviate_headers,
    skip_init_checks=False
)

print(client.is_ready())

            We encourage you to update your code to use the async client instead when running inside async def functions!
HTTP Request: GET http://weaviate.hpe-weaviate.svc.cluster.local/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
HTTP Request: GET http://weaviate.hpe-weaviate.svc.cluster.local/v1/meta "HTTP/1.1 200 OK"
HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"
HTTP Request: GET http://weaviate.hpe-weaviate.svc.cluster.local/v1/.well-known/ready "HTTP/1.1 200 OK"


True


## Connecting to LLM through MLIS

In [3]:
_api_key = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NDgzNDIxNzEsImlhdCI6MTc0MzE1ODE3NCwiaXNzIjoiYWlvbGlAaHBlLmNvbSIsInN1YiI6IjAwN2ExM2Q3LWM3ZGYtNDkxNy1iODNiLTc5NjkyMzUxYzk4OCIsInVzZXIiOiJoYXJpcy1jcmltc29uY2xvdWQuaW4ifQ.PkuygOyMTg3vHwIUGMfn6ymca4eEBWCQbA9b4xPJ1O27egSP0LVAOPhIG_B4NxKs3wq3r-Xfl-ZVyAGh_ihTSvjjHbondTNTW1EJK5lMXRQ1cWYw4xCLakwHpCGR4-MBz6UgTTZx5mSHvd_X_ZR9wsPV4ab532q1j1JKnGSNz_oVvoDLfghmLsjg3fo7Qr1sd81fra39C0bMtUl55EpcfKkjwR092XO3q1JTjOu1ZQaar7yRGrI7TN63A534xfX6dpWbKx0goDOwP4lBpduzrUnbkxFQeJdIZFFKJDXvPCMgMl2n2rDgOmZ9cC_PYfWMh8Jvh-iZ4wronY6eTcuj9g" 

llm = ChatNVIDIA(
    base_url="https://nvidia-nim-model-predictor-haris-crimsoncl-3444a1bb.pcai1.genai1.hou",
    model="meta/llama3-8b-instruct",
    api_key=_api_key,
    temperature=0.5,
    max_tokens=1024,
    top_p=1.0,
)




## Data Extraction and Processing

In [14]:
# Replace with the path to your PDF
pdf_path = "docs/HPE.pdf"

# Load PDF file
loader = PyPDFLoader(pdf_path)
documents = loader.load()

# Split into manageable chunks
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = text_splitter.split_documents(documents)

for doc in docs:
    doc.metadata={}

## Vector Store Initialization

In [15]:
from langchain_ollama import OllamaEmbeddings

vector = WeaviateVectorStore.from_documents(docs, embedding=OllamaEmbeddings(model = "nomic-embed-text:latest", base_url="https://ollama.pcai1.genai1.hou"), client=client, index_name="RAG", text_key="Rag".lower() + "_key")


HTTP Request: GET http://weaviate.hpe-weaviate.svc.cluster.local/v1/schema/RAG "HTTP/1.1 200 OK"
HTTP Request: GET http://weaviate.hpe-weaviate.svc.cluster.local/v1/schema/RAG "HTTP/1.1 200 OK"
HTTP Request: POST https://ollama.pcai1.genai1.hou/api/embed "HTTP/1.1 200 OK"
HTTP Request: GET http://weaviate.hpe-weaviate.svc.cluster.local/v1/schema "HTTP/1.1 200 OK"
HTTP Request: GET http://weaviate.hpe-weaviate.svc.cluster.local/v1/nodes "HTTP/1.1 200 OK"
HTTP Request: GET http://weaviate.hpe-weaviate.svc.cluster.local/v1/nodes "HTTP/1.1 200 OK"


## Retriever Initialization

In [7]:
retriever=vector.as_retriever()

## Reranking

In [None]:
compressor = NVIDIARerank(model="nvidia/llama-3.2-nv-rerankqa-1b-v1",
                          base_url="https://reranker-5c3f14b5-predictor-ezai-services.pcai1.genai1.hou",
                          api_key=token)

compressor.get_available_models()



In [None]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=retriever
) 

## User Query

In [12]:
query = "What is HPE Proliant Compute DL384 Gen12"

## Output

In [13]:
chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
chain.invoke(query)

HTTP Request: POST https://ollama.pcai1.genai1.hou/api/embed "HTTP/1.1 200 OK"


{'query': 'What is HPE Proliant Compute DL384 Gen12',
 'result': 'According to the provided QuickSpecs document, the HPE ProLiant Compute DL384 Gen12 is a 2U standard 19” rack design, air-cooled server. It is a type of rack mount server manufactured by Hewlett Packard Enterprise (HPE).'}