In [1]:
import os
from langchain.document_loaders import PyPDFLoader, CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.chat_models.oci_generative_ai import ChatOCIGenAI
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OCIGenAIEmbeddings
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import START, END, MessagesState, StateGraph
from langchain.schema import AIMessage, HumanMessage, SystemMessage

In [2]:
from dotenv import load_dotenv
load_dotenv()

# Step 1: Load PDF and CSV Documents
def load_documents():
    # Load PDF Document
    pdf_loader = PyPDFLoader("demo.pdf")
    pdf_documents = pdf_loader.load()
    return pdf_documents
 
# Step 2: Split the documents into smaller chunks for better processing
def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100
    )
    split_docs = text_splitter.split_documents(documents)
    return split_docs

# Load and process documents
documents = load_documents()
chunks = split_documents(documents)

In [2]:
import os
import oracledb
from typing import List, Dict
from langchain_core.messages import BaseMessage, AIMessage
from langchain.prompts import ChatPromptTemplate
from langchain_community.vectorstores import OracleVS
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import  MessagesPlaceholder
from langchain.chains import (
    create_history_aware_retriever,
    create_retrieval_chain
)
from langchain_community.chat_models import ChatOCIGenAI
from langchain_core.messages import HumanMessage
from langchain_community.embeddings import OCIGenAIEmbeddings
from dotenv import load_dotenv
load_dotenv()

True

In [3]:

# embeddings = OCIGenAIEmbeddings(
#     model_id=os.getenv("CON_GEN_AI_EMB_MODEL_ID"),
#     service_endpoint=os.getenv("CON_GEN_AI_SERVICE_ENDPOINT"),
#     compartment_id=os.getenv("CON_GEN_AI_COMPARTMENT_ID"),
#     auth_type="API_KEY",
#     model_kwargs={"input_type": "SEARCH_DOCUMENT"}
# )

# db = FAISS.from_documents(chunks, embeddings)
# retriever = db.as_retriever(
#     search_type="mmr",
#     search_kwargs={'k': 50, 'fetch_k': 150}
# )

default_path=""
CONFIG_PROFILE = "DEFAULT"
llm = ChatOCIGenAI(
        model_id         = os.getenv("CON_GEN_AI_CHAT_MODEL_ID"),
        service_endpoint = os.getenv("CON_GEN_AI_SERVICE_ENDPOINT"),
        compartment_id   = os.getenv("CON_GEN_AI_COMPARTMENT_ID"),
        provider         = "meta",
        is_stream        = True,
        auth_type        = os.getenv("CON_GEN_AI_AUTH_TYPE"),
        auth_file_location=default_path+"oci/config",
        auth_profile=CONFIG_PROFILE,
        model_kwargs     = {
            "max_tokens"        : 1024,
            "temperature"       : 0.6,
            "top_p"             : 0.7,
            "top_k"             : 20,
            "frequency_penalty" : 0
        }        
    )



embeddings = OCIGenAIEmbeddings(
            model_id=os.getenv('CON_GEN_AI_EMB_MODEL_ID'),
            service_endpoint=os.getenv('CON_GEN_AI_SERVICE_ENDPOINT'),
            compartment_id=os.getenv('CON_GEN_AI_COMPARTMENT_ID'),
            truncate="NONE",
            auth_file_location=default_path+"oci/config",
            auth_type=os.getenv("CON_GEN_AI_AUTH_TYPE"),
            auth_profile=CONFIG_PROFILE
        )

default_path = ""
connection = oracledb.connect(
    user=os.getenv('CON_ADB_DEV_USER_NAME'), 
    password=os.getenv('CON_ADB_DEV_PASSWORD'), 
    dsn=os.getenv('CON_ADB_DEV_SERVICE_NAME'),
    config_dir=default_path+"oci",
    wallet_location=default_path+"oci",
    wallet_password=os.getenv('DB_WALLET_PASSWORD')
    )

table_name = "MY_DOCS"
vector_store = OracleVS(connection, embeddings, table_name)
retriever = vector_store.as_retriever(
    # search_kwargs={ 'k': 100} <- parameter to define the number of documents to retrieve
)

In [4]:
# chat = ChatOCIGenAI(
#             model_id=os.getenv("CON_GEN_AI_CHAT_MODEL_ID"),
#             service_endpoint=os.getenv("CON_GEN_AI_SERVICE_ENDPOINT"),
#             compartment_id=os.getenv("CON_GEN_AI_COMPARTMENT_ID"),
#             provider="meta",
#             is_stream=True,
#             auth_type=os.getenv("CON_GEN_AI_AUTH_TYPE"),
#             model_kwargs={
#                 "max_tokens": 1024,
#                 "temperature": 0.6,
#                 "top_p": 0.7,
#                 "top_k": 20,
#                 "frequency_penalty": 0,
#             },
#         )

workflow = StateGraph(state_schema=MessagesState)

# Define the retriever function
def retrieve_relevant_documents(state: MessagesState):
    query = state["messages"][-1].content  # Extract last message as query
    retrieved_docs = retriever.invoke(query)  # Assuming retriever is defined
    retrieved_content = "\n".join([doc.page_content for doc in retrieved_docs])
    state["messages"].append(SystemMessage(content=f"Context: {retrieved_content}"))
    # print("state en el retrieve", state)
    return state  # Return updated state

# Define the function that calls the model
def call_model(state: MessagesState):
    system_prompt = (
        "You are a helpful assistant that call me by my name if posible. "
        "Use the following retrieved documents as context to provide accurate responses. "
        "Answer all questions to the best of your ability and if the answer is not in the context, respond with 'The question is out of my context.' ."
    )

    messages = [SystemMessage(content=system_prompt)] + state["messages"]
    response = llm.invoke(messages)
    return {"messages": response}

# Define the nodes and edges
workflow.add_node("retriever", retrieve_relevant_documents)
workflow.add_node("model", call_model)
workflow.add_edge(START, "retriever")
workflow.add_edge("retriever", "model")
workflow.add_edge("model", END)  # Add end node



<langgraph.graph.state.StateGraph at 0x18ec73027b0>

In [12]:
def translate_chat_history(lista):
    translated_history = []    
    for item in lista:
        role = item['role'].lower()
        message = item['message']
        if role == 'human':
            translated_history.append(HumanMessage(content=message))
        elif role == 'assistant':
            translated_history.append(AIMessage(content=message))    
    return translated_history

# Example usage:
import json
# body = {
#     "p_query":"what are the top risks mentioned in the document?",
#     "p_role" :"private",
#     "p_history": "[{\"role\":\"Assistant\",\"message\":\"Hello, what is your name?\"},\
#     {\"role\":\"Human\",\"message\":\"Hello, my name is Evelyn\"}]"
# }
data = {
    "query": "what is Oracle AI Vector Search?",
    # "query": "in what applications can I use it?",
    "context": [
       {
            "role": "user",
            "message": "what is Oracle AI Vector Search?"
        },
        {"role": "assistant",
            "message": "Oracle AI Vector Search is designed for Artificial Intelligence (AI) workloads and allows you to query data based on semantics, rather than keywords. It stores vector embeddings, which are mathematical vector representations of data points, and these vector embeddings describe the semantic meaning behind content such as words, documents, audio tracks, or images."}
]
}
body = data
# p_history = json.loads(body['context'])
chat_history = translate_chat_history(body['context'])

# Add simple in-memory checkpointer
memory = MemorySaver()
app = workflow.compile(checkpointer=memory)
response  = app.invoke(
    {
        "messages": chat_history
        + [HumanMessage(content=body["query"])]
    },
    config={"configurable": {"thread_id": "1"}},
)
response["messages"][-1].pretty_print()



Oracle AI Vector Search is designed for Artificial Intelligence (AI) workloads and allows you to query data based on semantics, rather than keywords. It stores vector embeddings, which are mathematical vector representations of data points, and these vector embeddings describe the semantic meaning behind content such as words, documents, audio tracks, or images.
