In [1]:
from build_embedder import *
from build_vectordb import create_retriever
from build_llm import *

config - INFO - Current Working Directory: c:\Users\Rahul Gupta\Documents\RG\GenAI\0.self_explore
config - INFO - Folder separator used w.r.t OS: \
config - INFO - Log File: c:\Users\Rahul Gupta\Documents\RG\GenAI\0.self_explore\logs\coe_demo_2024_05_18_12_43_57.log
config - INFO - Documents Directory: c:\Users\Rahul Gupta\Documents\RG\GenAI\0.self_explore\documents
config - INFO - Documents folder being read: c:\Users\Rahul Gupta\Documents\RG\GenAI\0.self_explore\documents\coe_demo


In [None]:
#
# This one is to be used with Streamlit
#

import streamlit as st

from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores import FAISS
# Luigi's OracleVectorStore wrapper for LangChain
from oracle_vector_db_lc import OracleVectorStore

In [2]:
from langchain.prompts import ChatPromptTemplate

from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
my_conv_memory = ConversationBufferMemory()

In [11]:
def print_response(my_dict):
    desired_order = ['question', 'generated_question', 'answer', 'chat_history', 'source_documents']
    for key in desired_order:
        logger.info(f"{key}:")
        # print(f"    {type(my_dict[key])}")
        if isinstance(my_dict[key], list): 
            for item in my_dict[key]:
                logger.info(f'{item}')
        else:
            logger.info(f"    {my_dict[key]}")

In [4]:
#
# def: get_answer  from LLM
#
def get_answer(rag_chain, question):
    response = rag_chain.invoke(question)

    if DEBUG:
        # logger.debug(f"Question: {question}")
        # logger.debug("The response:")
        # logger.debug(response)
        print_response(response)

    return response

In [None]:
#
# Initialize_rag_chain
#
# to run it only once
@st.cache_resource
def create_rag_chain():
    logger.debug(f'#### ENTER create_rag_chain() function ####')

    # 1. Load embeddings model
    embedder = create_embedder()

    # 2. restore vectordb
    if VECTOR_STORE == "CHROMA":
        # restore persistant chromadb
        try:
            vectorstore = Chroma(persist_directory=f"./vectorstore/{VECTORDB_FOLDER}_chromadb", embedding_function=embedder)
            logger.info(f'{VECTOR_STORE} vectordb restored')
        except Exception as e:
            logger.exception("Exception occured")
            sys.exit(1)
    elif VECTOR_STORE == "ORACLE":
        # restore oracle vectordb
        try:
            vectorstore = OracleVectorStore(embedding_function=embedder.embed_query, verbose=True)
            logger.info(f'{VECTOR_STORE} vectordb restored')
        except Exception as e:
            logger.exception("Exception occured")
            sys.exit(1)
    else:
        print(f'Please check the vector store to be used as part of configuration')
        exit()

    # 3. Create a retriever
    retriever = create_retriever(vectorstore)

    # 4. Build the LLM
    llm = build_llm()
    
    # 5. Build prompt template
    template = """You are a helpful assistant who searches for answer by going through provided context below. \
    Answer the question as truthfully as possible using only the context provided. \
    If the answer is not contained within the context below, say "I don't know". \
    Do not end your answer with a question.
    Context: {context}
    Question: {question}
    """

    rag_prompt = ChatPromptTemplate.from_template(template)

    # 6. Build conversation memory
    logger.info("Initializing Conversation Buffer Memory")
    global my_conv_memory
    my_conv_memory = ConversationBufferMemory(
        memory_key="chat_history",
        input_key='question', output_key='answer',
        return_messages=True,
        verbose = True
    )

    # 7. Create a conversation chain
    logger.info("Initializing Conversation Retrieval Chain")
    qa = ConversationalRetrievalChain.from_llm(
        llm, 
        chain_type="stuff", 
        retriever=retriever, 
        memory=my_conv_memory,
        return_source_documents=True,
        return_generated_question=True,
        rephrase_question=False, 
        combine_docs_chain_kwargs={'prompt': rag_prompt}
    )


    logger.info("\nRAG Chain created successfully")

    logger.debug(f'#### EXIT create_rag_chain() function ####')
    return qa

In [None]:
# 
# Reset Conversation Buffer Memory
# 
def clear_conv_memory():
    logger.debug(f'#### ENTER clear_conv_memory() function ####')
    global my_conv_memory
    my_conv_memory.clear()
    logger.info(f'Conversation Memory cleared')
    logger.debug(f'#### EXIT clear_conv_memory() function ####')

In [6]:
# qa = create_rag_chain()

config - INFO - Documents being processed:
config - INFO - (1) data-science-lifecycle-ebook.pdf
config - INFO - (2) oracle-autonomous-database-technical-overview.pdf
config - INFO - Loading document: data-science-lifecycle-ebook.pdf...
config - INFO - Loading document: oracle-autonomous-database-technical-overview.pdf...
config - INFO - 
Loaded 32 docs...

config - INFO - Splitted the document in 145 chunks...
config - INFO - Number of non-empty chunks: 145
config - INFO - Loading OCI GenAI Cohere Embeddings Model: cohere.embed-english-v3.0
config - INFO - Using CHROMA as Vector Store...
config - INFO - persistent vector store being created
config - INFO - Directory called coe_demo_chromadb has been deleted from vectorstore folder
config - INFO - A new directory called coe_demo_chromadb will be created under vectorstore folder
config - INFO - No. of document splits: 145
config - INFO - Document embeddings being created in a batch size of 96 docs at a time
config - INFO - No reranking..

In [None]:
# result = qa.invoke("what is machine learning?")
# print_response(result)

In [None]:
# result = qa.invoke("what are different steps needed to build it?")
# print_response(result)

In [None]:
# result = qa.invoke("what is oracle autonomous database?")
# print_response(result)

In [None]:
# result = qa.invoke("what are the different offerings that it provides?")
# print_response(result)


In [12]:
# result = qa.invoke("Provide more details about Oracle Autonomous Data Warehouse")
# print_response(result)


config - INFO - question:
config - INFO -     Provide more details about Oracle Autonomous Data Warehouse
config - INFO - generated_question:
config - INFO -      What are the key features of the Oracle Autonomous Data Warehouse and how do they make it suitable for data warehousing, data marts, data lakes, and machine learning workloads?
config - INFO - answer:
config - INFO -     Oracle Autonomous Data Warehouse is a solution designed for specific data warehousing, data mart, and data lake business requirements. It is tailored to streamline machine learning and data warehousing operations by utilizing data modeling techniques such as star schema to ensure optimal data structures for business analysts and data scientists. Given the large volumes of data typically housed in data warehouses, Oracle ADW uses summary data representations and highly parallel SQL to provide quick responses times, enabling efficient data processing and streaming into the database. 

Oracle Autonomous Data War