In [1]:
import os
import oracledb
from langchain_community.embeddings import OCIGenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain_community.vectorstores import OracleVS
from langchain_community.vectorstores.utils import DistanceStrategy

from dotenv import load_dotenv
load_dotenv()

True

# create knowledge base table

### Create database connection

In [None]:

default_path = ""
connection = oracledb.connect(
    user=os.getenv('CON_ADB_DEV_USER_NAME'), 
    password=os.getenv('CON_ADB_DEV_PASSWORD'), 
    dsn=os.getenv('CON_ADB_DEV_SERVICE_NAME'),
    config_dir=default_path+"oci",
    wallet_location=default_path+"oci",
    wallet_password=os.getenv('DB_WALLET_PASSWORD')
    )

### Create knowledge table

In [3]:
table_name = 'my_docs'
with connection.cursor() as cursor:
    cursor.execute(f"""
        CREATE TABLE IF NOT EXISTS {table_name}
        (	
            DOCS_ID NUMBER, 
            FILE_ID NUMBER, 
            TEXT CLOB , 
            METADATA CLOB , 
            EMBEDDING VECTOR NOT NULL ENABLE
        ) 
        """)

### Split the text and update metadata

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.chat_models.oci_generative_ai import ChatOCIGenAI
# Step 1: Load PDF and CSV Documents
def load_documents(file_name):
    # Load PDF Document
    pdf_loader = PyPDFLoader(file_name)
    pdf_documents = pdf_loader.load()
    return pdf_documents
 
# Step 2: Split the documents into smaller chunks for better processing
def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
    )
    split_docs = text_splitter.split_documents(documents)
    return split_docs

# Load and process documents
file_name ="data/database-concepts-23ai.pdf"
documents = load_documents(file_name)
chunks = split_documents(documents)
for idx, text in enumerate(chunks):
    metadata = {"docs_id": idx, 
                "file_id": 1,
                "user_role":'tester',
                'source':text.metadata['source'],
                'total_pages':text.metadata['total_pages'],
                'page':text.metadata['page'],
                'page_label':text.metadata['page_label'],
                'title':text.metadata['title']
                }
    chunks[idx].metadata=metadata
# chunks

In [5]:
chunks[100].metadata

{'docs_id': 100,
 'file_id': 1,
 'user_role': 'tester',
 'source': 'data/database-concepts-23ai.pdf',
 'total_pages': 794,
 'page': 42,
 'page_label': '1-19',
 'title': 'Database Concepts'}

In [6]:
default_path=""
AUTH_TYPE = "API_KEY"
CONFIG_PROFILE = "DEFAULT"
embeddings = OCIGenAIEmbeddings(
            model_id=os.getenv('CON_GEN_AI_EMB_MODEL_ID'),
            service_endpoint=os.getenv('CON_GEN_AI_SERVICE_ENDPOINT'),
            compartment_id=os.getenv('CON_GEN_AI_COMPARTMENT_ID'),
            truncate="NONE",
            auth_file_location=default_path+"oci/config",
            auth_type=AUTH_TYPE,
            auth_profile=CONFIG_PROFILE
        )

# documents = ["This is a sample document", "and here is another one"]
# response = embeddings.embed_documents(documents)
# print(response)

In [17]:
len(chunks)

2038

In [24]:
# using from_documents to create the table and insert the 10 first chunks
vector_store = OracleVS.from_documents(
    chunks[:10],
    embeddings,
    client=connection,
    table_name=table_name,
    distance_strategy=DistanceStrategy.COSINE,
)




In [None]:
# using add_documents to insert the reminding chunks
vector_store.add_documents(chunks[10:])

In [28]:
# creating index
from langchain_community.vectorstores import oraclevs

# create index
oraclevs.create_index(
    connection,
    vector_store,
    params={"idx_name": "hnsw_idx1", "idx_type": "HNSW", "accuracy": 97, "parallel": 16,}
)

In [30]:
#Conduct Similarity searches
query = "Oracle AI Vector Search?"
retrieved_docs = vector_store.similarity_search(query, 2)
retrieved_docs


[Document(metadata={'docs_id': Decimal('42'), 'file_id': Decimal('1'), 'user_role': 'tester', 'source': 'data/database-concepts-23ai.pdf', 'total_pages': Decimal('794'), 'page': Decimal('19'), 'page_label': 'xx', 'title': 'Database Concepts'}, page_content='Distributed SQL 20-35\nDatabase Links 20-35\nInformation Sharing 20-36\nOracle GoldenGate 20-36\nOracle Database Advanced Queuing (AQ) 20-36\n21  \n \nArtificial Intelligence in the Oracle Database\nOverview of Oracle AI Vector Search 21-1\nUnderstand Hierarchical Navigable Small World Indexes 21-5\nUnderstand Inverted File Flat Vector Indexes 21-9\nVector Distance Metrics 21-15\nArtificial Intelligence in the Oracle Database 21-15\nPerform Exact Similarity Search 21-15\nUnderstand Approximate Similarity Search Using Vector Indexes 21-17\nxx'),
 Document(metadata={'docs_id': Decimal('1618'), 'file_id': Decimal('1'), 'user_role': 'tester', 'source': 'data/database-concepts-23ai.pdf', 'total_pages': Decimal('794'), 'page': Decimal('61