# RAG Application: LangChain, SolarLLM, Oracle Database 23ai

## Step 01. 사전 준비

## 01.01 환경 변수 설정

In [21]:
!pip install langchain-experimental
%load_ext dotenv
%dotenv

Collecting langchain-experimental
  Downloading langchain_experimental-0.0.62-py3-none-any.whl.metadata (1.5 kB)
Downloading langchain_experimental-0.0.62-py3-none-any.whl (202 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m202.7/202.7 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain-experimental
Successfully installed langchain-experimental-0.0.62

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


### 01.02 관련 패키지 로딩

In [2]:
import sys
import array
import time
import os
from dotenv import load_dotenv

import oracledb
from langchain_community.vectorstores import oraclevs
from langchain_community.vectorstores.oraclevs import OracleVS

from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_core.documents import BaseDocumentTransformer, Document

from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

import warnings
warnings.filterwarnings("ignore")

print("Successfully imported libraries and modules")

Successfully imported libraries and modules


### 01.03 데이터베이스 연결 

In [3]:
username=os.environ["DB_USER"]
password=os.environ["DB_PASSWORD"]
dsn=os.environ["DSN"]

con = oracledb.connect(user=username, password=password, dsn=dsn)

try: 
    conn23c = oracledb.connect(user=username, password=password, dsn=dsn)
    print("Connection successful!", conn23c.version)
except Exception as e:
    print("Connection failed!")

Connection successful! 23.4.1.24.6


## Step 02. Load the document 

### 텍스트 추출: UpstageLayoutAnalysisLoader

In [5]:
from langchain_upstage import UpstageLayoutAnalysisLoader

file_path = "./pdfs/(Cambridge Texts in Biomedical Engineering) W. Mark Saltzman - Biomedical Engineering_ Bridging Medicine and Technology-Cambridge University Press (2009).pdf"

layzer = UpstageLayoutAnalysisLoader(file_path, split="page")

# For improved memory efficiency, consider using the lazy_load method to load documents page by page.
docs = layzer.load()  # or layzer.lazy_load()

In [6]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

text_splitter = RecursiveCharacterTextSplitter.from_language(
    chunk_size=1500, chunk_overlap=200, language=Language.HTML
)
docs = text_splitter.split_documents(docs)
for doc in docs:
    doc.metadata['title']="Biomedical Engineering_ Bridging Medicine and Technology"

In [8]:
len(docs)

2735

In [9]:
for doc in docs:
    doc.metadata['title']="Oracle Database 23ai New Features"

### SolarLLM 임베딩 모델 & 데이터 적재

In [13]:
from langchain_upstage import UpstageEmbeddings
 
upstage_embeddings = UpstageEmbeddings(model="solar-embedding-1-large")
     
# Configure the vector store with the model, table name, and using the indicated distance strategy for the similarity search and vectorize the chunks
s1time = time.time()

knowledge_base = OracleVS.from_documents(docs, upstage_embeddings, client=conn23c, 
                    table_name="biomedical_table", 
                    distance_strategy=DistanceStrategy.DOT_PRODUCT)    

s2time =  time.time()      
print( f"Vectorizing and inserting chunks duration: {round(s2time - s1time, 1)} sec.")


Vectorizing and inserting chunks duration: 547.1 sec.


## Step 03. Retriever 생성 및 쿼리 테스트

### Oracle Database를 위한 Retriever 

In [14]:
vector_store = OracleVS(client=conn23c, 
                        embedding_function=upstage_embeddings, 
                        table_name="biomedical_table", 
                        distance_strategy=DistanceStrategy.DOT_PRODUCT)

retriever = vector_store.as_retriever()

In [37]:
from langchain_experimental.text_splitter import SemanticChunker
quiz_splitter = SemanticChunker(upstage_embeddings)

prb_file_path = "./pdfs/(Prb)Bis200(2024spring)_Quiz1-20240320.pdf"
sol_file_path = "./pdfs/(Ans)Bis200(2024spring)_Quiz1-20240320.pdf"

prb_lazyer = UpstageLayoutAnalysisLoader(prb_file_path, split="page")
prb_docs = prb_lazyer.load()  # or layzer.lazy_load()
prb_docs = text_splitter.split_documents(prb_docs)

sol_lazyer = UpstageLayoutAnalysisLoader(sol_file_path, split="page")
sol_docs = sol_lazyer.load()  # or layzer.lazy_load()
sol_docs = text_splitter.split_documents(sol_docs)



In [38]:
print(prb_docs[1].page_content)

<p id='5' data-category='paragraph' style='font-size:14px'>1. Starting from the familiar image of DNA in metaphase cells, name all the structural levels through<br>which DNA is organized within our cells.<br>2. What are the key enzymes involved in DNA replication in eukaryotic cells and what are their specific<br>functions?<br>3. What are the three principal techniques used in recombinant DNA technology?<br>4. What is the function of the RNA-induced silencing complex (RISC) in the mechanism of siRNA-mediated<br>RNA interference?<br>5. RISC is composed of multiple proteins. What levels of protein structure are present in this complex?</p>


## Step 04. LangChain 애플리케이션 준비

In [52]:
from langchain_upstage import ChatUpstage
from langchain_core.messages import HumanMessage, SystemMessage
 
llm = ChatUpstage()
template = """Please explain the solutions from quiz referencing the below knowledge:
              ----------
              {context}
              ----------
              Quiz: {quiz}
              Sol: {sol} 
              """
prompt = PromptTemplate.from_template(template)
retriever = vector_store.as_retriever()
s5time = time.time()
print("We are sending the prompt and RAG context to the LLM, wait a few seconds for the response...")
chain = (
  {"context": retriever, "quiz": RunnablePassthrough(), "sol": RunnablePassthrough()}
    | prompt
    | llm
    # | StrOutputParser()
    )
response = chain.invoke({"quiz":prb_docs[0], "sol":sol_docs[0]})
print(response.content)

s6time = time.time()
print("")
print( f"Send user question and ranked chunks to LLM and get answer duration: {round(s6time - s5time, 1)} sec.")

We are sending the prompt and RAG context to the LLM, wait a few seconds for the response...
The solutions from the quiz are as follows:

1. Water-soluble molecules are referred to as hydrophilic, molecules that are not easily dissolved in water are called hydrophobic, and molecules that contain both water-soluble and water-insoluble parts are called amphiphilic.
2. The enzyme responsible for transcription is RNA polymerase, and the complex that catalyzes translation is the ribosome.
3. Introns are the sequences that are spliced or removed during RNA splicing, resulting in an mRNA transcript consisting only of exons.
4. The four primary tissue types are muscle, nervous, epithelial, and connective tissues.

Send user question and ranked chunks to LLM and get answer duration: 2.8 sec.


In [55]:
response.__dict__

{'content': 'The solutions from the quiz are as follows:\n\n1. Water-soluble molecules are referred to as hydrophilic, molecules that are not easily dissolved in water are called hydrophobic, and molecules that contain both water-soluble and water-insoluble parts are called amphiphilic.\n2. The enzyme responsible for transcription is RNA polymerase, and the complex that catalyzes translation is the ribosome.\n3. Introns are the sequences that are spliced or removed during RNA splicing, resulting in an mRNA transcript consisting only of exons.\n4. The four primary tissue types are muscle, nervous, epithelial, and connective tissues.',
 'additional_kwargs': {},
 'response_metadata': {'token_usage': {'completion_tokens': 168,
   'prompt_tokens': 2719,
   'total_tokens': 2887},
  'model_name': 'solar-1-mini-chat-240612',
  'system_fingerprint': None,
  'finish_reason': 'stop',
  'logprobs': None},
 'type': 'ai',
 'name': None,
 'id': 'run-19ceac35-025b-4e0e-b195-aa9110a34094-0',
 'example'