# RAG Application: LangChain, SolarLLM, Oracle Database 23ai

## Step 01. 사전 준비

## 01.01 환경 변수 설정

In [1]:
!pip install langchain-experimental
%load_ext dotenv
%dotenv


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


### 01.02 관련 패키지 로딩

In [2]:
import sys
import array
import time
import os
from dotenv import load_dotenv

import oracledb
from langchain_community.vectorstores import oraclevs
from langchain_community.vectorstores.oraclevs import OracleVS

from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_core.documents import BaseDocumentTransformer, Document

from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_upstage import UpstageEmbeddings, UpstageLayoutAnalysisLoader
from langchain_upstage import ChatUpstage
from langchain_core.messages import HumanMessage, SystemMessage

import warnings
warnings.filterwarnings("ignore")

print("Successfully imported libraries and modules")

Successfully imported libraries and modules


### 01.03 데이터베이스 연결 

In [3]:
username=os.environ["DB_USER"]
password=os.environ["DB_PASSWORD"]
dsn=os.environ["DSN"]

con = oracledb.connect(user=username, password=password, dsn=dsn)

try: 
    conn23c = oracledb.connect(user=username, password=password, dsn=dsn)
    print("Connection successful!", conn23c.version)
except Exception as e:
    print("Connection failed!")

Connection successful! 23.4.1.24.6


## Step 02. Load the document 

### 텍스트 추출: UpstageLayoutAnalysisLoader

In [None]:

file_path = "./pdfs/(Cambridge Texts in Biomedical Engineering) W. Mark Saltzman - Biomedical Engineering_ Bridging Medicine and Technology-Cambridge University Press (2009).pdf"

layzer = UpstageLayoutAnalysisLoader(file_path, split="page")

# For improved memory efficiency, consider using the lazy_load method to load documents page by page.
docs = layzer.load()  # or layzer.lazy_load()

In [None]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

text_splitter = RecursiveCharacterTextSplitter.from_language(
    chunk_size=1500, chunk_overlap=200, language=Language.HTML
)
docs = text_splitter.split_documents(docs)
for doc in docs:
    doc.metadata['title']="Biomedical Engineering_ Bridging Medicine and Technology"

In [None]:
len(docs)

2735

### SolarLLM 임베딩 모델 & 데이터 적재

In [7]:
upstage_embeddings = UpstageEmbeddings(model="solar-embedding-1-large")
     
# Configure the vector store with the model, table name, and using the indicated distance strategy for the similarity search and vectorize the chunks
s1time = time.time()

knowledge_base = OracleVS.from_documents(docs, upstage_embeddings, client=conn23c, 
                    table_name="biomedical_table", 
                    distance_strategy=DistanceStrategy.DOT_PRODUCT)    

s2time =  time.time()      
print( f"Vectorizing and inserting chunks duration: {round(s2time - s1time, 1)} sec.")


NameError: name 'docs' is not defined

## Step 03. Retriever 생성 및 쿼리 테스트

### Oracle Database를 위한 Retriever 

In [4]:
upstage_embeddings = UpstageEmbeddings(model="solar-embedding-1-large")

vector_store = OracleVS(client=conn23c, 
                        embedding_function=upstage_embeddings, 
                        table_name="biomedical_table", 
                        distance_strategy=DistanceStrategy.DOT_PRODUCT)

retriever = vector_store.as_retriever()

In [5]:
prb_file_path = "./pdfs/(Prb)Bis200(2024spring)_Quiz1-20240320.pdf"
sol_file_path = "./pdfs/(Ans)Bis200(2024spring)_Quiz1-20240320.pdf"

prb_lazyer = UpstageLayoutAnalysisLoader(prb_file_path, split="page")
prb_docs = prb_lazyer.load()  # or layzer.lazy_load()
sol_lazyer = UpstageLayoutAnalysisLoader(sol_file_path, split="page")
sol_docs = sol_lazyer.load()  # or layzer.lazy_load()

In [6]:
print((prb_docs[0].page_content))

<h1 id='0' style='font-size:18px'>BiS200 Quiz#1 (2024-03-20)</h1> <br><p id='1' data-category='paragraph' style='font-size:14px'>Answer in English, Closed book, 15 minutes<br>There will be a class after the quiz.</p> <br><h1 id='2' style='font-size:14px'>Fill-in-the-blank questions:</h1> <br><p id='3' data-category='paragraph' style='font-size:14px'>1. Water-soluble molecules are referred to as ( ), molecules that are not easily dissolved in water are<br>called ( ), and molecules that contain both water-soluble and water-insoluble parts are called ( ).<br>2. The enzyme responsible for transcription is ( ), and the complex that catalyzes translation is the ( ).<br>3. ( ) are the sequences that are spliced or removed during RNA splicing, resulting in an mRNA transcript<br>consisting only of ( ).<br>4. The four primary tissue types are muscle, nervous, epithelial, and ( ).</p> <h1 id='4' style='font-size:14px'>Short-answer questions:</h1> <br><p id='5' data-category='paragraph' style='fon

In [29]:
# parser = ChatUpstage()
# template = """Please extract only the quiz {label} from the provided content.
#               -----
#               {context}
#               -----
#               """
# prompt = PromptTemplate.from_template(template)
# s5time = time.time()
# print("We are sending the prompt and RAG context to the LLM, wait a few seconds for the response...")
# chain = (
#   {"label": RunnablePassthrough(), "context": RunnablePassthrough()}
#     | prompt
#     | parser
#     | StrOutputParser()
#     )
# prb_response = chain.invoke({"label":"problems", "context": prb_docs[0].page_content})
# print(prb_response)

# s6time = time.time()
# print("")
# print( f"Send user question and ranked chunks to LLM and get answer duration: {round(s6time - s5time, 1)} sec.")

# print("We are sending the prompt and RAG context to the LLM, wait a few seconds for the response...")
# sol_response = chain.invoke({"label": "solutions","context":sol_docs[0].page_content})
# print(sol_response)

# s6time = time.time()
# print("")
# print( f"Send user question and ranked chunks to LLM and get answer duration: {round(s6time - s5time, 1)} sec.")

We are sending the prompt and RAG context to the LLM, wait a few seconds for the response...
I have extracted the quiz with the label 'problems' from the provided content. Here it is:
```vbnet
{'label': 'problems', 'context': "<h1 id='0' style='font-size:18px'>BiS200 Quiz#1 (2024-03-20)</h1> <br><p id='1' data-category='paragraph' style='font-size:14px'>Answer in English, Closed book, 15 minutes<br>There will be a class after the quiz.</p> <br><h1 id='2' style='font-size:14px'>Fill-in-the-blank questions:</h1> <br><p id='3' data-category='paragraph' style='font-size:14px'>1. Water-soluble molecules are referred to as ( ), molecules that are not easily dissolved in water are<br>called ( ), and molecules that contain both water-soluble and water-insoluble parts are called ( ).<br>2. The enzyme responsible for transcription is ( ), and the complex that catalyzes translation is the ( ).<br>3. ( ) are the sequences that are spliced or removed during RNA splicing, resulting in an mRNA transc

## Step 04. LangChain 애플리케이션 준비

In [13]:
from langchain_upstage import UpstageGroundednessCheck

groundedness_check = UpstageGroundednessCheck()
llm = ChatUpstage()
template = """Please give me the solutions from quiz referencing the below knowledge:
              ----------
              {context}
              ----------
              Quiz: {quiz}
              """
prompt = PromptTemplate.from_template(template)
k=4
grounded = False
while not grounded:
  retriever = vector_store.as_retriever(k=k)
  s5time = time.time()
  print("We are sending the prompt and RAG context to the LLM, wait a few seconds for the response...")
  chain = (
    # {"context": retriever, "quiz": RunnablePassthrough(), "sol": RunnablePassthrough()}
    {"context": retriever, "quiz": RunnablePassthrough()}
      | prompt
      | llm
      | StrOutputParser()
      )
  # response = chain.invoke({"quiz":prb_docs[0], "sol":sol_docs[0]})
  response = chain.invoke({"quiz":prb_docs[0].page_content})
  print(response)

  s6time = time.time()
  print("")
  print( f"Send user question and ranked chunks to LLM and get answer duration: {round(s6time - s5time, 1)} sec.")
  
  result_chunks=vector_store.similarity_search(prb_docs[0].page_content, k=k)
  ref = ""
  for i in range(k):
    ref += result_chunks[i].page_content
    ref += "\n"
  gc_result = groundedness_check.invoke({"context": ref, "answer": response})

  # print("GC check result: ", gc_result)
  if gc_result.lower().startswith("grounded"):
    print(f"Answer is grounded\n[ref] {ref}")
    grounded = True
  else:
    print(f"answer is not grounded with k={k}")
    k *=2



We are sending the prompt and RAG context to the LLM, wait a few seconds for the response...
Quiz Solutions:

1. Water-soluble molecules are referred to as "soluble," molecules that are not easily dissolved in water are called "insoluble," and molecules that contain both water-soluble and water-insoluble parts are called "amphiphilic."
2. The enzyme responsible for transcription is "RNA polymerase," and the complex that catalyzes translation is the "ribosome."
3. "Introns" are the sequences that are spliced or removed during RNA splicing, resulting in an mRNA transcript consisting only of "exons."
4. The fourth primary tissue type is "connective tissue."

Short-answer questions:

1. The structural levels of DNA organization within our cells are: chromosomes, chromatin, nucleosomes, DNA-histone complexes, and DNA double helix.
2. The key enzymes involved in DNA replication in eukaryotic cells are DNA polymerase, primase, helicase, and topoisomerase. DNA polymerase adds nucleotides to th