In [21]:
import configparser

parser = configparser.ConfigParser()
parser.read('.secrets')
config = parser['OPENAI']['api.key']
#keke1123@naver.com
OPENAI_KEY = config

In [68]:
from openai import OpenAI

client = OpenAI(
    api_key=OPENAI_KEY,
)

# OpenAI 임베딩 생성
def generate_embeddings(text):
    embedding = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    ).data[0].embedding
    return embedding

In [None]:
import pdfplumber

# PDF 파일을 텍스트로 변환
def extract_text_from_pdf(pdf_path):
    result = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            data = {
                "page_number": page.page_number,
                "text": page.extract_text(),
            }
            result.append(data)
    return result

In [70]:
import tiktoken  # OpenAI의 토큰화 라이브러리

# 최대 토큰 길이 설정 (모델에 따라 다름)
MAX_TOKENS = 8192  # text-embedding-3-small의 최대 토큰 수
CHUNK_SIZE = 1000  # 분할된 텍스트의 최대 토큰 수

# 텍스트를 토큰 단위로 분할
def split_text_into_chunks(text, chunk_size=CHUNK_SIZE, overlap=500):
    tokenizer = tiktoken.get_encoding("cl100k_base")  # text-embedding-3-small에 적합한 토크나이저
    tokens = tokenizer.encode(text)
    chunks = []
    start = 0

    i = 0
    while start < len(tokens):
        end = start + chunk_size
        chunk = tokens[start:end]
        data = {
            "input": "".join(tokenizer.decode(chunk)),
            "chunk_num":i,
            "text": text[start:end],
        }
        chunks.append(data)
        start = end - overlap  # overlap 만큼 겹치도록 설정
        i += 1
    return chunks

In [76]:
import os
import datetime
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from datetime import datetime

# 최대 토큰 길이 설정 (모델에 따라 다름)
MAX_TOKENS = 8192  # text-embedding-3-small의 최대 토큰 수
CHUNK_SIZE = 1000  # 분할된 텍스트의 최대 토큰 수

# Elasticsearch 설정
es = Elasticsearch("http://localhost:9200")  # Elasticsearch 서버 주소
now = datetime.now()
formattedDate = now.strftime("%Y%m%d")

index_name = "insurance-" + formattedDate

# Elasticsearch에 벡터 데이터 삽입
def index_documents(folder_path):
    for company in os.listdir(folder_path):
        company_path = os.path.join(folder_path, company)
        if os.path.isdir(company_path):
            for product in os.listdir(company_path):
                product_path = os.path.join(company_path, product)
                for file in os.listdir(product_path):    
                    if file.endswith(".pdf"):
                        index_document(os.path.join(product_path, file), company, product)

def index_document(file_path, company, product):
    print(f'{product} start convert task pdf to text')
    # pages = extract_text_from_pdf(file_path)
    pages = "".join(page["text"] for page in extract_text_from_pdf(file_path))
    print('text convert complete')
    chunks = split_text_into_chunks(pages, chunk_size=MAX_TOKENS - 100)
    print('chunks created')
    # 분할된 텍스트를 OpenAI 임베딩으로 변환
    for chunk in chunks:
        document = {
            "company": company,
            "product": product,
            "model" : "text-embedding-3-small",
            "file_name": os.path.basename(file_path),
            "file_path": file_path,
            "content": chunk["text"],
            "embedding": generate_embeddings(chunk["input"]),
            "chunk_number": chunk["chunk_num"],
            # "page_number": chunk["page_number"],
        }
        print('start save')
        es.index(index=index_name, body=document)
    

In [85]:
index_document(f"ai-암보험약관/DB손해보험/무배당 프로미 라이프 New I'mOK 암보험/약관_30652(09)_20250401.pdf", 'DB손해보험', "무배당 프로미 라이프 New I'mOK 암보험")

무배당 프로미 라이프 New I'mOK 암보험 start convert task pdf to text
text convert complete
chunks created
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save


In [None]:
index_document(f"ai-암보험약관/MG손해보험/(무)다이렉트 건강 보험 갱신형/(무)다이렉트건강보험(24.04)_갱신형_약관.pdf", 'MG손해보험', "(무)다이렉트 건강 보험 갱신형")

(무)다이렉트 건강 보험 갱신형 start convert task pdf to text
text convert complete
chunks created
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save
start save


In [98]:
# Elasticsearch에서 벡터 검색
def search_documents(query, top_k=10):
    # OpenAI 임베딩 생성
    embedding = client.embeddings.create(
        input=query,
        model="text-embedding-3-small"
    ).data[0].embedding
    
    query = {
            "size": top_k,
            "query": {
                "script_score": {
                    "query": {"match_all": {}},
                    "script": {
                        "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                        "params": {"query_vector": embedding}
                    },
                    "min_score": 0.8
                }
            },
            "sort": [
                {
                    "_score": {
                        "order": "desc"
                    }
                },
                {
                    "chunk_number": {
                        "order": "asc"
                    }
                }
            ]
        }
    print(query)
    # Elasticsearch 벡터 검색 쿼리
    response = es.search(
        index=index_name,
        body=query
    )
    return [hit["_source"] for hit in response["hits"]["hits"]]

In [99]:
#index_documents('ai-암보험약관')  # PDF 파일을 Elasticsearch에 색인

In [100]:
import json

# OpenAI API 호출
def query_openai(prompt):
    response = client.chat.completions.create(
        model="o1-mini",  # 사용할 OpenAI 모델
        messages=[
            {"role": "user", "content": prompt}
        ],            
    )
    return response.choices[0].message.content

# RAG 시스템
def query_rag(query):
    # Elasticsearch에서 관련 문서 검색
    documents = search_documents(query)
    docs = []
    for doc in documents :
        context_data = {
            "company": doc["company"],
            "product": doc["product"],
            "file_name": doc["file_name"],
            "content": doc["content"],
        }
        docs.append(context_data)
    context_json = json.dumps(docs, ensure_ascii=False, indent=2)

    # OpenAI에 질의
    prompt = f"""
    Temparature: 0.3
    ## Role
    - You are a friendly insurance agent.
    ## Instructions
    - You should actively use the information delivered in 'Context'.
    - Answer the question in 'Question'.
    - Answers should be in Korean
    - Do not give uncertain information that may confuse the user.
    - Avoid saying unnecessary things, and respond in as much detail as possible to what the user needs.
    - If the information passed in 'Context' doesn't provide enough information for the user to answer, answer "Sorry, I don't know".
    - Always include company and product information in context, except when you say you don't know.
    - Except when answering “don't know”, always include the source of the data you referenced at the end of your response, which is passed in the file_name in context
    - If you include multiple companies and products, please compare them.
    - If there are multiple companies, products, and file_names referenced, please list all companies, products, and file_names.
    - If the user's question is about cancer in general, not cancer insurance, you can respond with general medical knowledge without referencing the articles in Context.
    Context:
      {context_json}
    Question:
      {query}
    """
    print(prompt)
    answer = query_openai(prompt)
    return answer


In [101]:
query = "암이란게 뭐야?"
answer = query_rag(query)
print(answer)

{'size': 10, 'query': {'script_score': {'query': {'match_all': {}}, 'script': {'source': "cosineSimilarity(params.query_vector, 'embedding') + 1.0", 'params': {'query_vector': [-0.02807677909731865, 0.019484875723719597, -0.04334921017289162, 0.0033240269403904676, 0.04583585262298584, -0.04876786097884178, -0.030878888443112373, -0.0038088292349129915, 0.016255954280495644, -0.055262818932533264, 0.014734277501702309, 0.048730745911598206, -0.020858095958828926, -0.04364612326025963, 0.011106379330158234, -0.017007512971758842, -0.03598207235336304, 0.029487112537026405, 0.00282530696131289, -0.054186511784791946, -0.01871475949883461, 0.003333305474370718, 0.00714909378439188, 0.038116127252578735, 0.046503905206918716, -0.05845462903380394, -0.0006460164440795779, -0.0599391907453537, -0.0007405412616208196, -0.06921770423650742, 0.04119659587740898, -0.0252375528216362, 0.09939142316579819, -0.0073021892458200455, -0.045167796313762665, 0.015829142183065414, 0.046726588159799576, -

In [102]:
query = "암의 종류는 뭐가 있지?"
answer = query_rag(query)
print(answer)

{'size': 10, 'query': {'script_score': {'query': {'match_all': {}}, 'script': {'source': "cosineSimilarity(params.query_vector, 'embedding') + 1.0", 'params': {'query_vector': [-0.07235744595527649, 0.00746120372787118, -0.019914088770747185, -0.02616026997566223, 0.05407508462667465, -0.039582543075084686, -0.031353723257780075, -0.011799316853284836, 0.032037995755672455, -0.024896997958421707, -0.012202862650156021, 0.015843544155359268, -0.011808089911937714, 0.021703725680708885, 0.0020254033152014017, -0.009439453482627869, -0.028441179543733597, 0.039021085947752, 0.018177088350057602, -0.047442905604839325, 0.010413226671516895, -0.018896453082561493, -0.0029037725180387497, -0.009421908296644688, 0.03986326977610588, -0.05860181152820587, -0.025581270456314087, -0.028230633586645126, -0.05698763206601143, -0.029774634167551994, 0.007255044765770435, -0.022212542593479156, 0.07242763042449951, 0.025511087849736214, -0.04993435740470886, 0.008891157805919647, 0.05418035760521889

In [103]:
query = "암보험이 보장하는 암은 뭐가 있지?"
answer = query_rag(query)
print(answer)

{'size': 10, 'query': {'script_score': {'query': {'match_all': {}}, 'script': {'source': "cosineSimilarity(params.query_vector, 'embedding') + 1.0", 'params': {'query_vector': [0.005401743110269308, 0.016946446150541306, -0.03659376502037048, 0.052230387926101685, -0.0029242518357932568, -0.03702021762728691, -0.05803827941417694, -0.02930344082415104, 0.016255997121334076, 2.9152088245609775e-05, 0.016438763588666916, 0.02676502801477909, -0.0262573454529047, -0.027029022574424744, 0.02120082639157772, -0.015697546303272247, -0.04414808005094528, 0.041792433708906174, 0.023353401571512222, -0.06214035302400589, -0.008661065250635147, -0.030745260417461395, 0.00038742530159652233, -0.0028480994515120983, 0.029648665338754654, -0.04167059063911438, -0.016763679683208466, -0.07046634703874588, -0.008376763202250004, -0.035476863384246826, 0.04073645547032356, -0.031984005123376846, 0.08061999827623367, -0.016225537285208702, -0.054342348128557205, 0.019373169168829918, 0.0389087982475757

In [104]:
query = "암을 정의하는 조항에 대해서 설명해줘"
answer = query_rag(query)
print(answer)

{'size': 10, 'query': {'script_score': {'query': {'match_all': {}}, 'script': {'source': "cosineSimilarity(params.query_vector, 'embedding') + 1.0", 'params': {'query_vector': [-0.038286592811346054, -0.009702000766992569, -0.00797947682440281, 0.02687137760221958, 0.004883588757365942, -0.03171306848526001, -0.06402203440666199, -0.029366709291934967, 0.013528797775506973, -0.01228113193064928, 0.029832256957888603, 0.07229015231132507, -0.023575304076075554, -0.03614507615566254, 0.046331245452165604, -0.015242011286318302, -0.036107830703258514, 0.04945972189307213, 0.0686030164361, -0.004478563088923693, 0.02759763039648533, 0.004373814910650253, 0.03210412710905075, 0.05027908459305763, 0.007043727207928896, -0.0902044028043747, 0.03258829563856125, -0.017253175377845764, -0.00814241822808981, -0.047225095331668854, 0.006382650230079889, -0.03579125925898552, 0.06361234933137894, -0.04547463729977608, -0.029199112206697464, 0.030148828402161598, 0.03398493677377701, -0.02180622518

In [None]:
PUT /insurance_documents
{
  "mappings": {
    "properties": {
      "embedding": {
        "type": "dense_vector",
        "dims": 1536
      }
    }
  }
}



위 설정은 OpenAI의 `text-embedding-ada-002` 모델이 생성하는 1536차원 벡터를 저장할 수 있도록 합니다.

Similar code found with 1 license type



### 주요 변경 사항:
1. **Google Gemini API 제거**:
   - `query_gemini` 함수와 관련된 설정을 제거했습니다.

2. **OpenAI API 추가**:
   - `openai` 라이브러리를 사용하여 OpenAI API를 호출하도록 `query_openai` 함수를 추가했습니다.
   - OpenAI API 키를 설정하는 부분을 추가했습니다.

3. **RAG 시스템 수정**:
   - Google Gemini 호출 부분을 OpenAI 호출로 대체했습니다.

`your-openai-api-key` 부분에 실제 OpenAI API 키를 입력하세요.