In [1]:
import os
from dotenv import load_dotenv
from datetime import datetime

from langchain_community.document_loaders import TextLoader
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_upstage import UpstageDocumentParseLoader # 위키독스 따라가면 안됨.

load_dotenv()

True

In [2]:
def prossed_document(file_path):
    loader = UpstageDocumentParseLoader(file_path=file_path, output_format="html", split="element", ocr="force", coordinates=False,)
    docs = loader.load()
    metadata = {"source": file_path, "date": datetime.now().isoformat()}
    if metadata:
        for doc in docs:
            if hasattr(doc, "metadata"):  # 문서에 metadata 속성이 있는지 확인
                doc.metadata.update(metadata)  # 메타데이터 업데이트
            else:
                doc.metadata = metadata  # 새로운 메타데이터 추가

    return docs

def create_db(DB_PATH, documents, embedding=OpenAIEmbeddings(model="text-embedding-3-small"), name="db"):
    db = Chroma.from_documents(
    documents=documents, persist_directory=DB_PATH, embedding=embedding, collection_name=name
    )
    return db 

def add_data(db, documents):
    db.add_documents(documents)

def dorp_data(db, ids):
    db.delete(ids=ids)

def select_data(db, query=None):
    try:
        if query or query != []:
            return db.get(where=query)
        elif query != []:
            return db.get()
    except Exception as e:
        print(e)

In [3]:
DB_PATH = "./chroma_db"
name = "my_db"
embedding = OpenAIEmbeddings(model="text-embedding-3-small")
file_path = "data/Aligning Instruction Tuning with Pre-training.pdf"

In [5]:
documents = prossed_document(file_path)

documents

ValueError: HTTP error: {"error":{"message":"API key suspended due to insufficient credit. Register your payment method at https://console.upstage.ai/billing to continue.","type":"invalid_request_error","param":"","code":"api_key_is_not_allowed"}}

In [36]:
persist_db = create_db(DB_PATH=DB_PATH, documents=documents, embedding=embedding, name=name)
db = Chroma(persist_directory=DB_PATH, embedding_function=embedding, collection_name=name,)

In [6]:
type(db)

langchain_chroma.vectorstores.Chroma

### 추가로 사용할 수 있는 연산자
- `$and`: 모든 조건이 만족되어야 함
- `$or`: 하나라도 조건이 만족되면 됨
- `$gt`, `$lt`, `$gte`, `$lte`: 숫자 비교
- `$eq`, `$ne`: 같음/같지 않음

In [39]:
ddb = select_data(db, {'$and': [{'page': 6}, {'id': 1}]})['ids']

ddb

[]

In [44]:
dd = dorp_data(db, select_data(db, {'$and': [{'page': 6}, {'id': 1}]})['ids'])

dd

ValueError: Expected IDs to be a non-empty list, got 0 IDs in delete.

In [8]:
file_path = "과제\20250122과제\data\Parallel Multi-objective Metaheuristics for Smart Communications in Vehicular Networks.pdf"
documents = prossed_document(file_path)

add_data(persist_db, documents)

In [24]:
select_data(db)

{'ids': ['e6d51af1-2629-413e-ac90-4c7019dd0afe',
  'fdbd0627-b33e-4711-bbc3-a4ec465a5454',
  'c09fc517-d4f1-47bf-952b-9529b91a2693',
  '4a866414-535b-4d5e-90a1-9b7f9d082325',
  'f8e10b64-f1bf-40f7-99c6-51f10c92e064',
  '42a81464-68f3-44a4-b44e-671589ae4499',
  'ee4c7fa5-0b88-49c9-a0c7-df786dc7b080',
  '0f5f417e-6bd2-4872-ab92-0a4b3c8351fe',
  'c4b3cde4-31bc-4b1d-8cb3-b8e4af718c1d',
  '5435caf8-a48a-437e-95ac-edd23af51786',
  'fa336dbe-26d5-4671-8eda-f0b74c86718c',
  '2dd9a63f-d6f4-436a-9b4f-6b06774954a5',
  'c14a3fca-d2a6-4b2a-aaa7-cb3eebb0dd3a',
  '4e54ca26-f0e5-466e-b3c7-98909c333f52',
  'd93e6be6-3588-43e9-a8b9-3edd8284eba3',
  '11143cd0-e6ed-4696-b7c3-431d7cf49b02',
  '71bccbf9-bec7-40c2-9a57-8249ab4cb303',
  'bb9dcb6f-474a-4e4f-8571-ac8c17e15856',
  '13e9f8da-7144-425e-8ebc-cd759f346ed7',
  '9ddb9ac2-d262-424d-953b-29577d5f18e7',
  '80a0c1ab-4c70-4e3b-8167-3ae3e954123c',
  '51e69502-40eb-463f-8866-f8316ca83967',
  '4844b18d-3c51-4ca6-be3a-427ea112ac02',
  '11ae45c9-978e-40f7-aa7e-