In [1]:
"""
EU AI Act 문서를 ChromaDB에 임베딩하는 스크립트

사용법:
1. eu_ai_act.pdf 파일을 현재 디렉토리에 배치
2. python embed_eu_ai_act.py 실행
"""

import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_chroma import Chroma
import chromadb

# 환경 변수 로드
load_dotenv()

# 설정
PDF_PATH = "data/EU.pdf"  # PDF 파일 경로
CHROMA_PATH = "./chroma/ethics"
COLLECTION_NAME = "EU_ai_act"

print("=" * 70)
print("🇪🇺 EU AI Act 임베딩 시작")
print("=" * 70)

# Step 1: PDF 파일 확인
if not os.path.exists(PDF_PATH):
    print(f"❌ PDF 파일을 찾을 수 없습니다: {PDF_PATH}")
    print("💡 eu_ai_act.pdf 파일을 현재 디렉토리에 배치해주세요.")
    exit(1)

print(f"✅ PDF 파일 확인: {PDF_PATH}")

# Step 2: PDF 로드
print("\n[1/4] PDF 로딩 중...")
try:
    loader = PyPDFLoader(PDF_PATH)
    documents = loader.load()
    print(f"✅ PDF 로드 완료: {len(documents)} 페이지")
except Exception as e:
    print(f"❌ PDF 로드 실패: {e}")
    exit(1)

# Step 3: 텍스트 청킹
print("\n[2/4] 텍스트 청킹 중...")
try:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    chunks = text_splitter.split_documents(documents)
    print(f"✅ 청킹 완료: {len(chunks)} 청크")
    
    # 샘플 출력
    print(f"\n📄 샘플 청크:")
    print(f"   {chunks[0].page_content[:200]}...")
    
except Exception as e:
    print(f"❌ 청킹 실패: {e}")
    exit(1)

# Step 4: 메타데이터 추가
print("\n[3/4] 메타데이터 추가 중...")
for i, chunk in enumerate(chunks):
    chunk.metadata["chunk_id"] = i
    chunk.metadata["source"] = "EU AI Act"
    # 페이지 번호가 있으면 article로 변환
    if "page" in chunk.metadata:
        chunk.metadata["article"] = f"Page {chunk.metadata['page']}"

print(f"✅ 메타데이터 추가 완료")

# Step 5: ChromaDB 초기화 (기존 컬렉션 삭제)
print("\n[4/4] ChromaDB 임베딩 시작...")
print("⏳ 예상 소요 시간: 10-20분 (로컬 실행)")
print("💰 비용: 무료 (BGE-M3 로컬 모델)")

try:
    # 기존 디렉토리가 있으면 컬렉션 삭제
    if os.path.exists(CHROMA_PATH):
        print("🗑️ 기존 컬렉션 삭제 중...")
        client = chromadb.PersistentClient(path=CHROMA_PATH)
        try:
            client.delete_collection(name=COLLECTION_NAME)
            print("✅ 기존 컬렉션 삭제 완료")
        except:
            print("ℹ️ 기존 컬렉션 없음")
    
    # BGE-M3 임베딩 초기화
    print("\n🤖 BGE-M3 모델 로딩 중...")
    embeddings = HuggingFaceBgeEmbeddings(
        model_name="BAAI/bge-m3",
        model_kwargs={'device': 'cpu'},  # GPU 있으면 'cuda'로 변경
        encode_kwargs={'normalize_embeddings': True}
    )
    print("✅ BGE-M3 모델 로드 완료")
    
    print(f"\n🚀 임베딩 시작 ({len(chunks)}개 청크)...")
    
    # 배치로 처리 (100개씩)
    batch_size = 100
    total_batches = (len(chunks) + batch_size - 1) // batch_size
    
    vectorstore = None
    
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i+batch_size]
        batch_num = i // batch_size + 1
        
        print(f"   📦 배치 {batch_num}/{total_batches} 처리 중... ({len(batch)}개)")
        
        if vectorstore is None:
            # 첫 배치: 새로 생성
            vectorstore = Chroma.from_documents(
                documents=batch,
                embedding=embeddings,
                persist_directory=CHROMA_PATH,
                collection_name=COLLECTION_NAME
            )
        else:
            # 이후 배치: 추가
            vectorstore.add_documents(batch)
        
        print(f"   ✅ 배치 {batch_num} 완료")
    
    # 저장 확인
    print("\n💾 ChromaDB 저장 중...")
    count = vectorstore._collection.count()
    print(f"✅ 저장 완료: {count}개 문서")
    
    # 검증
    print("\n🔍 저장 검증 중...")
    test_results = vectorstore.similarity_search("transparency requirements", k=3)
    print(f"✅ 검색 테스트 성공: {len(test_results)}개 결과")
    
    print("\n" + "=" * 70)
    print("🎉 임베딩 완료!")
    print("=" * 70)
    print(f"📊 통계:")
    print(f"   - 원본 페이지: {len(documents)}개")
    print(f"   - 생성된 청크: {len(chunks)}개")
    print(f"   - 저장된 문서: {count}개")
    print(f"   - 저장 경로: {CHROMA_PATH}")
    print(f"   - 컬렉션명: {COLLECTION_NAME}")
    print("\n💡 이제 EthicsEvaluationAgent에서 사용할 수 있습니다!")
    
except Exception as e:
    print(f"\n❌ 임베딩 실패: {e}")
    import traceback
    traceback.print_exc()
    exit(1)

🇪🇺 EU AI Act 임베딩 시작
✅ PDF 파일 확인: data/EU.pdf

[1/4] PDF 로딩 중...
✅ PDF 로드 완료: 144 페이지

[2/4] 텍스트 청킹 중...
✅ 청킹 완료: 784 청크

📄 샘플 청크:
   REGUL ATION (EU) 2024/1689 OF THE EUR OPEAN PARLIAMENT AND OF THE COUNCIL
of 13 June 2024
laying down harmonised rules on artificial intelligence and amending Regulations (EC) No 300/2008, 
(EU) No 16...

[3/4] 메타데이터 추가 중...
✅ 메타데이터 추가 완료

[4/4] ChromaDB 임베딩 시작...
⏳ 예상 소요 시간: 10-20분 (로컬 실행)
💰 비용: 무료 (BGE-M3 로컬 모델)

🤖 BGE-M3 모델 로딩 중...


  embeddings = HuggingFaceBgeEmbeddings(


✅ BGE-M3 모델 로드 완료

🚀 임베딩 시작 (784개 청크)...
   📦 배치 1/8 처리 중... (100개)


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


   ✅ 배치 1 완료
   📦 배치 2/8 처리 중... (100개)
   ✅ 배치 2 완료
   📦 배치 3/8 처리 중... (100개)
   ✅ 배치 3 완료
   📦 배치 4/8 처리 중... (100개)
   ✅ 배치 4 완료
   📦 배치 5/8 처리 중... (100개)
   ✅ 배치 5 완료
   📦 배치 6/8 처리 중... (100개)
   ✅ 배치 6 완료
   📦 배치 7/8 처리 중... (100개)
   ✅ 배치 7 완료
   📦 배치 8/8 처리 중... (84개)
   ✅ 배치 8 완료

💾 ChromaDB 저장 중...
✅ 저장 완료: 784개 문서

🔍 저장 검증 중...


Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


✅ 검색 테스트 성공: 3개 결과

🎉 임베딩 완료!
📊 통계:
   - 원본 페이지: 144개
   - 생성된 청크: 784개
   - 저장된 문서: 784개
   - 저장 경로: ./chroma/ethics
   - 컬렉션명: EU_ai_act

💡 이제 EthicsEvaluationAgent에서 사용할 수 있습니다!


In [2]:
query = """
According to the EU Artificial Intelligence Act (Regulation (EU) 2024/1689),
what are the specific obligations and compliance requirements for
AI systems used in the medical or healthcare sector?
List the relevant articles, annexes, or sections if available.
"""

In [3]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

# --- 로드 경로 동일하게 설정 ---
PERSIST_DIR = "./chroma/ethics"
COLLECTION  = "EU_ai_act"
EMBED_MODEL = "BAAI/bge-m3"

# 임베딩 함수 (bge-m3, normalize=True)
emb_fn = HuggingFaceEmbeddings(
    model_name=EMBED_MODEL,
    model_kwargs={"trust_remote_code": True},
    encode_kwargs={"normalize_embeddings": True}
)

# Chroma 로드
vectordb = Chroma(
    collection_name=COLLECTION,
    embedding_function=emb_fn,
    persist_directory=PERSIST_DIR
)

# --- 질의 ---
query = """
According to the EU Artificial Intelligence Act (Regulation (EU) 2024/1689),
what are the specific obligations and compliance requirements for
AI systems used in the medical or healthcare sector?
List the relevant articles, annexes, or sections if available.
"""

# 상위 5개 유사 청크 검색
results = vectordb.similarity_search(query, k=5)

for i, r in enumerate(results, 1):
    print(f"\n=== Result {i} ===")
    print(r.page_content[:800])  # 길면 일부만 보기
    print(f"\n(source: {r.metadata.get('source')}, page: {r.metadata.get('page')})")


  emb_fn = HuggingFaceEmbeddings(
  vectordb = Chroma(
Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given



=== Result 1 ===
2020/1828 (Artificial Intelligence Act) (OJ L, 2024/1689, 12.7.2024, ELI: http://data.europa.eu/eli/reg/ 
2024/1689/oj).’ .
Article 105
Amendment to Directiv e 2014/90/EU
In Article 8 of Directive 2014/90/EU, the following paragraph is added:
‘5. For Artificial Intelligence syste ms which are safety comp onents within the meaning of Regulation (EU) 2024/1689 of 
the European Parliament and of the Council (*), when carrying out its activities pursuant to paragraph 1 and when adop ting 
technical specif ications and testing standards in accordance with paragraphs 2 and 3, the Commission shall take into 
account the requirements set out in Chap ter III, Section 2, of that Regulation. 
(*) Regulation (EU) 2024/1689 of the European Parliament and of the Council of 13 June 2024 laying down harmo

(source: EU AI Act, page: 118)

=== Result 2 ===
(EU) 2018/858, (EU) 2018/1139 and (EU) 2019/2144 and Directives 2014/90/EU, (EU) 2016/797 and (EU) 
2020/1828 (Artificial Intellige

In [4]:
!pip install -U --no-deps rank-bm25 tqdm



In [7]:
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from tqdm import tqdm

PERSIST_DIR = "./chroma/ethics"
COLLECTION  = "EU_ai_act"
EMBED_MODEL = "BAAI/bge-m3"  # 불러오기 위해 필요(임베딩 사용 안 함)

emb_fn = HuggingFaceEmbeddings(
    model_name=EMBED_MODEL,
    model_kwargs={"trust_remote_code": True},
    encode_kwargs={"normalize_embeddings": True}
)

vectordb = Chroma(
    collection_name=COLLECTION,
    embedding_function=emb_fn,
    persist_directory=PERSIST_DIR
)

# 크로마 컬렉션에서 모든 청크 가져오기 (배치)
def fetch_all_chroma_docs(vdb: Chroma, batch_size: int = 1000):
    col = vdb._collection
    out_docs = []
    total = col.count()
    offset = 0
    while True:
        batch = col.get(
            where={},                   # or None
            limit=batch_size,
            offset=offset,
            include=["documents", "metadatas"]   # ❗ "ids" 넣지 말 것
        )
        ids = batch.get("ids") or []
        docs = batch.get("documents") or []
        metas = batch.get("metadatas") or []
        if not ids:
            break
        for txt, md in zip(docs, metas):
            out_docs.append(Document(page_content=txt or "", metadata=md or {}))
        offset += len(ids)
        if offset >= total:
            break
    return out_docs

docs = fetch_all_chroma_docs(vectordb)
len(docs)

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event CollectionGetEvent: capture() takes 1 positional argument but 3 were given


784

In [8]:
from langchain_community.retrievers import BM25Retriever

bm25 = BM25Retriever.from_documents(docs)
bm25.k = 8


In [None]:
bm25_hits = bm25.get_relevant_documents(query)

for i, d in enumerate(bm25_hits, 1):
    md = d.metadata or {}
    print(f"\n=== BM25 Result {i} ===")
    print(d.page_content[:900])
    print(f"\n[source: {md.get('source')}, page: {md.get('page')}]")

  bm25_hits = bm25.get_relevant_documents(query)



=== BM25 Result 1 ===
database. In order to maximise the availability and use of the EU database by the public, the EU database, including 
the information made available through it, should comply with requirements under the Directive (EU) 2019/882.
(132) Certain AI systems intended to interac t with natural persons or to generate cont ent may pose specific risks of 
impersonation or decept ion irrespective of whether they qualify as high-r isk or not. In certain circumstances, the use 
of these systems should theref ore be subject to specif ic transparency obligations without prejudice to the 
requirements and obligations for high-r isk AI systems and subject to target ed exceptions to take into account the 
special need of law enforcement. In particular , natural persons should be notified that they are interacting with an AI 
syste m, unless this is obvious from the point of view of a natural person who is

[source: EU AI Act, page: 32]

=== BM25 Result 2 ===
education should be pr

In [12]:
import os, re, textwrap

def build_context_snippet(docs, max_chars=6000):
    buf, used = [], 0
    for d in docs:
        chunk = d.page_content.strip()
        if not chunk:
            continue
        take = chunk[: min(len(chunk), max_chars - used)]
        buf.append(take)
        used += len(take)
        if used >= max_chars:
            break
    return "\n\n---\n\n".join(buf)

context = build_context_snippet(bm25_hits, max_chars=6000)

system_prompt = (
    "You are a compliance analyst of the EU AI Act (Regulation (EU) 2024/1689). "
    "Given the retrieved excerpts, answer precisely: list concrete obligations "
    "and compliance requirements for AI systems in the medical/healthcare sector. "
    "Cite the relevant Articles/Annexes in-line when possible. Be concise and structured.모든 답변작성하고 가장 마지막에 한국어로 번역"
)
user_prompt = f"Question:\n{query.strip()}\n\nRetrieved Excerpts:\n{context}"

answer = None

# 1) OpenAI가 있으면 LLM 요약
try:
    from langchain_openai import ChatOpenAI
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
    answer = llm.invoke(
        [{"role":"system","content":system_prompt},
         {"role":"user","content":user_prompt}]
    ).content
except Exception as e:
    # 2) Fallback: 간단 추출/정리
    # 조항, 부속서, 장/절 키워드 라인만 모아 간단 구조화
    lines = []
    for para in context.splitlines():
        if re.search(r"\b(Article|Annex|Chapter|Section|Annex\s*III|high-?risk|medical|healthcare)\b", para, re.I):
            lines.append(para.strip())
    core = "\n".join(lines[:30])
    template = f"""
    Below is a heuristic summary (no LLM available):

    • High-risk scope: Medical/healthcare AI typically falls under high-risk when used as a safety component or medical device (see Annex III; cross-reference with MDR).
    • Core obligations (Chapter III, Section 2; Articles 9–15 commonly):
      - Risk management system
      - Data governance & data quality
      - Technical documentation
      - Record-keeping & logs
      - Transparency & user information
      - Human oversight
      - Accuracy, robustness, cybersecurity
    • Transparency obligations (Title IV; e.g., Arts. 50–52) may apply depending on use (e.g., automated interaction, deepfake disclosure).
    • Conformity assessment required for high-risk systems before placing on the market; ongoing post-market monitoring.

    Extracted cues:
    {core}
    """.strip()
    answer = textwrap.dedent(template)

print(answer)


### Obligations and Compliance Requirements for AI Systems in the Medical/Healthcare Sector under the EU AI Act (Regulation (EU) 2024/1689)

1. **Classification as High-Risk AI Systems**:
   - AI systems used for medical purposes, including those for emergency healthcare patient triage, must be classified as high-risk due to their critical impact on life and health (Article 6, Annex III).

2. **Registration Requirement**:
   - High-risk AI systems must be registered at the national level before being placed on the market or put into service (Article 52).

3. **Compliance with Mandatory Requirements**:
   - High-risk AI systems must comply with specific mandatory requirements to ensure they do not pose unacceptable risks to public interests, as recognized by Union law (Article 16).

4. **Transparency Obligations**:
   - Providers of AI systems that interact directly with natural persons must inform users that they are interacting with an AI system, unless this is obvious (Article 50).

