In [4]:
import os
import json
from typing import List, Dict
from dotenv import load_dotenv
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.schema import Document

# .env 파일에서 환경 변수 로드
load_dotenv()

# OpenAI API 키를 환경 변수에서 가져옴
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    raise ValueError("OPENAI_API_KEY not found in .env file")

# 디렉토리 경로 설정
BL_DIR = "./bl_sample/"
BOOKING_DIR = "./bkg/"
SI_DIR = "./si_draft/"
INDEX_DIR = "./index_BL/"

# 필요한 디렉토리 생성
for directory in [BL_DIR, BOOKING_DIR, SI_DIR, INDEX_DIR]:
    os.makedirs(directory, exist_ok=True)

# OpenAIEmbeddings 초기화 (text-embedding-ada-002 모델 사용)
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

def load_json_file(file_path: str) -> Dict:
    """지정된 JSON 파일을 로드합니다."""
    with open(file_path, 'r') as file:
        return json.load(file)

def create_or_load_bl_vector_db(bl_data: List[Dict]) -> FAISS:
    """BL 데이터로 FAISS vector DB를 생성하거나 로드합니다."""
    index_path = os.path.join(INDEX_DIR, "faiss_index")
    
    if os.path.exists(index_path):
        print("Loading existing FAISS index...")
        return FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
    
    print("Creating new FAISS index...")
    documents = [Document(page_content=json.dumps(bl), metadata={"id": bl.get("id")}) for bl in bl_data]
    vector_db = FAISS.from_documents(documents, embeddings)
    vector_db.save_local(index_path)
    return vector_db

def find_similar_bl(vector_db: FAISS, booking: Dict, top_k: int = 5) -> List[Dict]:
    """Booking 데이터와 유사한 BL을 찾습니다."""
    query = json.dumps(booking)
    similar_docs = vector_db.similarity_search(query, k=top_k)
    return [json.loads(doc.page_content) for doc in similar_docs]

def generate_si(booking: Dict, similar_bls: List[Dict]) -> Dict:
    """Booking과 유사한 BL 정보를 기반으로 SI를 생성합니다."""
    si = {
        "booking_id": booking.get("id"),
        "shipper": booking.get("shipper"),
        "consignee": booking.get("consignee"),
        "notify_party": booking.get("notify_party"),
        "port_of_loading": booking.get("port_of_loading"),
        "port_of_discharge": booking.get("port_of_discharge"),
        "vessel": booking.get("vessel"),
        "voyage": booking.get("voyage"),
        "containers": booking.get("containers", []),
        "similar_bls": []
    }

    for bl in similar_bls:
        similar_bl_info = {
            "bl_number": bl.get("bl_number"),
            "shipper": bl.get("shipper"),
            "consignee": bl.get("consignee"),
            "notify_party": bl.get("notify_party"),
            "port_of_loading": bl.get("port_of_loading"),
            "port_of_discharge": bl.get("port_of_discharge"),
            "vessel": bl.get("vessel"),
            "voyage": bl.get("voyage"),
            "containers": bl.get("containers", [])
        }
        si["similar_bls"].append(similar_bl_info)

    # Booking과 유사한 BL 정보를 비교하여 SI 정보를 보완 또는 수정
    for bl_info in si["similar_bls"]:
        if not si["shipper"] and bl_info["shipper"]:
            si["shipper"] = bl_info["shipper"]
        if not si["consignee"] and bl_info["consignee"]:
            si["consignee"] = bl_info["consignee"]
        if not si["notify_party"] and bl_info["notify_party"]:
            si["notify_party"] = bl_info["notify_party"]
        # 기타 필요한 필드에 대해서도 유사한 로직을 적용할 수 있습니다.

    si["generated_info"] = f"Generated SI based on Booking {booking.get('id')} and {len(similar_bls)} similar BLs"
    return si

def process_booking_file(file_name: str, vector_db: FAISS):
    """개별 booking 파일을 처리하고 SI를 생성합니다."""
    booking_path = os.path.join(BOOKING_DIR, file_name)
    booking = load_json_file(booking_path)
    
    similar_bls = find_similar_bl(vector_db, booking)
    si = generate_si(booking, similar_bls)

    # SI 파일 이름 생성 (booking_*.json -> si_*.json)
    si_file_name = file_name.replace("booking_", "si_")
    si_path = os.path.join(SI_DIR, si_file_name)

    # SI 저장
    with open(si_path, 'w') as file:
        json.dump(si, file, indent=2)

    print(f"Generated and saved SI for Booking {booking.get('id')} as {si_file_name}")

def main():
    # BL 데이터 로드 및 vector DB 생성 또는 로드
    bl_data = [load_json_file(os.path.join(BL_DIR, f)) for f in os.listdir(BL_DIR) if f.endswith('.json')]
    vector_db = create_or_load_bl_vector_db(bl_data)

    # Booking 파일 처리
    for file_name in os.listdir(BOOKING_DIR):
        if file_name.startswith("booking_") and file_name.endswith(".json"):
            process_booking_file(file_name, vector_db)

if __name__ == "__main__":
    main()

Creating new FAISS index...
Generated and saved SI for Booking None as si_CHERRY20240911091201.json
Generated and saved SI for Booking None as si_CHERRY20240911091202.json
Generated and saved SI for Booking None as si_CHERRY202409119091200.json
Generated and saved SI for Booking None as si_CHERRY20240911091203.json
Generated and saved SI for Booking None as si_CHERRY20240911091199.json
Generated and saved SI for Booking None as si_CHERRY20240911091204.json
Generated and saved SI for Booking None as si_CHERRY20240911091198.json
