In [15]:
import os, json, mimetypes, argparse
import cv2, numpy as np
from PIL import Image, ImageOps
from google import genai
from google.genai import types
from typing import Dict, Tuple, Union
from copy import deepcopy


TARGET_SHORT = 768
MODEL = "gemini-2.0-flash"

PROMPT_TEXT = (
    "전처리된 얼굴 피부 이미지를 분석하여 JSON만 반환하라.\n"
    "평가 항목: acne(여드름), redness(홍조), melasma_darkspots(잡티).\n"
    "각 항목은 다음 스키마로 제공하라:\n"
    "{acne:{score:number,reason:string}, redness:{score:number,reason:string}, "
    "melasma_darkspots:{score:number,reason:string}}\n"
    "score는 0~100 범위의 실수이며, 0은 없음·매우 양호, 100은 매우 심함을 의미한다."
)


def _load_exif_bgr(path: str):
    pil = Image.open(path)
    pil = ImageOps.exif_transpose(pil).convert("RGB")
    return cv2.cvtColor(np.array(pil), cv2.COLOR_RGB2BGR)

def _resize_short(bgr, short=TARGET_SHORT):
    h, w = bgr.shape[:2]
    s = min(h, w)
    if s == short:
        return bgr
    scale = short / float(s)
    new = (int(round(w * scale)), int(round(h * scale)))
    interp = cv2.INTER_AREA if scale < 1.0 else cv2.INTER_CUBIC
    return cv2.resize(bgr, new, interpolation=interp)

def _wb_grayworld(bgr, strength=0.5):
    x = bgr.astype(np.float32)
    means = x.reshape(-1,3).mean(0) + 1e-6
    g = means.mean()
    gains = np.clip(g/means, 0.8, 1.2)
    gains = (1-strength)*1.0 + strength*gains
    x *= gains
    return np.clip(x, 0, 255).astype(np.uint8)

def _clahe_light(bgr, clip=1.8, tiles=8):
    lab = cv2.cvtColor(bgr, cv2.COLOR_BGR2LAB)
    l, a, b = cv2.split(lab)
    l = cv2.createCLAHE(clipLimit=clip, tileGridSize=(tiles, tiles)).apply(l)
    return cv2.cvtColor(cv2.merge((l,a,b)), cv2.COLOR_LAB2BGR)

def _morph_kernel(bgr, base: int = 768, ksize: int = 5):
    h, w = bgr.shape[:2]
    scale = min(h, w) / float(base)
    k = max(3, int(round(ksize * scale)))
    if k % 2 == 0:
        k += 1
    return cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (k, k))

def _skin_mask(bgr):
    ycrcb = cv2.cvtColor(bgr, cv2.COLOR_BGR2YCrCb)
    Y, Cr, Cb = cv2.split(ycrcb)
    m1 = (Cr >= 135) & (Cr <= 180) & (Cb >= 85) & (Cb <= 135) & (Y >= 40) & (Y <= 240)

    hsv = cv2.cvtColor(bgr, cv2.COLOR_BGR2HSV)
    H, S, V = cv2.split(hsv)
    m2 = (H <= 25) & (S >= 30) & (S <= 180) & (V >= 60)

    m = (m1 & m2).astype(np.uint8) * 255

    k = _morph_kernel(bgr)
    m = cv2.morphologyEx(m, cv2.MORPH_OPEN,  k, iterations=1)
    m = cv2.morphologyEx(m, cv2.MORPH_CLOSE, k, iterations=2)

    num, labels, stats, _ = cv2.connectedComponentsWithStats(m, connectivity=8)
    if num > 1:
        largest = 1 + np.argmax(stats[1:, cv2.CC_STAT_AREA])
        m2 = np.zeros_like(m)
        m2[labels == largest] = 255
        m = m2
    return m

def preprocess_with_mask(bgr, bg_gray=220, return_mask: bool = False):
    bgr = _resize_short(bgr)
    bgr = _wb_grayworld(bgr, 0.5)
    bgr = _clahe_light(bgr, 1.8, 8)
    mask = _skin_mask(bgr)

    bg = np.full_like(bgr, (bg_gray, bg_gray, bg_gray), np.uint8)
    out = np.where(mask[..., None] == 255, bgr, bg)

    if return_mask:
        return out, mask
    return out


def analyze_with_gemini(image_path, api_key):
    if not api_key:
        raise RuntimeError("GEMINI_API_KEY 환경 변수가 설정되어 있지 않습니다. 환경 변수를 먼저 설정해 주세요.")

    client = genai.Client(api_key=api_key)
    mime = mimetypes.guess_type(image_path)[0] or "image/jpeg"

    try:
        with open(image_path, "rb") as f:
            img_bytes = f.read()

        content = types.Content(
            role="user",
            parts=[
                types.Part(text=PROMPT_TEXT),
                types.Part(inline_data=types.Blob(mime_type=mime, data=img_bytes)),
            ],
        )

        resp = client.models.generate_content(
            model=MODEL,
            contents=[content],
            config=types.GenerateContentConfig(
                response_mime_type="application/json",
                temperature=0.2,
                system_instruction="너는 피부 분석 전문가다. 반드시 JSON만 반환하라."
            ),
        )

        txt = (resp.text or "").strip()
        start, end = txt.find("{"), txt.rfind("}")
        if start == -1 or end == -1:
            print("[WARN] Gemini 응답에서 JSON 블록을 찾지 못했습니다.")
            print("[DEBUG] raw response:", txt[:300])
            return {}
        json_str = txt[start:end+1]
        try:
            return json.loads(json_str)
        except Exception as e:
            print(f"[WARN] Gemini 응답 JSON 파싱 실패: {e}")
            print("[DEBUG] json_str:", json_str[:300])
            return {}
    except Exception as e:
        print(f"[ERROR] Gemini 분석 중 예외 발생: {e}")
        return {}


MAP: Dict[str, Dict[str, int]] = {
    "q1":  {"없어요":0,"T존 일부(이마 혹은 코)":1,"T존 전체(이마와 코)":2,"얼굴 전체":3},
    "q2":  {"전혀 안 보여요":0,"지금은 없지만 가끔 보여요":1,"부분적으로 붉게 보여요":2,"전체적으로 붉게 보여요":3},
    "q3":  {"없어요":0,"U존 일부(볼 혹은 턱)":1,"U존 전체(볼과 턱)":2,"얼굴 전체":3},
    "q4":  {"전혀 생기지 않아요":0,"표정을 지을 때만 생겨요":1,"표정 짓지 않아도 약간 있어요":2,"표정 짓지 않아도 많이 있어요":3},
    "q5":  {"주름이 없어요":0,"잔주름이에요":1,"깊은 주름이에요":2,"잔주름과 깊은 주름 다 있어요":3},
    "q6":  {"전혀 생기지 않아요":0,"미소 지을 때만 약간 생겨요":1,"미소 지을 때 진하게 생겨요":2,"미소 짓지 않아도 생겨요":3},
    "q7":  {"전혀 안 보여요":0,"거의 안 보여요":1,"약간 눈에 띄어요":2,"곳곳에 많이 보여요":3},
    "q8":  {"주름이 없어요":0,"잔주름이에요":1,"깊은 주름이에요":2,"잔주름과 깊은 주름 다 있어요":3},
    "q9":  {"외출 전보다 윤기가 없어요":0,"외출 전과 변함이 없어요":1,"약간 번들거리고 윤기가 있어요":2,"많이 번들거리고 기름져요":3},
    "q10": {"전혀 안 보여요":0,"가끔 붉어지면 보여요":1,"특정부위에 눈에 띄어요":2,"곳곳에 많이 보여요":3},
}

def _to_score(q, v):
    return MAP[q].get(v, 0)

def _to_0_3(x):
    try:
        v = float(x)
    except Exception:
        v = 0.0
    return round(max(0, min(100, v)) / 100 * 3, 2)

def _decide_skin_type(oil: float, dry: float) -> str:
    if oil >= 2 and dry <= 1:
        return "지성"
    if dry >= 2 and oil <= 1:
        return "건성"
    if oil >= 2 and dry >= 2:
        return "복합성"
    return "중성"

def assess_skin_type(**a):
    s = {k: _to_score(k, v) for k, v in a.items()}

    oil = round(0.6 * s["q1"] + 0.4 * s["q9"], 2)
    dry = float(s["q3"])
    sens = round(0.7 * s["q2"] + 0.3 * s["q10"], 2)
    wrinkle = round(0.4 * s["q4"] + 0.6 * ((s["q5"] + s["q8"]) / 2), 2)
    pigment = float(s["q7"])

    skin = _decide_skin_type(oil, dry)

    return {
        "skin_type": skin,
        "indices": {
            "oil": oil,
            "dry": dry,
            "sensitivity": sens,
            "wrinkle": wrinkle,
            "pigment": pigment
        }
    }

def _safe_score(gemini: dict, key: str) -> float:
    try:
        return _to_0_3(gemini.get(key, {}).get("score", 0))
    except Exception:
        return 0.0

def assess_with_gemini(survey, gemini):
    answers = survey["survey"] if "survey" in survey else survey

    base = assess_skin_type(**answers)
    idx = base["indices"]
    fused = idx.copy()

    acne  = _safe_score(gemini, "acne")
    red   = _safe_score(gemini, "redness")
    mel   = _safe_score(gemini, "melasma_darkspots")

    fused["sensitivity"] = round(0.4 * idx["sensitivity"] + 0.6 * red,  2)
    fused["pigment"]     = round(0.3 * idx["pigment"]     + 0.7 * mel,  2)
    fused["oil"]         = round(0.7 * idx["oil"]         + 0.3 * acne, 2)
    fused["dry"]         = idx["dry"]
    fused["wrinkle"]     = idx["wrinkle"]

    skin = _decide_skin_type(fused["oil"], fused["dry"])

    return {
        "skin_type": skin,
        "indices": fused,
        "vision_raw": gemini
    }


def run_fusion_from_request(image_path: str, survey_dict: dict) -> dict:
    if not image_path:
        raise ValueError("image_path가 없습니다. 이미지 파일이 반드시 필요합니다.")
    if "survey" not in survey_dict:
        raise ValueError("survey_dict에 'survey' 키가 없습니다. 설문 JSON이 반드시 필요합니다.")

    answers = survey_dict["survey"]

    original = _load_exif_bgr(image_path)
    processed = preprocess_with_mask(original)

    root, ext = os.path.splitext(image_path)
    if not ext:
        ext = ".jpg"
    pre_path = f"{root}_Pre{ext}"

    cv2.imwrite(pre_path, processed)

    api_key = os.getenv("GEMINI_API_KEY")
    gemini = analyze_with_gemini(pre_path, api_key)

    fused = assess_with_gemini({"survey": answers}, gemini)
    return fused



def run_full_pipeline(input_path: str, survey_json_path: str):
    original = _load_exif_bgr(input_path)
    processed = preprocess_with_mask(original)
    root, ext = os.path.splitext(input_path)
    if not ext:
        ext = ".jpg"
    pre_path = f"{root}_Pre{ext}"

    cv2.imwrite(pre_path, processed)
    print(f"[INFO] 전처리 완료: {pre_path}")

    api_key = os.getenv("GEMINI_API_KEY")
    gemini = analyze_with_gemini(pre_path, api_key)

    with open(survey_json_path, "r", encoding="utf-8") as f:
        survey = json.load(f)

    fused = assess_with_gemini(survey, gemini)

    print("\n[설문+이미지 융합 결과]")
    print(json.dumps(fused, ensure_ascii=False, indent=2))

    return {
        "preprocessed_image_path": pre_path,
        "gemini": gemini,
        "fusion": fused,
    }


if __name__ == "__main__":
    run_full_pipeline("Lee.jpg", "survey_example.json")

[INFO] 전처리 완료: Lee_Pre.jpg

[설문+이미지 융합 결과]
{
  "skin_type": "중성",
  "indices": {
    "oil": 1.53,
    "dry": 1.0,
    "sensitivity": 1.04,
    "wrinkle": 1.0,
    "pigment": 1.23
  },
  "vision_raw": {
    "acne": {
      "score": 15.0,
      "reason": "There are a few small areas that appear to be acne, but they are not very prominent."
    },
    "redness": {
      "score": 20.0,
      "reason": "Slight redness is observed in some areas, but it's not widespread or intense."
    },
    "melasma_darkspots": {
      "score": 30.0,
      "reason": "Some dark spots are visible, indicating the presence of melasma or hyperpigmentation."
    }
  }
}


In [13]:
import json, os, hashlib, re, math
from typing import List, Dict, Any, Tuple
from elasticsearch import Elasticsearch, helpers

import numpy as np
from sentence_transformers import SentenceTransformer

try:
    import torch
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
except ImportError:
    DEVICE = "cpu"

EMB_MODEL_NAME = "jhgan/ko-sroberta-multitask"
emb_model = SentenceTransformer(EMB_MODEL_NAME, device=DEVICE)
EMB_DIM = emb_model.get_sentence_embedding_dimension()

INDEX_NAME = "cosmetics_demo"
_SPLIT_RE = re.compile(r"[,\s/;·∙•ㆍ|]+")
CATEGORIES = ["cream", "essence", "skintoner"]
LTR_MODEL_PATH = "ltr_booster.json"


def connect_es() -> Elasticsearch:
    url = os.getenv("ES_URL", "http://localhost:9200")
    return Elasticsearch(url, request_timeout=20)


def load_json(path: str) -> List[Dict[str, Any]]:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def _split_ingredients(ing: str) -> List[str]:
    if not ing:
        return []
    parts = _SPLIT_RE.split(ing)
    return [p.strip().lower() for p in parts if p.strip()]


def normalize_product(doc: Dict[str, Any], category: str) -> Dict[str, Any]:
    name = doc.get("productName") or ""
    brand = doc.get("mallName") or ""
    sale = doc.get("salePrice")
    if isinstance(sale, str):
        nums = re.findall(r"\d[\d,._]*", sale.replace(" ", ""))
        sale = float(nums[0].replace(",", "").replace("_", "")) if nums else None

    avg_score = doc.get("averageReviewScore")
    review_cnt = doc.get("totalReviewCount")

    ing_str = (doc.get("ingredientsInfo") or {}).get("ingredients")
    ingredients = _split_ingredients(ing_str) if isinstance(ing_str, str) else []

    reviews = doc.get("reviews") or []
    review_text = " ".join([str(r.get("reviewContent", "")) for r in reviews])[:5000]

    pid = hashlib.md5(f"{name}|{brand}".encode("utf-8")).hexdigest()
    return {
        "product_id": pid,
        "productName": name,
        "brand": brand,
        "category": category,
        "ingredients": ingredients,
        "review_text": review_text,
        "salePrice": sale,
        "averageReviewScore": avg_score,
        "totalReviewCount": review_cnt,
    }


def load_and_normalize_all() -> List[Dict[str, Any]]:
    base = os.getcwd()
    files = [
        (os.path.join(base, "cream.json"), "cream"),
        (os.path.join(base, "essence.json"), "essence"),
        (os.path.join(base, "essense.json"), "essence"),
        (os.path.join(base, "skintoner.json"), "skintoner"),
    ]
    out: List[Dict[str, Any]] = []
    for path, cat in files:
        if not os.path.exists(path):
            continue
        try:
            raw = load_json(path)
            for doc in raw:
                out.append(normalize_product(doc, cat))
        except Exception as e:
            print(f"[WARN] Failed to load {path}: {e}")
    return out


def encode_texts_batch(texts: List[str], batch_size: int = 64) -> np.ndarray:
    if not texts:
        return np.zeros((0, EMB_DIM), dtype=np.float32)
    vecs = emb_model.encode(
        texts,
        normalize_embeddings=True,
        batch_size=batch_size,
        show_progress_bar=False,
    )
    return np.asarray(vecs, dtype=np.float32)


def add_review_vectors_batch(docs: List[Dict[str, Any]], batch_size: int = 64) -> List[Dict[str, Any]]:
    texts = [d.get("review_text", "") or "" for d in docs]
    if not texts:
        return docs
    vecs = encode_texts_batch(texts, batch_size=batch_size)
    for d, v in zip(docs, vecs):
        d["review_vector"] = v.tolist()
    return docs


def ensure_index_minimal(es: Elasticsearch):
    if es.indices.exists(index=INDEX_NAME):
        return

    body = {
        "mappings": {
            "properties": {
                "product_id": {"type": "keyword"},
                "productName": {"type": "text"},
                "brand": {"type": "keyword"},
                "category": {"type": "keyword"},
                "ingredients": {"type": "text"},
                "review_text": {"type": "text"},
                "salePrice": {"type": "float"},
                "averageReviewScore": {"type": "float"},
                "totalReviewCount": {"type": "integer"},
                "review_vector": {
                    "type": "dense_vector",
                    "dims": EMB_DIM,
                    "index": True,
                    "similarity": "cosine",
                },
            }
        }
    }
    es.indices.create(index=INDEX_NAME, body=body)
    print(f"[INFO] index created: {INDEX_NAME}")


def bulk_index(es: Elasticsearch, docs: List[Dict[str, Any]]):
    ops = (
        {
            "_op_type": "index",
            "_index": INDEX_NAME,
            "_id": d["product_id"],
            "_source": d,
        }
        for d in docs
    )
    helpers.bulk(es, ops, refresh="wait_for")
    print(f"[INFO] bulk indexed {len(docs)} docs into {INDEX_NAME}")


POS_ING = {
    "pigment": {
        "niacinamide", "나이아신아마이드",
        "vitamin c", "비타민c",
        "ascorbic", "아스코빅",
        "arbutin", "아르부틴",
        "tranexamic", "트라넥삼산",
        "licorice", "감초",
        "kojic", "코직",
    },
    "sensitivity": {
        "panthenol", "판테놀",
        "madecassoside", "마데카소사이드",
        "centella", "병풀",
        "cica", "시카",
        "allantoin", "알란토인",
        "beta-glucan", "베타글루칸",
        "bisabolol", "비사볼올",
        "aloe", "알로에",
        "ceramide", "세라마이드",
    },
    "dry": {
        "hyaluronic", "히알루론산",
        "glycerin", "글리세린",
        "squalane", "스쿠알란",
        "ceramide", "세라마이드",
        "cholesterol", "콜레스테롤",
        "urea", "요소",
    },
    "acne": {
        "salicylic", "살리실산",
        "bha", "바하",
        "azelaic", "아젤라익",
        "zinc", "아연",
    },
}

NEG_ING = {
    "sensitivity": {
        "fragrance", "향료", "향",
        "parfum", "퍼퓸",
        "alcohol", "알코올", "에탄올",
        "essential oil", "에센셜 오일",
        "tea tree oil", "티트리 오일",
    },
    "acne": {
        "coconut oil", "코코넛 오일",
        "isopropyl myristate", "아이소프로필 미리스테이트",
        "lanolin", "라놀린",
    },
}


def fusion_to_query_text(fusion: Dict[str, Any]) -> str:
    idx = fusion.get("indices", {})
    skin = fusion.get("skin_type", "")

    oil = float(idx.get("oil", 0.0))
    dry = float(idx.get("dry", 0.0))
    sens = float(idx.get("sensitivity", 0.0))
    wrinkle = float(idx.get("wrinkle", 0.0))
    pigment = float(idx.get("pigment", 0.0))

    parts: List[str] = []

    if skin:
        parts.append(f"피부 타입은 {skin} 피부입니다.")
    if oil >= 2.0:
        parts.append("피지가 많고 번들거림이 고민입니다.")
    elif oil <= 1.0:
        parts.append("유분은 많지 않습니다.")
    if dry >= 2.0:
        parts.append("건조하고 당김이 느껴집니다.")
    if sens >= 2.0:
        parts.append("민감성과 자극, 붉은기 고민이 큽니다.")
    if pigment >= 2.0:
        parts.append("기미, 잡티, 색소침착 개선을 원합니다.")
    if wrinkle >= 2.0:
        parts.append("주름과 탄력 저하도 고민입니다.")
    parts.append("자극이 적고 트러블이 덜 나며 피부 진정과 개선에 도움이 된다는 후기가 많은 제품을 선호합니다.")

    return " ".join(parts)


def encode_query_text(text: str) -> List[float]:
    if not text:
        return [0.0] * EMB_DIM
    vec = emb_model.encode(text, normalize_embeddings=True)
    return vec.tolist()


def encode_query_from_fusion(fusion: Dict[str, Any]) -> List[float]:
    text = fusion_to_query_text(fusion)
    return encode_query_text(text)


def _ing_filter(ings: List[str]) -> Dict[str, Any]:
    return {
        "bool": {
            "should": [{"match_phrase": {"ingredients": ing}} for ing in ings],
            "minimum_should_match": 1,
        }
    }


def build_query(
    fusion: Dict[str, Any],
    category: str,
    top_k: int,
    query_vec: List[float],
) -> Dict[str, Any]:
    idx = fusion.get("indices", {})
    oil = float(idx.get("oil", 0.0))
    dry = float(idx.get("dry", 0.0))
    sens = float(idx.get("sensitivity", 0.0))
    pigment = float(idx.get("pigment", 0.0))
    acne_flag = 1.0 if fusion.get("flags", {}).get("acne") else 0.0

    pos_ings = set()
    if pigment >= 2.0:
        pos_ings |= POS_ING["pigment"]
    if sens >= 2.0:
        pos_ings |= POS_ING["sensitivity"]
    if dry >= 2.0:
        pos_ings |= POS_ING["dry"]
    if oil >= 2.0 or acne_flag:
        pos_ings |= POS_ING["acne"]

    functions = []

    functions.append({
        "script_score": {
            "script": {
                "source": """
                    double sim = cosineSimilarity(params.qvec, 'review_vector');
                    sim = (sim + 1.0) / 2.0;

                    double bonus = 0.0;
                    if (!doc['averageReviewScore'].empty) {
                        double r = Math.min(Math.max(doc['averageReviewScore'].value, 0.0), 5.0);
                        bonus += (r / 5.0) * 0.5;
                    }
                    if (!doc['totalReviewCount'].empty) {
                        double n = Math.min(doc['totalReviewCount'].value, 5000);
                        bonus += (n / 5000.0) * 0.5;
                    }

                    return 1.0 + 5.0 * sim + bonus;
                """,
                "params": {
                    "qvec": query_vec
                }
            }
        }
    })

    if pos_ings:
        functions.append({
            "filter": _ing_filter(list(pos_ings)),
            "weight": 3.0
        })

    return {
        "size": top_k,
        "query": {
            "function_score": {
                "query": {"term": {"category": category}},
                "score_mode": "sum",
                "boost_mode": "sum",
                "functions": functions
            }
        },
        "_source": {
            "includes": [
                "product_id", "productName", "brand", "category",
                "ingredients", "salePrice", "averageReviewScore",
                "totalReviewCount"
            ]
        }
    }


def search_candidates_per_category(es: Elasticsearch, fusion: Dict[str, Any], per_cat: int = 30):
    query_vec = encode_query_from_fusion(fusion)
    out: Dict[str, List[Dict[str, Any]]] = {}
    for cat in CATEGORIES:
        body = build_query(fusion, cat, per_cat, query_vec)
        resp = es.search(index=INDEX_NAME, body=body)
        hits = resp.get("hits", {}).get("hits", [])
        out[cat] = [
            {
                "rank_es": i + 1,
                "score_es": h["_score"],
                **h["_source"],
            }
            for i, h in enumerate(hits)
        ]
    return out


def _make_pos_neg_vocab(fusion: Dict[str, Any]) -> Tuple[set, set]:
    idx = fusion.get("indices", {})
    oil = float(idx.get("oil", 0.0))
    dry = float(idx.get("dry", 0.0))
    sens = float(idx.get("sensitivity", 0.0))
    pigment = float(idx.get("pigment", 0.0))
    acne = 1.0 if fusion.get("flags", {}).get("acne") else 0.0
    pos, neg = set(), set()
    if pigment >= 2.0:
        pos |= POS_ING["pigment"]
    if sens >= 2.0:
        pos |= POS_ING["sensitivity"]
        neg |= NEG_ING["sensitivity"]
    if dry >= 2.0:
        pos |= POS_ING["dry"]
    if oil >= 2.0 or acne:
        pos |= POS_ING["acne"]
        neg |= NEG_ING["acne"]
    return pos, neg


def _ingredients_hits(ingredients, vocab):
    return sum(1 for x in (ingredients or []) if x in vocab)


def _safe_log1p(x):
    try:
        return math.log1p(float(x)) if x else 0.0
    except Exception:
        return 0.0


def featurize(results_by_cat: Dict[str, List[Dict[str, Any]]], fusion: Dict[str, Any]):
    pos_vocab, neg_vocab = _make_pos_neg_vocab(fusion)
    X, group = [], []

    for cat in CATEGORIES:
        rows = results_by_cat.get(cat, [])
        group.append(len(rows))

        for r in rows:
            ingredients = r.get("ingredients") or []
            pos_hits = _ingredients_hits(ingredients, pos_vocab)
            neg_hits = _ingredients_hits(ingredients, neg_vocab)
            pos_ratio = (pos_hits / max(1, len(ingredients))) if ingredients else 0.0

            avg = r.get("averageReviewScore")
            cnt = r.get("totalReviewCount")
            price = r.get("salePrice")
            es_score = r.get("score_es", 0.0)

            X.append(
                [
                    pos_hits,
                    neg_hits,
                    pos_hits - neg_hits,
                    pos_ratio,
                    avg or 0.0,
                    _safe_log1p(cnt),
                    _safe_log1p(price),
                    es_score,
                ]
            )

    return X, group


def train_booster_offline(es: Elasticsearch, fusion_json: Dict[str, Any], model_path: str = LTR_MODEL_PATH):
    import xgboost as xgb

    es_results = search_candidates_per_category(es, fusion_json, per_cat=30)
    X, group = featurize(es_results, fusion_json)
    if not X:
        raise RuntimeError("LTR 학습을 위한 후보 데이터가 없습니다.")

    X_arr = np.asarray(X, dtype=np.float32)
    y_list: List[float] = []
    for cat in CATEGORIES:
        rows = es_results.get(cat, [])
        y_list.extend(float(len(rows) - i) for i, _ in enumerate(rows))
    y_arr = np.asarray(y_list, dtype=np.float32)

    dtrain = xgb.DMatrix(X_arr, label=y_arr)
    dtrain.set_group(group)

    params = {
        "objective": "rank:pairwise",
        "eta": 0.1,
        "max_depth": 5,
        "eval_metric": "ndcg",
    }

    booster = xgb.train(params, dtrain, num_boost_round=80)
    booster.save_model(model_path)
    print(f"[INFO] Booster 모델 학습 및 저장 완료: {model_path}")


if __name__ == "__main__":
    es = connect_es()
    ensure_index_minimal(es)

    docs = load_and_normalize_all()
    if docs:
        print(f"[INFO] 로드한 문서 수: {len(docs)}")
        docs = add_review_vectors_batch(docs, batch_size=64)
        bulk_index(es, docs)

    fusion_json = {
        "skin_type": "복합성",
        "indices": {
            "oil": 2.2,
            "dry": 1.0,
            "sensitivity": 2.4,
            "wrinkle": 1.2,
            "pigment": 2.8,
        },
        "flags": {"sensitive": True, "aging": False, "pigment": True, "acne": False},
    }

    train_booster_offline(es, fusion_json, LTR_MODEL_PATH)

[INFO] 로드한 문서 수: 243
[INFO] bulk indexed 243 docs into cosmetics_demo
[INFO] Booster 모델 학습 및 저장 완료: ltr_booster.json


In [17]:
import os, math
from typing import List, Dict, Any, Tuple
from elasticsearch import Elasticsearch

import numpy as np
from sentence_transformers import SentenceTransformer

try:
    import torch
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
except ImportError:
    DEVICE = "cpu"

EMB_MODEL_NAME = "jhgan/ko-sroberta-multitask"
emb_model = SentenceTransformer(EMB_MODEL_NAME, device=DEVICE)
EMB_DIM = emb_model.get_sentence_embedding_dimension()

INDEX_NAME = "cosmetics_demo"
CATEGORIES = ["cream", "essence", "skintoner"]
LTR_MODEL_PATH = "ltr_booster.json"


def connect_es() -> Elasticsearch:
    url = os.getenv("ES_URL", "http://localhost:9200")
    return Elasticsearch(url, request_timeout=20)


POS_ING = {
    "pigment": {
        "niacinamide", "나이아신아마이드",
        "vitamin c", "비타민c",
        "ascorbic", "아스코빅",
        "arbutin", "아르부틴",
        "tranexamic", "트라넥삼산",
        "licorice", "감초",
        "kojic", "코직",
    },
    "sensitivity": {
        "panthenol", "판테놀",
        "madecassoside", "마데카소사이드",
        "centella", "병풀",
        "cica", "시카",
        "allantoin", "알란토인",
        "beta-glucan", "베타글루칸",
        "bisabolol", "비사볼올",
        "aloe", "알로에",
        "ceramide", "세라마이드",
    },
    "dry": {
        "hyaluronic", "히알루론산",
        "glycerin", "글리세린",
        "squalane", "스쿠알란",
        "ceramide", "세라마이드",
        "cholesterol", "콜레스테롤",
        "urea", "요소",
    },
    "acne": {
        "salicylic", "살리실산",
        "bha", "바하",
        "azelaic", "아젤라익",
        "zinc", "아연",
    },
}

NEG_ING = {
    "sensitivity": {
        "fragrance", "향료", "향",
        "parfum", "퍼퓸",
        "alcohol", "알코올", "에탄올",
        "essential oil", "에센셜 오일",
        "tea tree oil", "티트리 오일",
    },
    "acne": {
        "coconut oil", "코코넛 오일",
        "isopropyl myristate", "아이소프로필 미리스테이트",
        "lanolin", "라놀린",
    },
}


def encode_query_text(text: str) -> List[float]:
    if not text:
        return [0.0] * EMB_DIM
    vec = emb_model.encode(text, normalize_embeddings=True)
    return vec.tolist()


def fusion_to_query_text(fusion: Dict[str, Any]) -> str:
    idx = fusion.get("indices", {})
    skin = fusion.get("skin_type", "")

    oil = float(idx.get("oil", 0.0))
    dry = float(idx.get("dry", 0.0))
    sens = float(idx.get("sensitivity", 0.0))
    wrinkle = float(idx.get("wrinkle", 0.0))
    pigment = float(idx.get("pigment", 0.0))

    parts: List[str] = []

    if skin:
        parts.append(f"피부 타입은 {skin} 피부입니다.")
    if oil >= 2.0:
        parts.append("피지가 많고 번들거림이 고민입니다.")
    elif oil <= 1.0:
        parts.append("유분은 많지 않습니다.")
    if dry >= 2.0:
        parts.append("건조하고 당김이 느껴집니다.")
    if sens >= 2.0:
        parts.append("민감성과 자극, 붉은기 고민이 큽니다.")
    if pigment >= 2.0:
        parts.append("기미, 잡티, 색소침착 개선을 원합니다.")
    if wrinkle >= 2.0:
        parts.append("주름과 탄력 저하도 고민입니다.")
    parts.append("자극이 적고 트러블이 덜 나며 피부 진정과 개선에 도움이 된다는 후기가 많은 제품을 선호합니다.")

    return " ".join(parts)


def encode_query_from_fusion(fusion: Dict[str, Any]) -> List[float]:
    text = fusion_to_query_text(fusion)
    return encode_query_text(text)


def _ing_filter(ings: List[str]) -> Dict[str, Any]:
    return {
        "bool": {
            "should": [{"match_phrase": {"ingredients": ing}} for ing in ings],
            "minimum_should_match": 1,
        }
    }


def build_query(
    fusion: Dict[str, Any],
    category: str,
    top_k: int,
    query_vec: List[float],
) -> Dict[str, Any]:
    idx = fusion.get("indices", {})
    oil = float(idx.get("oil", 0.0))
    dry = float(idx.get("dry", 0.0))
    sens = float(idx.get("sensitivity", 0.0))
    pigment = float(idx.get("pigment", 0.0))
    acne_flag = 1.0 if fusion.get("flags", {}).get("acne") else 0.0

    pos_ings = set()
    if pigment >= 2.0:
        pos_ings |= POS_ING["pigment"]
    if sens >= 2.0:
        pos_ings |= POS_ING["sensitivity"]
    if dry >= 2.0:
        pos_ings |= POS_ING["dry"]
    if oil >= 2.0 or acne_flag:
        pos_ings |= POS_ING["acne"]

    functions = []

    functions.append({
        "script_score": {
            "script": {
                "source": """
                    double sim = cosineSimilarity(params.qvec, 'review_vector');
                    sim = (sim + 1.0) / 2.0;

                    double bonus = 0.0;
                    if (!doc['averageReviewScore'].empty) {
                        double r = Math.min(Math.max(doc['averageReviewScore'].value, 0.0), 5.0);
                        bonus += (r / 5.0) * 0.5;
                    }
                    if (!doc['totalReviewCount'].empty) {
                        double n = Math.min(doc['totalReviewCount'].value, 5000);
                        bonus += (n / 5000.0) * 0.5;
                    }

                    return 1.0 + 5.0 * sim + bonus;
                """,
                "params": {
                    "qvec": query_vec
                }
            }
        }
    })

    if pos_ings:
        functions.append({
            "filter": _ing_filter(list(pos_ings)),
            "weight": 3.0
        })

    return {
        "size": top_k,
        "query": {
            "function_score": {
                "query": {"term": {"category": category}},
                "score_mode": "sum",
                "boost_mode": "sum",
                "functions": functions
            }
        },
        "_source": {
            "includes": [
                "product_id", "productName", "brand", "category",
                "ingredients", "salePrice", "averageReviewScore",
                "totalReviewCount"
            ]
        }
    }


def search_candidates_per_category(es: Elasticsearch, fusion: Dict[str, Any], per_cat: int = 30):
    query_vec = encode_query_from_fusion(fusion)
    out: Dict[str, List[Dict[str, Any]]] = {}
    for cat in CATEGORIES:
        body = build_query(fusion, cat, per_cat, query_vec)
        resp = es.search(index=INDEX_NAME, body=body)
        hits = resp.get("hits", {}).get("hits", [])
        out[cat] = [
            {
                "rank_es": i + 1,
                "score_es": h["_score"],
                **h["_source"],
            }
            for i, h in enumerate(hits)
        ]
    return out


def _make_pos_neg_vocab(fusion: Dict[str, Any]) -> Tuple[set, set]:
    idx = fusion.get("indices", {})
    oil = float(idx.get("oil", 0.0))
    dry = float(idx.get("dry", 0.0))
    sens = float(idx.get("sensitivity", 0.0))
    pigment = float(idx.get("pigment", 0.0))
    acne = 1.0 if fusion.get("flags", {}).get("acne") else 0.0
    pos, neg = set(), set()
    if pigment >= 2.0:
        pos |= POS_ING["pigment"]
    if sens >= 2.0:
        pos |= POS_ING["sensitivity"]
        neg |= NEG_ING["sensitivity"]
    if dry >= 2.0:
        pos |= POS_ING["dry"]
    if oil >= 2.0 or acne:
        pos |= POS_ING["acne"]
        neg |= NEG_ING["acne"]
    return pos, neg


def _ingredients_hits(ingredients, vocab):
    return sum(1 for x in (ingredients or []) if x in vocab)


def _safe_log1p(x):
    try:
        return math.log1p(float(x)) if x else 0.0
    except Exception:
        return 0.0


def featurize(results_by_cat: Dict[str, List[Dict[str, Any]]], fusion: Dict[str, Any]):
    pos_vocab, neg_vocab = _make_pos_neg_vocab(fusion)
    X, group, ids, cats = [], [], [], []

    for cat in CATEGORIES:
        rows = results_by_cat.get(cat, [])
        group.append(len(rows))

        for r in rows:
            ingredients = r.get("ingredients") or []
            pos_hits = _ingredients_hits(ingredients, pos_vocab)
            neg_hits = _ingredients_hits(ingredients, neg_vocab)
            pos_ratio = (pos_hits / max(1, len(ingredients))) if ingredients else 0.0

            avg = r.get("averageReviewScore")
            cnt = r.get("totalReviewCount")
            price = r.get("salePrice")
            es_score = r.get("score_es", 0.0)

            X.append(
                [
                    pos_hits,
                    neg_hits,
                    pos_hits - neg_hits,
                    pos_ratio,
                    avg or 0.0,
                    _safe_log1p(cnt),
                    _safe_log1p(price),
                    es_score,
                ]
            )
            ids.append(r["product_id"])
            cats.append(cat)

    return X, group, ids, cats


def load_booster(model_path: str = LTR_MODEL_PATH):
    import xgboost as xgb
    booster = xgb.Booster()
    booster.load_model(model_path)
    return booster


def rerank_with_booster(results_by_cat, fusion, booster):
    import xgboost as xgb
    X, group, ids, cats = featurize(results_by_cat, fusion)
    if not X:
        return {c: [] for c in CATEGORIES}

    X_arr = np.asarray(X, dtype=np.float32)
    dmat = xgb.DMatrix(X_arr)
    preds = booster.predict(dmat)

    out = {c: [] for c in CATEGORIES}
    idx = 0
    for cat in CATEGORIES:
        rows = results_by_cat.get(cat, [])
        n = len(rows)
        for i in range(n):
            row = dict(rows[i])
            row["score_ltr"] = float(preds[idx + i])
            out[cat].append(row)
        out[cat].sort(key=lambda r: r["score_ltr"], reverse=True)
        for rank, r in enumerate(out[cat], 1):
            r["rank_ltr"] = rank
        idx += n
    return out


def top_k_overall(reranked_by_cat, k=3):
    merged = [dict(r, category=c) for c, rows in reranked_by_cat.items() for r in rows]
    merged.sort(key=lambda x: x.get("score_ltr", 0.0), reverse=True)
    return merged[:k]


def top_k_per_category(reranked_by_cat, k=3):
    return {c: rows[:k] for c, rows in reranked_by_cat.items()}


def recommend_from_fusion_online(es: Elasticsearch, booster, fusion_json: Dict[str, Any], topk: int = 3):
    es_results = search_candidates_per_category(es, fusion_json, per_cat=30)
    reranked = rerank_with_booster(es_results, fusion_json, booster)

    all_items = []
    for cat, items in reranked.items():
        for doc in items:
            all_items.append(doc)
    all_items.sort(key=lambda d: d.get("score_ltr", 0.0), reverse=True)

    return all_items[:topk]


es_global = connect_es()
booster_global = load_booster(LTR_MODEL_PATH)


def recommend_for_request(fusion_json: Dict[str, Any], topk: int = 3):
    return recommend_from_fusion_online(es_global, booster_global, fusion_json, topk=topk)


if __name__ == "__main__":
    fusion_json = {
        "skin_type": "복합성",
        "indices": {
            "oil": 2.2,
            "dry": 1.0,
            "sensitivity": 2.4,
            "wrinkle": 1.2,
            "pigment": 2.8,
        },
        "flags": {"sensitive": True, "aging": False, "pigment": True, "acne": False},
    }

    es = es_global
    booster = booster_global

    es_results = search_candidates_per_category(es, fusion_json, per_cat=15)

    for cat in CATEGORIES:
        print(f"\n--- {cat.upper()} (ES candidates) ---")
        for r in es_results.get(cat, []):
            print(
                f"- ({r['score_es']:.4f}) {r.get('brand','')} | "
                f"{r.get('productName','')[:80]} | {r.get('salePrice')}"
            )

    reranked = rerank_with_booster(es_results, fusion_json, booster)

    print("\n===  CATEGORY TOP-3 (after LTR) ===")
    top3_by_cat = top_k_per_category(reranked, 3)
    for cat, rows in top3_by_cat.items():
        print(f"\n[{cat.upper()}]")
        for r in rows:
            print(
                f"- (LTR:{r['score_ltr']:.4f} | ES:{r['score_es']:.4f}) "
                f"{r.get('brand','')} | {r.get('productName','')[:80]} | {r.get('salePrice')}"
            )


--- CREAM (ES candidates) ---
- (10.2172) 더마팩토리 온라인스토어 | [비건인증] 더마팩토리 EDLP 어성초 71% 수분크림(대용량) 120g /수분진정 속건조개선 민감지성피부 촘촘보습 2배용량 수분만땅 | 8000
- (10.1608) 더마팩토리 온라인스토어 | [비건인증] 더마팩토리 레티날 300ppm 크림 30ml /비타민 A 레티날 레티놀 리프팅 주름개선 크림 | 9000
- (10.0970) 더마팩토리 온라인스토어 | [비건인증] 더마팩토리 EDLP 어성초 71% 수분크림 60g /속건조개선 민감지성피부 열감감소 수분가득 촉촉크림 | 5200
- (10.0834) 달바 공식스토어 | 달바 비건 더블 크림 단지형 70g | 78000
- (10.0327) 더마팩토리 온라인스토어 | [비건인증] 더마팩토리 세라판테놀 8% 인텐시브 크림 (세라마이드 10,000ppm) 50ml /고보습크림 보습영양 탄력크림 속보습 장벽케어크림 | 9000
- (10.0315) 더마팩토리 온라인스토어 | [비건인증] 더마팩토리 레티날 1000ppm 크림 30ml /비타민A 레티날 레티놀 모공탄력 깐달걀피부 리프팅 미백 주름개선 레틴A 탄력세럼 포 | 15000
- (10.0277) 마몽드 | [싱글] 마몽드 포어 슈링커 바쿠치올 크림 60ml x 1개 | 38000
- (9.9825) 더마팩토리 온라인스토어 | [비건인증] 더마팩토리 레티노이드 4000ppm 크림 30ml/비타민A 판테놀 리프팅 미백 주름개선 탄력크림 레티놀 3000ppm 레티날 100 | 20000
- (9.9604) 더마팩토리 온라인스토어 | [비건인증] 더마팩토리 더블화이트닝크림 40ml (20ml+20ml) / 반반크림 미백크림 미백케어 잡티케어 글루타치온 투명한피부 톤케어 브라이 | 12000
- (9.9197) 한율 | [본사직영] 한율 어린쑥 수분진정크림 55ml /한율 1등템 | 38000
- (9.9161) 달바 공식스토어 | 달바 비건 더블 크림 튜브형 6