In [1]:
! pip install torch_geometric
! pip install carte_ai
! pip install fasttext

Collecting torch_geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.7.0
Collecting carte_ai
  Downloading carte_ai-0.0.26-py3-none-any.whl.metadata (6.2 kB)
Collecting torcheval (from carte_ai)
  Downloading torcheval-0.0.7-py3-none-any.whl.metadata (8.6 kB)
Collecting fasttext (from carte_ai)
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h 

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
from __future__ import annotations

import argparse
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Mapping, Optional, Tuple

import numpy as np
import pandas as pd
import torch
from huggingface_hub import hf_hub_download
from torch_geometric.loader import DataLoader

# ============================================================
# (1) CARTE import
# ============================================================
from carte_ai import Table2GraphTransformer
from carte_ai.src.carte_model import CARTE_Base
from carte_ai.configs.directory import config_directory


# ============================================================
# (0) 실행 환경 판별
# ============================================================
def is_ipython_env() -> bool:
    """
    - 목적
      - Colab/Jupyter(IPython) 환경 여부 판별
    """
    try:
        from IPython import get_ipython  # type: ignore

        return get_ipython() is not None
    except Exception:
        return False


# ============================================================
# (2) 설정(스키마/실행옵션) - 한 곳에서 관리
# ============================================================
@dataclass(frozen=True)
class CatalogSchema:
    """
    - 목적
      - 입력 catalog 스키마 정의
    - 포인트
      - 멀티값은 join하지 않고 slot 컬럼 유지
    """

    id_col: str = "movieId"
    num_cols: Tuple[str, ...] = ("release_year",)
    text_cols_slot: Tuple[str, ...] = (
        # companies
        "produced_by_company_1",
        # "produced_by_company_2",
        # countries
        "produced_in_country_1",
        # "produced_in_country_2",
        # languages
        "spoken_language_1",
        # "spoken_language_2",
        # actors
        "actor_1",
        "actor_2",
        "actor_3",
        # director / writer
        "director_1",
        "writer_1",
        # genres
        "genre_1",
        # "genre_2",
        # "genre_3",
    )

    @property
    def required_cols(self) -> Tuple[str, ...]:
        return (self.id_col, *self.num_cols, *self.text_cols_slot)


@dataclass(frozen=True)
class RunConfig:
    """
    - 목적
      - 실행 파라미터 묶음
    """

    input_path: Path
    out_path: Path
    pretrained_model_path: Path
    batch_size: int = 256
    device: str = "cpu"
    num_layers: int = 1  # ✅ 주의: 실제 message passing = num_layers + 1(readout)
    verbose: bool = True


# ============================================================
# (3) 문자열 정규화 (논문 정합: 결측은 None 유지)
# ============================================================
def normalize_text(v: Any) -> Optional[str]:
    """
    - 목적
      - 문자열 정규화
    - 규칙
      - None / NaN / "" -> None
      - 그 외 -> 공백 정리 + strip
    """
    if v is None:
        return None
    if isinstance(v, float) and np.isnan(v):
        return None

    s = str(v)
    s = re.sub(r"\s+", " ", s.strip())
    return s if s else None


def build_model_input_table(
    df_raw: pd.DataFrame,
    *,
    schema: CatalogSchema,
    verbose: bool,
) -> Tuple[pd.DataFrame, np.ndarray]:
    """
    - 목적
      - raw catalog -> CARTE 입력용 X, movie_id 배열 생성
    - 포인트
      - 멀티값은 slot 컬럼 유지
      - 결측은 None/NaN 유지 (leaf 미생성 유도)
    """
    missing = [c for c in schema.required_cols if c not in df_raw.columns]
    if missing:
        raise ValueError(f"[SchemaError] missing columns: {missing}")

    df = df_raw.loc[:, list(schema.required_cols)].copy()

    # - id 결측 제거
    df = df.dropna(subset=[schema.id_col]).reset_index(drop=True)
    movie_ids = df[schema.id_col].astype(int).to_numpy()

    # - numeric: NaN 유지
    for c in schema.num_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    # - text: None 유지
    for c in schema.text_cols_slot:
        df[c] = df[c].apply(normalize_text).astype("object")
        if verbose:
            na_ratio = float(df[c].isna().mean())
            print(f"[TextSlot] {c} na_ratio={na_ratio:.3f}")

    X = df.loc[:, [*schema.num_cols, *schema.text_cols_slot]].copy()
    return X, movie_ids


# ============================================================
# (4) 체크포인트 로딩 유틸
# ============================================================
def extract_state_dict(ckpt_obj: Any) -> Dict[str, torch.Tensor]:
    """
    - 목적
      - 다양한 체크포인트 포맷에서 state_dict 추출
    """
    # - lightning 등
    if hasattr(ckpt_obj, "state_dict") and callable(getattr(ckpt_obj, "state_dict")):
        return dict(ckpt_obj.state_dict())

    if isinstance(ckpt_obj, Mapping):
        for key in ("state_dict", "model_state_dict", "model", "net"):
            if key in ckpt_obj and isinstance(ckpt_obj[key], Mapping):
                return dict(ckpt_obj[key])

        # - dict 자체가 state_dict인 경우(텐서 비율로 추정)
        tensor_cnt = sum(isinstance(v, torch.Tensor) for v in ckpt_obj.values())
        if tensor_cnt >= max(1, len(ckpt_obj) // 3):
            return dict(ckpt_obj)

    raise ValueError("checkpoint에서 state_dict를 찾지 못했습니다.")


def strip_common_prefixes(state: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
    """
    - 목적
      - 로딩 실패를 유발하는 공통 prefix 제거
    """
    prefixes = ("model.", "module.", "ft_base.")
    out: Dict[str, torch.Tensor] = {}

    for k, v in state.items():
        nk = k
        changed = True
        while changed:
            changed = False
            for p in prefixes:
                if nk.startswith(p):
                    nk = nk[len(p) :]
                    changed = True
        out[nk] = v

    return out


def infer_int_from_weight(state: Dict[str, torch.Tensor], key: str, axis: int) -> int:
    """
    - 목적
      - 특정 weight 텐서 shape에서 정수값 추론
    """
    w = state.get(key)
    if not isinstance(w, torch.Tensor) or w.ndim != 2:
        raise ValueError(f"state_dict에서 {key}를 찾지 못했습니다.")
    return int(w.shape[axis])


def infer_model_hparams_from_state(state: Dict[str, torch.Tensor]) -> Dict[str, int]:
    """
    - 목적
      - 체크포인트에서 모델 하이퍼파라미터 자동 추론
    - 추론 규칙
      - hidden_dim, input_dim_x: initial_x.0.weight (hidden_dim, input_dim_x)
      - ff_dim: layers.0.linear_net_x.0.weight (ff_dim, hidden_dim)
    """
    hidden_dim = infer_int_from_weight(state, "initial_x.0.weight", axis=0)
    input_dim_x = infer_int_from_weight(state, "initial_x.0.weight", axis=1)

    # - ff_dim 추론(가능하면)
    ff_dim_key = "layers.0.linear_net_x.0.weight"
    ff_dim = hidden_dim
    if ff_dim_key in state and isinstance(state[ff_dim_key], torch.Tensor) and state[ff_dim_key].ndim == 2:
        ff_dim = int(state[ff_dim_key].shape[0])

    return {
        "input_dim_x": input_dim_x,
        "hidden_dim": hidden_dim,
        "ff_dim": ff_dim,
    }


def infer_ckpt_num_layers(state: Dict[str, torch.Tensor]) -> int:
    """
    - 목적
      - 체크포인트 내부 layers.N 개수 추정(로그용)
    """
    pat = re.compile(r"(?:^|\.)layers\.(\d+)\.")
    idxs: List[int] = []
    for k in state.keys():
        m = pat.search(k)
        if m:
            idxs.append(int(m.group(1)))
    return (max(idxs) + 1) if idxs else 0


def build_device(device: Optional[str]) -> str:
    """
    - 목적
      - device 자동 선택
    """
    if device is None or str(device).strip() == "":
        return "cuda" if torch.cuda.is_available() else "cpu"
    return str(device).strip()


def load_carte_base(
    *,
    pretrained_model_path: Path,
    device: str,
    num_layers: int,
    verbose: bool,
) -> CARTE_Base:
    """
    - 목적
      - pretrained CARTE_Base 로드 + "프리트레인 가중치 사용 여부"를 확실히 검증
    - 검증 포인트(중요)
      1) ckpt 파일 존재/형태/키 확인
      2) load_state_dict 결과(missing/unexpected) 확인
      3) ckpt_state vs model.state_dict 값(allclose) 비교로 '실제 주입' 확정
    - 주의
      - CARTE_Base는 read_out_block이 1개 추가로 존재
      - 즉, 실제 message passing 단계는 (num_layers + 1)로 보는 게 안전
    """

    # ============================================================
    # (A) 체크포인트 로드 + 정체 확인
    # ============================================================
    ckpt_path = Path(pretrained_model_path)
    if not ckpt_path.exists():
        raise FileNotFoundError(f"[CKPT] file not found: {ckpt_path}")

    ckpt_obj = torch.load(str(ckpt_path), map_location="cpu")

    if verbose:
        print(f"[CKPT] path: {ckpt_path}")
        print(f"[CKPT] size: {ckpt_path.stat().st_size / (1024**2):.2f} MB")
        print(f"[CKPT] type: {type(ckpt_obj)}")
        if isinstance(ckpt_obj, dict):
            print(f"[CKPT] top-level keys(sample): {list(ckpt_obj.keys())[:30]}")

    raw_state = extract_state_dict(ckpt_obj)
    state = strip_common_prefixes(raw_state)

    # - pretrain ckpt 여부(약한 힌트): pretrain_classifier 파라미터가 있으면 pretrain에서 온 경우가 많음
    if verbose:
        has_pretrain_head = any(k.startswith("pretrain_classifier.") for k in state.keys())
        print(f"[CKPT] contains pretrain_classifier.* = {has_pretrain_head}")

    # ============================================================
    # (B) 체크포인트로부터 모델 하이퍼파라미터 추론
    # ============================================================
    h = infer_model_hparams_from_state(state)
    input_dim_x = h["input_dim_x"]
    hidden_dim = h["hidden_dim"]
    ff_dim = h["ff_dim"]

    # - edge 입력도 동일 차원으로 가정(Table2GraphTransformer fastText=300과 정합)
    input_dim_e = input_dim_x

    ckpt_layers = infer_ckpt_num_layers(state)

    # - num_heads 안전 처리
    num_heads = 12
    if hidden_dim % num_heads != 0:
        if verbose:
            print(f"[Warn] hidden_dim={hidden_dim} 이 num_heads=12로 나누어떨어지지 않음 → num_heads=1로 변경")
        num_heads = 1

    # ============================================================
    # (C) 모델 생성 + state_dict 로드
    # ============================================================
    model = CARTE_Base(
        input_dim_x=input_dim_x,
        input_dim_e=input_dim_e,
        hidden_dim=hidden_dim,
        num_layers=int(num_layers),
        ff_dim=int(ff_dim),
        num_heads=int(num_heads),
        concat=True,
        dropout=0.0,
    )

    incompat = model.load_state_dict(state, strict=False)

    # ============================================================
    # (D) 안정성 체크: 핵심 파라미터 누락 시 즉시 중단
    # ============================================================
    must_prefix = ("initial_x.", "initial_e.", "read_out_block.")
    critical_missing = [k for k in incompat.missing_keys if k.startswith(must_prefix)]
    if critical_missing:
        raise RuntimeError(
            "[LoadError] 핵심 파라미터 로드 실패 → 프리트레인 사용 불가 상태\n"
            f"  - sample: {critical_missing[:30]}"
        )

    # ============================================================
    # (E) 프리트레인 "실제 주입" 검증 (가장 확실한 체크)
    # ============================================================
    def _verify_pretrained_injected(
        *,
        model: CARTE_Base,
        ckpt_state: Dict[str, torch.Tensor],
        verbose: bool,
    ) -> bool:
        """
        - 목적
          - ckpt_state가 model 파라미터에 실제로 주입되었는지 확정
        - 방식
          - 공통 키 중 '핵심 파라미터' 우선으로 allclose 비교
        """
        model_state = model.state_dict()

        # - 공통 키
        common_keys = [k for k in ckpt_state.keys() if k in model_state]
        if verbose:
            print(f"[Verify] common_keys={len(common_keys):,} / model_keys={len(model_state):,} / ckpt_keys={len(ckpt_state):,}")

        if not common_keys:
            if verbose:
                print("[Verify] 공통 키가 없습니다. → 구조 불일치/로딩 실패 가능성 큼")
            return False

        # - 우선 확인할 핵심 키(없으면 common_keys에서 일부 사용)
        priority_keys = [
            "initial_x.0.weight",
            "initial_e.0.weight",
            "read_out_block.g_attn.lin_query.weight",
            "read_out_block.g_attn.lin_key.weight",
            "read_out_block.g_attn.lin_value.weight",
        ]
        check_keys = [k for k in priority_keys if k in common_keys]
        if not check_keys:
            check_keys = common_keys[:5]

        matched = 0
        for k in check_keys:
            a = model_state[k].detach().cpu()
            b = ckpt_state[k].detach().cpu()
            same = torch.allclose(a, b)
            matched += int(same)
            if verbose:
                print(f"[Verify] {k} allclose={same} shape={tuple(a.shape)}")

        injected = matched > 0
        if verbose:
            print(f"[Verify] injected={injected} (matched {matched}/{len(check_keys)})")
        return injected

    injected = _verify_pretrained_injected(model=model, ckpt_state=state, verbose=verbose)
    if not injected:
        raise RuntimeError(
            "[VerifyError] 체크포인트가 모델에 실제로 주입되지 않았습니다.\n"
            "- prefix 제거/모델 구조/체크포인트 파일을 확인하세요."
        )

    # ============================================================
    # (F) 로깅 + 디바이스 이동
    # ============================================================
    if verbose:
        print(f"[Model] input_dim_x={input_dim_x} input_dim_e={input_dim_e} hidden_dim={hidden_dim} ff_dim={ff_dim} heads={num_heads}")
        print(f"[Model] num_layers={num_layers} (ckpt_layers≈{ckpt_layers}) | 실제 MP≈{num_layers + 1}(readout 포함)")
        print(f"[LoadState] missing_keys={len(incompat.missing_keys)} unexpected_keys={len(incompat.unexpected_keys)}")
        print("  - missing sample:", incompat.missing_keys[:20])
        print("  - unexpected sample:", incompat.unexpected_keys[:20])
        print("[OK] pretrained weights injected ✅")

    model = model.to(torch.device(device))
    model.eval()
    return model



# ============================================================
# (5) Graphlet 생성 (fastText 다운로드 포함)
# ============================================================
def build_graphlets_from_table(
    X: pd.DataFrame,
    *,
    verbose: bool,
) -> List[Any]:
    """
    - 목적
      - (numeric + text slot) 테이블 -> graphlet 리스트 생성
    """
    fasttext_bin_path = hf_hub_download(
        repo_id="hi-paris/fastText",
        filename="cc.en.300.bin",
    )

    preprocessor = Table2GraphTransformer(
        lm_model="fasttext",
        fasttext_model_path=fasttext_bin_path,
    )

    # - 주의: fit_transform이므로, 내부 사전/처리 상태를 X에 맞춰 구성
    graphs = preprocessor.fit_transform(X)

    if verbose:
        print(f"[Graphlets] count={len(graphs):,}")

        # - 샘플 1개 shape 확인(가능할 때만)
        try:
            g0 = graphs[0]
            x_shape = tuple(getattr(g0, "x").shape)
            e_shape = tuple(getattr(g0, "edge_attr").shape)
            print(f"[Graphlets] sample0 x={x_shape} edge_attr={e_shape}")
        except Exception:
            pass

    return graphs


# ============================================================
# (6) Head embedding 추출
# ============================================================
def get_head_indices_from_batch(batch: Any) -> torch.Tensor:
    """
    - 목적
      - 배치에서 graph별 head 인덱스 추출
    - 우선순위
      1) batch.head_idx 존재 시 사용(가정 최소화)
      2) 없으면 ptr[:-1] 사용(첫 노드가 head라는 가정)
    """
    if hasattr(batch, "head_idx"):
        head_idx = getattr(batch, "head_idx")
        if torch.is_tensor(head_idx):
            return head_idx
        return torch.as_tensor(head_idx, device=batch.x.device)

    if hasattr(batch, "ptr"):
        return batch.ptr[:-1]

    raise RuntimeError("Batch에 head_idx/ptr이 없습니다. (PyG DataLoader/그래프 생성 확인 필요)")


@torch.no_grad()
def extract_head_embeddings(
    graphs: List[Any],
    *,
    model: CARTE_Base,
    batch_size: int,
    device: str,
    verbose: bool,
) -> np.ndarray:
    """
    - 목적
      - 각 graphlet의 head node embedding 추출
    """
    if not graphs:
        raise ValueError("graphs가 비어있습니다.")

    loader = DataLoader(graphs, batch_size=int(batch_size), shuffle=False)

    outs: List[np.ndarray] = []
    for step, batch in enumerate(loader, start=1):
        batch = batch.to(torch.device(device))

        x_out = model(batch.x, batch.edge_index, batch.edge_attr)  # (total_nodes, hidden_dim)
        head_idx = get_head_indices_from_batch(batch)

        head_emb = x_out[head_idx].detach().cpu().numpy()
        outs.append(head_emb)

        if verbose and step == 1:
            print(f"[Embed] batch1 head_emb shape={head_emb.shape}")

    emb = np.vstack(outs)

    # - 안정성 체크(무한/NaN 방지)
    if not np.isfinite(emb).all():
        raise ValueError("[EmbedError] embedding에 NaN/Inf가 포함되어 있습니다.")

    return emb


# ============================================================
# (7) 파이프라인
# ============================================================
def run_pipeline(cfg: RunConfig, *, schema: CatalogSchema) -> Path:
    """
    - 목적
      - movie_catalog_flat.parquet -> movie_embeddings.parquet 생성
    """
    if not cfg.input_path.exists():
        raise FileNotFoundError(f"input_path not found: {cfg.input_path}")

    cfg.out_path.parent.mkdir(parents=True, exist_ok=True)

    if cfg.verbose:
        print(f"[IO] input={cfg.input_path}")
        print(f"[IO] output={cfg.out_path}")
        print(f"[IO] ckpt={cfg.pretrained_model_path}")

    # 1) load catalog
    df_raw = pd.read_parquet(cfg.input_path)

    # 2) standardize -> X 만들기
    X, movie_ids = build_model_input_table(df_raw, schema=schema, verbose=cfg.verbose)

    # 3) table -> graphlets
    graphs = build_graphlets_from_table(X, verbose=cfg.verbose)

    # 4) load model
    model = load_carte_base(
        pretrained_model_path=cfg.pretrained_model_path,
        device=cfg.device,
        num_layers=int(cfg.num_layers),
        verbose=cfg.verbose,
    )

    # 5) extract embeddings
    emb = extract_head_embeddings(
        graphs,
        model=model,
        batch_size=int(cfg.batch_size),
        device=cfg.device,
        verbose=cfg.verbose,
    )

    # 6) save parquet
    out_df = pd.DataFrame(
        {
            schema.id_col: movie_ids,
            "embedding": [e.astype(np.float32).tolist() for e in emb],
        }
    )
    out_df.to_parquet(cfg.out_path, index=False)

    if cfg.verbose:
        print(f"[OK] saved: {cfg.out_path} rows={len(out_df):,} dim={emb.shape[1]} device={cfg.device} num_layers={cfg.num_layers}")

    return cfg.out_path


# ============================================================
# (8) CLI / Notebook helper
# ============================================================
def build_parser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(add_help=True)
    p.add_argument("--input_path", type=str, required=True, help="movie_catalog_flat.parquet 경로")
    p.add_argument("--out_path", type=str, required=True, help="movie_embeddings.parquet 저장 경로")
    p.add_argument("--pretrained_model_path", type=str, default="", help="비우면 config_directory['pretrained_model'] 사용")
    p.add_argument("--batch_size", type=int, default=256)
    p.add_argument("--device", type=str, default="")
    p.add_argument(
        "--num_layers",
        type=int,
        default=1,
        help="CARTE_Base 내부 layers 개수 (주의: 실제 MP≈num_layers+1(readout 포함)). oversmoothing이면 0도 테스트 권장",
    )
    p.add_argument("--verbose", action="store_true")
    return p


def resolve_pretrained_path(user_path: str) -> Path:
    """
    - 목적
      - pretrained 경로 결정
    """
    if user_path.strip():
        return Path(user_path.strip())

    default_path = str(config_directory.get("pretrained_model", "")).strip()
    if not default_path:
        raise ValueError("pretrained_model_path가 비어있습니다. (config_directory['pretrained_model'] 확인 필요)")
    return Path(default_path)


def main(argv: Optional[List[str]] = None) -> None:
    args = build_parser().parse_args(argv)

    schema = CatalogSchema()

    cfg = RunConfig(
        input_path=Path(args.input_path),
        out_path=Path(args.out_path),
        pretrained_model_path=resolve_pretrained_path(args.pretrained_model_path),
        batch_size=int(args.batch_size),
        device=build_device(args.device),
        num_layers=int(args.num_layers),
        verbose=bool(args.verbose),
    )

    run_pipeline(cfg, schema=schema)


def run_in_notebook(
    *,
    project_root: str = "/content/drive/MyDrive/대학원/논문/CARTE",
    input_rel: str = "data/processed/movie_catalog_flat.parquet",
    out_rel: str = "data/processed/movie_embeddings.parquet",
    batch_size: int = 256,
    device: Optional[str] = None,
    num_layers: int = 1,
    verbose: bool = True,
) -> Path:
    """
    - 목적
      - 노트북에서 간단 실행
    """
    schema = CatalogSchema()
    root = Path(project_root)

    cfg = RunConfig(
        input_path=root / input_rel,
        out_path=root / out_rel,
        pretrained_model_path=resolve_pretrained_path(""),
        batch_size=int(batch_size),
        device=build_device(device),
        num_layers=int(num_layers),
        verbose=bool(verbose),
    )

    return run_pipeline(cfg, schema=schema)


# ✅ 노트북에서는 자동 실행 금지
if __name__ == "__main__" and not is_ipython_env():
    main()


# ============================================================
# (9) 노트북 실행 예시(직접 호출)
# ============================================================
out_path = run_in_notebook(
    project_root="/content/drive/MyDrive/대학원/논문/CARTE",
    input_rel="data/processed/movie_catalog_flat.parquet",
    out_rel="data/processed/movie_embeddings.parquet",
    batch_size=256,
    device=None,
    num_layers=0,
    verbose=True,
)
print("saved:", out_path)


[IO] input=/content/drive/MyDrive/대학원/논문/CARTE/data/processed/movie_catalog_flat.parquet
[IO] output=/content/drive/MyDrive/대학원/논문/CARTE/data/processed/movie_embeddings.parquet
[IO] ckpt=/usr/local/lib/python3.12/dist-packages/carte_ai/data/etc/kg_pretrained.pt
[TextSlot] produced_by_company_1 na_ratio=0.124
[TextSlot] produced_in_country_1 na_ratio=0.054
[TextSlot] spoken_language_1 na_ratio=0.036
[TextSlot] actor_1 na_ratio=0.037
[TextSlot] actor_2 na_ratio=0.070
[TextSlot] actor_3 na_ratio=0.089
[TextSlot] director_1 na_ratio=0.006
[TextSlot] writer_1 na_ratio=0.119
[TextSlot] genre_1 na_ratio=0.009


  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[Graphlets] count=86,272
[Graphlets] sample0 x=(11, 300) edge_attr=(20, 300)
[CKPT] path: /usr/local/lib/python3.12/dist-packages/carte_ai/data/etc/kg_pretrained.pt
[CKPT] size: 38.35 MB
[CKPT] type: <class 'dict'>
[CKPT] top-level keys(sample): ['ft_base.initial_x.0.weight', 'ft_base.initial_x.0.bias', 'ft_base.initial_x.2.weight', 'ft_base.initial_x.2.bias', 'ft_base.initial_e.0.weight', 'ft_base.initial_e.0.bias', 'ft_base.initial_e.2.weight', 'ft_base.initial_e.2.bias', 'ft_base.layers.0.g_attn.lin_query.weight', 'ft_base.layers.0.g_attn.lin_key.weight', 'ft_base.layers.0.g_attn.lin_value.weight', 'ft_base.layers.0.g_attn.lin_edge.weight', 'ft_base.layers.0.g_attn.lin_edge.bias', 'ft_base.layers.0.linear_net_x.0.weight', 'ft_base.layers.0.linear_net_x.0.bias', 'ft_base.layers.0.linear_net_x.3.weight', 'ft_base.layers.0.linear_net_x.3.bias', 'ft_base.layers.0.norm1_x.weight', 'ft_base.layers.0.norm1_x.bias', 'ft_base.layers.0.norm2_x.weight', 'ft_base.layers.0.norm2_x.bias', 'ft_bas

In [16]:
from __future__ import annotations

from pathlib import Path
from typing import Optional, Tuple

import numpy as np
import pandas as pd

# ============================================================
# 설정 (경로/컬럼명)
# ============================================================
PROJECT_ROOT = Path("/content/drive/MyDrive/대학원/논문/CARTE")
EMB_PATH = PROJECT_ROOT / "data/processed/movie_embeddings.parquet"
CATALOG_PATH = PROJECT_ROOT / "data/processed/movie_catalog_flat.parquet"  # 없으면 None로

ID_COL = "movieId"
EMB_COL = "embedding"

# ============================================================
# 로더: parquet -> (movie_ids, E)
# ============================================================
def load_embedding_matrix(
    emb_path: Path,
    *,
    id_col: str = ID_COL,
    emb_col: str = EMB_COL,
    expected_dim: Optional[int] = 300,
) -> Tuple[np.ndarray, np.ndarray]:
    """
    - 목적
      - embedding parquet 로드
      - embedding(list) -> (N, D) float32 변환
    """
    df = pd.read_parquet(emb_path)
    if id_col not in df.columns or emb_col not in df.columns:
        raise ValueError(f"[SchemaError] 필요 컬럼 없음: {id_col}, {emb_col} | got={list(df.columns)}")

    movie_ids = pd.to_numeric(df[id_col], errors="coerce")
    if movie_ids.isna().any():
        bad = df.loc[movie_ids.isna(), id_col].head(10).tolist()
        raise ValueError(f"[DataError] movieId 숫자 변환 실패. sample={bad}")
    movie_ids = movie_ids.astype(np.int64).to_numpy()

    emb_list = df[emb_col].tolist()
    if len(emb_list) == 0:
        raise ValueError("[DataError] embedding 테이블이 비었습니다.")

    first = np.asarray(emb_list[0], dtype=np.float32)
    if first.ndim != 1:
        raise ValueError(f"[DataError] embedding[0]가 1D가 아님: {first.shape}")

    dim = int(first.shape[0])
    if expected_dim is not None and dim != expected_dim:
        raise ValueError(f"[DimError] expected_dim={expected_dim}, got_dim={dim}")

    E = np.empty((len(emb_list), dim), dtype=np.float32)
    for i, v in enumerate(emb_list):
        a = np.asarray(v, dtype=np.float32)
        if a.ndim != 1 or a.shape[0] != dim:
            raise ValueError(f"[DimError] row={i} dim mismatch. expected={dim}, got={a.shape}")
        E[i] = a

    return movie_ids, E


# ============================================================
# 진단: "골고루 분포" 체크 (수치로)
# ============================================================
def run_embedding_diagnostics(
    *,
    emb_path: Path,
    catalog_path: Optional[Path] = None,
    expected_dim: int = 300,
    row_sample_size: int = 20000,
    pair_sample_size: int = 200000,
    seed: int = 7,
) -> None:
    """
    - 목적
      - embedding이 "골고루 분포" 되었는지(붕괴/쏠림/이방성) 점검
    - 포함 체크
      1) 유한값/노름/중복
      2) 차원별 분산(죽은 차원 비율)
      3) 평균 방향 쏠림(이방성) 지표
      4) 코사인 유사도 분포(랜덤 쌍 샘플링)
      5) PCA(샘플) 설명분산비: 한두 축에 과도하게 몰리면 의심
    """
    rng = np.random.default_rng(seed)

    # ----------------------------
    # (1) load
    # ----------------------------
    movie_ids, E = load_embedding_matrix(emb_path, expected_dim=expected_dim)
    n, d = E.shape
    print(f"\n========== Embedding Diagnostics ==========")
    print(f"[Path] {emb_path}")
    print(f"[Shape] N={n:,} D={d}")

    # ----------------------------
    # (2) 기본 정상성
    # ----------------------------
    finite_ratio = float(np.isfinite(E).mean())
    print(f"\n[Finite] ratio={finite_ratio:.8f}")
    if finite_ratio < 1.0:
        bad_cnt = int((~np.isfinite(E)).sum())
        print(f"  - WARNING: non-finite count={bad_cnt}")

    norms = np.linalg.norm(E, axis=1)
    print("\n[Norms]")
    print(f"  - min={float(norms.min()):.6f}")
    print(f"  - p25={float(np.quantile(norms, 0.25)):.6f}")
    print(f"  - p50={float(np.median(norms)):.6f}")
    print(f"  - p75={float(np.quantile(norms, 0.75)):.6f}")
    print(f"  - max={float(norms.max()):.6f}")
    print(f"  - zero_norm_ratio={float((norms == 0).mean()):.8f}")

    uniq_id_ratio = len(np.unique(movie_ids)) / len(movie_ids)
    print(f"\n[movieId unique] ratio={uniq_id_ratio:.8f}")
    if uniq_id_ratio < 1.0:
        print("  - WARNING: duplicated movieId 존재")

    # - embedding 중복(샘플 기반)
    sample_n = min(n, row_sample_size)
    sample_idx = rng.choice(n, size=sample_n, replace=False)
    E_s = E[sample_idx]

    #   - float 직접 비교는 불안정 → 소수점 반올림 후 bytes 해시
    E_round = np.round(E_s, 4)
    hashes = np.fromiter((hash(row.tobytes()) for row in E_round), dtype=np.int64, count=sample_n)
    uniq_vec_ratio = len(np.unique(hashes)) / len(hashes)
    print(f"\n[Embedding uniqueness] (sample={sample_n:,}) unique_ratio={uniq_vec_ratio:.6f}")
    if uniq_vec_ratio < 0.98:
        print("  - WARNING: 동일/유사 벡터가 많이 존재할 수 있음(붕괴 의심)")

    # ----------------------------
    # (3) 차원별 분산(죽은 차원) 체크
    # ----------------------------
    # - 샘플에서 차원별 표준편차 확인
    dim_std = E_s.std(axis=0)
    dim_std_min = float(dim_std.min())
    dim_std_med = float(np.median(dim_std))
    dim_std_max = float(dim_std.max())
    dead_dim_ratio = float((dim_std < 1e-4).mean())  # 거의 변하지 않는 차원 비율
    lowvar_dim_ratio = float((dim_std < (0.1 * dim_std_med + 1e-12)).mean())  # 중앙값 대비 매우 낮은 차원 비율

    print("\n[Per-dimension std] (sample 기반)")
    print(f"  - std_min={dim_std_min:.6e}")
    print(f"  - std_med={dim_std_med:.6e}")
    print(f"  - std_max={dim_std_max:.6e}")
    print(f"  - dead_dim_ratio(std<1e-4)={dead_dim_ratio:.6f}")
    print(f"  - lowvar_dim_ratio(std<0.1*median)={lowvar_dim_ratio:.6f}")
    if dead_dim_ratio > 0.05:
        print("  - WARNING: '죽은 차원'이 5% 초과 → 임베딩 붕괴/전처리 문제 가능")

    # ----------------------------
    # (4) 이방성(한 방향으로 쏠림) 체크
    # ----------------------------
    # - 코사인 계산을 위해 정규화
    E_s_norm = E_s / (np.linalg.norm(E_s, axis=1, keepdims=True) + 1e-12)

    # (a) 평균 방향 벡터의 길이: 0에 가까울수록 '골고루'(등방성)에 가까움
    mean_dir = E_s_norm.mean(axis=0)
    mean_dir_norm = float(np.linalg.norm(mean_dir))
    print("\n[Anisotropy] (sample 기반)")
    print(f"  - ||mean(normalized)|| = {mean_dir_norm:.6f}  (작을수록 골고루)")

    # (b) 평균 방향과의 코사인 분포
    cos_to_mean = E_s_norm @ (mean_dir / (np.linalg.norm(mean_dir) + 1e-12))
    print("  - cos(x, mean_dir) stats")
    print(f"    * mean={float(cos_to_mean.mean()):.6f}, std={float(cos_to_mean.std()):.6f}")
    print(f"    * p5={float(np.quantile(cos_to_mean, 0.05)):.6f}, p50={float(np.median(cos_to_mean)):.6f}, p95={float(np.quantile(cos_to_mean, 0.95)):.6f}")
    if mean_dir_norm > 0.10:
        print("  - WARNING: 평균 방향 쏠림이 큰 편(이방성) → 임베딩이 한쪽으로 몰릴 수 있음")

    # ----------------------------
    # (5) 랜덤 쌍 코사인 유사도 분포(골고루 vs 뭉침)
    # ----------------------------
    # - 랜덤으로 두 점을 뽑아 코사인 유사도 분포 확인
    pair_m = min(pair_sample_size, sample_n * 20)  # 과도한 샘플 방지
    a_idx = rng.integers(0, sample_n, size=pair_m)
    b_idx = rng.integers(0, sample_n, size=pair_m)

    # - 동일 인덱스 제거(가능하면)
    same = a_idx == b_idx
    if same.any():
        b_idx[same] = (b_idx[same] + 1) % sample_n

    pair_cos = np.sum(E_s_norm[a_idx] * E_s_norm[b_idx], axis=1)

    print("\n[Pairwise cosine similarity] (random pairs)")
    print(f"  - pairs={pair_m:,}")
    print(f"  - mean={float(pair_cos.mean()):.6f}, std={float(pair_cos.std()):.6f}")
    print(f"  - p1={float(np.quantile(pair_cos, 0.01)):.6f}")
    print(f"  - p5={float(np.quantile(pair_cos, 0.05)):.6f}")
    print(f"  - p50={float(np.median(pair_cos)):.6f}")
    print(f"  - p95={float(np.quantile(pair_cos, 0.95)):.6f}")
    print(f"  - p99={float(np.quantile(pair_cos, 0.99)):.6f}")

    # - 지나치게 모든 쌍이 비슷하면(평균이 높고 분산이 낮음) 뭉침 의심
    if float(pair_cos.mean()) > 0.20 and float(pair_cos.std()) < 0.05:
        print("  - WARNING: 랜덤 쌍 코사인이 전반적으로 높고 분산이 작음 → 임베딩이 뭉친(붕괴) 가능성")

    # ----------------------------
    # (6) PCA 설명분산(샘플) — 특정 축에 과도하게 몰리면 '쏠림'
    # ----------------------------
    # - numpy SVD로 PCA (중심화 후)
    X = E_s - E_s.mean(axis=0, keepdims=True)
    # - full SVD는 느릴 수 있음: sample_n을 적당히 유지
    # - singular values: s (내림차순)
    try:
        _, s, _ = np.linalg.svd(X, full_matrices=False)
        var = (s**2) / (sample_n - 1)
        var_ratio = var / (var.sum() + 1e-12)

        top1 = float(var_ratio[0])
        top5 = float(var_ratio[:5].sum())
        top10 = float(var_ratio[:10].sum())
        # - effective rank (exp(entropy))
        p = var_ratio / (var_ratio.sum() + 1e-12)
        eff_rank = float(np.exp(-(p * np.log(p + 1e-12)).sum()))

        print("\n[PCA variance ratio] (sample 기반)")
        print(f"  - EVR@1  = {top1:.4f}")
        print(f"  - EVR@5  = {top5:.4f}")
        print(f"  - EVR@10 = {top10:.4f}")
        print(f"  - effective_rank ≈ {eff_rank:.2f} (클수록 골고루)")

        if top1 > 0.20:
            print("  - WARNING: 1개 주성분이 20% 초과 → 특정 축 쏠림 가능")
        if top10 > 0.70:
            print("  - WARNING: 상위 10개 주성분이 70% 초과 → 저차원으로 붕괴 가능")
    except np.linalg.LinAlgError:
        print("\n[PCA] WARNING: SVD 실패(수치 문제). 샘플 크기 줄이거나 non-finite 여부 확인 필요")

    # ----------------------------
    # (7) (옵션) 제목 3개 샘플 출력 (매칭 확인용)
    # ----------------------------
    if catalog_path is not None and Path(catalog_path).exists():
        df_cat = pd.read_parquet(catalog_path)
        if ID_COL in df_cat.columns and "title" in df_cat.columns:
            cat_map = df_cat.set_index(ID_COL)["title"]
            print("\n[Sample titles]")
            for mid in movie_ids[:3]:
                title = cat_map.get(int(mid), None)
                print(f"  - movieId={int(mid)} | title={title}")
        else:
            print("\n[Catalog] title 컬럼이 없어서 제목 출력은 생략합니다.")

    print("\n========== Done ==========\n")


# ============================================================
# 실행
# ============================================================
run_embedding_diagnostics(
    emb_path=EMB_PATH,
    catalog_path=CATALOG_PATH,   # 제목 확인 싫으면 None
    expected_dim=300,
    row_sample_size=20000,       # 너무 크면 느림 → 5,000~20,000 추천
    pair_sample_size=200000,     # 너무 크면 느림 → 50,000~200,000 추천
    seed=7,
)



[Path] /content/drive/MyDrive/대학원/논문/CARTE/data/processed/movie_embeddings.parquet
[Shape] N=86,272 D=300

[Finite] ratio=1.00000000

[Norms]
  - min=11.210655
  - p25=13.306205
  - p50=13.443941
  - p75=13.573166
  - max=14.516237
  - zero_norm_ratio=0.00000000

[movieId unique] ratio=1.00000000

[Embedding uniqueness] (sample=20,000) unique_ratio=0.999100

[Per-dimension std] (sample 기반)
  - std_min=8.914248e-03
  - std_med=1.409741e-01
  - std_max=4.092232e-01
  - dead_dim_ratio(std<1e-4)=0.000000
  - lowvar_dim_ratio(std<0.1*median)=0.003333

[Anisotropy] (sample 기반)
  - ||mean(normalized)|| = 0.974635  (작을수록 골고루)
  - cos(x, mean_dir) stats
    * mean=0.974634, std=0.045045
    * p5=0.930868, p50=0.983698, p95=0.997269

[Pairwise cosine similarity] (random pairs)
  - pairs=200,000
  - mean=0.949908, std=0.071041
  - p1=0.734901
  - p5=0.849358
  - p50=0.969802
  - p95=0.995319
  - p99=0.997838

[PCA variance ratio] (sample 기반)
  - EVR@1  = 0.6907
  - EVR@5  = 0.9738
  - EVR@10 = 0