# CARTE + BERT 하이브리드 임베딩

기존 CARTE 파이프라인에 BERT 텍스트 임베딩을 추가하는 버전.

**하이브리드 방식:**
- `tagline`, `overview` → BERT (sentence-transformers)
- 나머지 피처 (actor, director, genre 등) → fastText (기존 방식)

**구현 방식:**
1. BERT로 tagline+overview 임베딩 (768차원)
2. Linear projection으로 300차원 축소
3. graphlet에 BERT 임베딩 노드 추가
4. CARTE 모델로 최종 임베딩 생성

In [None]:
# 의존성 설치 (필요시)
# !pip install sentence-transformers torch_geometric carte_ai

In [1]:
from __future__ import annotations

import re
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Mapping, Optional, Tuple

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from huggingface_hub import hf_hub_download
from sentence_transformers import SentenceTransformer
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from tqdm.auto import tqdm

# CARTE
from carte_ai import Table2GraphTransformer
from carte_ai.src.carte_model import CARTE_Base
from carte_ai.configs.directory import config_directory

# 프로젝트 설정
import sys
sys.path.insert(0, str(Path.cwd()))
from config import PROCESSED, PROJECT_ROOT

print(f"Project root: {PROJECT_ROOT}")
print(f"Output path: {PROCESSED.MOVIE_EMBEDDINGS_BERT_PARQUET}")

  from .autonotebook import tqdm as notebook_tqdm


Project root: /Users/jisoo/projects/thesis/carte_test
Output path: /Users/jisoo/projects/thesis/carte_test/data/processed/movie_embeddings_bert.parquet


In [2]:
# ============================================================
# 설정
# ============================================================
@dataclass(frozen=True)
class CatalogSchema:
    """입력 catalog 스키마 정의"""
    id_col: str = "movieId"
    
    # BERT로 임베딩할 텍스트 컬럼
    bert_text_cols: Tuple[str, ...] = ("tagline", "overview")
    
    # fastText로 임베딩할 컬럼 (기존 방식 유지)
    num_cols: Tuple[str, ...] = ("release_year",)
    text_cols_slot: Tuple[str, ...] = (
        "produced_by_company_1",
        "produced_in_country_1",
        "spoken_language_1",
        "actor_1", "actor_2", "actor_3",
        "director_1", "writer_1",
        "genre_1",
    )

    @property
    def required_cols(self) -> Tuple[str, ...]:
        return (self.id_col, *self.bert_text_cols, *self.num_cols, *self.text_cols_slot)


@dataclass
class RunConfig:
    """실행 파라미터"""
    input_path: Path
    out_path: Path
    pretrained_model_path: Path
    batch_size: int = 256
    device: str = "cpu"
    num_layers: int = 0
    verbose: bool = True
    
    # BERT 설정
    bert_model_name: str = "sentence-transformers/all-MiniLM-L6-v2"  # 384차원
    bert_batch_size: int = 64

In [3]:
# ============================================================
# BERT 임베딩 생성
# ============================================================
def combine_text_fields(row: pd.Series, text_cols: Tuple[str, ...]) -> str:
    """여러 텍스트 컬럼을 하나의 문자열로 결합"""
    parts = []
    for col in text_cols:
        val = row.get(col)
        if pd.notna(val) and str(val).strip():
            parts.append(str(val).strip())
    return " ".join(parts) if parts else ""


def compute_bert_embeddings(
    df: pd.DataFrame,
    text_cols: Tuple[str, ...],
    model_name: str,
    batch_size: int = 64,
    device: str = "cpu",
    verbose: bool = True,
) -> np.ndarray:
    """
    BERT(sentence-transformers)로 텍스트 임베딩 생성
    
    Returns:
        (N, bert_dim) 배열
    """
    # 텍스트 결합
    texts = df.apply(lambda row: combine_text_fields(row, text_cols), axis=1).tolist()
    
    # 빈 텍스트 처리
    texts = [t if t else "[empty]" for t in texts]
    
    if verbose:
        non_empty = sum(1 for t in texts if t != "[empty]")
        print(f"[BERT] Non-empty texts: {non_empty:,} / {len(texts):,}")
        print(f"[BERT] Sample text: {texts[0][:200]}...")
    
    # 모델 로드
    model = SentenceTransformer(model_name, device=device)
    
    if verbose:
        print(f"[BERT] Model: {model_name}")
        print(f"[BERT] Embedding dim: {model.get_sentence_embedding_dimension()}")
    
    # 임베딩 생성
    embeddings = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=verbose,
        convert_to_numpy=True,
    )
    
    if verbose:
        print(f"[BERT] Output shape: {embeddings.shape}")
    
    return embeddings.astype(np.float32)

In [4]:
# ============================================================
# 차원 축소 (BERT dim → 300)
# ============================================================
class LinearProjection(nn.Module):
    """BERT 임베딩을 CARTE 입력 차원(300)으로 projection"""
    def __init__(self, input_dim: int, output_dim: int = 300):
        super().__init__()
        self.proj = nn.Sequential(
            nn.Linear(input_dim, output_dim),
            nn.LayerNorm(output_dim),
        )
        # Xavier 초기화
        nn.init.xavier_uniform_(self.proj[0].weight)
        nn.init.zeros_(self.proj[0].bias)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.proj(x)


def project_bert_to_300(
    bert_emb: np.ndarray,
    device: str = "cpu",
) -> np.ndarray:
    """
    BERT 임베딩을 300차원으로 projection
    학습 없이 고정 projection 사용
    """
    input_dim = bert_emb.shape[1]
    
    # 고정 시드로 재현 가능한 projection
    torch.manual_seed(42)
    proj = LinearProjection(input_dim, 300).to(device)
    proj.eval()
    
    with torch.no_grad():
        x = torch.from_numpy(bert_emb).to(device)
        out = proj(x)
        return out.cpu().numpy().astype(np.float32)

In [5]:
# ============================================================
# Graphlet에 BERT 노드 추가
# ============================================================
def add_bert_node_to_graphlet(
    graph: Data,
    bert_vec: np.ndarray,
) -> Data:
    """
    기존 graphlet에 BERT 임베딩 노드를 추가
    
    - 새 노드를 head node(인덱스 0)와 연결
    - edge_attr는 BERT 벡터 자체를 사용
    """
    old_x = graph.x  # (num_nodes, 300)
    old_edge_index = graph.edge_index  # (2, num_edges)
    old_edge_attr = graph.edge_attr  # (num_edges, 300)
    
    num_old_nodes = old_x.shape[0]
    new_node_idx = num_old_nodes
    
    # 새 노드 피처 추가 (BERT 벡터)
    bert_node = torch.from_numpy(bert_vec).unsqueeze(0)  # (1, 300)
    new_x = torch.cat([old_x, bert_node], dim=0)  # (num_nodes+1, 300)
    
    # 새 엣지 추가: head(0) <-> bert_node (양방향)
    new_edges = torch.tensor([[0, new_node_idx], [new_node_idx, 0]], dtype=torch.long)
    new_edge_index = torch.cat([old_edge_index, new_edges], dim=1)
    
    # 새 엣지 속성: BERT 벡터 사용
    new_edge_attrs = torch.from_numpy(np.stack([bert_vec, bert_vec]))  # (2, 300)
    new_edge_attr = torch.cat([old_edge_attr, new_edge_attrs], dim=0)
    
    # 새 Data 객체 생성
    return Data(
        x=new_x,
        edge_index=new_edge_index,
        edge_attr=new_edge_attr,
    )

In [6]:
# ============================================================
# 기존 유틸리티 함수들 (apply_carte_movie_embeddings.ipynb에서 가져옴)
# ============================================================
def normalize_text(v: Any) -> Optional[str]:
    """문자열 정규화"""
    if v is None:
        return None
    if isinstance(v, float) and np.isnan(v):
        return None
    s = str(v)
    s = re.sub(r"\s+", " ", s.strip())
    return s if s else None


def extract_state_dict(ckpt_obj: Any) -> Dict[str, torch.Tensor]:
    """체크포인트에서 state_dict 추출"""
    if hasattr(ckpt_obj, "state_dict") and callable(getattr(ckpt_obj, "state_dict")):
        return dict(ckpt_obj.state_dict())
    if isinstance(ckpt_obj, Mapping):
        for key in ("state_dict", "model_state_dict", "model", "net"):
            if key in ckpt_obj and isinstance(ckpt_obj[key], Mapping):
                return dict(ckpt_obj[key])
        tensor_cnt = sum(isinstance(v, torch.Tensor) for v in ckpt_obj.values())
        if tensor_cnt >= max(1, len(ckpt_obj) // 3):
            return dict(ckpt_obj)
    raise ValueError("checkpoint에서 state_dict를 찾지 못했습니다.")


def strip_common_prefixes(state: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
    """공통 prefix 제거"""
    prefixes = ("model.", "module.", "ft_base.")
    out: Dict[str, torch.Tensor] = {}
    for k, v in state.items():
        nk = k
        changed = True
        while changed:
            changed = False
            for p in prefixes:
                if nk.startswith(p):
                    nk = nk[len(p):]
                    changed = True
        out[nk] = v
    return out


def build_device(device: Optional[str]) -> str:
    """device 자동 선택"""
    if device is None or str(device).strip() == "":
        return "cuda" if torch.cuda.is_available() else "cpu"
    return str(device).strip()


def resolve_pretrained_path(user_path: str = "") -> Path:
    """pretrained 경로 결정"""
    if user_path.strip():
        return Path(user_path.strip())
    default_path = str(config_directory.get("pretrained_model", "")).strip()
    if not default_path:
        raise ValueError("pretrained_model_path가 비어있습니다.")
    return Path(default_path)

In [7]:
# ============================================================
# 모델 로딩
# ============================================================
def load_carte_base(
    *,
    pretrained_model_path: Path,
    device: str,
    num_layers: int,
    verbose: bool,
) -> CARTE_Base:
    """pretrained CARTE_Base 로드"""
    ckpt_path = Path(pretrained_model_path)
    if not ckpt_path.exists():
        raise FileNotFoundError(f"[CKPT] file not found: {ckpt_path}")

    ckpt_obj = torch.load(str(ckpt_path), map_location="cpu")
    raw_state = extract_state_dict(ckpt_obj)
    state = strip_common_prefixes(raw_state)

    # 하이퍼파라미터 추론
    w = state.get("initial_x.0.weight")
    hidden_dim = int(w.shape[0])
    input_dim_x = int(w.shape[1])
    input_dim_e = input_dim_x
    ff_dim = hidden_dim
    num_heads = 12 if hidden_dim % 12 == 0 else 1

    model = CARTE_Base(
        input_dim_x=input_dim_x,
        input_dim_e=input_dim_e,
        hidden_dim=hidden_dim,
        num_layers=int(num_layers),
        ff_dim=int(ff_dim),
        num_heads=int(num_heads),
        concat=True,
        dropout=0.0,
    )

    model.load_state_dict(state, strict=False)

    if verbose:
        print(f"[Model] hidden_dim={hidden_dim} num_layers={num_layers}")
        print(f"[Model] Loaded from: {ckpt_path}")

    model = model.to(torch.device(device))
    model.eval()
    return model

In [8]:
# ============================================================
# Head embedding 추출
# ============================================================
@torch.no_grad()
def extract_head_embeddings(
    graphs: List[Data],
    *,
    model: CARTE_Base,
    batch_size: int,
    device: str,
    verbose: bool,
) -> np.ndarray:
    """각 graphlet의 head node embedding 추출"""
    if not graphs:
        raise ValueError("graphs가 비어있습니다.")

    loader = DataLoader(graphs, batch_size=int(batch_size), shuffle=False)

    outs: List[np.ndarray] = []
    for step, batch in enumerate(tqdm(loader, disable=not verbose, desc="Extracting embeddings")):
        batch = batch.to(torch.device(device))
        x_out = model(batch.x, batch.edge_index, batch.edge_attr)
        
        # head indices from ptr
        head_idx = batch.ptr[:-1] if hasattr(batch, "ptr") else torch.arange(len(batch))
        head_emb = x_out[head_idx].detach().cpu().numpy()
        outs.append(head_emb)

    emb = np.vstack(outs)
    
    if verbose:
        print(f"[Embeddings] shape: {emb.shape}")
    
    return emb

In [9]:
# ============================================================
# 메인 파이프라인
# ============================================================
def run_hybrid_pipeline(cfg: RunConfig, schema: CatalogSchema) -> Path:
    """
    BERT + CARTE 하이브리드 파이프라인
    
    1. 카탈로그 로드
    2. BERT로 tagline+overview 임베딩
    3. Table2GraphTransformer로 기본 graphlet 생성
    4. 각 graphlet에 BERT 노드 추가
    5. CARTE 모델로 최종 임베딩 생성
    """
    if not cfg.input_path.exists():
        raise FileNotFoundError(f"input_path not found: {cfg.input_path}")

    cfg.out_path.parent.mkdir(parents=True, exist_ok=True)

    if cfg.verbose:
        print(f"[IO] input: {cfg.input_path}")
        print(f"[IO] output: {cfg.out_path}")
        print(f"[Config] BERT model: {cfg.bert_model_name}")
        print(f"[Config] device: {cfg.device}")

    # 1) 카탈로그 로드
    df_raw = pd.read_parquet(cfg.input_path)
    
    # 필수 컬럼 확인
    missing = [c for c in schema.required_cols if c not in df_raw.columns]
    if missing:
        raise ValueError(f"Missing columns: {missing}")
    
    df = df_raw.dropna(subset=[schema.id_col]).reset_index(drop=True)
    movie_ids = df[schema.id_col].astype(int).to_numpy()
    
    if cfg.verbose:
        print(f"[Data] rows: {len(df):,}")

    # 2) BERT 임베딩 생성
    print("\n=== Step 1: BERT Embedding ===")
    bert_emb = compute_bert_embeddings(
        df,
        schema.bert_text_cols,
        model_name=cfg.bert_model_name,
        batch_size=cfg.bert_batch_size,
        device=cfg.device,
        verbose=cfg.verbose,
    )
    
    # 3) BERT 임베딩을 300차원으로 projection
    print("\n=== Step 2: Dimension Projection ===")
    bert_300 = project_bert_to_300(bert_emb, device=cfg.device)
    if cfg.verbose:
        print(f"[Projection] {bert_emb.shape} -> {bert_300.shape}")

    # 4) fastText 기반 graphlet 생성 (기존 방식)
    print("\n=== Step 3: Building Graphlets (fastText) ===")
    
    # 기존 피처 테이블 준비
    X_fasttext = df.loc[:, [*schema.num_cols, *schema.text_cols_slot]].copy()
    
    # numeric: NaN 유지
    for c in schema.num_cols:
        X_fasttext[c] = pd.to_numeric(X_fasttext[c], errors="coerce")
    
    # text: 정규화
    for c in schema.text_cols_slot:
        X_fasttext[c] = X_fasttext[c].apply(normalize_text).astype("object")
    
    # fastText 모델 다운로드 및 graphlet 생성
    fasttext_bin_path = hf_hub_download(
        repo_id="hi-paris/fastText",
        filename="cc.en.300.bin",
    )
    
    preprocessor = Table2GraphTransformer(
        lm_model="fasttext",
        fasttext_model_path=fasttext_bin_path,
    )
    
    graphs = preprocessor.fit_transform(X_fasttext)
    
    if cfg.verbose:
        print(f"[Graphlets] count: {len(graphs):,}")
        g0 = graphs[0]
        print(f"[Graphlets] sample0: x={tuple(g0.x.shape)}, edge_attr={tuple(g0.edge_attr.shape)}")

    # 5) 각 graphlet에 BERT 노드 추가
    print("\n=== Step 4: Adding BERT Nodes to Graphlets ===")
    graphs_with_bert = []
    for i, (g, bv) in enumerate(tqdm(zip(graphs, bert_300), total=len(graphs), desc="Adding BERT nodes", disable=not cfg.verbose)):
        g_new = add_bert_node_to_graphlet(g, bv)
        graphs_with_bert.append(g_new)
    
    if cfg.verbose:
        g0_new = graphs_with_bert[0]
        print(f"[Graphlets+BERT] sample0: x={tuple(g0_new.x.shape)}, edge_attr={tuple(g0_new.edge_attr.shape)}")

    # 6) CARTE 모델 로드
    print("\n=== Step 5: Loading CARTE Model ===")
    model = load_carte_base(
        pretrained_model_path=cfg.pretrained_model_path,
        device=cfg.device,
        num_layers=cfg.num_layers,
        verbose=cfg.verbose,
    )

    # 7) 임베딩 추출
    print("\n=== Step 6: Extracting Embeddings ===")
    emb = extract_head_embeddings(
        graphs_with_bert,
        model=model,
        batch_size=cfg.batch_size,
        device=cfg.device,
        verbose=cfg.verbose,
    )

    # 8) 저장
    out_df = pd.DataFrame({
        schema.id_col: movie_ids,
        "embedding": [e.astype(np.float32).tolist() for e in emb],
    })
    out_df.to_parquet(cfg.out_path, index=False)

    if cfg.verbose:
        print(f"\n[OK] Saved: {cfg.out_path}")
        print(f"[OK] rows={len(out_df):,}, dim={emb.shape[1]}")

    return cfg.out_path

In [10]:
# ============================================================
# 실행
# ============================================================
schema = CatalogSchema()

cfg = RunConfig(
    input_path=PROCESSED.MOVIE_CATALOG_PARQUET,
    out_path=PROCESSED.MOVIE_EMBEDDINGS_BERT_PARQUET,
    pretrained_model_path=resolve_pretrained_path(),
    batch_size=256,
    device=build_device(None),  # auto-detect
    num_layers=0,
    verbose=True,
    bert_model_name="sentence-transformers/all-MiniLM-L6-v2",  # 빠른 모델 (384차원)
    bert_batch_size=64,
)

print(f"Device: {cfg.device}")
print(f"BERT model: {cfg.bert_model_name}")
print()

Device: cpu
BERT model: sentence-transformers/all-MiniLM-L6-v2



In [11]:
# 파이프라인 실행
out_path = run_hybrid_pipeline(cfg, schema)
print(f"\nOutput saved to: {out_path}")

[IO] input: /Users/jisoo/projects/thesis/carte_test/data/processed/movie_catalog_flat.parquet
[IO] output: /Users/jisoo/projects/thesis/carte_test/data/processed/movie_embeddings_bert.parquet
[Config] BERT model: sentence-transformers/all-MiniLM-L6-v2
[Config] device: cpu
[Data] rows: 86,272

=== Step 1: BERT Embedding ===
[BERT] Non-empty texts: 85,995 / 86,272
[BERT] Sample text: ... look closer Lester Burnham, a depressed suburban father in a mid-life crisis, decides to turn his hectic life around after developing an infatuation with his daughter's attractive friend....
[BERT] Model: sentence-transformers/all-MiniLM-L6-v2
[BERT] Embedding dim: 384


Batches: 100%|██████████| 1348/1348 [07:39<00:00,  2.93it/s]


[BERT] Output shape: (86272, 384)

=== Step 2: Dimension Projection ===
[Projection] (86272, 384) -> (86272, 300)

=== Step 3: Building Graphlets (fastText) ===
[Graphlets] count: 86,272
[Graphlets] sample0: x=(11, 300), edge_attr=(20, 300)

=== Step 4: Adding BERT Nodes to Graphlets ===


Adding BERT nodes: 100%|██████████| 86272/86272 [00:06<00:00, 12921.46it/s]


[Graphlets+BERT] sample0: x=(12, 300), edge_attr=(22, 300)

=== Step 5: Loading CARTE Model ===
[Model] hidden_dim=300 num_layers=0
[Model] Loaded from: /Users/jisoo/projects/thesis/carte_test/.venv/lib/python3.11/site-packages/carte_ai/data/etc/kg_pretrained.pt

=== Step 6: Extracting Embeddings ===


Extracting embeddings: 100%|██████████| 337/337 [00:14<00:00, 23.84it/s]


[Embeddings] shape: (86272, 300)

[OK] Saved: /Users/jisoo/projects/thesis/carte_test/data/processed/movie_embeddings_bert.parquet
[OK] rows=86,272, dim=300

Output saved to: /Users/jisoo/projects/thesis/carte_test/data/processed/movie_embeddings_bert.parquet


In [13]:
# ============================================================
# 결과 확인 (간단 진단)
# ============================================================
df_result = pd.read_parquet(out_path)
emb_list = df_result["embedding"].tolist()
E = np.array(emb_list, dtype=np.float32)

print(f"Shape: {E.shape}")
print(f"Norm range: [{E.min():.4f}, {E.max():.4f}]")

# 정규화 후 랜덤 쌍 코사인 유사도
E_norm = E / (np.linalg.norm(E, axis=1, keepdims=True) + 1e-12)

rng = np.random.default_rng(42)
n = min(10000, len(E))
idx_a = rng.integers(0, len(E), size=n)
idx_b = rng.integers(0, len(E), size=n)
cos_sim = np.sum(E_norm[idx_a] * E_norm[idx_b], axis=1)

print(f"\nRandom pair cosine similarity:")
print(f"  mean: {cos_sim.mean():.4f}")
print(f"  std:  {cos_sim.std():.4f}")
print(f"  p5:   {np.percentile(cos_sim, 5):.4f}")
print(f"  p50:  {np.percentile(cos_sim, 50):.4f}")
print(f"  p95:  {np.percentile(cos_sim, 95):.4f}")

Shape: (86272, 300)
Norm range: [nan, nan]

Random pair cosine similarity:
  mean: nan
  std:  nan
  p5:   nan
  p50:  nan
  p95:  nan
