# Knowledge Graph + BERT 기반 영화 임베딩 생성

기존 KG GNN에 BERT 텍스트 임베딩(tagline, overview)을 추가한 버전입니다.

**하이브리드 방식:**
- 영화 노드: BERT(tagline + overview) 임베딩 사용
- 엔티티 노드: fastText 임베딩 (기존 방식 유지)
- 그래프 구조: 영화-엔티티 Knowledge Graph

**vs kg_gnn (기존):**
- kg_gnn: 영화 노드를 release_year로만 초기화
- kg_gnn_bert: 영화 노드를 BERT(tagline+overview)로 초기화

In [None]:
import sys
sys.path.insert(0, "/Users/jisoo/projects/thesis/carte_test")

from config import PROCESSED

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import HeteroData
from torch_geometric.nn import HeteroConv, SAGEConv
from huggingface_hub import hf_hub_download
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import fasttext
import matplotlib.pyplot as plt

print(f"PyTorch: {torch.__version__}")

## 1. 데이터 로드

In [None]:
# 영화 카탈로그 로드
catalog = pd.read_parquet(PROCESSED.MOVIE_CATALOG_PARQUET)
ratings = pd.read_parquet(PROCESSED.RATINGS_PARQUET)

# ratings에 있는 영화만 필터링
movie_ids_in_ratings = ratings['movieId'].unique()
catalog_filtered = catalog[catalog['movieId'].isin(movie_ids_in_ratings)].reset_index(drop=True)

print(f"영화 수: {len(catalog_filtered):,}")
catalog_filtered.head()

## 2. BERT 임베딩 생성 (tagline + overview)

In [None]:
def combine_text_fields(row, text_cols):
    """여러 텍스트 컬럼을 하나의 문자열로 결합"""
    parts = []
    for col in text_cols:
        val = row.get(col)
        if pd.notna(val) and str(val).strip():
            parts.append(str(val).strip())
    return " ".join(parts) if parts else ""


def compute_bert_embeddings(df, text_cols, model_name, batch_size=64, device="cpu"):
    """
    BERT(sentence-transformers)로 텍스트 임베딩 생성
    """
    # 텍스트 결합
    texts = df.apply(lambda row: combine_text_fields(row, text_cols), axis=1).tolist()
    
    # 빈 텍스트 처리
    texts = [t if t else "[empty]" for t in texts]
    
    non_empty = sum(1 for t in texts if t != "[empty]")
    print(f"[BERT] Non-empty texts: {non_empty:,} / {len(texts):,}")
    print(f"[BERT] Sample text: {texts[0][:200]}...")
    
    # 모델 로드
    model = SentenceTransformer(model_name, device=device)
    print(f"[BERT] Model: {model_name}")
    print(f"[BERT] Embedding dim: {model.get_sentence_embedding_dimension()}")
    
    # 임베딩 생성
    embeddings = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
    )
    
    print(f"[BERT] Output shape: {embeddings.shape}")
    return embeddings.astype(np.float32)

In [None]:
# BERT 임베딩 생성
BERT_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # 384차원
TEXT_COLS = ["tagline", "overview"]

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}\n")

bert_embeddings = compute_bert_embeddings(
    catalog_filtered,
    TEXT_COLS,
    model_name=BERT_MODEL,
    batch_size=64,
    device=device
)

In [None]:
# BERT 임베딩을 300차원으로 projection
class LinearProjection(nn.Module):
    """BERT 임베딩을 300차원으로 projection"""
    def __init__(self, input_dim, output_dim=300):
        super().__init__()
        self.proj = nn.Sequential(
            nn.Linear(input_dim, output_dim),
            nn.LayerNorm(output_dim),
        )
        nn.init.xavier_uniform_(self.proj[0].weight)
        nn.init.zeros_(self.proj[0].bias)
    
    def forward(self, x):
        return self.proj(x)


def project_bert_to_300(bert_emb, device="cpu"):
    """BERT 임베딩을 300차원으로 projection"""
    input_dim = bert_emb.shape[1]
    
    torch.manual_seed(42)
    proj = LinearProjection(input_dim, 300).to(device)
    proj.eval()
    
    with torch.no_grad():
        x = torch.from_numpy(bert_emb).to(device)
        out = proj(x)
        return out.cpu().numpy().astype(np.float32)


bert_300 = project_bert_to_300(bert_embeddings, device=device)
print(f"Projected shape: {bert_300.shape}")

## 3. Knowledge Graph 구축

In [None]:
# 엔티티 타입 정의
ENTITY_TYPES = {
    'actor': ['actor_1', 'actor_2', 'actor_3'],
    'director': ['director_1'],
    'writer': ['writer_1'],
    'genre': ['genre_1'],
    'company': ['produced_by_company_1'],
    'country': ['produced_in_country_1'],
    'language': ['spoken_language_1'],
}

def build_knowledge_graph(df, movie_bert_emb):
    """영화 카탈로그에서 Heterogeneous Knowledge Graph 구축 (BERT 임베딩 포함)"""
    
    # 영화 ID 매핑
    movie_ids = df['movieId'].tolist()
    movie_to_idx = {mid: i for i, mid in enumerate(movie_ids)}
    
    # 엔티티별 유니크 값 수집 및 매핑
    entity_to_idx = {}
    for etype, cols in ENTITY_TYPES.items():
        unique_vals = set()
        for col in cols:
            if col in df.columns:
                unique_vals.update(df[col].dropna().unique())
        entity_to_idx[etype] = {v: i for i, v in enumerate(sorted(unique_vals))}
    
    # 엣지 구축
    edges = {etype: ([], []) for etype in ENTITY_TYPES}
    
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Building edges"):
        movie_idx = movie_to_idx[row['movieId']]
        
        for etype, cols in ENTITY_TYPES.items():
            for col in cols:
                if col in df.columns and pd.notna(row[col]):
                    entity_val = row[col]
                    if entity_val in entity_to_idx[etype]:
                        entity_idx = entity_to_idx[etype][entity_val]
                        edges[etype][0].append(movie_idx)
                        edges[etype][1].append(entity_idx)
    
    # HeteroData 구축
    data = HeteroData()
    
    # 영화 노드: BERT 임베딩 사용
    data['movie'].x = torch.from_numpy(movie_bert_emb)
    data['movie'].num_nodes = len(movie_ids)
    
    # 엔티티 노드 수 설정
    for etype, mapping in entity_to_idx.items():
        data[etype].num_nodes = len(mapping)
    
    # 엣지 설정 (양방향)
    for etype, (src, dst) in edges.items():
        if len(src) > 0:
            edge_index = torch.tensor([src, dst], dtype=torch.long)
            data['movie', f'has_{etype}', etype].edge_index = edge_index
            data[etype, f'has_{etype}_rev', 'movie'].edge_index = edge_index.flip(0)
    
    return data, movie_to_idx, entity_to_idx

In [None]:
# KG 구축 (BERT 임베딩 포함)
kg_data, movie_to_idx, entity_to_idx = build_knowledge_graph(catalog_filtered, bert_300)

print("\n[Knowledge Graph 통계]")
print(f"  영화 노드: {kg_data['movie'].num_nodes:,} (BERT 초기화)")
for etype in entity_to_idx:
    print(f"  {etype} 노드: {kg_data[etype].num_nodes:,}")

total_edges = sum(kg_data[et].edge_index.size(1) for et in kg_data.edge_types)
print(f"\n  총 엣지 수: {total_edges:,}")
print(f"  엣지 타입: {len(kg_data.edge_types)}")

## 4. 엔티티 노드 초기화 (fastText)

In [None]:
# fastText 모델 로드
print("fastText 모델 로딩...")
ft_path = hf_hub_download(repo_id="hi-paris/fastText", filename="cc.en.300.bin")
ft_model = fasttext.load_model(ft_path)
print("완료!")

In [None]:
def get_entity_embeddings(mapping, ft_model, dim=300):
    """엔티티 이름을 fastText로 임베딩"""
    idx_to_entity = {v: k for k, v in mapping.items()}
    n = len(mapping)
    emb = np.zeros((n, dim), dtype=np.float32)
    
    for idx in range(n):
        entity_name = idx_to_entity[idx]
        emb[idx] = ft_model.get_sentence_vector(str(entity_name))
    
    return torch.from_numpy(emb)

# 각 엔티티 타입별 초기 임베딩
print("엔티티 임베딩 생성...")
for etype, mapping in entity_to_idx.items():
    kg_data[etype].x = get_entity_embeddings(mapping, ft_model)
    print(f"  {etype}: {kg_data[etype].x.shape}")

print(f"  movie: {kg_data['movie'].x.shape} (BERT)")

## 5. Heterogeneous GNN 모델

In [None]:
class HeteroGNN(nn.Module):
    """Heterogeneous Graph Neural Network using HeteroConv"""
    
    def __init__(self, in_dim, hidden_dim, out_dim, edge_types, num_layers=2):
        super().__init__()
        self.num_layers = num_layers
        self.convs = nn.ModuleList()
        
        for i in range(num_layers):
            in_channels = in_dim if i == 0 else hidden_dim
            out_channels = out_dim if i == num_layers - 1 else hidden_dim
            
            conv_dict = {}
            for edge_type in edge_types:
                conv_dict[edge_type] = SAGEConv(in_channels, out_channels)
            self.convs.append(HeteroConv(conv_dict, aggr='mean'))
    
    def forward(self, x_dict, edge_index_dict):
        for i, conv in enumerate(self.convs):
            x_dict = conv(x_dict, edge_index_dict)
            if i < self.num_layers - 1:
                x_dict = {key: F.relu(x) for key, x in x_dict.items()}
        return x_dict

In [None]:
# 모델 생성
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

model = HeteroGNN(
    in_dim=300,
    hidden_dim=300,
    out_dim=300,
    edge_types=kg_data.edge_types,
    num_layers=2
).to(device)

print(f"\n모델 파라미터: {sum(p.numel() for p in model.parameters()):,}")

## 6. 학습 (Link Prediction)

In [None]:
def train_kg_embeddings(model, data, epochs=100, lr=0.01):
    """Link prediction 기반 unsupervised 학습"""
    
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    # 학습할 엣지 타입 (movie -> entity)
    train_edge_types = [et for et in data.edge_types if et[0] == 'movie']
    
    losses = []
    pbar = tqdm(range(epochs), desc="Training")
    
    for epoch in pbar:
        optimizer.zero_grad()
        
        # Forward pass
        out_dict = model(data.x_dict, data.edge_index_dict)
        
        total_loss = 0
        for edge_type in train_edge_types:
            src_type, _, dst_type = edge_type
            edge_index = data[edge_type].edge_index
            
            if edge_index.size(1) == 0:
                continue
            
            # Positive samples
            src_emb = out_dict[src_type][edge_index[0]]
            dst_emb = out_dict[dst_type][edge_index[1]]
            pos_score = (src_emb * dst_emb).sum(dim=1)
            
            # Negative samples
            neg_dst_idx = torch.randint(0, data[dst_type].num_nodes, (edge_index.size(1),), device=data[dst_type].x.device)
            neg_dst_emb = out_dict[dst_type][neg_dst_idx]
            neg_score = (src_emb * neg_dst_emb).sum(dim=1)
            
            # BPR Loss
            loss = -F.logsigmoid(pos_score - neg_score).mean()
            total_loss += loss
        
        total_loss.backward()
        optimizer.step()
        losses.append(total_loss.item())
        
        pbar.set_postfix({'loss': f'{total_loss.item():.2f}'})
    
    return losses

In [None]:
# 데이터를 device로 이동
kg_data = kg_data.to(device)

# 학습 실행
losses = train_kg_embeddings(model, kg_data, epochs=100, lr=0.01)

In [None]:
# 학습 손실 시각화
plt.figure(figsize=(10, 4))
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Knowledge Graph + BERT Embedding Training Loss')
plt.grid(True, alpha=0.3)
plt.show()

print(f"최종 Loss: {losses[-1]:.4f}")

## 7. 임베딩 추출 및 저장

In [None]:
# 임베딩 추출
model.eval()
with torch.no_grad():
    out_dict = model(kg_data.x_dict, kg_data.edge_index_dict)
    movie_embeddings = out_dict['movie'].cpu().numpy()

print(f"영화 임베딩 shape: {movie_embeddings.shape}")

In [None]:
# movieId 순서 복원
idx_to_movie = {v: k for k, v in movie_to_idx.items()}
movie_ids = np.array([idx_to_movie[i] for i in range(len(movie_to_idx))])

# Parquet으로 저장
EMBEDDINGS_DIR = PROCESSED.DIR / "ablation_embeddings"
EMBEDDINGS_DIR.mkdir(exist_ok=True)

df_emb = pd.DataFrame({
    'movieId': movie_ids,
    'embedding': [e.tolist() for e in movie_embeddings]
})

save_path = EMBEDDINGS_DIR / "emb_kg_gnn_bert.parquet"
df_emb.to_parquet(save_path, index=False)

print(f"저장 완료: {save_path}")
print(f"  영화 수: {len(df_emb):,}")
print(f"  파일 크기: {save_path.stat().st_size / 1024 / 1024:.1f} MB")

## 8. 임베딩 품질 확인

In [None]:
def compute_embedding_stats(emb):
    """임베딩 품질 통계"""
    norms = np.linalg.norm(emb, axis=1)
    valid = norms > 1e-10
    emb_norm = emb[valid] / norms[valid, None]
    
    # Anisotropy
    anisotropy = float(np.linalg.norm(emb_norm.mean(axis=0)))
    
    # Random pair cosine
    rng = np.random.default_rng(42)
    n = len(emb_norm)
    idx_a = rng.integers(0, n, size=50000)
    idx_b = rng.integers(0, n, size=50000)
    idx_b[idx_a == idx_b] = (idx_b[idx_a == idx_b] + 1) % n
    pair_cos = np.sum(emb_norm[idx_a] * emb_norm[idx_b], axis=1)
    
    return {
        'anisotropy': anisotropy,
        'pair_cos_mean': float(np.mean(pair_cos)),
        'pair_cos_std': float(np.std(pair_cos)),
        'pair_cos_p5': float(np.percentile(pair_cos, 5)),
        'pair_cos_p95': float(np.percentile(pair_cos, 95)),
    }

stats = compute_embedding_stats(movie_embeddings)

print("[KG GNN + BERT 임베딩 품질]")
print(f"  Anisotropy:      {stats['anisotropy']:.4f}")
print(f"  Pair Cos Mean:   {stats['pair_cos_mean']:.4f}")
print(f"  Pair Cos Std:    {stats['pair_cos_std']:.4f}")
print(f"  Pair Cos p5:     {stats['pair_cos_p5']:.4f}")
print(f"  Pair Cos p95:    {stats['pair_cos_p95']:.4f}")

In [None]:
# 다른 버전과 비교
print("\n[버전별 비교]")
print("="*65)
print(f"{'지표':<20} {'KG GNN':>15} {'KG GNN+BERT':>15}")
print("-"*65)
print(f"{'Anisotropy':<20} {'~0.70':>15} {stats['anisotropy']:>15.4f}")
print(f"{'Pair Cos Mean':<20} {'~0.49':>15} {stats['pair_cos_mean']:>15.4f}")
print(f"{'Pair Cos p95':<20} {'~0.94':>15} {stats['pair_cos_p95']:>15.4f}")
print("="*65)
print("\n→ BERT 텍스트 정보 추가로 영화 콘텐츠 기반 유사도 반영")