In [1]:
# =========================
# [Cell 1] 설정 / 경로
# =========================
from pathlib import Path

BASE_DIR = Path("/root/heco")
ART_DIR  = BASE_DIR / "artifacts"    # 01에서 만든 아티팩트 폴더
OUT_DIR  = ART_DIR                   # 메타패스 결과도 artifacts 안에 저장

# 선택 파라미터
# - topK: 논문별 메타패스 이웃 상한. None이면 제한 없음.
TOPK_PAP = None      # 예: 100
TOPK_PCP = None      # 예: 100

# 자기 자신 제외 여부
EXCLUDE_SELF = True

# 랜덤샘플링(상한 초과 시)을 위해 시드 고정
RNG_SEED = 42

print("ART_DIR:", ART_DIR)
print("OUT_DIR:", OUT_DIR)


ART_DIR: /root/heco/artifacts
OUT_DIR: /root/heco/artifacts


In [2]:
# =========================
# [Cell 2] 라이브러리 로드 & 유틸
# =========================
import json
import numpy as np
import pandas as pd

def load_npz(path):
    with np.load(path) as data:
        # 단일 키만 있다고 가정(E or X)
        if "E" in data: return data["E"]
        if "X" in data: return data["X"]
        raise ValueError(f"Unsupported npz keys in {path}: {list(data.keys())}")

def build_adj(num_nodes: int, edges: np.ndarray, bidirectional: bool=False):
    """
    간단한 CSR 인접리스트 생성.
    edges: [N,2] (src,dst)
    """
    src, dst = edges[:,0], edges[:,1]
    if bidirectional:
        src = np.concatenate([src, dst], axis=0)
        dst = np.concatenate([dst, src], axis=0)
    order = np.argsort(src, kind="mergesort")
    src = src[order]; dst = dst[order]
    counts = np.bincount(src, minlength=num_nodes)
    indptr = np.zeros(num_nodes + 1, dtype=np.int64)
    indptr[1:] = np.cumsum(counts)
    indices = dst.astype(np.int64, copy=False)
    return indptr, indices

def neighbors_from_adj(indptr, indices, u: int) -> np.ndarray:
    return indices[indptr[u]:indptr[u+1]]

def list2csr(lists, sort_unique=True):
    indptr = [0]
    all_idx = []
    for nb in lists:
        if sort_unique:
            nb = np.array(sorted(set(nb)), dtype=np.int64)
        else:
            nb = np.array(list(nb), dtype=np.int64)
        all_idx.append(nb)
        indptr.append(indptr[-1] + len(nb))
    indptr = np.array(indptr, dtype=np.int64)
    indices = np.concatenate(all_idx, axis=0) if len(all_idx) else np.array([], dtype=np.int64)
    return indptr, indices

def maybe_topk(arr: np.ndarray, k: int|None, rng: np.random.Generator):
    if k is None or arr.size <= k:
        return arr
    # 무작위 샘플링 (원하면 차후 점수기반으로 변경 가능)
    sel = rng.choice(arr, size=k, replace=False)
    return np.sort(sel)

rng = np.random.default_rng(RNG_SEED)


In [3]:
# =========================
# [Cell 3] 아티팩트 로드
# =========================
# meta.json에서 노드 수 확인
with open(ART_DIR/"meta.json","r") as f:
    meta = json.load(f)

num_papers   = meta["num_papers"]
num_authors  = meta["num_authors"]
num_concepts = meta["num_concepts"]

# 관계별 엣지 로드 (01 단계에서 저장한 것)
E_AP = load_npz(ART_DIR/"edges_AP.npz")  # A->P
E_PA = load_npz(ART_DIR/"edges_PA.npz")  # P->A
E_PC = load_npz(ART_DIR/"edges_PC.npz")  # P->C
E_CP = load_npz(ART_DIR/"edges_CP.npz")  # C->P

print("num_papers  :", num_papers)
print("num_authors :", num_authors)
print("num_concepts:", num_concepts)
print("E_AP:", E_AP.shape, "E_PA:", E_PA.shape, "E_PC:", E_PC.shape, "E_CP:", E_CP.shape)


num_papers  : 5000
num_authors : 32161
num_concepts: 6901
E_AP: (78212, 2) E_PA: (78212, 2) E_PC: (164493, 2) E_CP: (164493, 2)


In [4]:
# =========================
# [Cell 4] 단일 hop 인접리스트 구성
# =========================
# P->A, A->P
PA_indptr, PA_indices = build_adj(num_papers,  E_PA, bidirectional=False)   # paper -> authors
AP_indptr, AP_indices = build_adj(num_authors, E_AP, bidirectional=False)   # author -> papers

# P->C, C->P
PC_indptr, PC_indices = build_adj(num_papers,  E_PC, bidirectional=False)   # paper -> concepts
CP_indptr, CP_indices = build_adj(num_concepts, E_CP, bidirectional=False)  # concept -> papers

print("PA_indptr:", PA_indptr.shape, "PA_indices:", PA_indices.shape)
print("AP_indptr:", AP_indptr.shape, "AP_indices:", AP_indices.shape)
print("PC_indptr:", PC_indptr.shape, "PC_indices:", PC_indices.shape)
print("CP_indptr:", CP_indptr.shape, "CP_indices:", CP_indices.shape)


PA_indptr: (5001,) PA_indices: (78212,)
AP_indptr: (32162,) AP_indices: (78212,)
PC_indptr: (5001,) PC_indices: (164493,)
CP_indptr: (6902,) CP_indices: (164493,)


In [5]:
# =========================
# [Cell 5] P–A–P 메타패스 이웃 생성
# =========================
PAP_lists = []
for p in range(num_papers):
    # 1-hop: papers->authors
    a_neighbors = neighbors_from_adj(PA_indptr, PA_indices, p)
    # 2-hop: authors->papers
    two_hop_papers = []
    for a in a_neighbors:
        two_hop_papers.extend(neighbors_from_adj(AP_indptr, AP_indices, a))
    # numpy array로 변환
    two_hop_papers = np.array(two_hop_papers, dtype=np.int64)
    if two_hop_papers.size == 0:
        PAP_lists.append([])
        continue
    # 자기 자신 제거
    if EXCLUDE_SELF:
        two_hop_papers = two_hop_papers[two_hop_papers != p]
    # 중복 제거 + (선택) topK
    two_hop_papers = np.unique(two_hop_papers)
    two_hop_papers = maybe_topk(two_hop_papers, TOPK_PAP, rng)
    PAP_lists.append(two_hop_papers.tolist())

PAP_indptr, PAP_indices = list2csr(PAP_lists, sort_unique=False)
print("PAP CSR -> indptr:", PAP_indptr.shape, "indices:", PAP_indices.shape)
print("PAP 예시:", PAP_indices[PAP_indptr[0]:PAP_indptr[1]][:10] if num_papers>0 else [])


PAP CSR -> indptr: (5001,) indices: (95466,)
PAP 예시: [ 112  708 1069 1246 1315 1558 1854 1875 2182 2640]


In [6]:
# =========================
# [Cell 6] P–C–P 메타패스 이웃 생성
# =========================
PCP_lists = []
for p in range(num_papers):
    # 1-hop: papers->concepts
    c_neighbors = neighbors_from_adj(PC_indptr, PC_indices, p)
    # 2-hop: concepts->papers
    two_hop_papers = []
    for c in c_neighbors:
        two_hop_papers.extend(neighbors_from_adj(CP_indptr, CP_indices, c))
    two_hop_papers = np.array(two_hop_papers, dtype=np.int64)
    if two_hop_papers.size == 0:
        PCP_lists.append([])
        continue
    if EXCLUDE_SELF:
        two_hop_papers = two_hop_papers[two_hop_papers != p]
    two_hop_papers = np.unique(two_hop_papers)
    two_hop_papers = maybe_topk(two_hop_papers, TOPK_PCP, rng)
    PCP_lists.append(two_hop_papers.tolist())

PCP_indptr, PCP_indices = list2csr(PCP_lists, sort_unique=False)
print("PCP CSR -> indptr:", PCP_indptr.shape, "indices:", PCP_indices.shape)
print("PCP 예시:", PCP_indices[PCP_indptr[0]:PCP_indptr[1]][:10] if num_papers>0 else [])


PCP CSR -> indptr: (5001,) indices: (24995000,)
PCP 예시: [ 1  2  3  4  5  6  7  8  9 10]


In [7]:
# =========================
# [Cell 7] 저장(CSR 포맷 + 요약)
# =========================
# CSR 저장 (npz)
np.savez_compressed(OUT_DIR / "metapath_PAP.npz", indptr=PAP_indptr, indices=PAP_indices)
np.savez_compressed(OUT_DIR / "metapath_PCP.npz", indptr=PCP_indptr, indices=PCP_indices)

# 간단 요약
summary = {
    "PAP": {
        "num_papers": int(num_papers),
        "total_pairs": int(PAP_indices.size),
        "avg_neighbors_per_paper": float(PAP_indices.size / max(1, num_papers)),
        "topK": TOPK_PAP,
    },
    "PCP": {
        "num_papers": int(num_papers),
        "total_pairs": int(PCP_indices.size),
        "avg_neighbors_per_paper": float(PCP_indices.size / max(1, num_papers)),
        "topK": TOPK_PCP,
    },
    "exclude_self": EXCLUDE_SELF,
}
with open(OUT_DIR / "metapath_summary.json", "w") as f:
    json.dump(summary, f, indent=2)

print("✅ Saved:",
      OUT_DIR/"metapath_PAP.npz",
      OUT_DIR/"metapath_PCP.npz",
      OUT_DIR/"metapath_summary.json")
print(json.dumps(summary, indent=2))


✅ Saved: /root/heco/artifacts/metapath_PAP.npz /root/heco/artifacts/metapath_PCP.npz /root/heco/artifacts/metapath_summary.json
{
  "PAP": {
    "num_papers": 5000,
    "total_pairs": 95466,
    "avg_neighbors_per_paper": 19.0932,
    "topK": null
  },
  "PCP": {
    "num_papers": 5000,
    "total_pairs": 24995000,
    "avg_neighbors_per_paper": 4999.0,
    "topK": null
  },
  "exclude_self": true
}


In [8]:
# =========================
# [Cell 8] (선택) 매핑 파일로 원본 ID 프리뷰
# =========================
map_paper = pd.read_csv(ART_DIR/"map_paper_id.csv")["paper_id"].tolist()

def peek_neighbors(indptr, indices, u: int, k: int=10):
    return indices[indptr[u]:indptr[u+1]][:k]

for name, (indptr, indices) in {
    "PAP": (PAP_indptr, PAP_indices),
    "PCP": (PCP_indptr, PCP_indices),
}.items():
    if len(map_paper) == 0: break
    u = 0
    ids = peek_neighbors(indptr, indices, u, k=10)
    print(f"[{name}] paper_idx={u}, paper_id={map_paper[u] if u < len(map_paper) else 'NA'}")
    print(" -> neighbor paper_idx:", ids)
    print(" -> neighbor paper_id :", [map_paper[i] for i in ids[:5]])

[PAP] paper_idx=0, paper_id=https://openalex.org/W3010906965
 -> neighbor paper_idx: [ 112  708 1069 1246 1315 1558 1854 1875 2182 2640]
 -> neighbor paper_id : ['https://openalex.org/W3020097213', 'https://openalex.org/W4381982883', 'https://openalex.org/W3005448673', 'https://openalex.org/W2976876706', 'https://openalex.org/W3022781679']
[PCP] paper_idx=0, paper_id=https://openalex.org/W3010906965
 -> neighbor paper_idx: [ 1  2  3  4  5  6  7  8  9 10]
 -> neighbor paper_id : ['https://openalex.org/W4213446860', 'https://openalex.org/W2987460522', 'https://openalex.org/W3133928066', 'https://openalex.org/W4382464460', 'https://openalex.org/W2973049920']
