In [1]:
#Cell 1 — 환경/경로 설정(컴피그)
# === CONFIG ===
from pathlib import Path

# 베이스 경로(네 환경 기준)
BASE_DIR = Path("/root/heco")
DATA_DIR = BASE_DIR / "data"          # CSV들이 들어있는 폴더
OUT_DIR  = BASE_DIR / "artifacts"     # 전처리 결과물을 저장할 폴더

# 옵션
AUTHOR_ONE_HOT   = True               # True면 저자 피처를 one-hot, False면 임의 초기 벡터(학습 시 learnable embedding으로 대체 권장)
AUTHOR_FEAT_DIM  = 64                 # AUTHOR_ONE_HOT=False일 때만 사용
CONCEPT_TOPK     = None               # 지금은 줄이지 않음; 나중에 10 같은 값으로 바꿔 실행

# 파일명(필요시 이름만 바꿔도 전체 파이프라인은 동일)
FILE_NODES_PAPERS   = "nodes_papers_v2.csv"
FILE_NODES_AUTHORS  = "nodes_authors_v2.csv"
FILE_NODES_CONCEPTS = "nodes_concepts_v2.csv"

FILE_EDGES_WROTE      = "pairs_wrote.csv"         # author_id -> paper_id
FILE_EDGES_HAS_TOPIC  = "pairs_has_topic.csv"     # paper_id  -> concept_id
# 아래 둘은 이번 메타패스(P–A–P, P–C–P)에는 미사용이므로 읽지 않아도 됨
# FILE_EDGES_CITES      = "edges_cites.csv"       # src_paper_id -> dst_paper_id
# FILE_EDGES_WORKED     = "edges_worked_with.csv" # src_author_id -> dst_author_id

FILE_EMB_PAPERS   = "embedding_vectors.csv"          # paper_id + (abstract+summary ⨁ venue) 임베딩
FILE_EMB_CONCEPTS = "nodes_concepts_with_emb.csv"    # concept_id + concept_name 임베딩

# 경로 생성
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("BASE_DIR:", BASE_DIR)
print("DATA_DIR:", DATA_DIR)
print("OUT_DIR :", OUT_DIR)


BASE_DIR: /root/heco
DATA_DIR: /root/heco/data
OUT_DIR : /root/heco/artifacts


In [2]:
# Cell 2 — 라이브러리 임포트 & 유틸
import json
import numpy as np
import pandas as pd
from pathlib import Path

pd.set_option("display.max_columns", 200)

def l2_normalize(X: np.ndarray, eps: float = 1e-8) -> np.ndarray:
    denom = np.linalg.norm(X, axis=1, keepdims=True) + eps
    return X / denom

def build_id_map(df: pd.DataFrame, id_col: str):
    """id_col을 0..N-1 인덱스로 매핑"""
    ids = df[id_col].astype(str).tolist()
    uniq = pd.unique(ids).tolist()
    id2idx = {id_: i for i, id_ in enumerate(uniq)}
    return id2idx, uniq

def save_npz(path: Path, **arrays):
    path = str(path)
    np.savez_compressed(path, **arrays)


In [3]:
# 3 CSV로드 (필수파일만)
# 노드
nodes_papers   = pd.read_csv(DATA_DIR / FILE_NODES_PAPERS)
nodes_authors  = pd.read_csv(DATA_DIR / FILE_NODES_AUTHORS)
nodes_concepts = pd.read_csv(DATA_DIR / FILE_NODES_CONCEPTS)

# 엣지 (이번 메타패스에 필요한 것만)
edges_wrote     = pd.read_csv(DATA_DIR / FILE_EDGES_WROTE)       # author_id -> paper_id
edges_has_topic = pd.read_csv(DATA_DIR / FILE_EDGES_HAS_TOPIC)   # paper_id  -> concept_id

# 임베딩
emb_papers   = pd.read_csv(DATA_DIR / FILE_EMB_PAPERS)           # paper_id + numeric dims
emb_concepts = pd.read_csv(DATA_DIR / FILE_EMB_CONCEPTS)         # concept_id + numeric dims

print("nodes_papers   :", nodes_papers.shape)
print("nodes_authors  :", nodes_authors.shape)
print("nodes_concepts :", nodes_concepts.shape)
print("edges_wrote    :", edges_wrote.shape)
print("edges_has_topic:", edges_has_topic.shape)
print("emb_papers     :", emb_papers.shape)
print("emb_concepts   :", emb_concepts.shape)

display(nodes_papers.head(3))
display(nodes_authors.head(3))
display(nodes_concepts.head(3))
display(edges_wrote.head(3))
display(edges_has_topic.head(3))
display(emb_papers.head(3))
display(emb_concepts.head(3))

nodes_papers   : (5000, 10)
nodes_authors  : (32161, 5)
nodes_concepts : (6901, 3)
edges_wrote    : (81516, 3)
edges_has_topic: (164476, 3)
emb_papers     : (5000, 833)
emb_concepts   : (6901, 771)


Unnamed: 0,paper_id,title,year,type,cited_by_count,updated_date,venue_name,venue_type,venue_issn_l,abstract_len
0,https://openalex.org/W3010906965,Extension and evaluation of the D4 London-disp...,2020,article,321,2025-08-27T07:13:42.282366,Physical Chemistry Chemical Physics,journal,1463-9076,1380
1,https://openalex.org/W4213446860,Robust Aggregation for Federated Learning,2022,article,342,2025-08-26T16:37:21.303548,IEEE Transactions on Signal Processing,journal,1053-587X,1217
2,https://openalex.org/W2987460522,Learning the Model Update for Siamese Trackers,2019,article,379,2025-08-25T04:44:16.401159,2021 IEEE/CVF International Conference on Comp...,conference,,1340


Unnamed: 0,author_id,author_name,institution_ids,institution_names,h_index
0,https://openalex.org/A5026718416,Eike Caldeweyher,https://openalex.org/I135140700,University of Bonn,0
1,https://openalex.org/A5058947613,Jan‐Michael Mewes,https://openalex.org/I135140700,University of Bonn,0
2,https://openalex.org/A5004260600,Sebastian Ehlert,https://openalex.org/I135140700,University of Bonn,0


Unnamed: 0,concept_id,concept_name,level
0,https://openalex.org/C2778029271,Extension (predicate logic),2
1,https://openalex.org/C177562468,Dispersion (optics),2
2,https://openalex.org/C121864883,Statistical physics,1


Unnamed: 0,positive,negative,label
0,"yilan liao#v1,https://openalex.org/W3012769470#v2",,1
1,"hong-kyung kim#v1,https://openalex.org/W291288...",,1
2,"xiuming zhu#v1,https://openalex.org/W432257696...",,1


Unnamed: 0,positive,negative,label
0,"https://openalex.org/W3009569863#v1,Economics#v2",,1
1,,"https://openalex.org/W3163443091,Image segment...",0
2,"https://openalex.org/W4206433468#v1,Artificial...",,1


Unnamed: 0,paper_id,venue_0,venue_1,venue_2,venue_3,venue_4,venue_5,venue_6,venue_7,venue_8,venue_9,venue_10,venue_11,venue_12,venue_13,venue_14,venue_15,venue_16,venue_17,venue_18,venue_19,venue_20,venue_21,venue_22,venue_23,venue_24,venue_25,venue_26,venue_27,venue_28,venue_29,venue_30,venue_31,venue_32,venue_33,venue_34,venue_35,venue_36,venue_37,venue_38,venue_39,venue_40,venue_41,venue_42,venue_43,venue_44,venue_45,venue_46,venue_47,venue_48,venue_49,venue_50,venue_51,venue_52,venue_53,venue_54,venue_55,venue_56,venue_57,venue_58,venue_59,venue_60,venue_61,venue_62,venue_63,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,emb_10,emb_11,emb_12,emb_13,emb_14,emb_15,emb_16,emb_17,emb_18,emb_19,emb_20,emb_21,emb_22,emb_23,emb_24,emb_25,emb_26,emb_27,emb_28,emb_29,emb_30,emb_31,emb_32,emb_33,emb_34,...,emb_668,emb_669,emb_670,emb_671,emb_672,emb_673,emb_674,emb_675,emb_676,emb_677,emb_678,emb_679,emb_680,emb_681,emb_682,emb_683,emb_684,emb_685,emb_686,emb_687,emb_688,emb_689,emb_690,emb_691,emb_692,emb_693,emb_694,emb_695,emb_696,emb_697,emb_698,emb_699,emb_700,emb_701,emb_702,emb_703,emb_704,emb_705,emb_706,emb_707,emb_708,emb_709,emb_710,emb_711,emb_712,emb_713,emb_714,emb_715,emb_716,emb_717,emb_718,emb_719,emb_720,emb_721,emb_722,emb_723,emb_724,emb_725,emb_726,emb_727,emb_728,emb_729,emb_730,emb_731,emb_732,emb_733,emb_734,emb_735,emb_736,emb_737,emb_738,emb_739,emb_740,emb_741,emb_742,emb_743,emb_744,emb_745,emb_746,emb_747,emb_748,emb_749,emb_750,emb_751,emb_752,emb_753,emb_754,emb_755,emb_756,emb_757,emb_758,emb_759,emb_760,emb_761,emb_762,emb_763,emb_764,emb_765,emb_766,emb_767
0,https://openalex.org/W3010906965,-0.146592,-0.078469,-0.076921,0.009507,0.059249,-0.073674,-0.228234,0.123331,-0.069029,0.100069,0.10883,-0.056039,-0.087381,-0.129448,0.049582,0.007552,-0.15996,-0.124539,-0.077412,0.033873,0.125445,-0.107395,0.050148,0.213733,-0.034194,-0.009922,0.051205,0.004229,-0.054491,0.333212,0.209806,-0.040066,0.125521,0.041689,-0.141834,0.034798,0.082925,-0.023073,0.119328,0.008586,0.034024,0.019098,-0.065593,0.136396,0.020165,-0.011923,0.097804,0.013896,0.041991,-0.079225,0.224306,0.018513,0.199383,-0.005084,0.047467,-0.043615,0.074769,0.535012,0.062836,-0.000713,-0.115023,-0.190925,0.040443,-0.034571,-0.582592,0.263358,0.440137,-0.412921,-0.482905,-0.090752,0.446444,0.009842,-0.366867,0.496498,-0.296139,0.225334,-0.061591,0.394058,-0.473862,0.175903,-0.610744,0.336022,-0.174641,-0.682957,-0.496658,-0.539111,-0.076606,0.157044,0.020413,0.075222,0.163193,0.013038,0.244376,-0.064388,-0.12153,1.337384,-0.504482,0.220244,-0.502865,...,-0.360521,-0.028415,0.004916,0.012882,-0.405791,0.180155,0.038331,0.678726,0.237785,0.332397,0.018792,0.433905,-0.039891,0.567521,0.689571,0.464273,-0.808724,0.058226,-0.310339,-0.895865,-0.231841,-0.129023,-0.137292,-0.53187,-0.35807,0.103458,-0.176535,0.596481,0.277322,1.025165,-0.095032,-0.733213,0.11002,0.334892,0.092103,0.152381,-0.123803,-0.049886,1.761499,0.258073,0.730836,0.187227,0.655851,-0.056454,0.437093,0.90669,0.293135,-0.220827,-0.248015,0.101685,-0.092503,0.330237,-0.318867,-0.148335,0.70047,-0.012824,0.010543,-0.11496,-0.073828,-0.030424,0.522342,0.470699,-0.708847,-0.609352,0.127169,0.513359,-0.330598,-0.157815,-0.599275,-0.594745,0.856284,0.108569,0.467315,0.099838,0.03533,0.147814,0.429688,0.15191,-0.781383,0.097765,0.081411,-0.567652,0.04641,-0.539389,-0.784444,0.319165,0.002029,-0.268097,1.4e-05,0.38216,-0.036262,-0.349808,0.812181,0.675634,0.01962,-0.205038,0.010425,0.40756,0.706132,0.210988
1,https://openalex.org/W4213446860,0.130415,0.136322,-0.114638,-0.069871,0.110752,0.156296,-0.133835,0.314146,-0.12412,0.16399,-0.052889,-0.199897,0.081373,0.061438,-0.023219,-0.085104,-0.027338,-0.198187,-0.118291,-0.040492,0.031069,-0.028271,-0.1207,0.053938,-0.226477,-0.028621,-0.118601,0.08215,-0.149146,0.109974,-0.02217,0.086736,-0.047876,0.068005,-0.05658,-0.021082,0.137798,0.092643,0.039657,-0.0757,0.047293,0.044029,-0.046555,0.091166,0.138031,-0.038744,0.084482,0.04908,0.222591,0.15,0.045466,-0.197254,0.136166,0.143006,-0.148835,-0.044689,0.048264,0.364043,0.256322,0.03647,0.147047,0.098083,-0.007116,0.011959,-0.123783,0.680218,-0.036361,-0.15033,-0.674824,0.195345,0.617835,0.130329,0.475901,0.3459,0.548305,0.021867,0.725633,0.395911,-0.206182,0.090589,-0.366823,-0.098408,-0.444786,0.058656,-0.440826,-0.601035,-0.037771,0.121078,-0.450184,-0.344738,0.812127,-0.335481,-0.06159,-0.478894,-0.227703,1.415296,-1.268729,0.644792,-0.57936,...,-0.408454,0.199239,0.1056,-0.408173,-0.254632,0.213434,0.454673,0.998316,0.16732,0.476571,-0.102199,0.848963,0.127163,0.600435,0.899037,0.315694,-0.188532,-0.14537,-0.14922,-0.383198,0.18568,-0.173125,-0.528447,-0.127801,-0.828486,-0.130131,0.221866,0.683809,0.097403,0.95281,-0.751564,-1.01701,-0.1163,-0.603571,0.413903,-0.002923,0.646766,0.093604,2.241423,-0.014569,0.559,-0.067126,0.38725,-0.42921,0.567496,1.07338,0.652508,-0.277855,0.278926,0.702742,-0.064584,-0.096621,-0.010183,0.116758,0.764447,-0.343146,0.523314,-1.4264,-1.218999,0.192017,0.472866,0.561558,-0.66989,-0.996774,-0.284584,0.035527,0.047107,0.145278,-0.370579,-0.639474,0.598999,0.586743,0.299866,-0.591699,0.458324,-0.662196,-0.136615,-0.091301,-1.202629,-0.178883,-0.076097,-0.568752,-0.0623,-0.645571,-0.582718,-0.114558,-0.323657,0.051492,0.140723,0.167967,0.121183,-0.679482,0.531348,0.487317,0.12319,0.138495,0.337093,0.166208,-0.045906,0.306707
2,https://openalex.org/W2987460522,0.090419,0.206654,-0.11304,0.113296,0.207166,0.052175,-0.043197,0.225825,0.091186,-0.048596,-0.145693,-0.06045,-0.068374,-0.034347,-0.114702,0.200264,0.010839,-0.018739,-0.186334,-0.170487,0.056967,0.083007,-0.112848,-0.079045,-0.093103,-0.089461,0.120389,0.164097,-0.070994,-0.032398,0.175854,0.088566,0.068501,0.083774,-0.06029,0.199242,0.162819,0.284741,-0.114126,-0.007944,-0.069013,0.006997,0.081857,0.093806,-0.156301,-0.175982,-0.069332,-0.005216,0.040225,0.087927,-0.054475,-0.039331,0.176366,0.004469,0.057063,-0.109014,0.179944,0.19643,0.314135,-0.110228,-0.078598,-0.084029,-0.077128,-0.128951,-0.499891,0.5769,0.451898,-0.202028,-0.435531,-0.004617,0.10781,0.498915,0.362835,0.372059,0.342445,0.336781,0.118485,-0.504843,-0.637806,0.582049,0.184425,0.465451,-0.430099,-0.144719,-0.541466,-0.444491,-0.13313,-0.204335,-0.585095,0.006238,0.566402,-0.601164,0.27305,0.376648,-0.032486,1.153972,-0.836062,0.688879,-1.056444,...,0.035588,-0.095534,-0.043195,-0.183741,-0.013113,0.134386,0.515255,0.385536,-0.428174,1.01174,0.063931,0.639054,0.132522,0.107992,1.282355,0.576862,-0.840469,0.13997,-0.207497,-0.532999,0.058916,-0.169834,-0.512333,-0.154233,-0.691588,-0.175419,-0.203855,0.152952,0.274907,0.829215,-0.563315,-0.953397,-0.770437,-0.090563,0.05255,0.40384,-0.497186,0.369684,2.109699,0.10576,0.753766,-0.219801,-0.046316,-0.062892,0.409733,1.181794,0.292201,-0.619278,0.26528,0.768857,0.274083,0.192131,-0.458114,-0.407222,0.763829,-0.619563,-0.048174,-0.683772,-0.898156,-0.319267,0.563721,0.577299,-0.294965,-0.400768,0.280829,-0.271162,-0.179732,-0.074592,-0.101056,-0.979904,0.956128,0.6069,0.231927,0.244188,0.101517,-0.335717,0.659851,-0.319703,-0.936627,-0.087006,-0.14713,-0.870502,-0.067034,-0.140604,-0.815986,-0.156679,-0.394305,-0.019225,-0.556345,-0.098885,0.149175,0.099558,0.247046,0.1967,-0.084869,-0.27182,-0.090214,0.452711,0.14373,0.221805


Unnamed: 0,concept_id,concept_name,level,d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14,d15,d16,d17,d18,d19,d20,d21,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31,d32,d33,d34,d35,d36,d37,d38,d39,d40,d41,d42,d43,d44,d45,d46,d47,d48,d49,d50,d51,d52,d53,d54,d55,d56,d57,d58,d59,d60,d61,d62,d63,d64,d65,d66,d67,d68,d69,d70,d71,d72,d73,d74,d75,d76,d77,d78,d79,d80,d81,d82,d83,d84,d85,d86,d87,d88,d89,d90,d91,d92,d93,d94,d95,d96,...,d668,d669,d670,d671,d672,d673,d674,d675,d676,d677,d678,d679,d680,d681,d682,d683,d684,d685,d686,d687,d688,d689,d690,d691,d692,d693,d694,d695,d696,d697,d698,d699,d700,d701,d702,d703,d704,d705,d706,d707,d708,d709,d710,d711,d712,d713,d714,d715,d716,d717,d718,d719,d720,d721,d722,d723,d724,d725,d726,d727,d728,d729,d730,d731,d732,d733,d734,d735,d736,d737,d738,d739,d740,d741,d742,d743,d744,d745,d746,d747,d748,d749,d750,d751,d752,d753,d754,d755,d756,d757,d758,d759,d760,d761,d762,d763,d764,d765,d766,d767
0,https://openalex.org/C2778029271,Extension (predicate logic),2,0.037495,0.050475,-0.016452,-0.000956,0.009516,-0.010964,-0.074798,-0.011995,-0.008065,-0.000592,0.017847,0.021183,-0.032723,-0.007876,0.052146,-0.062935,0.060312,0.020426,0.065311,0.003642,-0.009703,0.008588,-0.017711,-0.045629,-0.005931,-0.001524,0.018407,0.021029,-0.011953,0.024747,-0.041404,0.01393,-0.037228,-0.058849,1.221328e-06,-0.031792,-0.083301,-0.025902,0.018091,-0.022512,-0.058839,0.071486,-0.003858,0.006362,-0.074395,-0.011442,0.02968,-0.023632,-0.005992,0.066657,-0.013369,-0.026562,-0.042338,-0.019549,0.025056,-0.008977,0.0549,0.004142,0.08172,-0.015741,0.035894,-0.012691,-0.019635,-0.006242,0.027426,-0.025931,0.05735,0.020713,0.002347,-0.054045,0.064692,0.00173,-0.039406,0.068253,0.009675,-0.019526,-2e-06,0.017368,0.037184,0.009459,0.017799,0.052088,0.011938,-0.001931,-0.076219,0.120794,0.001173,-0.022312,-0.068648,0.015302,-0.056776,-0.005903,0.033325,0.022331,0.05678,0.023195,-0.041398,...,-0.018225,-0.008718,0.031784,0.057482,0.057546,0.033343,-0.005313,-0.038291,-0.040555,0.03722,0.044022,0.032272,-0.025439,-0.025894,-0.038684,-0.035468,-0.015806,-0.0655,-0.003259,0.018089,2.016221e-07,0.03211,-0.047883,0.018559,0.100514,-0.056962,-0.066944,0.007278,-0.006013,0.018717,0.00594,0.069906,0.034088,-0.055432,-0.017972,0.061027,0.007918,0.041153,-0.06095,-0.013813,0.029852,0.061203,0.053303,0.063099,-0.013749,-0.017184,-0.047227,0.027741,0.019904,0.02284,0.057905,0.017807,0.027479,-0.00126,0.025825,-0.006924,0.003101,0.041385,0.037388,-0.058601,0.055136,0.003941,-0.026183,-0.000618,0.006946,0.066688,-0.000881,-0.017314,0.006238,-0.073892,0.015868,-0.029115,-0.007136,-0.007558,-0.010584,0.056945,-0.026774,0.002354,-0.02926,-0.017274,-0.006024,0.019166,-0.058358,0.011899,0.030128,-0.00703,-0.00842,-0.012937,1.39664e-34,0.055951,0.020628,0.016438,-0.047995,0.012918,-0.056663,-0.065802,0.029837,0.054031,-0.062732,-0.020809
1,https://openalex.org/C177562468,Dispersion (optics),2,0.028208,-0.070382,-0.018014,-0.021376,0.022096,-0.02061,0.022844,-0.019067,-0.036861,0.004658,0.033919,-0.005828,0.022461,-0.04881,-0.097347,-0.070565,-0.032649,0.027525,-0.010971,-0.009238,0.029667,-0.02917,9.1e-05,0.003269,0.024651,-0.019386,-0.030802,-0.010312,0.003429,0.031291,0.047552,-0.021783,-0.018757,-0.066421,1.116813e-06,-0.004583,0.045631,0.03199,0.021677,0.047661,-0.013042,0.03268,0.037084,0.021516,-0.033201,-0.007994,-0.039527,-0.06029,-0.035641,-0.000147,-0.00157,-0.018304,-0.056266,0.02231,0.030875,-0.066115,-0.004858,0.013389,-0.013325,0.121868,-0.030525,0.016323,0.002767,0.050331,0.076251,-0.037616,0.040926,0.003402,-0.044911,0.061574,0.060345,-0.022614,5.2e-05,0.017232,0.003343,0.047165,0.027882,0.012993,0.054513,-0.099355,-0.034082,-0.013793,-0.003586,0.025539,-0.015774,0.021319,0.020638,0.034095,-0.038474,-0.026938,0.018142,-0.039296,-0.004529,0.035035,-0.007319,0.01306,-0.007824,...,0.071618,0.025581,0.008352,-0.014565,-0.016144,0.016315,-0.000823,-0.039979,0.020717,0.005097,-0.051927,0.016763,-0.030644,0.002821,-0.020094,-0.032854,-0.009833,-0.010756,-0.03931,0.02037,1.740627e-07,0.012647,0.023605,0.034068,0.041043,0.004932,0.00263,0.000463,-6.4e-05,-0.002227,-0.041908,-0.019062,0.049562,-0.047353,-0.003542,0.052895,0.018456,0.077348,-0.049736,0.026664,0.046148,-0.03649,-0.047466,0.02814,0.000395,-0.061261,0.053512,0.007318,0.040258,0.006247,0.056526,-0.037432,-0.064152,0.00849,0.04295,0.007522,-0.025658,0.081768,-0.004288,0.011341,-0.01141,0.053527,-0.028507,0.019253,-0.036319,0.009465,0.045335,0.017002,0.007269,-0.025972,-0.031485,-0.015776,-0.034245,-0.006369,-0.049884,-0.026163,-0.027577,0.00765,-0.055532,-0.072555,0.069616,0.04358,-0.041329,0.003403,-0.058859,0.093067,0.003518,0.026327,2.653128e-35,0.005168,0.026465,-0.064741,0.040598,-0.027054,-0.032502,-0.10422,0.010706,0.011875,0.015705,0.041105
2,https://openalex.org/C121864883,Statistical physics,1,-0.045072,-0.034201,-0.023163,0.040222,-0.029626,-0.013524,-0.01094,-0.026023,-0.034533,0.012317,0.012301,0.003369,0.003709,-0.005283,0.000103,-0.062444,-0.010066,-0.011771,-0.016187,-0.065677,-0.021779,-0.04283,0.038466,-0.005476,0.022482,0.012995,0.026874,-0.012382,-0.023622,-0.009086,-0.014026,0.017614,-0.029913,-0.038892,9.525809e-07,-0.043977,0.018672,0.073071,-0.006642,0.025927,0.043371,0.013175,-0.045687,-0.048566,0.006408,0.05626,0.046039,-0.049396,-0.057855,-0.038396,-0.015997,0.005672,0.015991,-0.033009,0.05792,-0.034603,-0.008007,0.033238,0.034,0.023153,-0.070411,0.036478,0.031399,0.026715,0.026666,0.03698,-0.000283,0.126218,-0.028596,0.078102,0.025474,0.009503,-0.062257,0.018444,-0.038834,0.039192,-0.046033,-0.022311,0.022673,-0.062485,0.000248,0.019629,0.008068,0.001016,-0.002526,0.052774,0.023314,0.010055,0.026993,0.026283,-0.007126,-0.00627,-0.081989,-0.012737,-0.015991,0.007938,0.030416,...,0.062349,-0.043535,0.0132,0.014133,-0.017548,-0.033616,-0.002338,0.030735,0.021013,0.048718,0.004747,-0.029119,0.031094,-0.054342,-0.034878,-0.079326,0.009341,-0.004188,-0.036945,-0.015866,1.740412e-07,0.011438,0.017973,-0.053921,-0.002082,0.041839,-0.058401,-0.055954,-0.013407,0.006144,-0.084388,0.01166,0.010039,0.007775,0.025948,-0.003217,-0.032948,0.031331,0.030701,0.026126,-0.005381,-0.070286,0.022792,0.039087,0.040754,0.010958,-0.0197,0.006634,-0.025336,0.048419,0.012492,-0.025526,-0.022747,0.02664,0.03126,-0.002887,0.027973,0.073329,0.008002,0.054794,0.015104,-0.015335,0.013588,0.03677,0.016624,0.020213,0.004466,-0.019145,0.002988,-0.077319,-0.005929,0.049573,0.014039,0.082503,-0.012788,-0.04142,-0.020014,0.017436,0.020002,0.001954,0.046016,0.020891,-0.028749,0.002819,-0.035157,0.060829,-0.002586,-0.004695,6.353777e-35,0.006138,0.044154,0.002109,-0.026868,-0.01204,0.019437,-0.034416,-0.01105,0.018255,-0.012181,0.001292


In [4]:
#4 컬럼 검증
required_cols = {
    "nodes_papers":   ("paper_id",),
    "nodes_authors":  ("author_id",),
    "nodes_concepts": ("concept_id",),
    "edges_wrote":    ("author_id","paper_id"),
    "edges_has_topic":("paper_id","concept_id"),
    "emb_papers":     ("paper_id",),
    "emb_concepts":   ("concept_id",),
}

dfs = {
    "nodes_papers": nodes_papers,
    "nodes_authors": nodes_authors,
    "nodes_concepts": nodes_concepts,
    "edges_wrote": edges_wrote,
    "edges_has_topic": edges_has_topic,
    "emb_papers": emb_papers,
    "emb_concepts": emb_concepts,
}

for name, reqs in required_cols.items():
    df = dfs[name]
    miss = [c for c in reqs if c not in df.columns]
    assert not miss, f"[{name}] missing columns: {miss}"
print("✅ column check passed.")


AssertionError: [edges_wrote] missing columns: ['author_id', 'paper_id']

In [None]:
#5 ID맵 생성

paper_id2idx, paper_ids     = build_id_map(nodes_papers, "paper_id")
author_id2idx, author_ids   = build_id_map(nodes_authors, "author_id")
concept_id2idx, concept_ids = build_id_map(nodes_concepts, "concept_id")

print(f"papers : {len(paper_ids):,}")
print(f"authors: {len(author_ids):,}")
print(f"concepts: {len(concept_ids):,}")

In [None]:
# === CELL 6 (FIXED): 피처 행렬 만들기(Paper/Concept/Author) ===
import pandas as pd
import numpy as np

# ----------------
# Paper: 안전 머지 (suffix) + numeric만 선택 + L2 정규화
# ----------------
papers_joined = nodes_papers.merge(
    emb_papers, on="paper_id", how="left",
    suffixes=("", "_emb"), validate="one_to_one"
)

# 우선 suffix로 붙은 _emb 컬럼을 1순위로 사용
paper_num_cols_joined = papers_joined.select_dtypes(include=np.number).columns.tolist()
paper_emb_cols = [c for c in paper_num_cols_joined if c.endswith("_emb")]

# (백업) 혹시 _emb가 없다면, nodes_papers에 없던 numeric 컬럼만 선택
if len(paper_emb_cols) == 0:
    paper_emb_cols = [
        c for c in paper_num_cols_joined
        if c not in nodes_papers.columns and c != "paper_id"
    ]
    # 그래도 비었다면, emb_papers에서 id 제외 numeric 컬럼을 직접 참조
    if len(paper_emb_cols) == 0:
        tmp_cols = emb_papers.select_dtypes(include=np.number).columns.tolist()
        paper_emb_cols = tmp_cols

X_paper = papers_joined[paper_emb_cols].fillna(0.0).to_numpy(dtype=np.float32)
X_paper = l2_normalize(X_paper)
print("X_paper:", X_paper.shape, "| paper embedding cols:", len(paper_emb_cols))

# ----------------
# Concept: 머지 대신 "인덱스 정렬(reindex)"로 안전 정렬 + L2 정규화
# ----------------
# emb_concepts: concept_id를 인덱스로 두고, numeric 컬럼 중 메타(예: level/score/count)는 제외
emc = emb_concepts.set_index("concept_id")

candidate_cols = []
for col in emc.columns:
    if pd.api.types.is_numeric_dtype(emc[col]):
        lc = col.lower()
        if lc not in {"level", "score", "count"}:  # 메타 숫자 컬럼 제외
            candidate_cols.append(col)

if len(candidate_cols) == 0:
    raise ValueError("No numeric embedding columns found in emb_concepts (after excluding meta columns).")

# nodes_concepts의 concept_id 순서로 reindex (없으면 0으로 채움)
concept_order = nodes_concepts["concept_id"]
missing = set(concept_order.astype(str)) - set(emc.index.astype(str))
if len(missing) > 0:
    print(f"[WARN] {len(missing)} concept_ids have no embedding in emb_concepts; filling zeros.")

# dtype 차이에 안전하게 맞추기 위해 문자열로 align
emc_idx_str = emc.copy()
emc_idx_str.index = emc_idx_str.index.astype(str)
order_str = concept_order.astype(str)

X_concept = emc_idx_str.reindex(order_str, fill_value=0.0)[candidate_cols].to_numpy(dtype=np.float32)
X_concept = l2_normalize(X_concept)
print("X_concept:", X_concept.shape, "| concept embedding cols:", len(candidate_cols))

# ----------------
# Author: one-hot 또는 임의 초기 벡터
# ----------------
if AUTHOR_ONE_HOT:
    X_author = np.eye(len(author_ids), dtype=np.float32)
else:
    rng = np.random.default_rng(42)
    X_author = rng.standard_normal((len(author_ids), AUTHOR_FEAT_DIM), dtype=np.float32)

print("X_author:", X_author.shape)


In [None]:
#7 엣지 정리
# 저자 누락 엣지 드랍 (너의 결정 반영)
mask_wrote = edges_wrote["author_id"].astype(str).isin(author_id2idx) & edges_wrote["paper_id"].astype(str).isin(paper_id2idx)
edges_wrote_clean = edges_wrote.loc[mask_wrote].copy()

# Concept top-k (지금은 적용 안 함; None이면 그대로)
if CONCEPT_TOPK is None:
    edges_has_topic_clean = edges_has_topic.copy()
else:
    if "score" in edges_has_topic.columns:
        edges_has_topic_clean = (
            edges_has_topic
            .sort_values(["paper_id","score"], ascending=[True,False])
            .groupby("paper_id", as_index=False)
            .head(int(CONCEPT_TOPK))
            .reset_index(drop=True)
        )
    else:
        edges_has_topic_clean = (
            edges_has_topic
            .groupby("paper_id", as_index=False)
            .head(int(CONCEPT_TOPK))
            .reset_index(drop=True)
        )

print("edges_wrote (before/after):", len(edges_wrote), "→", len(edges_wrote_clean))
print("edges_has_topic (before/after):", len(edges_has_topic), "→", len(edges_has_topic_clean))

In [None]:
#8 관계별 엣지 인덱스 생성
def build_edges(edge_df, src_col, dst_col, src_map, dst_map):
    src = edge_df[src_col].astype(str)
    dst = edge_df[dst_col].astype(str)
    # 둘 다 맵에 있는 경우만
    mask = src.isin(src_map) & dst.isin(dst_map)
    sub = edge_df.loc[mask, [src_col, dst_col]].copy()
    E = np.stack([sub[src_col].map(src_map).to_numpy(), sub[dst_col].map(dst_map).to_numpy()], axis=1).astype(np.int64)
    return E

# A->P, P->A
E_AP = build_edges(edges_wrote_clean, "author_id", "paper_id", author_id2idx, paper_id2idx)
E_PA = np.stack([E_AP[:,1], E_AP[:,0]], axis=1)

# P->C, C->P
E_PC = build_edges(edges_has_topic_clean, "paper_id", "concept_id", paper_id2idx, concept_id2idx)
E_CP = np.stack([E_PC[:,1], E_PC[:,0]], axis=1)

print("E_AP:", E_AP.shape, "E_PA:", E_PA.shape, "E_PC:", E_PC.shape, "E_CP:", E_CP.shape)

In [None]:
#9 저장
# Feature matrices
save_npz(OUT_DIR / "features_papers.npz",   X=X_paper)
save_npz(OUT_DIR / "features_authors.npz",  X=X_author)
save_npz(OUT_DIR / "features_concepts.npz", X=X_concept)

# Edge indices
save_npz(OUT_DIR / "edges_AP.npz", E=E_AP)
save_npz(OUT_DIR / "edges_PA.npz", E=E_PA)
save_npz(OUT_DIR / "edges_PC.npz", E=E_PC)
save_npz(OUT_DIR / "edges_CP.npz", E=E_CP)

# ID maps (인덱스→원본 id 조회용)
pd.DataFrame({"paper_id": paper_ids}).to_csv(OUT_DIR / "map_paper_id.csv", index=False)
pd.DataFrame({"author_id": author_ids}).to_csv(OUT_DIR / "map_author_id.csv", index=False)
pd.DataFrame({"concept_id": concept_ids}).to_csv(OUT_DIR / "map_concept_id.csv", index=False)

# Meta info
meta = {
    "num_papers":   int(len(paper_ids)),
    "num_authors":  int(len(author_ids)),
    "num_concepts": int(len(concept_ids)),
    "paper_feat_dim":   int(X_paper.shape[1]),
    "author_feat_dim":  int(X_author.shape[1]),
    "concept_feat_dim": int(X_concept.shape[1]),
    "author_one_hot": bool(AUTHOR_ONE_HOT),
    "concept_topk":  None if CONCEPT_TOPK is None else int(CONCEPT_TOPK),
    "used_relations": ["A->P","P->A","P->C","C->P"],   # 이번 메타패스용
    "metapaths_targeted": ["P-A-P","P-C-P"]
}
with open(OUT_DIR / "meta.json", "w") as f:
    json.dump(meta, f, indent=2)

print("✅ Saved to:", OUT_DIR)

In [None]:
# 샘플로 몇 개 인덱스가 잘 매핑되는지 확인
print("paper_id[0:5]  -> idx:", paper_ids[:5])
print("author_id[0:5] -> idx:", author_ids[:5])
print("concept_id[0:5]-> idx:", concept_ids[:5])

# 엣지 샘플
for name, E in [("AP", np.load(OUT_DIR/"edges_AP.npz")["E"]),
                ("PA", np.load(OUT_DIR/"edges_PA.npz")["E"]),
                ("PC", np.load(OUT_DIR/"edges_PC.npz")["E"]),
                ("CP", np.load(OUT_DIR/"edges_CP.npz")["E"])]:
    print(name, ":", E.shape, "sample:", E[:3] if len(E)>0 else E)