In [1]:
import pandas as pd
import os
import torch
from sentence_transformers import SentenceTransformer

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

model_name = "all-mpnet-base-v2"
model = SentenceTransformer(model_name)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-mpnet-base-v2


In [2]:
df = pd.read_csv('/Users/jaehwayang/DSL/Projects/Modeling/graph_workspace/data/nodes_concepts (1).csv', sep = ',')

print(df['concept_name'])

0             Extension (predicate logic)
1                     Dispersion (optics)
2                     Statistical physics
3                             Mathematics
4                                 Physics
                      ...                
6896                                 Club
6897                        Ordered logit
6898                          Tree kernel
6899                        Kernel method
6900    Kernel embedding of distributions
Name: concept_name, Length: 6901, dtype: object


In [3]:
texts = df["concept_name"].astype(str).tolist()
emb = model.encode(
    texts,
    batch_size=256,
    normalize_embeddings=True,
    show_progress_bar=True
)

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

In [4]:
torch.Tensor(emb).size()

torch.Size([6901, 768])

In [5]:
papers = pd.read_csv('/Users/jaehwayang/DSL/Projects/Modeling/graph_workspace/data/nodes_papers (2).csv')

len(papers['venue_name'].unique())

1338

In [6]:
venue = papers['venue_name'].astype(str).to_list()
emb = model.encode(
    venue,
    batch_size=256,
    normalize_embeddings=True,
    show_progress_bar=True
)

emb_tensor = torch.Tensor(emb)

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

In [7]:
iccv_idx = []
aaai_idx = []
natcom_idx = []
for i, elements in enumerate(papers['venue_name'].fillna('')):
    if 'ICCV' in elements:
        iccv_idx.append(i)
    elif 'AAAI' in elements:
        aaai_idx.append(i)
    elif 'Nature Communications' in elements:
        natcom_idx.append(i)
print(len(iccv_idx), len(aaai_idx), len(natcom_idx))

149 100 164


In [8]:
iccv_arr = emb_tensor[iccv_idx[0]]
aaai_arr = emb_tensor[aaai_idx[0]]
natcom_arr = emb_tensor[natcom_idx[0]]

print(iccv_arr @ aaai_arr, iccv_arr @ natcom_arr)

tensor(0.3977) tensor(0.1780)


In [9]:
sim = emb_tensor @ emb_tensor.T   # emb_tensor는 이미 L2-normalize 상태라고 가정

N = sim.size(0)

# 2. 대각선 제외 (self-similarity 제거)
mask = torch.eye(N, dtype=torch.bool, device=sim.device)
sim_masked = sim.masked_fill(mask, -float("inf"))  # top 구할 때 제외
sim_masked_min = sim.masked_fill(mask, float("inf"))  # bottom 구할 때 제외

# 3. Top-10 pairs
topk_vals, topk_idx = torch.topk(sim_masked.view(-1), 10)
top_pairs = [(int(i // N), int(i % N), float(v)) for i, v in zip(topk_idx, topk_vals)]

# 4. Bottom-10 pairs
bottomk_vals, bottomk_idx = torch.topk(-sim_masked_min.view(-1), 10)
bottom_pairs = [(int(i // N), int(i % N), float(v)) for i, v in zip(bottomk_idx, -bottomk_vals)]

# 결과 예시 출력
print("Top-10 most similar pairs (excluding self):")
for i, j, v in top_pairs:
    print(f"{i} - {j}: {v:.4f}")

print("\nBottom-10 least similar pairs (excluding self):")
for i, j, v in bottom_pairs:
    print(f"{i} - {j}: {v:.4f}")

Top-10 most similar pairs (excluding self):
85 - 45: 1.0000
45 - 1078: 1.0000
85 - 969: 1.0000
85 - 276: 1.0000
45 - 276: 1.0000
45 - 3120: 1.0000
85 - 1078: 1.0000
45 - 4227: 1.0000
45 - 969: 1.0000
45 - 85: 1.0000

Bottom-10 least similar pairs (excluding self):
2020 - 4696: -0.2051
1413 - 4696: -0.2051
4696 - 1413: -0.2051
4696 - 2020: -0.2051
2621 - 3986: -0.1873
3986 - 2621: -0.1873
2020 - 4236: -0.1822
4236 - 2020: -0.1822
4236 - 1413: -0.1822
1413 - 4236: -0.1822


In [16]:
import numpy as np

D = emb.shape[1]
emb_df = pd.DataFrame(emb, columns=[f"d{i}" for i in range(D)], dtype=np.float32)
df_out = pd.concat([df.reset_index(drop=True), emb_df], axis=1)

# 5) 저장 (CSV/Parquet)
df_out.to_csv("/Users/jaehwayang/DSL/Projects/Modeling/graph_workspace/data/nodes_concepts_with_emb.csv", index=False)

print("저장 완료:", df_out.shape)  # (N, 원래컬럼+임베딩차원)

저장 완료: (6901, 771)


In [38]:
data = np.load('/Users/jaehwayang/DSL/Projects/Modeling/graph_workspace/data/a_feat.npz')

data

NpzFile '/Users/jaehwayang/DSL/Projects/Modeling/graph_workspace/data/a_feat.npz' with keys: row, col, format, shape, data

In [None]:
import scipy.sparse as sp

# sparse matrix로 로드
p_feat = sp.load_npz('/Users/jaehwayang/DSL/Projects/Modeling/graph_workspace/data/a_feat.npz')   # (7167, 1902) csr_matrix
print(p_feat.shape)                  # (7167, 1902)


# 3번째 논문 feature vector
vec3 = p_feat.getrow(3).toarray().flatten() # (1902,)



(7167, 1902)


numpy.ndarray

In [57]:
import pickletools

with open("/Users/jaehwayang/DSL/Projects/Modeling/graph_workspace/data/edges.pkl", "rb") as f:
    pickletools.dis(f)

    0: \x80 PROTO      4
    2: \x95 FRAME      221
   11: }    EMPTY_DICT
   12: \x94 MEMOIZE    (as 0)
   13: (    MARK
   14: \x8c     SHORT_BINUNICODE 'p-a'
   19: \x94     MEMOIZE    (as 1)
   20: \x8c     SHORT_BINUNICODE 'scipy.sparse.csr'
   38: \x94     MEMOIZE    (as 2)
   39: \x8c     SHORT_BINUNICODE 'csr_matrix'
   51: \x94     MEMOIZE    (as 3)
   52: \x93     STACK_GLOBAL
   53: \x94     MEMOIZE    (as 4)
   54: )        EMPTY_TUPLE
   55: \x81     NEWOBJ
   56: \x94     MEMOIZE    (as 5)
   57: }        EMPTY_DICT
   58: \x94     MEMOIZE    (as 6)
   59: (        MARK
   60: \x8c         SHORT_BINUNICODE '_shape'
   68: \x94         MEMOIZE    (as 7)
   69: M            BININT2    18405
   72: M            BININT2    18405
   75: \x86         TUPLE2
   76: \x94         MEMOIZE    (as 8)
   77: \x8c         SHORT_BINUNICODE 'maxprint'
   87: \x94         MEMOIZE    (as 9)
   88: K            BININT1    50
   90: \x8c         SHORT_BINUNICODE 'indices'
   99: \x94        

In [58]:
import numpy as np
import pickle

with open("/Users/jaehwayang/DSL/Projects/Modeling/graph_workspace/data/edges.pkl", "rb", "rb") as f:
    arr = pickle.load(f)  # np.ndarray
print(arr.shape, arr.dtype)

TypeError: 'str' object cannot be interpreted as an integer

In [71]:
df = pd.read_pickle("/Users/jaehwayang/DSL/Projects/Modeling/graph_workspace/data/node_features.pkl")
print(df)

<Compressed Sparse Row sparse matrix of dtype 'int32'
	with 103722 stored elements and shape (18405, 334)>
  Coords	Values
  (0, 10)	1
  (0, 88)	1
  (0, 133)	1
  (0, 197)	1
  (1, 15)	1
  (1, 23)	1
  (1, 81)	1
  (1, 222)	1
  (1, 242)	1
  (2, 23)	1
  (2, 80)	1
  (2, 82)	1
  (2, 131)	1
  (2, 132)	1
  (2, 135)	1
  (2, 162)	1
  (2, 222)	1
  (2, 253)	1
  (2, 281)	1
  (3, 94)	1
  (3, 135)	1
  (3, 242)	1
  (4, 64)	1
  (4, 110)	1
  (4, 159)	1
  :	:
  (18404, 15)	1
  (18404, 23)	1
  (18404, 35)	1
  (18404, 36)	1
  (18404, 49)	1
  (18404, 57)	1
  (18404, 94)	1
  (18404, 107)	1
  (18404, 124)	1
  (18404, 151)	1
  (18404, 170)	1
  (18404, 173)	1
  (18404, 174)	1
  (18404, 184)	1
  (18404, 188)	1
  (18404, 195)	1
  (18404, 220)	1
  (18404, 221)	1
  (18404, 226)	1
  (18404, 241)	1
  (18404, 249)	1
  (18404, 253)	1
  (18404, 272)	1
  (18404, 304)	1
  (18404, 329)	1


In [None]:
df = pd.read_pickle("/Users/jaehwayang/DSL/Projects/Modeling/graph_workspace/data/meta_data.pkl")
print(df)

{'t_info': {'a': {'ind': range(0, 4057), 'cnt': 4057}, 'p': {'ind': range(4057, 18385), 'cnt': 14328}, 'c': {'ind': range(18385, 18405), 'cnt': 20}}, 'rel2id': {'p-a': 0, 'a-p': 1, 'p-c': 2, 'c-p': 3}, 'id2rel': {0: 'p-a', 1: 'a-p', 2: 'p-c', 3: 'c-p'}, 'node2lid': {'a0': 0, 'a1': 1, 'a2': 2, 'a3': 3, 'a4': 4, 'a5': 5, 'a6': 6, 'a7': 7, 'a8': 8, 'a9': 9, 'a10': 10, 'a11': 11, 'a12': 12, 'a13': 13, 'a14': 14, 'a15': 15, 'a16': 16, 'a17': 17, 'a18': 18, 'a19': 19, 'a20': 20, 'a21': 21, 'a22': 22, 'a23': 23, 'a24': 24, 'a25': 25, 'a26': 26, 'a27': 27, 'a28': 28, 'a29': 29, 'a30': 30, 'a31': 31, 'a32': 32, 'a33': 33, 'a34': 34, 'a35': 35, 'a36': 36, 'a37': 37, 'a38': 38, 'a39': 39, 'a40': 40, 'a41': 41, 'a42': 42, 'a43': 43, 'a44': 44, 'a45': 45, 'a46': 46, 'a47': 47, 'a48': 48, 'a49': 49, 'a50': 50, 'a51': 51, 'a52': 52, 'a53': 53, 'a54': 54, 'a55': 55, 'a56': 56, 'a57': 57, 'a58': 58, 'a59': 59, 'a60': 60, 'a61': 61, 'a62': 62, 'a63': 63, 'a64': 64, 'a65': 65, 'a66': 66, 'a67': 67, 'a68'

In [74]:
df = pd.read_pickle("/Users/jaehwayang/DSL/Projects/Modeling/graph_workspace/data/labels.pkl")
print(df)

[[   0    1]
 [   1    3]
 [   2    0]
 ...
 [4054    3]
 [4055    3]
 [4056    2]]


In [76]:
API_KEY = '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'

import pandas as pd
from openai import OpenAI
from tqdm import tqdm

# OpenAI 클라이언트 초기화
client = OpenAI(api_key=API_KEY)

# CSV 불러오기
df = pd.read_csv("/Users/jaehwayang/DSL/Projects/Modeling/graph_workspace/data/nodes_papers (2).csv")

# venue_name 컬럼 확인
print(df.columns)

# 임베딩 생성 함수
def get_embedding(text, model="text-embedding-3-large", dim=64):
    if not isinstance(text, str) or text.strip() == "":
        return [0.0] * dim   # venue_name이 비어있을 경우 zero vector
    resp = client.embeddings.create(
        model=model,
        input=text,
        dimensions=dim
    )
    return resp.data[0].embedding

tqdm.pandas(desc="Embedding venue_name")
df["venue_embedding"] = df["venue_name"].progress_apply(lambda x: get_embedding(x))


# paper_id와 embedding만 저장
out_df = df[["paper_id", "venue_embedding"]]

# CSV로 저장

print("저장 완료: nodes_papers_with_venue_embedding.csv")

Index(['paper_id', 'title', 'year', 'type', 'cited_by_count', 'updated_date',
       'venue_name', 'venue_type', 'venue_issn_l', 'abstract_len'],
      dtype='object')


Embedding venue_name:   0%|          | 0/5000 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
Embedding venue_name:   0%|          | 2/5000 [00:00<18:14,  4.57it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
Embedding venue_name:   0%|          | 3/5000 [00:00<22:41,  3.67it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
Embedding venue_name:   0%|          | 4/5000 [00:01<26:12,  3.18it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
Embedding venue_name:   0%|          | 5/5000 [00:01<28:26,  2.93it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
Embedding venue_name:   0%|          | 6/5000 [00:01<30:52,  2.70it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
Embedding venue_name:   0%|          | 7/5000 [00:02<27:58,  2.97it/s]INFO:htt

KeyboardInterrupt: 

In [1]:
import pandas as pd
import ast

# 입력/출력 파일 경로
in_file = "/Users/jaehwayang/DSL/Projects/Modeling/graph_workspace/data/nodes_papers_with_venue_embedding.csv"
out_file = "/Users/jaehwayang/DSL/Projects/Modeling/graph_workspace/data/venue_embed.csv"

# CSV 불러오기
df = pd.read_csv(in_file)

# venue_embedding 컬럼을 파싱해서 리스트로 변환
def parse_embedding(x):
    if isinstance(x, str):
        return ast.literal_eval(x)  # 문자열을 리스트로 변환
    return x

df["venue_embedding"] = df["venue_embedding"].apply(parse_embedding)

# 64차원 벡터를 64개의 컬럼으로 확장
embed_df = pd.DataFrame(df["venue_embedding"].tolist(),
                        columns=[f"venue_{i}" for i in range(64)])

# paper_id와 합치기
out_df = pd.concat([df[["paper_id"]], embed_df], axis=1)

# 결과 저장
out_df.to_csv(out_file, index=False)

print(f"저장 완료: {out_file}")

저장 완료: /Users/jaehwayang/DSL/Projects/Modeling/graph_workspace/data/venue_embed.csv


In [3]:
import pandas as pd

# 입력 파일 경로
file1 = "/Users/jaehwayang/DSL/Projects/Modeling/graph_workspace/data/venue_embed.csv"
file2 = "/Users/jaehwayang/DSL/Projects/Modeling/graph_workspace/data/cs_abstract_emb.csv"
out_file = "embedding_vectors.csv"

# CSV 불러오기
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)

# paper_id 기준으로 merge (inner join: 공통 id만)
merged = pd.merge(df1, df2, on="paper_id", how="inner")

# 저장
merged.to_csv(out_file, index=False)

print(f"병합 완료: {out_file}")

병합 완료: embedding_vectors.csv


In [7]:
df = merged.drop('title', axis = 1)
df.to_csv(out_file, index=False)