
# TNIC 유사도 네트워크 평가 + 시각화 노트북

이 노트북은
1) TF-IDF / BERT / LLM 등으로 생성된 edge(jsonl)를 불러오고  
2) 재무지표 엑셀과 merge 한 뒤 간단 평가 지표를 계산하고  
3) 마지막에 네트워크 시각화를 확인할 수 있도록 구성되어 있습니다.


## 0. 환경 준비

In [None]:

import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)

# (선택) Colab에서 Drive를 쓰는 경우에만 실행하세요.
# from google.colab import drive
# drive.mount('/content/drive')


## 1. 입력 경로 설정

In [None]:
# 재무지표 엑셀(최소 컬럼: PeerName, year, K, W)
METRICS_XLSX = "evaluation/evaluation_metrics_1128.xlsx"

# 방법별 edge 파일 정의
METHOD_FILES = {
    "TFIDF": "output/v2/tfidf/TFIDF_2024.jsonl",
    "BERT":  "output/v2/tfidf/BERT_2024_thr0975.jsonl",
    "LLM":   "output/v2/tfidf/OPENAI_2024_thr084.jsonl",
}

# (선택) 특정 연도만 분석하고 싶으면 숫자를 넣고, 전체면 None
YEAR_FILTER = None  # 예: 2024

print("METRICS_XLSX:", METRICS_XLSX)
print("METHOD_FILES:", METHOD_FILES)


## 2. 데이터 로드

In [None]:

def read_edges_jsonl(path: Path) -> pd.DataFrame:
    df = pd.read_json(path, lines=True)
    return df

dfs = {}
for name, fp in METHOD_FILES.items():
    dfs[name] = read_edges_jsonl(fp)
    print(name, "rows:", len(dfs[name]), "cols(head):", list(dfs[name].columns)[:15])

df_T = pd.read_excel(METRICS_XLSX)
print("metrics rows:", len(df_T), "cols:", list(df_T.columns))
df_T.head()


## 3. 재무지표 merge (firm_i / firm_j 각각 붙이기)

In [None]:

def merge_with_metrics(df_edges: pd.DataFrame, df_metrics: pd.DataFrame) -> pd.DataFrame:
    base = df_metrics[["PeerName", "year", "K", "W"]].copy()

    # firm_i merge
    m_i = base.rename(columns={"PeerName": "firm_i_name", "K": "K_i", "W": "W_i"})
    out = df_edges.merge(m_i, how="left", on=["firm_i_name", "year"])

    # firm_j merge
    m_j = base.rename(columns={"PeerName": "firm_j_name", "K": "K_j", "W": "W_j"})
    out = out.merge(m_j, how="left", on=["firm_j_name", "year"])
    return out

dfs_m = {}
for name, df in dfs.items():
    d = df.copy()
    if YEAR_FILTER is not None and "year" in d.columns:
        d = d[d["year"] == YEAR_FILTER].copy()
    dfs_m[name] = merge_with_metrics(d, df_T)
    print(name, "after merge rows:", len(dfs_m[name]))

dfs_m["LLM"].head()



## 4. 평가 지표 계산 (기본 버전)

노트북 단독 사용을 위해 범용 지표를 제공합니다.

- abs_delta_K = |K_i - K_j|
- abs_delta_W = |W_i - W_j|
- missing_rate = K/W merge 결측 비율
- similarity 분포 요약


In [None]:

def add_eval_cols(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c in ["K_i", "K_j", "W_i", "W_j"]:
        if c not in out.columns:
            out[c] = np.nan

    out["abs_delta_K"] = (out["K_i"] - out["K_j"]).abs()
    out["abs_delta_W"] = (out["W_i"] - out["W_j"]).abs()

    # self-edge 제거(있으면)
    if "firm_i" in out.columns and "firm_j" in out.columns:
        out = out[out["firm_i"] != out["firm_j"]].copy()
    return out

dfs_e = {name: add_eval_cols(df) for name, df in dfs_m.items()}

def summarize_method(df: pd.DataFrame, method_name: str) -> dict:
    total = len(df)
    sim = df["similarity"] if "similarity" in df.columns else pd.Series(dtype=float)
    return {
        "method": method_name,
        "rows": total,
        "missing_rate_K": float(df["abs_delta_K"].isna().mean()) if total else np.nan,
        "missing_rate_W": float(df["abs_delta_W"].isna().mean()) if total else np.nan,
        "sim_mean": float(sim.mean()) if len(sim) else np.nan,
        "sim_median": float(sim.median()) if len(sim) else np.nan,
        "sim_p90": float(sim.quantile(0.90)) if len(sim) else np.nan,
        "sim_p95": float(sim.quantile(0.95)) if len(sim) else np.nan,
    }

summary = pd.DataFrame([summarize_method(df, name) for name, df in dfs_e.items()])
summary


### 4.1 Top-N edge만 평가 (similarity 상위 N개)

In [None]:

TOP_N = 20000  # 데이터 크기에 맞게 조절

def topn_eval(df: pd.DataFrame, n: int) -> dict:
    if "similarity" not in df.columns or len(df) == 0:
        return {"top_n": n, "mean_abs_delta_K": np.nan, "mean_abs_delta_W": np.nan}
    d = df.sort_values("similarity", ascending=False).head(n)
    return {
        "top_n": n,
        "mean_abs_delta_K": float(d["abs_delta_K"].mean()),
        "mean_abs_delta_W": float(d["abs_delta_W"].mean()),
        "median_abs_delta_K": float(d["abs_delta_K"].median()),
        "median_abs_delta_W": float(d["abs_delta_W"].median()),
    }

topn_table = []
for name, df in dfs_e.items():
    row = {"method": name}
    row.update(topn_eval(df, TOP_N))
    topn_table.append(row)

pd.DataFrame(topn_table)


### 4.2 similarity vs |K_i-K_j| / |W_i-W_j| 상관(참고용)

In [None]:

def corr_safe(df: pd.DataFrame, x: str, y: str) -> float:
    d = df[[x, y]].dropna()
    if len(d) < 3:
        return np.nan
    return float(d[x].corr(d[y]))

corr_rows = []
for name, df in dfs_e.items():
    corr_rows.append({
        "method": name,
        "corr(sim, abs_delta_K)": corr_safe(df, "similarity", "abs_delta_K"),
        "corr(sim, abs_delta_W)": corr_safe(df, "similarity", "abs_delta_W"),
    })

pd.DataFrame(corr_rows)


### 4.3 간단 플롯(분포/산점도)

In [None]:

METHOD_TO_PLOT = "LLM"  # "TFIDF" / "BERT" / "LLM"
dfp = dfs_e[METHOD_TO_PLOT].copy()

plt.figure(figsize=(6,4))
dfp["similarity"].dropna().hist(bins=50)
plt.title(f"Similarity distribution: {METHOD_TO_PLOT}")
plt.xlabel("similarity")
plt.ylabel("count")
plt.show()

SAMPLE_N = 30000
d2 = dfp[["similarity", "abs_delta_K", "abs_delta_W"]].dropna()
if len(d2) > SAMPLE_N:
    d2 = d2.sample(SAMPLE_N, random_state=42)

plt.figure(figsize=(6,4))
plt.scatter(d2["similarity"], d2["abs_delta_K"], s=3, alpha=0.3)
plt.title(f"Similarity vs |K_i-K_j| (sample): {METHOD_TO_PLOT}")
plt.xlabel("similarity")
plt.ylabel("|K_i - K_j|")
plt.show()

plt.figure(figsize=(6,4))
plt.scatter(d2["similarity"], d2["abs_delta_W"], s=3, alpha=0.3)
plt.title(f"Similarity vs |W_i-W_j| (sample): {METHOD_TO_PLOT}")
plt.xlabel("similarity")
plt.ylabel("|W_i - W_j|")
plt.show()


## 5. (시각화) 한글 폰트 설정

In [None]:

import matplotlib.font_manager as fm

# (선택) Colab/Ubuntu 기준 폰트 설치가 필요하면 아래를 실행
# !apt-get update -y
# !apt-get install -y fonts-nanum

font_candidates = [
    "/usr/share/fonts/truetype/nanum/NanumGothic.ttf",
    "/usr/share/fonts/truetype/nanum/NanumGothicBold.ttf",
]

font_path = None
for p in font_candidates:
    if os.path.exists(p):
        font_path = p
        break

if font_path:
    fm.fontManager.addfont(font_path)
    font_name = fm.FontProperties(fname=font_path).get_name()
    plt.rcParams["font.family"] = font_name
    plt.rcParams["axes.unicode_minus"] = False
    print("Using font:", font_name, "(", font_path, ")")
else:
    font_name = None
    print("Nanum font not found. Proceeding with default matplotlib font.")



## 6. (시각화) 중심 기업 네트워크

- CENTER_NAME이 None이면, edge에 가장 많이 등장하는 기업을 자동으로 중심으로 잡습니다.
- SIM_THRESHOLD / TOP_K_PER_NODE로 sparsify 강도를 조절합니다.


In [None]:

import networkx as nx

METHOD_VIZ = "LLM"   # "TFIDF" / "BERT" / "LLM"
YEAR_VIZ = YEAR_FILTER  # None이면 필터 없이 진행
CENTER_NAME = None   # 예: "동성화인텍"
SIM_THRESHOLD = 0.88
TOP_K_PER_NODE = 5

df = dfs_e[METHOD_VIZ].copy()
if YEAR_VIZ is not None and "year" in df.columns:
    df = df[df["year"] == YEAR_VIZ].copy()

df = df[df["similarity"] >= SIM_THRESHOLD].copy()
print("edges after threshold:", len(df))

if len(df) == 0:
    raise ValueError("No edges left after threshold. Lower SIM_THRESHOLD.")

if CENTER_NAME is None:
    counts = pd.concat([df["firm_i_name"], df["firm_j_name"]]).value_counts()
    CENTER_NAME = counts.index[0]
print("CENTER_NAME =", CENTER_NAME)

mask_center = (df["firm_i_name"] == CENTER_NAME) | (df["firm_j_name"] == CENTER_NAME)
df_center = df[mask_center].copy()

neighbors = pd.unique(df_center[["firm_i_name", "firm_j_name"]].values.ravel())
neighbors = [n for n in neighbors if n != CENTER_NAME]

mask_sub = df["firm_i_name"].isin([CENTER_NAME] + neighbors) | df["firm_j_name"].isin([CENTER_NAME] + neighbors)
df_sub = df[mask_sub].copy()

df_sorted = df_sub.sort_values("similarity", ascending=False)

grp_i = "firm_i" if "firm_i" in df_sorted.columns else "firm_i_name"
grp_j = "firm_j" if "firm_j" in df_sorted.columns else "firm_j_name"

keep_i = df_sorted.groupby(grp_i).head(TOP_K_PER_NODE)
keep_j = df_sorted.groupby(grp_j).head(TOP_K_PER_NODE)
df_sub = pd.concat([keep_i, keep_j]).drop_duplicates()

print("subgraph edges:", len(df_sub))
df_sub.head()


In [None]:

src = "firm_i" if "firm_i" in df_sub.columns else "firm_i_name"
tgt = "firm_j" if "firm_j" in df_sub.columns else "firm_j_name"

G = nx.from_pandas_edgelist(
    df_sub,
    source=src,
    target=tgt,
    edge_attr="similarity",
    create_using=nx.Graph()
)

# 노드 속성: 이름/섹터(W)
for _, r in df_sub.iterrows():
    ni = r.get(src)
    nj = r.get(tgt)

    if ni in G:
        G.nodes[ni]["name"] = r.get("firm_i_name", str(ni))
        G.nodes[ni]["sector"] = r.get("W_i", "Unknown")
    if nj in G:
        G.nodes[nj]["name"] = r.get("firm_j_name", str(nj))
        G.nodes[nj]["sector"] = r.get("W_j", "Unknown")

for n in G.nodes():
    if "sector" not in G.nodes[n] or pd.isna(G.nodes[n]["sector"]):
        G.nodes[n]["sector"] = "Unknown"
    if "name" not in G.nodes[n] or pd.isna(G.nodes[n]["name"]):
        G.nodes[n]["name"] = str(n)

sectors = sorted({G.nodes[n]["sector"] for n in G.nodes()})
cmap = plt.get_cmap("tab20", max(1, len(sectors)))
sector_to_color = {s: cmap(i) for i, s in enumerate(sectors)}
node_colors = [sector_to_color[G.nodes[n]["sector"]] for n in G.nodes()]
edge_widths = [float(G[u][v]["similarity"]) * 5 for u, v in G.edges()]

pos = nx.spring_layout(G, k=0.8, seed=42)

node_sizes = []
for n in G.nodes():
    node_sizes.append(900 if G.nodes[n]["name"] == CENTER_NAME else 320)

plt.figure(figsize=(12, 9))
nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=node_sizes, alpha=0.9)
nx.draw_networkx_edges(G, pos, width=edge_widths, alpha=0.55)

labels = {n: G.nodes[n]["name"] for n in G.nodes()}
nx.draw_networkx_labels(G, pos, labels=labels, font_size=10)

plt.title(f"[{METHOD_VIZ}] Center similarity network: {CENTER_NAME} (th={SIM_THRESHOLD}, topk={TOP_K_PER_NODE})")
plt.axis("off")
plt.tight_layout()
plt.show()



## 7. (선택) cap/WICS 네트워크

데이터에 `cap`, `WICS` 컬럼이 있을 때만 사용하세요.  
없으면 이 섹션은 건너뛰시면 됩니다.


In [None]:

import networkx as nx

METHOD_CAP = "BERT"
YEAR_SEL = YEAR_FILTER if YEAR_FILTER is not None else 2019
SIM_TH = 0.3

dfc = dfs.get(METHOD_CAP, None)
if dfc is None:
    raise ValueError(f"Unknown method: {METHOD_CAP}")

dfc = dfc.copy()
if "year" in dfc.columns:
    dfc = dfc[dfc["year"] == YEAR_SEL].copy()

needed = {"cap", "WICS", "firm_i", "firm_j", "similarity"}
missing = needed - set(dfc.columns)
if missing:
    print("cap/WICS plot skipped. Missing columns:", missing)
else:
    dfc = dfc[dfc["similarity"] >= SIM_TH].copy()
    print("edges:", len(dfc))

    nodes_i = dfc[["firm_i", "WICS", "cap"]].rename(columns={"firm_i": "node"})
    nodes_j = dfc[["firm_j", "WICS", "cap"]].rename(columns={"firm_j": "node"})
    nodes_df = pd.concat([nodes_i, nodes_j], ignore_index=True).drop_duplicates(subset="node").set_index("node")

    wics_attr = nodes_df["WICS"].to_dict()
    cap_attr = nodes_df["cap"].to_dict()

    G2 = nx.from_pandas_edgelist(dfc, source="firm_i", target="firm_j", edge_attr="similarity", create_using=nx.Graph())
    nx.set_node_attributes(G2, wics_attr, "WICS")
    nx.set_node_attributes(G2, cap_attr, "cap")

    pos2 = nx.spring_layout(G2, k=0.25, weight="similarity", iterations=100, seed=42)

    nodes = list(G2.nodes())
    wics = np.array([G2.nodes[n].get("WICS", np.nan) for n in nodes], dtype=float)
    caps = np.array([G2.nodes[n].get("cap", np.nan) for n in nodes], dtype=float)

    if np.isnan(caps).any():
        non = caps[~np.isnan(caps)]
        caps[np.isnan(caps)] = np.nanmin(non) if len(non) else 1.0
    if np.isnan(wics).any():
        wics[np.isnan(wics)] = 0.0

    cap_log = np.log1p(caps)
    denom = (cap_log.max() - cap_log.min()) + 1e-9
    cap_norm = (cap_log - cap_log.min()) / denom
    node_sizes = 60 + cap_norm * 1900

    cmap = plt.cm.get_cmap("tab20", 20)

    edge_colors = []
    for u, v in G2.edges():
        wu = G2.nodes[u].get("WICS", 0)
        try:
            w_norm = (float(wu) - 1) / 19
        except Exception:
            w_norm = 0.0
        edge_colors.append(cmap(w_norm))

    plt.figure(figsize=(14, 14))
    nx.draw_networkx_edges(
        G2, pos2,
        edge_color=edge_colors,
        width=[float(G2[u][v]["similarity"]) * 1.5 for u, v in G2.edges()],
        alpha=0.35,
    )
    nx.draw_networkx_nodes(
        G2, pos2,
        nodelist=nodes,
        node_size=node_sizes,
        node_color=wics,
        cmap=cmap,
        vmin=1, vmax=20,
        linewidths=0.2,
        edgecolors="black",
        alpha=0.95,
    )

    plt.title(f"[{METHOD_CAP}] cap/WICS network (year={YEAR_SEL}, th={SIM_TH})\nnode color=WICS, node size=cap, edge=similarity")
    plt.axis("off")
    plt.tight_layout()
    plt.show()
