In [3]:
import pandas as pd

# ===== 输入文件（按你的上传路径）=====
IN_PATH = "D:/Download/Academic/pycharmworkspace/aaa/sjk/feature_counts_merged.csv"   # 推荐用 merged 版
OUT_PATH = "./top30_features_overall.csv"

df = pd.read_csv(IN_PATH)

# ===== Top30（全局：跨数据库+跨模型汇总）=====
top30 = (
    df.groupby("feature", as_index=False)["count"].sum()
      .sort_values("count", ascending=False)
      .head(30)
      .reset_index(drop=True)
)

top30.to_csv(OUT_PATH, index=False, encoding="utf-8-sig")
print("Saved:", OUT_PATH)
print(top30)


Saved: ./top30_features_overall.csv
                   feature  count
0                      age     52
1                      bun     44
2                  lactate     43
3               creatinine     41
4              respiratory     40
5                    heart     40
6                 platelet     38
7              temperature     35
8                      wbc     35
9                   sodium     30
10                     spo     29
11              hemoglobin     28
12                    aptt     27
13  mechanical_ventilation     25
14                     sbp     25
15                    sofa     24
16                     inr     24
17               anion_gap     23
18               potassium     22
19                     map     22
20                      pt     22
21                     gcs     20
22                 glucose     20
23                chloride     18
24                      ph     17
25            urine_output     17
26             bicarbonate     17
27          

In [5]:
import json
import pandas as pd
import re
from collections import defaultdict, Counter

# ========================
# 文件路径
# ========================
GRAPH_PATH = "./graph_data.json"
TOP30_PATH = "./top30_features_overall.csv"
OUT_PATH = "high_frequency_feature_cheatsheet_engineering_top30.csv"

# ========================
# 工具函数：特征名规范化
# ========================
def canon(s: str) -> str:
    s = str(s or "").strip().lower()
    s = s.replace(" ", "_")
    s = re.sub(r"[^a-z0-9_]+", "_", s)
    s = re.sub(r"_+", "_", s).strip("_")
    return s

# ========================
# 1. 读取数据
# ========================
with open(GRAPH_PATH, "r", encoding="utf-8") as f:
    graph = json.load(f)

nodes = graph["nodes"]
edges = graph["edges"]

top30 = pd.read_csv(TOP30_PATH)
top30["feature_canon"] = top30["feature"].apply(canon)

# ========================
# 2. 节点信息索引
# ========================
node_group = {}
node_label = {}

for n in nodes:
    node_group[n["id"]] = n.get("group")
    node_label[n["id"]] = str(n.get("label", "")).strip()

# ========================
# 3. 构建图谱关系
# ========================
# model -> dbs
model_to_dbs = defaultdict(set)

# feature -> model -> weight
feat_model_weight = defaultdict(lambda: defaultdict(float))
feat_models = defaultdict(set)

for e in edges:
    et = e.get("type")
    frm = e["from"]
    to = e["to"]
    val = float(e.get("value", 1) or 1)

    if et == "db_model":
        if node_group.get(frm) == "database" and node_group.get(to) == "model":
            model_to_dbs[to].add(frm)

    elif et == "model_feature":
        if node_group.get(frm) == "model" and node_group.get(to) == "feature":
            feat_model_weight[to][frm] += val
            feat_models[to].add(frm)

# ========================
# 4. Feature → canonical 映射
# ========================
fid_to_canon = {
    fid: canon(node_label[fid])
    for fid, grp in node_group.items()
    if grp == "feature"
}

# ========================
# 5. 计算工程指标（只保留 WUF / MC / DC）
# ========================
WUF = Counter()                 # Σ edge value
MC = defaultdict(set)           # 覆盖模型
DC = defaultdict(set)           # 覆盖数据库
model_weight = defaultdict(lambda: defaultdict(float))

for fid, models in feat_models.items():
    fcan = fid_to_canon.get(fid)
    if not fcan:
        continue

    total_weight = sum(feat_model_weight[fid].values())
    WUF[fcan] += total_weight

    for mid in models:
        MC[fcan].add(mid)
        model_weight[fcan][mid] += feat_model_weight[fid][mid]

        for db in model_to_dbs.get(mid, []):
            DC[fcan].add(db)

def top_models(fcan, k=3):
    items = sorted(
        model_weight.get(fcan, {}).items(),
        key=lambda x: x[1],
        reverse=True
    )[:k]
    return [node_label[mid] for mid, _ in items]

def top_dbs(fcan):
    return [node_label[db] for db in sorted(DC.get(fcan, []))]

# ========================
# 6. 生成速查表（工程版 · 精简）
# ========================
rows = []

for _, r in top30.iterrows():
    fcan = r["feature_canon"]

    rows.append({
        "feature": fcan,
        "WUF": float(WUF.get(fcan, 0)),
        "MC": len(MC.get(fcan, set())),
        "DC": len(DC.get(fcan, set())),
        "top_models": ", ".join(top_models(fcan)),
        "top_dbs": ", ".join(top_dbs(fcan))
    })

out = pd.DataFrame(rows)

# 工程视角排序：WUF > MC > DC
out = out.sort_values(
    ["WUF", "MC", "DC"],
    ascending=False
).reset_index(drop=True)

# ========================
# 7. 导出
# ========================
out.to_csv(OUT_PATH, index=False, encoding="utf-8-sig")
print("Saved:", OUT_PATH)


Saved: high_frequency_feature_cheatsheet_engineering_top30.csv
