In [5]:
import os
import pandas as pd
from pathlib import Path
import networkx as nx

from config.settings import *

STRING_DIR = DATA_DIR / "prior_knowledge_network_data" / "STRING"
TRRUST_DIR = DATA_DIR / "prior_knowledge_network_data" / "TRRUST"
KEGG_DIR = DATA_DIR / "prior_knowledge_network_data" / "KEGG"

string_pkn_file = STRING_DIR / "string_mouse_pkn.csv"
trrust_pkn_file = TRRUST_DIR / "trrust_mouse_pkn.csv"
kegg_pkn_file = KEGG_DIR / "kegg_mouse_pkn.csv"

string_pkn_graphml = STRING_DIR / "string_mouse_pkn.graphml"
trrust_pkn_graphml = TRRUST_DIR / "trrust_mouse_pkn.graphml"
kegg_pkn_graphml = KEGG_DIR / "kegg_mouse_pkn.graphml"

In [2]:
string_pkn_df = pd.read_csv(string_pkn_file)
string_pkn_df.head()

Unnamed: 0,source_id,target_id,string_neighborhood_score,string_fusion_score,string_cooccurence_score,string_coexpression_score,string_experimental_score,string_database_score,string_textmining_score,string_combined_score
0,GNAI3,RGS4,0,0,0,56,594,500,492,0.889
1,GNAI3,CMTM4,0,0,0,0,0,0,163,0.163
2,GNAI3,ARL5A,0,0,0,110,117,0,65,0.201
3,GNAI3,DRD2,0,0,0,0,604,900,301,0.969
4,GNAI3,GRM8,0,0,0,48,228,0,84,0.267


In [3]:
trrust_pkn_df = pd.read_csv(trrust_pkn_file)
trrust_pkn_df.head()

Unnamed: 0,source_id,target_id,trrust_sign,trrust_regulation,trrust_pmids,trrust_support_n
0,AATF,BAK1,0,Unknown,22983126,1
1,AATF,BAX,0,Unknown,22983126,1
2,AATF,BBC3,0,Unknown,22983126,1
3,AATF,CDKN1A,0,Unknown,21317046,1
4,AATF,TPT1,1,Activation,17157788,1


In [4]:
kegg_pkn_df = pd.read_csv(kegg_pkn_file)
kegg_pkn_df.head()

Unnamed: 0,source_id,target_id,kegg_signal,kegg_n_pathways,kegg_pathways
0,CPD:C00031,GCK,0,2,"mmu00500,mmu00524"
1,CPD:C00031,HK1,0,2,"mmu00500,mmu00524"
2,CPD:C00031,HK2,0,2,"mmu00500,mmu00524"
3,CPD:C00031,HK3,0,2,"mmu00500,mmu00524"
4,CPD:C00031,HKDC1,0,2,"mmu00500,mmu00524"


In [8]:
# --- Harmonize key columns ---
# Ensure consistent TF/TG-style naming
trrust_pkn_df.rename(columns={"source": "source_id", "target": "target_id"}, inplace=True)
kegg_pkn_df.rename(columns={"source": "source_id", "target": "target_id"}, inplace=True)
string_pkn_df.rename(columns={"source": "source_id", "target": "target_id"}, inplace=True)

trrust_pkn_df["source_db"] = "TRRUST"
kegg_pkn_df["source_db"] = "KEGG"
string_pkn_df["source_db"] = "STRING"

# Optional: case-normalize
for df in [trrust_pkn_df, kegg_pkn_df, string_pkn_df]:
    df["source_id"] = df["source_id"].str.upper()
    df["target_id"] = df["target_id"].str.upper()

# --- Select canonical columns ---
def select_common_columns(df):
    keep = [c for c in df.columns if c in {"source_id", "target_id", "source_db"} or c.endswith("_sign") or c.endswith("_score") or c in {"signal"}]
    return df[keep]

trrust_pkn_df = select_common_columns(trrust_pkn_df)
kegg_pkn_df   = select_common_columns(kegg_pkn_df)
string_pkn_df = select_common_columns(string_pkn_df)

# --- Merge all sources ---
merged_df = pd.concat([trrust_pkn_df, kegg_pkn_df, string_pkn_df], ignore_index=True)

# Drop perfect duplicates (same source-target pair + identical source_db)
merged_df.drop_duplicates(subset=["source_id", "target_id", "source_db"], inplace=True)

print(f"Unified PKN: {len(merged_df):,} edges across {merged_df['source_db'].nunique()} sources")

# --- Save outputs ---
merged_df.to_csv("merged_pkn.csv", index=False)
G_merged = nx.from_pandas_edgelist(merged_df, source="source_id", target="target_id", edge_attr=True, create_using=nx.DiGraph())
nx.write_graphml(G_merged, "merged_pkn.graphml")

Unified PKN: 12,792,377 edges across 3 sources
