In [46]:
import os
import pandas as pd
from pathlib import Path
import networkx as nx

from config.settings import *

STRING_DIR = DATA_DIR / "prior_knowledge_network_data" / "hg38" / "STRING"
TRRUST_DIR = DATA_DIR / "prior_knowledge_network_data" / "hg38" / "TRRUST"
KEGG_DIR = DATA_DIR / "prior_knowledge_network_data" / "hg38" / "KEGG"

string_pkn_file = STRING_DIR / "string_human_pkn.csv"
trrust_pkn_file = TRRUST_DIR / "trrust_human_pkn.csv"
kegg_pkn_file = KEGG_DIR / "kegg_human_pkn.csv"

tf_tg_data_file = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/data/processed/PBMC/LINGER_PBMC_SC_DATA/tf_tg_data.parquet"

In [None]:
string_pkn_df = pd.read_csv(string_pkn_file).rename(columns={"protein1": "TF", "protein2": "TG"})
trrust_pkn_df = pd.read_csv(trrust_pkn_file).rename(columns={"source_id": "TF", "target_id": "TG"})
kegg_pkn_df = pd.read_csv(kegg_pkn_file).rename(columns={"source_id": "TF", "target_id": "TG"})

string_pkn_df["TF"] = string_pkn_df["TF"].str.upper()
string_pkn_df["TG"] = string_pkn_df["TG"].str.upper()

trrust_pkn_df["TF"] = trrust_pkn_df["TF"].str.upper()
trrust_pkn_df["TG"] = trrust_pkn_df["TG"].str.upper()

kegg_pkn_df["TF"] = kegg_pkn_df["TF"].str.upper()
kegg_pkn_df["TG"] = kegg_pkn_df["TG"].str.upper()

from mygene import MyGeneInfo
mg = MyGeneInfo()

# Convert Ensembl IDs or aliases in your PKN to HGNC symbols
def normalize_genes(gene_list):
    query = mg.querymany(gene_list, scopes=["symbol", "alias", "ensembl.gene"], fields="symbol", species="human")
    mapping = {q["query"]: q.get("symbol", q["query"]) for q in query}
    return [mapping.get(g, g).upper() for g in gene_list]

string_pkn_df["TF"] = normalize_genes(string_pkn_df["TF"])
string_pkn_df["TG"] = normalize_genes(string_pkn_df["TG"])
trrust_pkn_df["TF"] = normalize_genes(trrust_pkn_df["TF"])
trrust_pkn_df["TG"] = normalize_genes(trrust_pkn_df["TG"])
kegg_pkn_df["TF"] = normalize_genes(kegg_pkn_df["TF"])
kegg_pkn_df["TG"] = normalize_genes(kegg_pkn_df["TG"])

tf_tg_data_df = pd.read_parquet(tf_tg_data_file)

In [None]:
from mygene import MyGeneInfo
mg = MyGeneInfo()

# Convert Ensembl IDs or aliases in your PKN to HGNC symbols
def normalize_genes(gene_list):
    query = mg.querymany(gene_list, scopes=["symbol", "alias", "ensembl.gene"], fields="symbol", species="human")
    mapping = {q["query"]: q.get("symbol", q["query"]) for q in query}
    return [mapping.get(g, g).upper() for g in gene_list]

string_pkn_df["TF"] = normalize_genes(string_pkn_df["TF"])
string_pkn_df["TG"] = normalize_genes(string_pkn_df["TG"])
trrust_pkn_df["TF"] = normalize_genes(trrust_pkn_df["TF"])
trrust_pkn_df["TG"] = normalize_genes(trrust_pkn_df["TG"])
kegg_pkn_df["TF"] = normalize_genes(kegg_pkn_df["TF"])
kegg_pkn_df["TG"] = normalize_genes(kegg_pkn_df["TG"])

Input sequence provided is already in string format. No operation performed


In [49]:
print(tf_tg_data_df.head())


      TF          TG  reg_potential  motif_density  mean_tf_expr  \
0   EGR1        USF3            0.0            0.0      0.006046   
1   CTCF   WDFY3-AS1            0.0            0.0      0.049970   
2   CTCF     CA3-AS1            0.0            0.0      0.049970   
3  CEBPB  MIR548A1HG            0.0            0.0      0.052377   
4   SPI1      BCAP29            0.0            0.0      0.060950   

   mean_tg_expr  expr_product  log_reg_pot  motif_present  label  
0      0.001121  6.775073e-06          0.0              0      0  
1      0.000058  2.922863e-06          0.0              0      0  
2      0.000019  9.497285e-07          0.0              0      1  
3      0.000010  5.074078e-07          0.0              0      1  
4      0.000552  3.363038e-05          0.0              0      1  


In [50]:
print("STRING")
print(string_pkn_df.head())
print("\nTRRUST")
print(trrust_pkn_df.head())
print("\nKEGG")
print(kegg_pkn_df.head())

STRING
     TF         TG  string_neighborhood_score  string_fusion_score  \
0  ARF5      ACAP1                          0                    0   
1  ARF5  RAB11FIP3                          0                    0   
2  ARF5     IQSEC1                          0                    0   
3  ARF5      COPB1                          0                    0   
4  ARF5       ARF4                          0                    0   

   string_cooccurence_score  string_coexpression_score  \
0                         0                         47   
1                         0                          0   
2                         0                         49   
3                         0                        138   
4                         0                         49   

   string_experimental_score  string_database_score  string_textmining_score  \
0                         91                      0                      814   
1                        663                      0            

In [51]:
pkn_edges = set(zip(string_pkn_df["TF"], string_pkn_df["TG"])) | set(zip(trrust_pkn_df["TF"], trrust_pkn_df["TG"])) | set(zip(kegg_pkn_df["TF"], kegg_pkn_df["TG"]))

print(f"Number of edges in PKN: {len(pkn_edges)}")

Number of edges in PKN: 377096


In [52]:
print(list(pkn_edges)[:5])

[('SMARCB1', 'MYSM1'), ('TTI1', 'RPTOR'), ('SAMM50', 'TOMM70'), ('INS', 'FGFR2'), ('APBB1IP', 'ENAH')]


In [53]:
print(f"Overlapping TFs: {len(set(tf_tg_data_df['TF']).intersection({t for t,_ in pkn_edges}))}")
print(f"Overlapping TGs: {len(set(tf_tg_data_df['TG']).intersection({t for _,t in pkn_edges}))}")

Overlapping TFs: 10
Overlapping TGs: 12160


In [None]:
from typing import Tuple
def select_pkn_edges_from_df(df: pd.DataFrame, pkn_edges: set[Tuple[str, str]]):
    df['TF'] = df['TF'].str.upper()
    df['TG'] = df['TG'].str.upper()
    
    df['in_pkn'] = df.apply(
        lambda r: int((r['TF'], r['TG']) in pkn_edges or (r['TG'], r['TF']) in pkn_edges),
        axis=1
    )
    
    df = df[df['in_pkn'] == 1]
    
    return df
    
df_in_pkn = select_pkn_edges_from_df(tf_tg_data_df, pkn_edges)

In [55]:
print(f"Number of edges in TF-TG data: {tf_tg_data_df.shape[0]:,}")
print(f"Number of edges in PKN: {len(pkn_edges):,}")
print(f"Number of edges in TF-TG data that are in PKN: {df_in_pkn.shape[0]:,}")

Number of edges in TF-TG data: 155,632
Number of edges in PKN: 377,096
Number of edges in TF-TG data that are in PKN: 421
