In [3]:
import os
import pandas as pd
from pathlib import Path
import networkx as nx

from config.settings import *

STRING_DIR = DATA_DIR / "prior_knowledge_network_data" / "hg38" / "STRING"
TRRUST_DIR = DATA_DIR / "prior_knowledge_network_data" / "hg38" / "TRRUST"
KEGG_DIR = DATA_DIR / "prior_knowledge_network_data" / "hg38" / "KEGG"

string_pkn_file = STRING_DIR / "string_human_pkn.csv"
trrust_pkn_file = TRRUST_DIR / "trrust_human_pkn.csv"
kegg_pkn_file = KEGG_DIR / "kegg_human_pkn.csv"

tf_tg_data_df = pd.read_parquet("data/processed/PBMC/LINGER_PBMC_SC_DATA/tf_tg_data_unlabeled.parquet")


In [4]:
string_pkn_df = pd.read_csv(string_pkn_file).rename(columns={"protein1": "TF", "protein2": "TG"})
trrust_pkn_df = pd.read_csv(trrust_pkn_file).rename(columns={"source_id": "TF", "target_id": "TG"})
kegg_pkn_df = pd.read_csv(kegg_pkn_file).rename(columns={"source_id": "TF", "target_id": "TG"})

string_pkn_df["TF"] = string_pkn_df["TF"].str.upper()
string_pkn_df["TG"] = string_pkn_df["TG"].str.upper()

trrust_pkn_df["TF"] = trrust_pkn_df["TF"].str.upper()
trrust_pkn_df["TG"] = trrust_pkn_df["TG"].str.upper()

kegg_pkn_df["TF"] = kegg_pkn_df["TF"].str.upper()
kegg_pkn_df["TG"] = kegg_pkn_df["TG"].str.upper()

In [5]:
print(tf_tg_data_df.head())


    TF          TG  reg_potential  motif_density  mean_tf_expr  mean_tg_expr  \
0  AHR  AL627309.1            0.0            0.0      0.071921      0.000042   
1  AHR  AL627309.5            0.0            0.0      0.071921      0.000266   
2  AHR  AL627309.4            0.0            0.0      0.071921      0.000027   
3  AHR  AL669831.2            0.0            0.0      0.071921      0.000007   
4  AHR   LINC01409            0.0            0.0      0.071921      0.000327   

   expr_product  log_reg_pot  motif_present  
0  3.018726e-06          0.0              0  
1  1.913705e-05          0.0              0  
2  1.952958e-06          0.0              0  
3  5.151246e-07          0.0              0  
4  2.348392e-05          0.0              0  


In [6]:
print("STRING")
print(string_pkn_df.head())
print("\nTRRUST")
print(trrust_pkn_df.head())
print("\nKEGG")
print(kegg_pkn_df.head())

STRING
     TF         TG  string_neighborhood_score  string_fusion_score  \
0  ARF5      ACAP1                          0                    0   
1  ARF5  RAB11FIP3                          0                    0   
2  ARF5     IQSEC1                          0                    0   
3  ARF5      COPB1                          0                    0   
4  ARF5       ARF4                          0                    0   

   string_cooccurence_score  string_coexpression_score  \
0                         0                         47   
1                         0                          0   
2                         0                         49   
3                         0                        138   
4                         0                         49   

   string_experimental_score  string_database_score  string_textmining_score  \
0                         91                      0                      814   
1                        663                      0            

In [7]:
pkn_edges = set(zip(string_pkn_df["TF"], string_pkn_df["TG"])) | set(zip(trrust_pkn_df["TF"], trrust_pkn_df["TG"])) | set(zip(kegg_pkn_df["TF"], kegg_pkn_df["TG"]))

print(f"Number of edges in PKN: {len(pkn_edges)}")

Number of edges in PKN: 377096


In [8]:
print(list(pkn_edges)[:5])

[('SMAD2', 'E2F4'), ('PIP5KL1', 'PIP5K1B'), ('TRPC5', 'CALML4'), ('RPS29', 'MRPL2'), ('RRAGD', 'MTOR')]


In [9]:
print(f"Overlapping TFs: {len(set(tf_tg_data_df['TF']).intersection({t for t,_ in pkn_edges}))}")
print(f"Overlapping TGs: {len(set(tf_tg_data_df['TG']).intersection({t for _,t in pkn_edges}))}")

Overlapping TFs: 236
Overlapping TGs: 12270


In [12]:
from typing import Tuple
def select_pkn_edges_from_df(df: pd.DataFrame, pkn_edges: set[Tuple[str, str]]):
    df['TF'] = df['TF'].str.upper()
    df['TG'] = df['TG'].str.upper()
    
    df['in_pkn'] = df.apply(
        lambda r: int((r['TF'], r['TG']) in pkn_edges or (r['TG'], r['TF']) in pkn_edges),
        axis=1
    )
    
    in_pkn_df = df[df['in_pkn'] == 1]
    not_in_pkn_df = df[df['in_pkn'] == 0]
    
    return in_pkn_df, not_in_pkn_df
    
in_pkn_df, not_in_pkn_df = select_pkn_edges_from_df(tf_tg_data_df, pkn_edges)

In [13]:
print(f"Number of edges in TF-TG data: {tf_tg_data_df.shape[0]:,}")
print(f"Number of edges in PKN: {len(pkn_edges):,}")
print()
print(f"Number of unique TFs not in PKN: {not_in_pkn_df['TF'].nunique():,}")
print(f"Number of unique TGs not in PKN: {not_in_pkn_df['TG'].nunique():,}")
print()
print(f"Number of edges in TF-TG data that are not in PKN: {not_in_pkn_df.shape[0]:,}")
print(f"Number of edges in TF-TG data that are in PKN: {in_pkn_df.shape[0]:,}")
print()
print(f"Fraction of edges in TF-TG data that are in PKN: {in_pkn_df.shape[0] / tf_tg_data_df.shape[0]:.2f}")

Number of edges in TF-TG data: 5,989,200
Number of edges in PKN: 377,096

Number of unique TFs not in PKN: 240
Number of unique TGs not in PKN: 24,955

Number of edges in TF-TG data that are not in PKN: 5,979,333
Number of edges in TF-TG data that are in PKN: 9,867

Fraction of edges in TF-TG data that are in PKN: 0.00


In [14]:
in_pkn_df.columns

Index(['TF', 'TG', 'reg_potential', 'motif_density', 'mean_tf_expr',
       'mean_tg_expr', 'expr_product', 'log_reg_pot', 'motif_present',
       'in_pkn'],
      dtype='object')

In [18]:
# Sample an equal number of negatives
neg_df = not_in_pkn_df.sample(n=len(in_pkn_df), random_state=42)

balanced_df = pd.concat([in_pkn_df, neg_df], ignore_index=True)
print(f"Balanced fine-tuning set: {len(in_pkn_df)} positives, {len(neg_df)} negatives")


Balanced fine-tuning set: 9867 positives, 9867 negatives


In [19]:
merged_df = balanced_df.merge(string_pkn_df, on=["TF", "TG"], how="left")
merged_df = merged_df.merge(trrust_pkn_df, on=["TF", "TG"], how="left")
merged_df = merged_df.merge(kegg_pkn_df, on=["TF", "TG"], how="left")

merged_df.head()

Unnamed: 0,TF,TG,reg_potential,motif_density,mean_tf_expr,mean_tg_expr,expr_product,log_reg_pot,motif_present,in_pkn,...,string_database_score,string_textmining_score,string_combined_score,trrust_sign,trrust_regulation,trrust_pmids,trrust_support_n,kegg_signal,kegg_n_pathways,kegg_pathways
0,AHR,MFSD2A,0.0,0.0,0.071921,0.000108,8e-06,0.0,0,1,...,,,,0.0,Unknown,21736709.0,1.0,,,
1,AHR,EPHX4,0.0,0.0,0.071921,2.9e-05,2e-06,0.0,0,1,...,,,,,,,,1.0,1.0,hsa05207
2,AHR,GSTM4,0.0,0.0,0.071921,0.000162,1.2e-05,0.0,0,1,...,,,,,,,,1.0,1.0,hsa05207
3,AHR,GSTM2,0.0,0.0,0.071921,9.5e-05,7e-06,0.0,0,1,...,,,,,,,,1.0,1.0,hsa05207
4,AHR,GSTM1,0.0,0.0,0.071921,2.2e-05,2e-06,0.0,0,1,...,,,,,,,,1.0,1.0,hsa05207


In [20]:
print(f"Number of edges in merged dataframe: {merged_df.shape[0]:,}")
print(f"Number of unique TFs in merged dataframe: {merged_df['TF'].nunique():,}")
print(f"Number of unique TGs in merged dataframe: {merged_df['TG'].nunique():,}")
print(f"Number of unique TF-TG pairs in merged dataframe: {merged_df[['TF', 'TG']].drop_duplicates().shape[0]:,}")
print(merged_df.head())


Number of edges in merged dataframe: 19,734
Number of unique TFs in merged dataframe: 240
Number of unique TGs in merged dataframe: 10,319
Number of unique TF-TG pairs in merged dataframe: 19,734
    TF      TG  reg_potential  motif_density  mean_tf_expr  mean_tg_expr  \
0  AHR  MFSD2A            0.0            0.0      0.071921      0.000108   
1  AHR   EPHX4            0.0            0.0      0.071921      0.000029   
2  AHR   GSTM4            0.0            0.0      0.071921      0.000162   
3  AHR   GSTM2            0.0            0.0      0.071921      0.000095   
4  AHR   GSTM1            0.0            0.0      0.071921      0.000022   

   expr_product  log_reg_pot  motif_present  in_pkn  ...  \
0      0.000008          0.0              0       1  ...   
1      0.000002          0.0              0       1  ...   
2      0.000012          0.0              0       1  ...   
3      0.000007          0.0              0       1  ...   
4      0.000002          0.0              0    

In [21]:
merged_df.to_parquet("data/processed/PBMC/LINGER_PBMC_SC_DATA/tf_tg_merged_features.parquet")