In [3]:
import os
import pandas as pd
from pathlib import Path
import networkx as nx

from config.settings import *

STRING_DIR = DATA_DIR / "prior_knowledge_network_data" / "mm10" / "STRING"
TRRUST_DIR = DATA_DIR / "prior_knowledge_network_data" / "mm10" / "TRRUST"
KEGG_DIR = DATA_DIR / "prior_knowledge_network_data" / "mm10" / "KEGG"

string_pkn_file = STRING_DIR / "string_mouse_pkn.csv"
trrust_pkn_file = TRRUST_DIR / "trrust_mouse_pkn.csv"
kegg_pkn_file = KEGG_DIR / "kegg_mouse_pkn.csv"

tf_tg_data_file = SAMPLE_PROCESSED_DATA_DIR / "E7.5_rep1" / "tf_tg_data.parquet"

In [11]:
string_pkn_df = pd.read_csv(string_pkn_file).rename(columns={"protein1": "TF", "protein2": "TG"})
trrust_pkn_df = pd.read_csv(trrust_pkn_file).rename(columns={"source_id": "TF", "target_id": "TG"})
kegg_pkn_df = pd.read_csv(kegg_pkn_file).rename(columns={"source_id": "TF", "target_id": "TG"})

string_pkn_df["TF"] = string_pkn_df["TF"].str.upper()
string_pkn_df["TG"] = string_pkn_df["TG"].str.upper()

trrust_pkn_df["TF"] = trrust_pkn_df["TF"].str.upper()
trrust_pkn_df["TG"] = trrust_pkn_df["TG"].str.upper()

kegg_pkn_df["TF"] = kegg_pkn_df["TF"].str.upper()
kegg_pkn_df["TG"] = kegg_pkn_df["TG"].str.upper()

tf_tg_data_df = pd.read_parquet(tf_tg_data_file)

In [32]:
from mygene import MyGeneInfo
mg = MyGeneInfo()
mapping = mg.querymany(tf_tg_data_df['TF'].unique().tolist() + tf_tg_data_df['TG'].unique().tolist(),
                       scopes='symbol', fields='ensembl.gene', species='mouse')


Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
96 input query terms found dup hits:	[('GM17690', 2), ('9630014M24RIK', 2), ('GM15564', 2), ('GM39929', 2), ('ARHGAP26', 2), ('A730011C13
90 input query terms found no hit:	['GM11639', 'OLFR655', '2410131K14RIK', 'CCDC36', 'CCDC173', 'AC149090.1', 'GM45194', 'KIF1BP', '170


In [12]:
print(tf_tg_data_df.head())


      TF       TG  reg_potential  motif_density  mean_tf_expr  mean_tg_expr  \
0  Meis2   Rec114            0.0            0.0      0.200994      0.178481   
1  Smad3  Slc16a2            0.0            0.0      0.204987      0.177647   
2   Mxi1    Cdh18            0.0            0.0      0.203196      0.179075   
3   Jund    Gnai1            0.0            0.0      0.210623      0.178047   
4   Sox9   Kif19a            0.0            0.0      0.203838      0.177712   

   expr_product  log_reg_pot  motif_present  label  
0      0.035874          0.0              0      1  
1      0.036415          0.0              0      1  
2      0.036387          0.0              0      0  
3      0.037501          0.0              0      1  
4      0.036224          0.0              0      0  


In [31]:
print("STRING")
print(string_pkn_df.head())
print("\nTRRUST")
print(trrust_pkn_df.head())
print("\nKEGG")
print(kegg_pkn_df.head())

STRING
      TF     TG  string_neighborhood_score  string_fusion_score  \
0  GNAI3   RGS4                          0                    0   
1  GNAI3   DRD2                          0                    0   
2  GNAI3   GNB4                          0                    0   
3  GNAI3   RGS3                          0                    0   
4  GNAI3  GNAI1                          0                    0   

   string_cooccurence_score  string_coexpression_score  \
0                         0                         56   
1                         0                          0   
2                         0                        151   
3                         0                         90   
4                        47                          0   

   string_experimental_score  string_database_score  string_textmining_score  \
0                        594                    500                      492   
1                        604                    900                      301   
2

In [15]:
pkn_edges = set(zip(string_pkn_df["TF"], string_pkn_df["TG"])) | set(zip(trrust_pkn_df["TF"], trrust_pkn_df["TG"])) | set(zip(kegg_pkn_df["TF"], kegg_pkn_df["TG"]))

print(f"Number of edges in PKN: {len(pkn_edges)}")

Number of edges in PKN: 316665


In [24]:
print(list(pkn_edges)[:5])

[('SMARCB1', 'MYSM1'), ('TTI1', 'RPTOR'), ('PLXNA3', 'SEMA5B'), ('ACSL4', 'MMU00062'), ('APBB1IP', 'ENAH')]


In [36]:
print(f"Overlapping TFs: {len(set(tf_tg_data_df['TF']).intersection({t for t,_ in pkn_edges}))}")
print(f"Overlapping TGs: {len(set(tf_tg_data_df['TG']).intersection({t for _,t in pkn_edges}))}")

Overlapping TFs: 79
Overlapping TGs: 1652


In [25]:
from typing import Tuple
def select_pkn_edges_from_df(df: pd.DataFrame, pkn_edges: set[Tuple[str, str]]):
    df['TF'] = df['TF'].str.upper()
    df['TG'] = df['TG'].str.upper()
    
    df['in_pkn'] = df.apply(lambda row: 1 if (row['TF'], row['TG']) in pkn_edges else 0, axis=1)
    
    df = df[df['in_pkn'] == 1]
    
    return df
    
df_in_pkn = select_pkn_edges_from_df(tf_tg_data_df, pkn_edges)

In [30]:
print(f"Number of edges in TF-TG data: {tf_tg_data_df.shape[0]:,}")
print(f"Number of edges in PKN: {len(pkn_edges):,}")
print(f"Number of edges in TF-TG data that are in PKN: {df_in_pkn.shape[0]:,}")

Number of edges in TF-TG data: 145,920
Number of edges in PKN: 316,665
Number of edges in TF-TG data that are in PKN: 352


In [19]:
def merge_dataset_with_ground_truth(df: pd.DataFrame, ground_truth: pd.DataFrame, show_network_size: bool=False):
    df['TF'] = df['TF'].str.upper()
    df['TG'] = df['TG'].str.upper()
    
    df_filtered = df[
        df['TF'].isin(ground_truth) &
        df['TG'].isin(ground_truth)
    ]
    
    if show_network_size:
        print(f"Number of edges shared between dataset and ground truth: {len(df_filtered)}")
        
    return df_filtered

tf_tg_merged_with_pkn = merge_dataset_with_ground_truth(tf_tg_data_df, pkn_edges, show_network_size=True)


Number of edges shared between dataset and ground truth: 0


In [37]:
peaks = pd.read_csv("data/raw/LINGER_PBMC_SC_DATA/Peaks.txt")
print(peaks.head())

      chr1:9790-10675
0  chr1:180599-181702
1  chr1:191168-192093
2  chr1:267565-268455
3  chr1:270876-271770
4  chr1:273948-274789
