In [None]:
import os
import pandas as pd
from pathlib import Path

import sys
PROJECT_DIR = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER"
SRC_DIR = str(Path(PROJECT_DIR) / "src")
if SRC_DIR not in sys.path:
    sys.path.insert(0, SRC_DIR)
    
GROUND_TRUTH_DIR = os.path.join(PROJECT_DIR, "data/ground_truth_files")

## Combined Ground Truth

In [None]:
beeline_ground_truth = pd.read_csv(os.path.join(GROUND_TRUTH_DIR, "mESC_beeline_ChIP-seq.csv"))
beeline_ground_truth = beeline_ground_truth.rename(columns={"Gene1":"TF", "Gene2":"TG"})

orti_ground_truth = pd.read_csv(os.path.join(GROUND_TRUTH_DIR, "ORTI_ground_truth_TF_TG.csv"))
orti_ground_truth = orti_ground_truth.rename(columns={"Gene1":"TF", "Gene2":"TG"})

rn112_ground_truth = pd.read_csv(
    "/gpfs/Labs/Uzun/DATA/PROJECTS/2024.SC_MO_TRN_DB.MIRA/REPOSITORY/CURRENT/REFERENCE_NETWORKS/RN112_LOGOF_BEELINE_Mouse_ESC.tsv",
    sep="\t"
)
rn112_ground_truth = rn112_ground_truth.rename(columns={"Source":"TF", "Target":"TG"})
rn112_ground_truth = rn112_ground_truth[["TF", "TG"]]

rn114_ground_truth = pd.read_csv(
    "/gpfs/Labs/Uzun/DATA/PROJECTS/2024.SC_MO_TRN_DB.MIRA/REPOSITORY/CURRENT/REFERENCE_NETWORKS/RN114_ChIPX_ESCAPE_Mouse_ESC.tsv",
    sep="\t"
)
rn114_ground_truth = rn114_ground_truth.rename(columns={"Source":"TF", "Target":"TG"})
rn114_ground_truth = rn114_ground_truth[["TF", "TG"]]

rn115_ground_truth = pd.read_csv(os.path.join(GROUND_TRUTH_DIR, "rn115_chipseq.tsv"), sep="\t")
rn115_ground_truth = rn115_ground_truth.rename(columns={"sourceName":"TF", "targetName":"TG"})
rn115_ground_truth = rn115_ground_truth[["TF", "TG"]]

chip_atlas_ground_truth = pd.read_csv(os.path.join(GROUND_TRUTH_DIR, "chip_atlas_tf_peak_tg_dist.csv"), header=0, index_col=None)
chip_atlas_ground_truth = chip_atlas_ground_truth.loc[chip_atlas_ground_truth.groupby(["source_id", "peak_id"])["distance"].idxmin()]
chip_atlas_ground_truth = chip_atlas_ground_truth[["source_id", "target_id"]].drop_duplicates()
chip_atlas_ground_truth = chip_atlas_ground_truth.rename(columns={"source_id":"TF", "target_id":"TG"})

rn117_ground_truth = pd.read_csv(os.path.join(GROUND_TRUTH_DIR, "RN117.tsv"), sep="\t", header=0, index_col=None)
rn117_ground_truth = rn117_ground_truth[["Source", "Target"]].drop_duplicates()
rn117_ground_truth = rn117_ground_truth.rename(columns={"Source":"TF", "Target":"TG"})
rn117_ground_truth.head()

# beeline_ground_truth, 
# rn112_ground_truth,
# rn114_ground_truth,
# rn115_ground_truth, 

combined_ground_truth = pd.concat([
        orti_ground_truth, 
        chip_atlas_ground_truth,
        rn117_ground_truth
    ]).drop_duplicates(subset=["TF", "TG"])
combined_ground_truth["TF"] = combined_ground_truth["TF"].str.capitalize()
combined_ground_truth["TG"] = combined_ground_truth["TG"].str.capitalize()
combined_ground_truth = combined_ground_truth.drop_duplicates(subset=["TF", "TG"])
combined_ground_truth.to_csv(os.path.join(GROUND_TRUTH_DIR, "combined_ground_truth.csv"), index=None, header=True)

In [None]:
rn111_and_rn112 = pd.concat([beeline_ground_truth, rn112_ground_truth])
rn111_and_rn112["TF"] = rn111_and_rn112["TF"].str.capitalize()
rn111_and_rn112["TG"] = rn111_and_rn112["TG"].str.capitalize()
rn111_and_rn112 = rn111_and_rn112.drop_duplicates()
rn111_and_rn112.head()

num_rn111_and_rn112_edges = rn111_and_rn112.shape[0]
print(f"Number of RN111 and RN112 Edges: {num_rn111_and_rn112_edges:,}")

rn111_and_rn112_overlap_with_combined = pd.merge(rn111_and_rn112, combined_ground_truth, on=["TF", "TG"], how="outer", indicator=True)

print(f"Number of edges only in RN111 and RN112: {len(rn111_and_rn112_overlap_with_combined[rn111_and_rn112_overlap_with_combined['_merge'] == 'left_only'])}")
print(f"Number of edges only in Combined GT: {len(rn111_and_rn112_overlap_with_combined[rn111_and_rn112_overlap_with_combined['_merge'] == 'right_only'])}")
print(f"Number of edges in both ground truths: {len(rn111_and_rn112_overlap_with_combined[rn111_and_rn112_overlap_with_combined['_merge'] == 'both'])}")

combined_ground_truth_unique_edges = rn111_and_rn112_overlap_with_combined[rn111_and_rn112_overlap_with_combined['_merge'] == 'right_only']
combined_ground_truth_unique_edges = combined_ground_truth_unique_edges[["TF", "TG"]].dropna()
combined_ground_truth_unique_edges.to_csv(os.path.join(GROUND_TRUTH_DIR, "combined_ground_truth_no_rn111_or_rn112_edges.csv"), index=None, header=True)

num_tfs = combined_ground_truth_unique_edges["TF"].nunique()
num_tgs = combined_ground_truth_unique_edges["TG"].nunique()

print(f"Num TFs = {num_tfs:,}")
print(f"Num TGs = {num_tgs:,}")
print(f"Num Edges = {combined_ground_truth_unique_edges.shape[0]:,}")

Unnamed: 0,TF,TG
0,Adnp,Mis18bp1
1,Adnp,Exd1
2,Adnp,Chp1
3,Adnp,Oscar
4,Adnp,Ndufa3
