# Investigating Sliding Window Scores

The sliding window scores should have more overlap with the ground truth datasets than they do. I am going to go step-by-step to re-run the analysis, using the raw non-normalized sliding window scores from the method output file.

In [None]:
!hostnamectl

In [None]:
import os
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
import dev.notebooks.plotting as dev_plots


In [None]:
import importlib
importlib.reload(dev_plots)

## Loading the Raw Sliding Window Scores

In [None]:
output_dir = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/output/DS011_mESC/DS011_mESC_sample1/"

# Load sliding window scores, associate with the nearest gene TSS
raw_sliding_window_scores = pd.read_parquet(os.path.join(output_dir, "no_norm_sliding_window_tf_to_peak_score.parquet"), engine="pyarrow")
peaks_gene_distance_file = os.path.join(output_dir, "peaks_near_genes.parquet")

## Mapping Sliding Window Scores to TGs

### Setting the TG as the closest gene TSS

First, we load in the raw sliding window scores from the direct output of the sliding window score calculation method, without normalization or scaling. 

To get TF-TG edges, we load in the dataframe of all peaks within 1 Mb of each gene and find the closest gene for each peak. We use the closest gene to each peak as the TG for each TF-peak edge in the sliding window scores.

In [None]:
def set_tg_as_closest_gene_tss(raw_sliding_window_scores: pd.DataFrame, peaks_gene_distance_file: str):
    # Read in the peaks to TG data and pick the closest gene for each peak (maximum TSS distance score)
    peaks_near_genes_df = pd.read_parquet(peaks_gene_distance_file, engine="pyarrow")

    closest_gene_to_peak_df = peaks_near_genes_df.sort_values("TSS_dist_score", ascending=False).groupby("peak_id").first()
    closest_gene_to_peak_df = closest_gene_to_peak_df[["target_id"]].reset_index()

    # Set the TG for each TF-peak edge as the closest gene to the peak
    sliding_window_closest_gene_df = pd.merge(raw_sliding_window_scores, closest_gene_to_peak_df, on=["peak_id"], how="left")
    return sliding_window_closest_gene_df

In [None]:
sliding_window_closest_gene_df = set_tg_as_closest_gene_tss(raw_sliding_window_scores, peaks_gene_distance_file)
    
sliding_window_tf_tg = sliding_window_closest_gene_df[["source_id", "target_id", "sliding_window_score"]].drop_duplicates()
unique_sliding_window_tf_tg_edges = sliding_window_tf_tg[["source_id", "target_id"]].drop_duplicates()

In [None]:
print(f"Sliding Window scores")
print(f"  - TFs: {sliding_window_tf_tg['source_id'].nunique():,}")
print(f"  - TGs: {sliding_window_tf_tg['target_id'].nunique():,}")
print(f"  - TF-peak-TG Edges: {len(sliding_window_closest_gene_df):,}")
print(f"  - TF-TG-Score Edges: {len(sliding_window_tf_tg):,}")
print(f"  - TF-TG Edges: {len(unique_sliding_window_tf_tg_edges):,}")

### Setting the TG using MIRA peak-TG

In [None]:
mira_peak_to_tg_df = pd.read_parquet("/gpfs/Home/esm5360/MIRA/mira-datasets/ds011_full/ds011_full_mira_peak_to_tg_scores.parquet", engine="pyarrow")

In [None]:
def set_tg_using_mira_peak_tg_edges(raw_sliding_window_scores: pd.DataFrame, mira_df: pd.DataFrame):
    sliding_window_mira_df = pd.merge(raw_sliding_window_scores, mira_df, on=["peak_id"], how="left")
    sliding_window_mira_df = sliding_window_mira_df[["source_id", "peak_id", "target_id", "sliding_window_score"]].dropna(subset="target_id")
    
    return sliding_window_mira_df

In [None]:
sliding_window_mira_df = set_tg_using_mira_peak_tg_edges(raw_sliding_window_scores, mira_peak_to_tg_df)
sliding_window_mira_df

### Setting the TG using Cicero peak-TG

In [None]:
cicero_peak_to_tg_df = pd.read_parquet(os.path.join(output_dir, "cicero_peak_to_tg_scores.parquet"), engine="pyarrow")

In [None]:
def set_tg_using_cicero_peak_tg_edges(raw_sliding_window_scores: pd.DataFrame, cicero_df: pd.DataFrame):
    sliding_window_cicero_df = pd.merge(raw_sliding_window_scores, cicero_df, on=["peak_id"], how="left")
    sliding_window_cicero_df = sliding_window_cicero_df[["source_id", "peak_id", "target_id", "sliding_window_score"]].dropna(subset="target_id")
    
    return sliding_window_cicero_df

In [None]:
sliding_window_cicero_df = set_tg_using_cicero_peak_tg_edges(raw_sliding_window_scores, cicero_peak_to_tg_df)
sliding_window_cicero_df

## Loading RN115

Next, we load in the RN115 LOGOF ESCAPE ground truth data and ensure that the TF and TG column names match the sliding window DataFrame columns

In [None]:
# Load the RN115 LOGOG ESCAPE ground truth
rn115_ko_ground_truth = pd.read_csv("/gpfs/Labs/Uzun/DATA/PROJECTS/2024.SC_MO_TRN_DB.MIRA/REPOSITORY/CURRENT/REFERENCE_NETWORKS/RN115_LOGOF_ESCAPE_Mouse_ESC.tsv", sep="\t")
rn115_ko_ground_truth = rn115_ko_ground_truth[["Source", "Target"]].rename(columns={"Source":"source_id", "Target":"target_id"})
rn115_ko_ground_truth["source_id"] = rn115_ko_ground_truth["source_id"].str.capitalize()
rn115_ko_ground_truth["target_id"] = rn115_ko_ground_truth["target_id"].str.capitalize()

In [None]:
print("RN115 LOGOF Ground Truth")
print(f"  - TFs: {rn115_ko_ground_truth['source_id'].nunique():,}")
print(f"  - TGs: {rn115_ko_ground_truth['target_id'].nunique():,}")
print(f"  - Edges: {len(rn115_ko_ground_truth):,}")

### RN115 TGs per TF

In [None]:
rn115_targets_per_tf = rn115_ko_ground_truth.groupby("source_id").count().sort_values(by="target_id", ascending=False)
print("Number of RN115 TGs per TF")
rn115_targets_per_tf

In [None]:
sliding_window_tf_tg

### Sliding Window TGs per TF

In [None]:
sliding_window_targets_per_tf = sliding_window_tf_tg.groupby("source_id")["target_id"].count().sort_values(ascending=False)
print("Number of Sliding Window TGs per TF")
sliding_window_targets_per_tf

## Overlap between RN115 and Sliding Window Scores

Merge the datasets together. By doing a full outer join and using `inticator=True`, we can separate out the edges in both datasets along with the edges only in the sliding window scores and the edges only in RN115

In [None]:
merged_edges = pd.merge(
    sliding_window_tf_tg, 
    rn115_ko_ground_truth, 
    on=["source_id", "target_id"], 
    how="outer",
    indicator=True
    )
merged_edges

In [None]:
edges_in_both = merged_edges[merged_edges["_merge"] == "both"].drop(columns="_merge")
sliding_window_edges_only = merged_edges[merged_edges["_merge"] == "left_only"].drop(columns="_merge")
rn115_edges_only = merged_edges[merged_edges["_merge"] == "right_only"].drop(columns="_merge")

In [None]:
tfs_in_both = edges_in_both["source_id"].drop_duplicates()
tgs_in_both = edges_in_both["target_id"].drop_duplicates()

print("Edges in both RN115 and Sliding Window:")
print(f"  - TFs in both: {len(tfs_in_both):,}")
print(f"  - TGs in both: {len(tgs_in_both):,}")
print(f"  - TF-TG-Score Edges in both: {len(edges_in_both):,}")
print(f"  - TF-TG Edges in both: {len(edges_in_both.drop_duplicates(subset=['source_id', 'target_id'])):,}")

In [None]:
tfs_only_in_sliding_window = sliding_window_tf_tg[~sliding_window_tf_tg["source_id"].isin(rn115_ko_ground_truth["source_id"])]["source_id"].drop_duplicates()
tgs_only_in_sliding_window = sliding_window_tf_tg[~sliding_window_tf_tg["target_id"].isin(rn115_ko_ground_truth["target_id"])]["target_id"].drop_duplicates()

print("Only in Sliding Window:")
print(f"  - TFs only in Sliding Window: {len(tfs_only_in_sliding_window):,}")
print(f"  - TGs only in Sliding Window: {len(tgs_only_in_sliding_window):,}")
print(f"  - TF-peak-TG Edges only in Sliding Window: {len(sliding_window_edges_only):,}")
print(f"  - TF-TG Edges only in Sliding Window: {len(sliding_window_edges_only.drop_duplicates(subset=['source_id', 'target_id'])):,}")

In [None]:
tfs_only_in_rn115 = rn115_ko_ground_truth[~rn115_ko_ground_truth["source_id"].isin(sliding_window_tf_tg["source_id"])]["source_id"].drop_duplicates()
tgs_only_in_rn115 = rn115_ko_ground_truth[~rn115_ko_ground_truth["target_id"].isin(sliding_window_tf_tg["target_id"])]["target_id"].drop_duplicates()

print("Only in RN115:")
print(f"  - TFs only in RN115: {len(tfs_only_in_rn115):,}")
print(f"  - TGs only in RN115: {len(tgs_only_in_rn115):,}")
print(f"  - Edges only in Sliding Window: {len(rn115_edges_only):,}")

### Sum of sliding window scores per TF-TG edge (strength of association)

#### Both Sliding Window and RN115

In [None]:
edges_in_both_by_tf_tg = (
    edges_in_both.groupby(["source_id", "target_id"])
    .count()
    .sort_values(by="sliding_window_score", ascending=False)
    .reset_index()
    .rename(columns={'sliding_window_score':'Num Scores per TF-TG Edge'})
)
edges_in_both_by_tf_tg

In [None]:
importlib.reload(dev_plots)
dev_plots.plot_score_heatmap_by_tf_tg(
    df=edges_in_both,
    title="Sliding Window Scores in RN115 by TF-TG Edges",
    score_col="sliding_window_score",
    max_tfs=edges_in_both["source_id"].nunique(),
    max_tgs=edges_in_both["target_id"].nunique()
    )

#### Only in Sliding Window

In [None]:
dev_plots.plot_score_heatmap_by_tf_tg(
    df=sliding_window_edges_only,
    title="Sliding Window Scores NOT in RN115 by TF-TG Edges",
    score_col="sliding_window_score",
    max_tfs=sliding_window_edges_only["source_id"].nunique(),
    max_tgs=sliding_window_edges_only["target_id"].nunique()
    )

In [None]:
sliding_window_edges_only_only_by_tf_tg = (
    sliding_window_edges_only.groupby(["source_id", "target_id"])
    .count()
    .sort_values(by="sliding_window_score", ascending=False)
    .reset_index()
    .rename(columns={'sliding_window_score':'Num Scores per TF-TG Edge'})
)
sliding_window_edges_only_only_by_tf_tg

### Number of scores per TF

#### Both Sliding Window and RN115

In [None]:
edges_in_both_by_tf = (
    edges_in_both
    .groupby("source_id")
    .count()
    .sort_values("target_id", ascending=False)
    .rename(columns={'sliding_window_score':'Num Scores per TF'})
    ['Num Scores per TF']
    .reset_index()
    )

In [None]:
importlib.reload(dev_plots)
dev_plots.plot_scores_grouped_by_tf(
    edges_in_both, 
    title="Sliding Window Scores in RN115 - Number of Scores per TF", 
    score_col="sliding_window_score"
    )

#### Only in Sliding Window

In [None]:
sliding_window_edges_only_by_tf = (
    sliding_window_edges_only
    .groupby("source_id")
    .count()
    .sort_values("target_id", ascending=False)
    .rename(columns={'sliding_window_score':'Num Scores per TF'})
    ['Num Scores per TF']
    .reset_index()
    )

In [None]:
importlib.reload(dev_plots)
dev_plots.plot_scores_grouped_by_tf(
    sliding_window_edges_only, 
    title="Sliding Window Scores Not in RN115 - Number of Scores per TF", 
    score_col="sliding_window_score",
    top_tf_limit=911
    )

In [None]:
edges_in_both.hist("sliding_window_score", bins=150, grid=False)

## Score Distributions - Edges in RN115 vs Edges Not in RN115

In [None]:
importlib.reload(dev_plots)
dev_plots.plot_true_false_distribution(
    true_series=edges_in_both["sliding_window_score"], 
    false_series=sliding_window_edges_only["sliding_window_score"],
    xlabel="Sliding Window Score",
    title="Sliding Window Scores",
    balance=True
    
    )

In [None]:
importlib.reload(dev_plots)

df = (
    edges_in_both
    .groupby("source_id")
    .count()
    .sort_values("sliding_window_score", ascending=False)
    .reset_index()
    )
tfs_of_interest = df["source_id"].drop_duplicates().to_list()

num_tfs = 15

dev_plots.plot_score_distribution_by_tf(
    df=edges_in_both, 
    score_col="sliding_window_score", 
    title="Sliding window scores in RN115, colored by TF",
    top_tf_limit=num_tfs,
    tfs_of_interest=tfs_of_interest
    )
dev_plots.plot_score_distribution_by_tf(
    df=sliding_window_edges_only, 
    score_col="sliding_window_score", 
    title="Sliding window scores NOT in RN115, colored by TF",
    top_tf_limit=num_tfs,
    tfs_of_interest = tfs_of_interest
    )

## Comparing Sliding Window Scores to RN111

### Size of RN111

In [None]:
from grn_inference.utils import read_ground_truth
rn111_chipseq_ground_truth = read_ground_truth("/gpfs/Labs/Uzun/DATA/PROJECTS/2024.SC_MO_TRN_DB.MIRA/REPOSITORY/CURRENT/REFERENCE_NETWORKS/RN111_ChIPSeq_BEELINE_Mouse_ESC.tsv").drop(columns=["Relationship"])
rn111_chipseq_ground_truth['source_id'] = rn111_chipseq_ground_truth['source_id'].str.capitalize()
rn111_chipseq_ground_truth['target_id'] = rn111_chipseq_ground_truth['target_id'].str.capitalize()

In [None]:
print("RN111 ChIP-seq Ground Truth")
print(f"  - TFs: {rn111_chipseq_ground_truth['source_id'].nunique():,}")
print(f"  - TGs: {rn111_chipseq_ground_truth['target_id'].nunique():,}")
print(f"  - Edges: {len(rn111_chipseq_ground_truth):,}")

### RN111 TGs per TF

In [None]:
rn111_targets_per_tf = rn111_chipseq_ground_truth.groupby("source_id").count().sort_values(by="target_id", ascending=False)
print("Number of RN111 TGs per TF")
rn111_targets_per_tf

## Overlap between sliding window scores and RN111

In [None]:
merged_edges = pd.merge(
    sliding_window_tf_tg, 
    rn111_chipseq_ground_truth, 
    on=["source_id", "target_id"], 
    how="outer",
    indicator=True
    )

In [None]:
edges_in_sliding_and_rn111 = merged_edges[merged_edges["_merge"] == "both"].drop(columns="_merge")
sliding_window_not_rn111_edges = merged_edges[merged_edges["_merge"] == "left_only"].drop(columns="_merge")
rn111_edges_only = merged_edges[merged_edges["_merge"] == "right_only"].drop(columns="_merge")

In [None]:
tfs_in_both = edges_in_sliding_and_rn111["source_id"].drop_duplicates()
tgs_in_both = edges_in_sliding_and_rn111["target_id"].drop_duplicates()

print("Edges in both RN111 and Sliding Window:")
print(f"  - TFs in both: {len(tfs_in_both):,}")
print(f"  - TGs in both: {len(tgs_in_both):,}")
print(f"  - TF-TG-Score Edges in both: {len(edges_in_sliding_and_rn111):,}")
print(f"  - TF-TG Edges in both: {len(edges_in_sliding_and_rn111.drop_duplicates(subset=['source_id', 'target_id'])):,}")

In [None]:
tfs_only_in_sliding_window = sliding_window_tf_tg[~sliding_window_tf_tg["source_id"].isin(rn111_chipseq_ground_truth["source_id"])]["source_id"].drop_duplicates()
tgs_only_in_sliding_window = sliding_window_tf_tg[~sliding_window_tf_tg["target_id"].isin(rn111_chipseq_ground_truth["target_id"])]["target_id"].drop_duplicates()

print("Sliding window but NOT RN111:")
print(f"  - TFs: {len(tfs_only_in_sliding_window):,}")
print(f"  - TGs: {len(tgs_only_in_sliding_window):,}")
print(f"  - TF-peak-TG Edges: {len(sliding_window_not_rn111_edges):,}")
print(f"  - TF-TG Edges: {len(sliding_window_not_rn111_edges.drop_duplicates(subset=['source_id', 'target_id'])):,}")

In [None]:
tfs_only_in_rn111 = rn111_chipseq_ground_truth[~rn111_chipseq_ground_truth["source_id"].isin(sliding_window_tf_tg["source_id"])]["source_id"].drop_duplicates()
tgs_only_in_rn111 = rn111_chipseq_ground_truth[~rn111_chipseq_ground_truth["target_id"].isin(sliding_window_tf_tg["target_id"])]["target_id"].drop_duplicates()

print("Only in RN111:")
print(f"  - TFs only in RN111: {len(tfs_only_in_rn111):,}")
print(f"  - TGs only in RN111: {len(tgs_only_in_rn111):,}")
print(f"  - Edges only in RN111: {len(rn111_edges_only):,}")

### Sum of sliding window scores per TF-TG Edge (strength of association)

In [None]:
dev_plots.plot_score_heatmap_by_tf_tg(
    df=edges_in_sliding_and_rn111,
    title="Sliding Window Scores in RN111 by TF-TG Edges",
    score_col="sliding_window_score",
    max_tfs=edges_in_sliding_and_rn111["source_id"].nunique(),
    max_tgs=edges_in_sliding_and_rn111["target_id"].nunique()
    )

In [None]:
dev_plots.plot_score_heatmap_by_tf_tg(
    df=sliding_window_not_rn111_edges,
    title="Sliding Window Scores NOT in RN111 by TF-TG Edges",
    score_col="sliding_window_score",
    max_tfs=sliding_window_not_rn111_edges["source_id"].nunique(),
    max_tgs=sliding_window_not_rn111_edges["target_id"].nunique()
    )

### Number of Scores per TF

#### Both Sliding Window and RN111

In [None]:
edges_in_sliding_and_rn111_by_tf = (
    edges_in_sliding_and_rn111
    .groupby("source_id")
    .count()
    .sort_values("target_id", ascending=False)
    .rename(columns={'sliding_window_score':'Num Scores per TF'})
    ['Num Scores per TF']
    .reset_index()
    )

In [None]:
importlib.reload(dev_plots)
dev_plots.plot_scores_grouped_by_tf(
    edges_in_sliding_and_rn111, 
    title="Sliding Window Scores in RN111 - Number of Scores per TF", 
    score_col="sliding_window_score",
    top_tf_limit=75
    )

#### Only in Sliding Window

In [None]:
sliding_window_not_rn111_edges_by_tf = (
    sliding_window_not_rn111_edges
    .groupby("source_id")
    .count()
    .sort_values("target_id", ascending=False)
    .rename(columns={'sliding_window_score':'Num Scores per TF'})
    ['Num Scores per TF']
    .reset_index()
    )

In [None]:
importlib.reload(dev_plots)
dev_plots.plot_scores_grouped_by_tf(
    sliding_window_not_rn111_edges, 
    title="Sliding Window Scores NOT in RN111 - Number of Scores per TF", 
    score_col="sliding_window_score",
    top_tf_limit=911
    )

## Score Distributions - Edges in RN111 vs Edges Not in RN111

In [None]:
importlib.reload(dev_plots)
dev_plots.plot_true_false_distribution(
    true_series=edges_in_both["sliding_window_score"], 
    false_series=sliding_window_edges_only["sliding_window_score"],
    xlabel="Sliding Window Score",
    title="Sliding Window Scores",
    balance=True
    )

In [None]:
importlib.reload(dev_plots)

df = (
    edges_in_sliding_and_rn111
    .groupby("source_id")
    .count()
    .sort_values("sliding_window_score", ascending=False)
    .reset_index()
    )
tfs_of_interest = df["source_id"].drop_duplicates().to_list()

num_tfs = 15

dev_plots.plot_score_distribution_by_tf(
    df=edges_in_sliding_and_rn111, 
    score_col="sliding_window_score", 
    title="Sliding window scores in RN111, colored by TF",
    top_tf_limit=num_tfs,
    tfs_of_interest=tfs_of_interest
    )
dev_plots.plot_score_distribution_by_tf(
    df=sliding_window_not_rn111_edges, 
    score_col="sliding_window_score", 
    title="Sliding window scores NOT in RN111, colored by TF",
    top_tf_limit=num_tfs,
    tfs_of_interest = tfs_of_interest
    )

# Comparing Different Methods for Calculating and Evaluating Sliding Window Scores

We have two methods for calculating TF-TG binding scores: 

1. Sum TF binding potential for all peaks targeting the same TG
2. Use individual TF-peak-TG scores

We also have different methods for assigning peaks to TGs
1. Nearest gene TSS to the peak
2. MIRA peak-TG
3. Cicero peak-TG

To evaluate the accuracy of the TF-TG scores, we will use the ChIP-seq ground truth, where the TG for TF-peak edges is assigned using the closest gene's TSS.

We will test the combinations of the calculation methods with the peak-TG assignments to see which are more discriminatory

##  Sliding Window Score Calculation

The sliding window score distributions are different for each TF depending on the TF's binding motif. To determine the probability that each TF will bind to each peak, the probabilities need to be standardized to account for these differences.

### Summing peaks for TF-TG edges

The sliding window score for a peak will be determined by dividing the sum of the sliding window scores for a given TG by the sliding window score of all other sliding window scores for the TF:

$$\text{Sliding Window Score}_{\text{TF-TG}} = \frac{\sum_{\text{peak} \in \text{TG Peaks}}^{\text{TG peaks}}\text{sliding window score}_{\text{peak}}}{\sum \text{Total sliding window scores}_{\text{TF}}}$$

In [None]:
def calculate_summed_tf_tg_score(sliding_window_with_targets: pd.DataFrame):
    # Group by TF and sum all sliding window scores
    sum_of_tf_peaks = (
        sliding_window_with_targets
        .groupby("source_id")["sliding_window_score"]
        .sum()
        .reset_index()
        .rename(columns={"sliding_window_score":"total_tf_score"})
        )
    
    # Group by TF-TG edge and sum for all peaks for that edge
    sum_of_tf_tg_peak_scores = (
        sliding_window_with_targets
        .groupby(["source_id", "target_id"])["sliding_window_score"]
        .sum()
        .reset_index()
        .rename(columns={"sliding_window_score":"tf_to_tg_peak_scores_summed"})
        )
    
    # Merge the total TF peaks and summed TF-TG edges
    sliding_window_sum_calculation_df = pd.merge(
        sum_of_tf_tg_peak_scores, 
        sum_of_tf_peaks, 
        how="left", 
        on="source_id"
        )
    
    
    sliding_window_sum_calculation_df["sliding_window_score"] = (
        sliding_window_sum_calculation_df["tf_to_tg_peak_scores_summed"] / sliding_window_sum_calculation_df["total_tf_score"]
        ) * 1e6
    sliding_window_sum_df = sliding_window_sum_calculation_df[["source_id", "target_id", "sliding_window_score"]]
    
    return sliding_window_sum_df

### Individual peaks for TF-TG edges

We also want to evaluate the performance of using each TF-peak-TG score rather than aggregating. For this, we use the following formula:

$$\text{Sliding Window Score}_{\text{TF-peak-TG}} = \frac{\text{sliding window score}_{\text{TF-peak-TG}}}{\sum \text{Total sliding window scores}_{\text{TF}}}$$

In [None]:
def calculate_tf_peak_tg_score(sliding_window_with_targets: pd.DataFrame):
    # Group by TF and sum all sliding window scores
    sum_of_tf_peaks = (
        sliding_window_with_targets
        .groupby("source_id")["sliding_window_score"]
        .sum()
        .reset_index()
        .rename(columns={"sliding_window_score":"total_tf_score"})
        )
    
    # Merge the total TF peaks with the TF-peak-TG scores
    sliding_window_tf_peak_tg_calculation_df = pd.merge(
        sliding_window_with_targets, 
        sum_of_tf_peaks, 
        how="left", 
        on="source_id"
        ).rename(columns={"sliding_window_score":"tf_peak_tg_score"})
    
    # Calculate the final sliding window score by dividing each TF-peak-TG score by the sum of scores for the TF
    sliding_window_tf_peak_tg_calculation_df["sliding_window_score"] = (
        sliding_window_tf_peak_tg_calculation_df["tf_peak_tg_score"] / sliding_window_tf_peak_tg_calculation_df["total_tf_score"]
        ) * 1e6
    sliding_window_tf_peak_tg_df = sliding_window_tf_peak_tg_calculation_df[["source_id", "peak_id", "target_id", "sliding_window_score"]]
    
    return sliding_window_tf_peak_tg_df

## Loading the RN111 and RN115 Ground Truth Datasets

### Loading RN111

In [None]:
from grn_inference.utils import read_ground_truth
rn111_chipseq_ground_truth = read_ground_truth("/gpfs/Labs/Uzun/DATA/PROJECTS/2024.SC_MO_TRN_DB.MIRA/REPOSITORY/CURRENT/REFERENCE_NETWORKS/RN111_ChIPSeq_BEELINE_Mouse_ESC.tsv").drop(columns=["Relationship"])
rn111_chipseq_ground_truth['source_id'] = rn111_chipseq_ground_truth['source_id'].str.capitalize()
rn111_chipseq_ground_truth['target_id'] = rn111_chipseq_ground_truth['target_id'].str.capitalize()

### Loading RN115

In [None]:
rn115_ko_ground_truth = pd.read_csv("/gpfs/Labs/Uzun/DATA/PROJECTS/2024.SC_MO_TRN_DB.MIRA/REPOSITORY/CURRENT/REFERENCE_NETWORKS/RN115_LOGOF_ESCAPE_Mouse_ESC.tsv", sep="\t")
rn115_ko_ground_truth = rn115_ko_ground_truth[["Source", "Target"]].rename(columns={"Source":"source_id", "Target":"target_id"})
rn115_ko_ground_truth["source_id"] = rn115_ko_ground_truth["source_id"].str.capitalize()
rn115_ko_ground_truth["target_id"] = rn115_ko_ground_truth["target_id"].str.capitalize()

### Merging TF-TG dataframes with the ground truth

In [None]:
def merge_dataset_with_ground_truth(df: pd.DataFrame, ground_truth: pd.DataFrame):
    df['source_id'] = df['source_id'].str.capitalize()
    df['target_id'] = df['target_id'].str.capitalize()
    
    shared_sources = set(df['source_id']) & set(ground_truth['source_id'])
    shared_targets = set(df['target_id']) & set(ground_truth['target_id'])

    df_filtered = df[
        df['source_id'].isin(shared_sources) &
        df['target_id'].isin(shared_targets)
    ]

    gt_filtered = ground_truth[
        ground_truth['source_id'].isin(shared_sources) &
        ground_truth['target_id'].isin(shared_targets)
    ]
    
    df_merged = pd.merge(df_filtered, gt_filtered, on=['source_id', 'target_id'], how='outer', indicator=True)
    
    df_merged["label"] = df_merged["_merge"] == "both"
    
    df_labeled = df_merged.drop(columns=["_merge"])
    
    return df_labeled

## mESC Filtered L2 E7.5 Rep1

We first need to read in the sliding window scores, peak to gene distances, MIRA peak-TG scores, and the Cicero peak-TG scores for the mESC filtered L2 E7.5 Rep1 dataset.

### Reading in the peak-TG score files

In [None]:
mesc_output_dir = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/output/mESC/filtered_L2_E7.5_rep1"
mesc_sliding_window_df = pd.read_parquet(os.path.join(mesc_output_dir, "no_norm_sliding_window_tf_to_peak_score.parquet"), engine="pyarrow")

mesc_peak_to_gene_distance_file = pd.read_parquet(os.path.join(mesc_output_dir, "tmp/peaks_near_genes.parquet"), engine="pyarrow")
mesc_mira_df = pd.read_parquet("/gpfs/Home/esm5360/MIRA/mira-datasets/mESC_filtered_L2_E7.5_rep1/mESC_E7.5_rep1_mira_peak_to_tg_scores_nonzero.parquet", engine="pyarrow")
mesc_cicero_peak_to_tg_df = pd.read_parquet(os.path.join(mesc_output_dir, "cicero_peak_to_tg_scores.parquet"), engine="pyarrow")

### Setting the TGs for the TF-peak edges using closest TSS, MIRA, or Cicero

In [None]:
mesc_gene_tss_sliding_window_df = set_tg_as_closest_gene_tss(mesc_sliding_window_df, mesc_peak_to_gene_distance_file)
mesc_mira_sliding_window_df = set_tg_using_mira_peak_tg_edges(mesc_sliding_window_df, mesc_mira_df)
mesc_cicero_sliding_window_df = set_tg_using_cicero_peak_tg_edges(mesc_sliding_window_df, mesc_cicero_peak_to_tg_df)

## DS011

Next, I will evaluate the combinations for DS011. We will load in the sliding window scores and the different peak to TG files we will use to set the TGs for the TF-peak edges.

### Reading in the peak-TG score files

In [None]:
ds011_output_dir = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/output/DS011_mESC/DS011_mESC_sample1/"
ds011_sliding_window_df = pd.read_parquet(os.path.join(ds011_output_dir, "no_norm_sliding_window_tf_to_peak_score.parquet"), engine="pyarrow")

ds011_peak_to_gene_distance_file = os.path.join(ds011_output_dir, "peaks_near_genes.parquet"), engine="pyarrow"
ds011_mira_df = pd.read_parquet("/gpfs/Home/esm5360/MIRA/mira-datasets/ds011_full/ds011_full_mira_peak_to_tg_scores.parquet", engine="pyarrow")
ds011_cicero_peak_to_tg_df = pd.read_parquet(os.path.join(ds011_output_dir, "cicero_peak_to_tg_scores.parquet"), engine="pyarrow")

In [None]:
ds011_peak_to_gene_distance_file = os.path.join(ds011_output_dir, "peaks_near_genes.parquet")

#### Score Distributions

In [None]:
def plot_scores_distribution(df: pd.DataFrame, title: str):
    assert "label" in df.columns, f"label column does not exist, columns: {df.columns}"
    
    true_scores = df[df["label"] == True]
    false_scores = df[df["label"] == False]

    dev_plots.plot_true_false_distribution(
        true_series=true_scores["sliding_window_score"], 
        false_series=false_scores["sliding_window_score"],
        xlabel="Sliding Window Score",
        title=title,
        balance=True,
        log=False
        )

### Setting the TGs for the TF-peak edges using closest TSS, MIRA, or Cicero

In [None]:
ds011_gene_tss_sliding_window_df = set_tg_as_closest_gene_tss(ds011_sliding_window_df, ds011_peak_to_gene_distance_file)
ds011_mira_sliding_window_df = set_tg_using_mira_peak_tg_edges(ds011_sliding_window_df, ds011_mira_df)
ds011_cicero_sliding_window_df = set_tg_using_cicero_peak_tg_edges(ds011_sliding_window_df, ds011_cicero_peak_to_tg_df)

In [None]:
def compare_scoring_method_combinations(df_with_tgs: pd.DataFrame, rn111: pd.DataFrame, rn115: pd.DataFrame):    
    # Calculate the summed and individual sliding window scores
    tf_tg_sum_df = calculate_summed_tf_tg_score(df_with_tgs)
    tf_peak_tg_df = calculate_tf_peak_tg_score(df_with_tgs)
    
    # Sum all peaks for a TG
    tf_tg_sum_rn111_df = merge_dataset_with_ground_truth(tf_tg_sum_df, rn111)
    tf_tg_sum_rn115_df = merge_dataset_with_ground_truth(tf_tg_sum_df, rn115)
    
    # Calculate individual TF-peak-TG scores
    tf_peak_tg_rn111_df = merge_dataset_with_ground_truth(tf_peak_tg_df, rn111)
    tf_peak_tg_rn115_df = merge_dataset_with_ground_truth(tf_peak_tg_df, rn115)
    
    # Compare summed TF-TG scoring method against RN111 and RN115
    plot_scores_distribution(tf_tg_sum_rn111_df, title="DS011 Sliding Window Scores - Summed, RN111, nearest gene TSS")
    dev_plots.plot_auroc(tf_tg_sum_rn111_df, score_col="sliding_window_score", title="DS011 Sliding Window Scores - Summed, RN111, nearest gene TSS")
    
    plot_scores_distribution(tf_tg_sum_rn115_df, title="DS011 Sliding Window Scores - Summed, RN115, nearest gene TSS")
    dev_plots.plot_auroc(tf_tg_sum_rn115_df, score_col="sliding_window_score", title="DS011 Sliding Window Scores - Summed, RN115, nearest gene TSS")
    
    # Compare individual TF-peak-TG scoring method against RN111 and RN115
    plot_scores_distribution(tf_peak_tg_rn111_df, title="DS011 Sliding Window Scores - Individual, RN111, nearest gene TSS")
    dev_plots.plot_auroc(tf_peak_tg_rn111_df, score_col="sliding_window_score", title="DS011 Sliding Window Scores - Individual, RN111, nearest gene TSS")
    
    plot_scores_distribution(tf_peak_tg_rn115_df, title="DS011 Sliding Window Scores - Individual, RN115, nearest gene TSS")
    dev_plots.plot_auroc(tf_peak_tg_rn115_df, score_col="sliding_window_score", title="DS011 Sliding Window Scores - Individual, RN115, nearest gene TSS")    

In [None]:
compare_scoring_method_combinations(ds011_gene_tss_sliding_window_df, rn111_chipseq_ground_truth, rn115_ko_ground_truth)
compare_scoring_method_combinations(ds011_mira_sliding_window_df, rn111_chipseq_ground_truth, rn115_ko_ground_truth)
compare_scoring_method_combinations(ds011_cicero_sliding_window_df, rn111_chipseq_ground_truth, rn115_ko_ground_truth)

### TF-TG scores: summing across peaks for each TG

In [None]:
ds011_gene_tss_tf_tg_sum_df = calculate_summed_tf_tg_score(ds011_gene_tss_sliding_window_df)
ds011_mira_tf_tg_sum_df = calculate_summed_tf_tg_score(ds011_mira_sliding_window_df)
ds011_cicero_tf_tg_sum_df = calculate_summed_tf_tg_score(ds011_cicero_sliding_window_df)

### TF-TG scores: Individual TF-peak-TG scores

In [None]:
ds011_gene_tss_tf_peak_tg_df = calculate_tf_peak_tg_score(ds011_gene_tss_sliding_window_df)
ds011_mira_tf_peak_tg_df = calculate_tf_peak_tg_score(ds011_mira_sliding_window_df)
ds011_cicero_tf_peak_tg_df = calculate_tf_peak_tg_score(ds011_cicero_sliding_window_df)

### Comparing summed TF-TG scores against RN111

In [None]:
ds011_gene_tss_tf_tg_sum_rn111_df = merge_dataset_with_ground_truth(ds011_gene_tss_tf_tg_sum_df, rn111_chipseq_ground_truth)
ds011_mira_tf_tg_sum_rn111_df = merge_dataset_with_ground_truth(ds011_mira_tf_tg_sum_df, rn111_chipseq_ground_truth)
ds011_cicero_tf_tg_sum_rn111_df = merge_dataset_with_ground_truth(ds011_cicero_tf_tg_sum_df, rn111_chipseq_ground_truth)

In [None]:
plot_scores_distribution(ds011_gene_tss_tf_tg_sum_rn111_df, title="DS011 Sliding Window Scores - Summed, RN111, nearest gene TSS")
plot_scores_distribution(ds011_mira_tf_tg_sum_rn111_df, title="DS011 Sliding Window Scores - Summed, RN111, MIRA")
plot_scores_distribution(ds011_cicero_tf_tg_sum_rn111_df, title="DS011 Sliding Window Scores - Summed, RN111, Cicero")


#### AUROC Scores

In [None]:
dev_plots.plot_auroc(ds011_gene_tss_tf_tg_sum_rn111_df, score_col="sliding_window_score", title="DS011 Sliding Window Scores - Summed, RN111, nearest gene TSS")
dev_plots.plot_auroc(ds011_mira_tf_tg_sum_rn111_df, score_col="sliding_window_score", title="DS011 Sliding Window Scores - Summed, RN111, MIRA")
dev_plots.plot_auroc(ds011_cicero_tf_tg_sum_rn111_df, score_col="sliding_window_score", title="DS011 Sliding Window Scores - Summed, RN111, Cicero")

### Comparing summed TF-TG scores against RN115

In [None]:
ds011_gene_tss_tf_tg_sum_rn115_df = merge_dataset_with_ground_truth(ds011_gene_tss_tf_tg_sum_df, rn115_ko_ground_truth)
ds011_mira_tf_tg_sum_rn115_df = merge_dataset_with_ground_truth(ds011_mira_tf_tg_sum_df, rn115_ko_ground_truth)
ds011_cicero_tf_tg_sum_rn115_df = merge_dataset_with_ground_truth(ds011_cicero_tf_tg_sum_df, rn115_ko_ground_truth)

#### Score Distributions

In [None]:
plot_scores_distribution(ds011_gene_tss_tf_tg_sum_rn115_df, title="DS011 Sliding Window Scores - Summed, RN115, nearest gene TSS")
plot_scores_distribution(ds011_mira_tf_tg_sum_rn115_df, title="DS011 Sliding Window Scores - Summed, RN115, MIRA")
plot_scores_distribution(ds011_cicero_tf_tg_sum_rn115_df, title="DS011 Sliding Window Scores - Summed, RN115, Cicero")

#### AUROC Scores

In [None]:
dev_plots.plot_auroc(ds011_gene_tss_tf_tg_sum_rn115_df, score_col="sliding_window_score", title="DS011 Sliding Window Scores - Summed, RN115, nearest gene TSS")
dev_plots.plot_auroc(ds011_mira_tf_tg_sum_rn115_df, score_col="sliding_window_score", title="DS011 Sliding Window Scores - Summed, RN115, MIRA")
dev_plots.plot_auroc(ds011_cicero_tf_tg_sum_rn115_df, score_col="sliding_window_score", title="DS011 Sliding Window Scores - Summed, RN115, Cicero")

---

### Comparing individual TF-peak-TG scores against RN111

In [None]:
ds011_gene_tss_tf_peak_tg_rn111_df = merge_dataset_with_ground_truth(ds011_gene_tss_tf_peak_tg_df, rn111_chipseq_ground_truth)
ds011_mira_tf_peak_tg_rn111_df = merge_dataset_with_ground_truth(ds011_mira_tf_peak_tg_df, rn111_chipseq_ground_truth)
ds011_cicero_tf_peak_tg_rn111_df = merge_dataset_with_ground_truth(ds011_cicero_tf_peak_tg_df, rn111_chipseq_ground_truth)

#### Score Distributions

In [None]:
# plot_scores_distribution(ds011_gene_tss_tf_peak_tg_rn111_df, title="DS011 Sliding Window Scores - Individual, RN111, nearest gene TSS")
# plot_scores_distribution(ds011_mira_tf_peak_tg_rn111_df, title="DS011 Sliding Window Scores - Individual, RN111, MIRA")
# plot_scores_distribution(ds011_cicero_tf_peak_tg_rn111_df, title="DS011 Sliding Window Scores - Individual, RN111, Cicero")
plot_scores_distribution(ds011_gene_tss_tf_peak_tg_rn111_df, title="")
plot_scores_distribution(ds011_mira_tf_peak_tg_rn111_df, title="")
plot_scores_distribution(ds011_cicero_tf_peak_tg_rn111_df, title="")

#### AUROC Scores

In [None]:
dev_plots.plot_auroc(ds011_gene_tss_tf_tg_sum_rn111_df, score_col="sliding_window_score", title="DS011 Sliding Window Scores - Individual, RN111, nearest gene TSS")
dev_plots.plot_auroc(ds011_mira_tf_tg_sum_rn111_df, score_col="sliding_window_score", title="DS011 Sliding Window Scores - Individual, RN111, MIRA")
dev_plots.plot_auroc(ds011_cicero_tf_tg_sum_rn111_df, score_col="sliding_window_score", title="DS011 Sliding Window Scores - Individual, RN111, Cicero")


### Comparing individual TF-peak-TG scores against RN115

In [None]:
ds011_gene_tss_tf_peak_tg_rn115_df = merge_dataset_with_ground_truth(ds011_gene_tss_tf_peak_tg_df, rn115_ko_ground_truth)
ds011_mira_tf_peak_tg_rn115_df = merge_dataset_with_ground_truth(ds011_mira_tf_peak_tg_df, rn115_ko_ground_truth)
ds011_cicero_tf_peak_tg_rn115_df = merge_dataset_with_ground_truth(ds011_cicero_tf_peak_tg_df, rn115_ko_ground_truth)

#### Score Distributions

In [None]:
plot_scores_distribution(ds011_gene_tss_tf_peak_tg_rn115_df, title="DS011 Sliding Window Scores - Individual, RN115, nearest gene TSS")
plot_scores_distribution(ds011_mira_tf_peak_tg_rn115_df, title="DS011 Sliding Window Scores - Individual, RN115, MIRA")
plot_scores_distribution(ds011_cicero_tf_peak_tg_rn115_df, title="DS011 Sliding Window Scores - Individual, RN115, Cicero")

#### AUROC Scores

In [None]:
dev_plots.plot_auroc(ds011_gene_tss_tf_tg_sum_rn115_df, score_col="sliding_window_score", title="DS011 Sliding Window Scores - Individual, RN115, nearest gene TSS")
dev_plots.plot_auroc(ds011_mira_tf_tg_sum_rn115_df, score_col="sliding_window_score", title="DS011 Sliding Window Scores - Individual, RN115, MIRA")
dev_plots.plot_auroc(ds011_cicero_tf_tg_sum_rn115_df, score_col="sliding_window_score", title="DS011 Sliding Window Scores - Individual, RN115, Cicero")