# Investigating Sliding Window Scores

The sliding window scores should have more overlap with the ground truth datasets than they do. I am going to go step-by-step to re-run the analysis, using the raw non-normalized sliding window scores from the method output file.

In [None]:
!hostnamectl

In [None]:
import os
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
import dev.notebooks.plotting as dev_plots
output_dir = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/output/DS011_mESC/DS011_mESC_sample1/"

In [None]:
import importlib
importlib.reload(dev_plots)

## Loading Sliding Window scores and adding closest gene as the target

First, we load in the raw sliding window scores from the direct output of the sliding window score calculation method, without normalization or scaling. 

To get TF-TG edges, we load in the dataframe of all peaks within 1 Mb of each gene and find the closest gene for each peak. We use the closest gene to each peak as the TG for each TF-peak edge in the sliding window scores.

In [None]:
# Load sliding window scores, associate with the nearest gene TSS
raw_sliding_window_scores = pd.read_parquet(os.path.join(output_dir, "no_norm_sliding_window_tf_to_peak_score.parquet"), engine="pyarrow")

# Read in the peaks to TG data and pick the closest gene for each peak (maximum TSS distance score)
peaks_near_genes_df = pd.read_parquet(os.path.join(output_dir, "peaks_near_genes.parquet"), engine="pyarrow")
closest_gene_to_peak_df = peaks_near_genes_df.sort_values("TSS_dist_score", ascending=False).groupby("peak_id").first()
closest_gene_to_peak_df = closest_gene_to_peak_df[["target_id"]].reset_index()

# Set the TG for each TF-peak edge as the closest gene to the peak
sliding_window_with_targets = pd.merge(raw_sliding_window_scores, closest_gene_to_peak_df, on=["peak_id"], how="left")
sliding_window_tf_tg = sliding_window_with_targets[["source_id", "target_id", "sliding_window_score"]].drop_duplicates()
unique_sliding_window_tf_tg_edges = sliding_window_tf_tg[["source_id", "target_id"]].drop_duplicates()

In [None]:
print(f"Sliding Window scores")
print(f"  - TFs: {sliding_window_tf_tg['source_id'].nunique():,}")
print(f"  - TGs: {sliding_window_tf_tg['target_id'].nunique():,}")
print(f"  - TF-peak-TG Edges: {len(sliding_window_with_targets):,}")
print(f"  - TF-TG-Score Edges: {len(sliding_window_tf_tg):,}")
print(f"  - TF-TG Edges: {len(unique_sliding_window_tf_tg_edges):,}")

Next, we load in the RN115 LOGOF ESCAPE ground truth data and ensure that the TF and TG column names match the sliding window DataFrame columns

In [None]:
# Load the RN115 LOGOG ESCAPE ground truth
rn115_ko_ground_truth = pd.read_csv("/gpfs/Labs/Uzun/DATA/PROJECTS/2024.SC_MO_TRN_DB.MIRA/REPOSITORY/CURRENT/REFERENCE_NETWORKS/RN115_LOGOF_ESCAPE_Mouse_ESC.tsv", sep="\t")
rn115_ko_ground_truth = rn115_ko_ground_truth[["Source", "Target"]].rename(columns={"Source":"source_id", "Target":"target_id"})
rn115_ko_ground_truth["source_id"] = rn115_ko_ground_truth["source_id"].str.capitalize()
rn115_ko_ground_truth["target_id"] = rn115_ko_ground_truth["target_id"].str.capitalize()

In [None]:
print("RN115 LOGOF Ground Truth")
print(f"  - TFs: {rn115_ko_ground_truth['source_id'].nunique():,}")
print(f"  - TGs: {rn115_ko_ground_truth['target_id'].nunique():,}")
print(f"  - Edges: {len(rn115_ko_ground_truth):,}")

### Number of targets per TF in RN115

In [None]:
rn115_targets_per_tf = rn115_ko_ground_truth.groupby("source_id").count().sort_values(by="target_id", ascending=False)
print("Number of RN115 TGs per TF")
rn115_targets_per_tf

### Number of targets per TF in the sliding window scores

In [None]:
sliding_window_tf_tg

In [None]:
sliding_window_targets_per_tf = sliding_window_tf_tg.groupby("source_id")["target_id"].count().sort_values(ascending=False)
print("Number of Sliding Window TGs per TF")
sliding_window_targets_per_tf

## Overlap between RN115 and DS011 Sliding Window Scores

Merge the datasets together. By doing a full outer join and using `inticator=True`, we can separate out the edges in both datasets along with the edges only in the sliding window scores and the edges only in RN115

In [None]:
merged_edges = pd.merge(
    sliding_window_tf_tg, 
    rn115_ko_ground_truth, 
    on=["source_id", "target_id"], 
    how="outer",
    indicator=True
    )
merged_edges

In [None]:
edges_in_both = merged_edges[merged_edges["_merge"] == "both"].drop(columns="_merge")
sliding_window_edges_only = merged_edges[merged_edges["_merge"] == "left_only"].drop(columns="_merge")
rn115_edges_only = merged_edges[merged_edges["_merge"] == "right_only"].drop(columns="_merge")

In [None]:
tfs_in_both = edges_in_both["source_id"].drop_duplicates()
tgs_in_both = edges_in_both["target_id"].drop_duplicates()

print("Edges in both RN115 and Sliding Window:")
print(f"  - TFs in both: {len(tfs_in_both):,}")
print(f"  - TGs in both: {len(tgs_in_both):,}")
print(f"  - TF-TG-Score Edges in both: {len(edges_in_both):,}")
print(f"  - TF-TG Edges in both: {len(edges_in_both.drop_duplicates(subset=['source_id', 'target_id'])):,}")

In [None]:
tfs_only_in_sliding_window = sliding_window_tf_tg[~sliding_window_tf_tg["source_id"].isin(rn115_ko_ground_truth["source_id"])]["source_id"].drop_duplicates()
tgs_only_in_sliding_window = sliding_window_tf_tg[~sliding_window_tf_tg["target_id"].isin(rn115_ko_ground_truth["target_id"])]["target_id"].drop_duplicates()

print("Only in Sliding Window:")
print(f"  - TFs only in Sliding Window: {len(tfs_only_in_sliding_window):,}")
print(f"  - TGs only in Sliding Window: {len(tgs_only_in_sliding_window):,}")
print(f"  - TF-peak-TG Edges only in Sliding Window: {len(sliding_window_edges_only):,}")
print(f"  - TF-TG Edges only in Sliding Window: {len(sliding_window_edges_only.drop_duplicates(subset=['source_id', 'target_id'])):,}")

In [None]:
tfs_only_in_rn115 = rn115_ko_ground_truth[~rn115_ko_ground_truth["source_id"].isin(sliding_window_tf_tg["source_id"])]["source_id"].drop_duplicates()
tgs_only_in_rn115 = rn115_ko_ground_truth[~rn115_ko_ground_truth["target_id"].isin(sliding_window_tf_tg["target_id"])]["target_id"].drop_duplicates()

print("Only in RN115:")
print(f"  - TFs only in RN115: {len(tfs_only_in_rn115):,}")
print(f"  - TGs only in RN115: {len(tgs_only_in_rn115):,}")
print(f"  - Edges only in Sliding Window: {len(rn115_edges_only):,}")

In [None]:
dev_plots.plot_true_false_distribution(
    true_series=edges_in_both["sliding_window_score"], 
    false_series=sliding_window_edges_only["sliding_window_score"],
    xlabel="Sliding Window Score",
    title="Sliding Window Scores",
    balance=True
    )

### Number of scores per TF-TG Edge (number of peaks)

#### Both Sliding Window and RN115

In [None]:
edges_in_both_by_tf_tg = (
    edges_in_both.groupby(["source_id", "target_id"])
    .count()
    .sort_values(by="sliding_window_score", ascending=False)
    .reset_index()
    .rename(columns={'sliding_window_score':'Num Scores per TF-TG Edge'})
)
edges_in_both_by_tf_tg

#### Only in Sliding Window

In [None]:
sliding_window_edges_only_only_by_tf_tg = (
    sliding_window_edges_only.groupby(["source_id", "target_id"])
    .count()
    .sort_values(by="sliding_window_score", ascending=False)
    .reset_index()
    .rename(columns={'sliding_window_score':'Num Scores per TF-TG Edge'})
)
sliding_window_edges_only_only_by_tf_tg

### Number of scores per TF

#### Both Sliding Window and RN115

In [None]:
edges_in_both_by_tf = (
    edges_in_both
    .groupby("source_id")
    .count()
    .sort_values("target_id", ascending=False)
    .rename(columns={'sliding_window_score':'Num Scores per TF'})
    ['Num Scores per TF']
    .reset_index()
    )
edges_in_both_by_tf

#### Only in Sliding Window

In [None]:
sliding_window_edges_only_by_tf = (
    sliding_window_edges_only
    .groupby("source_id")
    .count()
    .sort_values("target_id", ascending=False)
    .rename(columns={'sliding_window_score':'Num Scores per TF'})
    ['Num Scores per TF']
    .reset_index()
    )
sliding_window_edges_only_by_tf

#### Only in RN115

In [None]:
rn115_edges_only_by_tf = (
    rn115_edges_only
    .groupby("source_id")
    .count()
    .sort_values("target_id", ascending=False)
    .reset_index()
    .rename(columns={'target_id':'Number of TGs per TF'})
    [['source_id', 'Number of TGs per TF']]
    )
rn115_edges_only_by_tf

In [None]:
dev_plots.plot_scores_grouped_by_tf(edges_in_both_by_tf, title="Sliding Window Scores in RN115 - Number of Targets per TF", height_col="Num Scores per TF")

In [None]:
edges_in_both.hist("sliding_window_score", bins=150, grid=False)

In [None]:
edges_in_both.sort_values

In [None]:
importlib.reload(dev_plots)
tfs_of_interest = edges_in_both["source_id"].drop_duplicates().to_list()
dev_plots.plot_score_distribution_by_tf(
    edges_in_both, 
    "sliding_window_score", 
    tfs_of_interest, 
    limit_x=False
    )