# Exploring Score DataFrame Sizes

As we combine the output DataFrames from multiple scoring methods, it is useful to know the sizes of each dataframe and how much overlap they have with one another.

In [None]:
import os
import pandas as pd
import numpy as np
from grn_inference.normalization import minmax_normalize_pandas

project_dir = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER"
input_dir = os.path.join(project_dir, "input/DS011_mESC/DS011_mESC_sample1")
output_dir = os.path.join(project_dir, "output/DS011_mESC/DS011_mESC_sample1")

## Size of the Input Dataset

In [None]:
atac_processed_df = pd.read_parquet(
    os.path.join(input_dir, "DS011_mESC_ATAC_processed.parquet"), 
    engine="pyarrow"
    )

rna_processed_df = pd.read_parquet(
    os.path.join(input_dir, "DS011_mESC_RNA_processed.parquet"), 
    engine="pyarrow"
    )

print("scATAC-seq Dataset:")
print(f"  - Cells: {atac_processed_df.shape[1]-1}")
print(f"  - Peaks: {atac_processed_df.shape[0]}")

print("\nscRNA-seq Dataset:")
print(f"  - Cells: {rna_processed_df.shape[1]-1}")
print(f"  - Genes: {rna_processed_df.shape[0]}")

---

## Peak to TG

The peak to TG regulatory potential methods we use are:

1) Cicero
2) TSS Distance
3) MIRA
4) TG Expression / Peak Accessibility correlation

First, let's look at the size of each methods output

In [None]:
def peak_to_tg_score_summary(score_file_name, method_name) -> pd.DataFrame:
    df = pd.read_parquet(
        os.path.join(output_dir, score_file_name)
    )
    
    print(f"{method_name}:")
    print(f"  - Edges: {df.shape[0]:,}")
    print(f"  - Unique Peaks: {df['peak_id'].nunique():,}")
    print(f"  - Unique TGs: {df['target_id'].nunique():,}\n")
    
    return df
    

method_files = {
    "Cicero": "cicero_peak_to_tg_scores.parquet",
    "TSS Distance Score": "tss_distance_score.parquet",
    "MIRA": "ds011_peak_to_gene_lite_rp_score_chrom_diff.parquet",
    "Correlation": "peak_to_gene_correlation.parquet"
}

method_dfs = {
    name: peak_to_tg_score_summary(path, name)
    for name, path in method_files.items()
}

method_peaks = {name: set(df["peak_id"]) for name, df in method_dfs.items()}
method_targets = {name: set(df["target_id"]) for name, df in method_dfs.items()}

The TSS distance score DataFrame is much larger than the others. Let's look at how many of the peak to TG edges are shared between scoring methods.

In [None]:
def get_shared_elements(method_dict, label):
    shared_elements = set()
    method_names = list(method_dict.keys())
    print(f"\nShared {label}s:")
    for i, m1 in enumerate(method_names):
        for m2 in method_names[i+1:]:
            shared = method_dict[m1] & method_dict[m2]
            print(f"  - {m1} vs {m2}: {len(shared):,}")
            shared_elements.update(shared)
    return shared_elements

# Compute shared peaks and target genes
shared_peaks = get_shared_elements(method_peaks, "Peak")
shared_targets = get_shared_elements(method_targets, "Target Gene")

print(f"\nPeaks in more than one method: {len(shared_peaks):,}")
print(f"Target genes in more than one method: {len(shared_targets):,}\n")


There is not a ton of overlap, but this reduces the size of the DataFrame. Now, lets subset each score DataFrame to only use edges that are in more than one scoring method.

In [None]:
def subset_df_to_peaks_and_targets_in_more_than_one_df(df, method_name, shared_peaks, shared_targets) -> pd.DataFrame:
    df = df[
    (df["peak_id"].isin(shared_peaks)) &
    (df["target_id"].isin(shared_targets))
    ].dropna()
    
    print(f"{method_name}:")
    print(f"  - Edges: {df.shape[0]:,}")
    print(f"  - Unique Peaks: {df['peak_id'].nunique():,}")
    print(f"  - Unique TGs: {df['target_id'].nunique():,}\n")
    
    return df

# Filter each dataframe to only those peak-target pairs that are in multiple methods
method_dfs_subset = {
    name: subset_df_to_peaks_and_targets_in_more_than_one_df(df, name, shared_peaks, shared_targets)
    for name, df in method_dfs.items()
}

Once we have only edges found in more than one score DataFrame, we can merge the scoring methods together into a single tf to peak DataFrame

In [None]:
merge_scores_no_tss = pd.merge(
    pd.merge(method_dfs_subset["Cicero"], method_dfs_subset["MIRA"], on=["peak_id", "target_id"], how="outer"),
    method_dfs_subset["Correlation"], on=["peak_id", "target_id"], how="inner"
)

peak_to_tg = pd.merge(
    merge_scores_no_tss, 
    method_dfs_subset["TSS Distance Score"], 
    on=["peak_id", "target_id"], 
    how="inner"
    )
peak_to_tg

Let's look at how sparse the dataset is

In [None]:
total_edges = len(peak_to_tg[["peak_id", "target_id"]])
score_cols = {
    "Cicero Score": "cicero_score",
    "TSS Distance Score": "TSS_dist_score",
    "LITE Score": "LITE_score",
    "NITE Score": "NITE_score",
    "MIRA Chromatin Differential Score": "chromatin_differential",
    "Correlation Score": "correlation"
}

print(f"Total Edges: {total_edges:,}")
for label, col in score_cols.items():
    score_count = peak_to_tg[col].notna().sum()
    print(f"  - {label}: {score_count:,} ({score_count / total_edges:.2%} of total scores)")


The merged dataset isn't too sparse if we outer merge the method score DataFrames, but use an inner merge for the correlation and TSS distance scores. The number of peak to TG edges has dropped dramatically, which may or may not be a good thing. We can add a column called `"support_count"` which counts the number methods with non-nan values across rows.

In [None]:
# Count how many scores are non-null for each edge
score_support_counts = peak_to_tg[score_cols.values()].notna().sum(axis=1)

# Select edges supported by 2 or more methods
for i, _ in enumerate(score_cols.values()):
    if i > 0:
        multi_supported_edges = peak_to_tg[score_support_counts >= i]

        print(f"Edges supported by ≥{i} methods: {len(multi_supported_edges):,} out of {len(peak_to_tg):,}")
peak_to_tg["support_count"] = score_support_counts

Now that we have a DataFrame containing all of the peak to TG scores for at least 2 methods, we can add in the **minmax normalized mean peak accessibility and mean TG expression** for each edge. We use an inner join here to only keep rows that have genes and peaks in the expression and accesibility data.

In [None]:
mean_peak_expr_norm = minmax_normalize_pandas(
    atac_processed_df.set_index("peak_id").mean(axis=1).rename("mean_peak_expr").to_frame(),
    ["mean_peak_expr"]
).reset_index()

mean_gene_expr_norm = minmax_normalize_pandas(
    rna_processed_df.set_index("gene_id").mean(axis=1).rename("mean_gene_expr").to_frame(),
    ["mean_gene_expr"]
).reset_index()

peak_to_tg_mean_acc = pd.merge(peak_to_tg, mean_peak_expr_norm, on="peak_id", how="inner")
full_peak_to_tg = pd.merge(
    peak_to_tg_mean_acc, mean_gene_expr_norm, left_on="target_id", 
    right_on="gene_id", how="inner"
    ).drop(columns="gene_id").rename(columns={"mean_gene_expr":"mean_tg_expr"})
full_peak_to_tg

---

## TF to Peak

Now that we have the peak to TG scores together, we need to combine the TF to peak binding potential scores. We have two methods that calculate TF to peak scores:

1) Homer
2) Sliding Window

Let's start by loading these score DataFrames in and take a look at their sizes.

In [None]:
def tf_to_peak_score_summary(score_file_name, method_name) -> pd.DataFrame:
    df = pd.read_parquet(
        os.path.join(output_dir, score_file_name)
    )
    
    print(f"{method_name}:")
    print(f"  - Edges: {df.shape[0]:,}")
    print(f"  - Unique TFs: {df['source_id'].nunique():,}")
    print(f"  - Unique Peaks: {df['peak_id'].nunique():,}\n")
    
    return df

method_files = {
    "Homer": "homer_tf_to_peak.parquet",
    "Sliding Window": "sliding_window_tf_to_peak_score.parquet"
}

method_dfs = {
    name: tf_to_peak_score_summary(path, name)
    for name, path in method_files.items()
}

method_TFs = {name: set(df["source_id"]) for name, df in method_dfs.items()}
method_peaks = {name: set(df["peak_id"]) for name, df in method_dfs.items()}

We will merge only on shared peaks and TFs between the two due to the large number of edges. We will also merge in the mean gene expression for the TFs.

In [None]:
sliding_window_df = method_dfs["Sliding Window"]
homer_df = method_dfs["Homer"]

tf_to_peak_df = pd.merge(
    pd.merge(sliding_window_df, homer_df, on=["peak_id", "source_id"], how="inner"),
    mean_gene_expr_norm, left_on="source_id", right_on="gene_id", how="inner"
    ).drop(columns="gene_id").rename(columns={"mean_gene_expr":"mean_tf_expr"})

In [None]:
tf_to_peak_df

In [None]:
tf_to_tg_df = pd.merge(full_peak_to_tg, tf_to_peak_df, on="peak_id", how="right").dropna(subset=["source_id", "peak_id", "target_id"])
tf_to_tg_df

In [None]:
tf_to_tg_df.columns

In [None]:
tf_to_tg_df = tf_to_tg_df[[
    'source_id', 'peak_id', 'target_id', 
    'mean_tf_expr', 'mean_peak_expr', 'mean_tg_expr',
    'cicero_score', 'TSS_dist_score', 'correlation',
    'LITE_score', 'NITE_score', 'chromatin_differential', 
    'sliding_window_score', 'homer_binding_score', 'support_count'
    ]]
tf_to_tg_df

In [None]:
tf_to_tg_df = minmax_normalize_pandas(tf_to_tg_df, ["NITE_score"])

In [None]:
print(tf_to_tg_df.head())

In [None]:
# Melt the dataframe to get it in the same format as combine_dataframes.py
score_cols = [
    'mean_tf_expr', 'mean_peak_expr', 'mean_tg_expr',
    'cicero_score', 'TSS_dist_score', 'correlation',
    'LITE_score', 'NITE_score', 'chromatin_differential', 
    'sliding_window_score', 'homer_binding_score', 'support_count'
    ]

melted_df = tf_to_tg_df.melt(
    id_vars=["source_id", "peak_id", "target_id"],
    value_vars=score_cols,
    var_name="score_type",
    value_name="score_value"
)

In [None]:
melted_df.to_parquet(os.path.join(output_dir, "inferred_grns", "inferred_score_df.parquet"), engine="pyarrow", compression="snappy")

In [None]:
tf_to_tg_df["chromatin_differential"].hist(bins=50, grid=False)

In [None]:
mira_df = method_dfs["MIRA"]
mira_df["chromatin_differential"].hist(bins=50, grid=False, log=True)

In [None]:
from grn_inference.normalization import clip_and_normalize_log1p_pandas, minmax_normalize_pandas

def add_string_db_scores(inferred_net_df):
    string_dir = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/data/string_database/mm10"
    
    # Load STRING protein info and links (small)
    print("  - Reading STRING protein info")
    protein_info_df = pd.read_csv(f"{string_dir}/protein_info.txt", sep="\t")
    
    print("  - Reading STRING protein links detailed")
    protein_links_df = pd.read_csv(f"{string_dir}/protein_links_detailed.txt", sep=" ")

    print("  - Mapping STRING protein IDs to preferred names")
    id_to_name = protein_info_df.set_index("#string_protein_id")["preferred_name"].to_dict()
    protein_links_df["protein1"] = protein_links_df["protein1"].map(id_to_name)
    protein_links_df["protein2"] = protein_links_df["protein2"].map(id_to_name)
    
    inferred_net_df["source_id"] = inferred_net_df["source_id"].str.strip().str.upper()
    inferred_net_df["target_id"] = inferred_net_df["target_id"].str.strip().str.upper()
    protein_links_df["protein1"] = protein_links_df["protein1"].str.strip().str.upper()
    protein_links_df["protein2"] = protein_links_df["protein2"].str.strip().str.upper()

    id_to_name = {
        k: v.strip().upper() for k, v in 
        protein_info_df.set_index("#string_protein_id")["preferred_name"].to_dict().items()
    }

    # Bi-directional match
    tf_tg_pairs = set(zip(inferred_net_df["source_id"], inferred_net_df["target_id"]))
    tf_tg_pairs |= set((tg, tf) for tf, tg in tf_tg_pairs)  # include reverse
    
    print(f"  - Filtering STRING links to match {len(tf_tg_pairs):,} TF–TG pairs")
    filtered_links_df = protein_links_df[
        protein_links_df[["protein1", "protein2"]]
        .apply(tuple, axis=1)
        .isin(tf_tg_pairs)
    ]

    # Rename columns and normalize
    filtered_links_df = filtered_links_df.rename(columns={
        "experimental": "string_experimental_score",
        "textmining": "string_textmining_score",
        "combined_score": "string_combined_score"
    })[["protein1", "protein2", "string_experimental_score", "string_textmining_score", "string_combined_score"]]

    # filtered_links_df = clip_and_normalize_log1p_pandas(
    #     df=filtered_links_df,
    #     score_cols=["string_experimental_score", "string_textmining_score", "string_combined_score"],
    #     quantiles=(0.05, 0.95),
    #     apply_log1p=True,
    #     sample_frac=0.1  # optional: for speed
    # )

    filtered_links_df = minmax_normalize_pandas(
        df=filtered_links_df,
        score_cols=["string_experimental_score", "string_textmining_score", "string_combined_score"],
    )

    print("  - Merging normalized STRING scores into inferred network")
    merged_dd = inferred_net_df.merge(
        filtered_links_df,
        left_on=["source_id", "target_id"],
        right_on=["protein1", "protein2"],
        how="left"
    ).drop(columns=["protein1", "protein2"])

    print("  Done!")
    return merged_dd

In [None]:
def read_inferred_network(inferred_network_file: str) -> pd.DataFrame:
    """Load and pivot a melted sparse inferred network.

    Parameters
    ----------
    inferred_network_file : str
        Path to a parquet file containing a "melted" network with columns
        ``source_id``, ``peak_id``, ``target_id``, ``score_type`` and
        ``score_value``.

    Returns
    -------
    dask.dataframe.DataFrame
        Network in wide format where score types form columns and the index is
        ``(source_id, peak_id, target_id)``.

    Notes
    -----
    The returned dataframe has a single partition which is usually sufficient
    for moderate sized networks.  Adjust ``npartitions`` if necessary.
    """
    print(f"Loading melted sparse network from: {inferred_network_file}")
    melted_ddf = pd.read_parquet(inferred_network_file, engine="pyarrow")

    # Standardize IDs
    melted_ddf["source_id"] = melted_ddf["source_id"].str.upper()
    melted_ddf["target_id"] = melted_ddf["target_id"].str.upper()

    # Aggregate scores
    grouped_ddf = (
        melted_ddf
        .groupby(["source_id", "peak_id", "target_id", "score_type"])["score_value"]
        .mean()
        .reset_index()
    )

    # Pivot manually by converting to pandas (if dataset is small enough)
    def pivot_partition(df):
        return df.pivot_table(
            index=["source_id", "peak_id", "target_id"],
            columns="score_type",
            values="score_value",
            aggfunc="first"
        ).reset_index()

    # Apply pivot in a single partition (best if you've already aggregated)
    pivot_df = grouped_ddf  # convert to Pandas here
    pivot_df = pivot_partition(pivot_df)
    return pivot_df

In [None]:
inferred_df = read_inferred_network(os.path.join(output_dir, "inferred_grns", "inferred_score_df.parquet"))

In [None]:
inferred_df_string = add_string_db_scores(inferred_df)

In [None]:
inferred_df_string

In [None]:
# Melt the dataframe to get it in the same format as combine_dataframes.py
score_cols = [
    'mean_tf_expr', 'mean_peak_expr', 'mean_tg_expr',
    'cicero_score', 'TSS_dist_score', 'correlation',
    'LITE_score', 'NITE_score', 'chromatin_differential', 
    'sliding_window_score', 'homer_binding_score', 'string_experimental_score',
    'string_textmining_score', 'string_combined_score'
    ]

melted_df = inferred_df_string.melt(
    id_vars=["source_id", "peak_id", "target_id"],
    value_vars=score_cols,
    var_name="score_type",
    value_name="score_value"
)

In [None]:
inferred_df_string.to_parquet(os.path.join(output_dir, "inferred_grns", "inferred_score_df_full.parquet"), engine="pyarrow", compression="snappy")