# Exploring Score DataFrame Sizes

As we combine the output DataFrames from multiple scoring methods, it is useful to know the sizes of each dataframe and how much overlap they have with one another.

In [None]:
import os
import pandas as pd
import numpy as np
from grn_inference.normalization import minmax_normalize_pandas

project_dir = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER"
input_dir = os.path.join(project_dir, "input/DS011_mESC/DS011_mESC_sample1")
output_dir = os.path.join(project_dir, "output/DS011_mESC/DS011_mESC_sample1")

## Size of the Input Dataset

In [None]:
atac_processed_df = pd.read_parquet(
    os.path.join(input_dir, "DS011_mESC_ATAC_processed.parquet"), 
    engine="pyarrow"
    )

rna_processed_df = pd.read_parquet(
    os.path.join(input_dir, "DS011_mESC_RNA_processed.parquet"), 
    engine="pyarrow"
    )

print("scATAC-seq Dataset:")
print(f"  - Cells: {atac_processed_df.shape[1]-1}")
print(f"  - Peaks: {atac_processed_df.shape[0]}")

print("\nscRNA-seq Dataset:")
print(f"  - Cells: {rna_processed_df.shape[1]-1}")
print(f"  - Genes: {rna_processed_df.shape[0]}")

---

## Peak to TG

The peak to TG regulatory potential methods we use are:

1) Cicero
2) TSS Distance
3) MIRA
4) TG Expression / Peak Accessibility correlation

First, let's look at the size of each methods output

In [None]:
def peak_to_tg_score_summary(score_file_name, method_name) -> pd.DataFrame:
    df = pd.read_parquet(
        os.path.join(output_dir, score_file_name)
    )
    
    print(f"{method_name}:")
    print(f"  - Edges: {df.shape[0]:,}")
    print(f"  - Unique Peaks: {df['peak_id'].nunique():,}")
    print(f"  - Unique TGs: {df['target_id'].nunique():,}\n")
    
    return df
    

method_files = {
    "Cicero": "cicero_peak_to_tg_scores.parquet",
    "TSS Distance Score": "tss_distance_score.parquet",
    "MIRA": "ds011_peak_to_gene_lite_rp_score_chrom_diff.parquet",
    "Correlation": "peak_to_gene_correlation.parquet"
}

method_dfs = {
    name: peak_to_tg_score_summary(path, name)
    for name, path in method_files.items()
}

method_peaks = {name: set(df["peak_id"]) for name, df in method_dfs.items()}
method_targets = {name: set(df["target_id"]) for name, df in method_dfs.items()}

The TSS distance score DataFrame is much larger than the others. Let's look at how many of the peak to TG edges are shared between scoring methods.

In [None]:
def get_shared_elements(method_dict, label):
    shared_elements = set()
    method_names = list(method_dict.keys())
    print(f"\nShared {label}s:")
    for i, m1 in enumerate(method_names):
        for m2 in method_names[i+1:]:
            shared = method_dict[m1] & method_dict[m2]
            print(f"  - {m1} vs {m2}: {len(shared):,}")
            shared_elements.update(shared)
    return shared_elements

# Compute shared peaks and target genes
shared_peaks = get_shared_elements(method_peaks, "Peak")
shared_targets = get_shared_elements(method_targets, "Target Gene")

print(f"\nPeaks in more than one method: {len(shared_peaks):,}")
print(f"Target genes in more than one method: {len(shared_targets):,}\n")


There is not a ton of overlap, but this reduces the size of the DataFrame. Now, lets subset each score DataFrame to only use edges that are in more than one scoring method.

In [None]:
def subset_df_to_peaks_and_targets_in_more_than_one_df(df, method_name, shared_peaks, shared_targets) -> pd.DataFrame:
    df = df[
    (df["peak_id"].isin(shared_peaks)) &
    (df["target_id"].isin(shared_targets))
    ].dropna()
    
    print(f"{method_name}:")
    print(f"  - Edges: {df.shape[0]:,}")
    print(f"  - Unique Peaks: {df['peak_id'].nunique():,}")
    print(f"  - Unique TGs: {df['target_id'].nunique():,}\n")
    
    return df

# Filter each dataframe to only those peak-target pairs that are in multiple methods
method_dfs_subset = {
    name: subset_df_to_peaks_and_targets_in_more_than_one_df(df, name, shared_peaks, shared_targets)
    for name, df in method_dfs.items()
}

Once we have only edges found in more than one score DataFrame, we can merge the scoring methods together into a single tf to peak DataFrame

In [None]:
merge_scores_no_tss = pd.merge(
    pd.merge(method_dfs_subset["Cicero"], method_dfs_subset["MIRA"], on=["peak_id", "target_id"], how="outer"),
    method_dfs_subset["Correlation"], on=["peak_id", "target_id"], how="inner"
)

peak_to_tg = pd.merge(
    merge_scores_no_tss, 
    method_dfs_subset["TSS Distance Score"], 
    on=["peak_id", "target_id"], 
    how="inner"
    )
peak_to_tg

Let's look at how sparse the dataset is

In [None]:
total_edges = len(peak_to_tg[["peak_id", "target_id"]])
score_cols = {
    "Cicero Score": "cicero_score",
    "TSS Distance Score": "TSS_dist_score",
    "LITE Score": "LITE_score",
    "NITE Score": "NITE_score",
    "MIRA Chromatin Differential Score": "chromatin_differential",
    "Correlation Score": "correlation"
}

print(f"Total Edges: {total_edges:,}")
for label, col in score_cols.items():
    score_count = peak_to_tg[col].notna().sum()
    print(f"  - {label}: {score_count:,} ({score_count / total_edges:.2%} of total scores)")


The merged dataset isn't too sparse if we outer merge the method score DataFrames, but use an inner merge for the correlation and TSS distance scores. The number of peak to TG edges has dropped dramatically, which may or may not be a good thing. We can add a column called `"support_count"` which counts the number methods with non-nan values across rows.

In [None]:
# Count how many scores are non-null for each edge
score_support_counts = peak_to_tg[score_cols.values()].notna().sum(axis=1)

# Select edges supported by 2 or more methods
for i, _ in enumerate(score_cols.values()):
    if i > 0:
        multi_supported_edges = peak_to_tg[score_support_counts >= i]

        print(f"Edges supported by ≥{i} methods: {len(multi_supported_edges):,} out of {len(peak_to_tg):,}")
peak_to_tg["support_count"] = score_support_counts

Now that we have a DataFrame containing all of the peak to TG scores for at least 2 methods, we can add in the **minmax normalized mean peak accessibility and mean TG expression** for each edge. We use an inner join here to only keep rows that have genes and peaks in the expression and accesibility data.

In [None]:
mean_peak_expr_norm = minmax_normalize_pandas(
    atac_processed_df.set_index("peak_id").mean(axis=1).rename("mean_peak_expr").to_frame(),
    ["mean_peak_expr"]
).reset_index()

mean_gene_expr_norm = minmax_normalize_pandas(
    rna_processed_df.set_index("gene_id").mean(axis=1).rename("mean_gene_expr").to_frame(),
    ["mean_gene_expr"]
).reset_index()

peak_to_tg_mean_acc = pd.merge(peak_to_tg, mean_peak_expr_norm, on="peak_id", how="inner")
full_peak_to_tg = pd.merge(
    peak_to_tg_mean_acc, mean_gene_expr_norm, left_on="target_id", 
    right_on="gene_id", how="inner"
    ).drop(columns="gene_id").rename(columns={"mean_gene_expr":"mean_tg_expr"})
full_peak_to_tg

---

## TF to Peak

Now that we have the peak to TG scores together, we need to combine the TF to peak binding potential scores. We have two methods that calculate TF to peak scores:

1) Homer
2) Sliding Window

Let's start by loading these score DataFrames in and take a look at their sizes.

In [None]:
def tf_to_peak_score_summary(score_file_name, method_name) -> pd.DataFrame:
    df = pd.read_parquet(
        os.path.join(output_dir, score_file_name)
    )
    
    print(f"{method_name}:")
    print(f"  - Edges: {df.shape[0]:,}")
    print(f"  - Unique TFs: {df['source_id'].nunique():,}")
    print(f"  - Unique Peaks: {df['peak_id'].nunique():,}\n")
    
    return df

method_files = {
    "Homer": "homer_tf_to_peak.parquet",
    "Sliding Window": "sliding_window_tf_to_peak_score.parquet"
}

method_dfs = {
    name: tf_to_peak_score_summary(path, name)
    for name, path in method_files.items()
}

method_TFs = {name: set(df["source_id"]) for name, df in method_dfs.items()}
method_peaks = {name: set(df["peak_id"]) for name, df in method_dfs.items()}

We will merge only on shared peaks and TFs between the two due to the large number of edges. We will also merge in the mean gene expression for the TFs.

In [None]:
sliding_window_df = method_dfs["Sliding Window"]
homer_df = method_dfs["Homer"]

tf_to_peak_df = pd.merge(
    pd.merge(sliding_window_df, homer_df, on=["peak_id", "source_id"], how="inner"),
    mean_gene_expr_norm, left_on="source_id", right_on="gene_id", how="inner"
    ).drop(columns="gene_id").rename(columns={"mean_gene_expr":"mean_tf_expr"})

In [None]:
tf_to_peak_df

In [None]:
tf_to_tg_df = pd.merge(full_peak_to_tg, tf_to_peak_df, on="peak_id", how="right").dropna(subset=["source_id", "peak_id", "target_id"])
tf_to_tg_df

In [None]:
tf_to_tg_df.columns

In [None]:
tf_to_tg_df = tf_to_tg_df[[
    'source_id', 'peak_id', 'target_id', 
    'mean_tf_expr', 'mean_peak_expr', 'mean_tg_expr',
    'cicero_score', 'TSS_dist_score', 'correlation',
    'LITE_score', 'NITE_score', 'chromatin_differential', 
    'sliding_window_score', 'homer_binding_score', 'support_count'
    ]]
tf_to_tg_df

In [None]:
tf_to_tg_df = minmax_normalize_pandas(tf_to_tg_df, ["NITE_score"])

In [None]:
print(tf_to_tg_df.head())

In [None]:
# Melt the dataframe to get it in the same format as combine_dataframes.py
score_cols = [
    'mean_tf_expr', 'mean_peak_expr', 'mean_tg_expr',
    'cicero_score', 'TSS_dist_score', 'correlation',
    'LITE_score', 'NITE_score', 'chromatin_differential', 
    'sliding_window_score', 'homer_binding_score', 'support_count'
    ]

melted_df = tf_to_tg_df.melt(
    id_vars=["source_id", "peak_id", "target_id"],
    value_vars=score_cols,
    var_name="score_type",
    value_name="score_value"
)

In [None]:
melted_df.to_parquet(os.path.join(output_dir, "inferred_grns", "inferred_score_df.parquet"), engine="pyarrow", compression="snappy")

## Adding STRING Scores to the Combined Score DataFrame

In [None]:
from grn_inference.normalization import clip_and_normalize_log1p_pandas, minmax_normalize_pandas

def add_string_db_scores(inferred_net_df):
    string_dir = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/data/string_database/mm10"
    
    # Load STRING protein info and links (small)
    print("  - Reading STRING protein info")
    protein_info_df = pd.read_csv(f"{string_dir}/protein_info.txt", sep="\t")
    
    print("  - Reading STRING protein links detailed")
    protein_links_df = pd.read_csv(f"{string_dir}/protein_links_detailed.txt", sep=" ")

    print("  - Mapping STRING protein IDs to preferred names")
    id_to_name = protein_info_df.set_index("#string_protein_id")["preferred_name"].to_dict()
    protein_links_df["protein1"] = protein_links_df["protein1"].map(id_to_name)
    protein_links_df["protein2"] = protein_links_df["protein2"].map(id_to_name)
    
    inferred_net_df["source_id"] = inferred_net_df["source_id"].str.strip().str.upper()
    inferred_net_df["target_id"] = inferred_net_df["target_id"].str.strip().str.upper()
    protein_links_df["protein1"] = protein_links_df["protein1"].str.strip().str.upper()
    protein_links_df["protein2"] = protein_links_df["protein2"].str.strip().str.upper()

    id_to_name = {
        k: v.strip().upper() for k, v in 
        protein_info_df.set_index("#string_protein_id")["preferred_name"].to_dict().items()
    }

    # Bi-directional match
    tf_tg_pairs = set(zip(inferred_net_df["source_id"], inferred_net_df["target_id"]))
    tf_tg_pairs |= set((tg, tf) for tf, tg in tf_tg_pairs)  # include reverse
    
    print(f"  - Filtering STRING links to match {len(tf_tg_pairs):,} TF–TG pairs")
    filtered_links_df = protein_links_df[
        protein_links_df[["protein1", "protein2"]]
        .apply(tuple, axis=1)
        .isin(tf_tg_pairs)
    ]

    # Rename columns and normalize
    filtered_links_df = filtered_links_df.rename(columns={
        "experimental": "string_experimental_score",
        "textmining": "string_textmining_score",
        "combined_score": "string_combined_score"
    })[["protein1", "protein2", "string_experimental_score", "string_textmining_score", "string_combined_score"]]

    # filtered_links_df = clip_and_normalize_log1p_pandas(
    #     df=filtered_links_df,
    #     score_cols=["string_experimental_score", "string_textmining_score", "string_combined_score"],
    #     quantiles=(0.05, 0.95),
    #     apply_log1p=True,
    #     sample_frac=0.1  # optional: for speed
    # )

    filtered_links_df = minmax_normalize_pandas(
        df=filtered_links_df,
        score_cols=["string_experimental_score", "string_textmining_score", "string_combined_score"],
    )

    print("  - Merging normalized STRING scores into inferred network")
    merged_dd = inferred_net_df.merge(
        filtered_links_df,
        left_on=["source_id", "target_id"],
        right_on=["protein1", "protein2"],
        how="left"
    ).drop(columns=["protein1", "protein2"])

    print("  Done!")
    return merged_dd

In [None]:
def read_inferred_network(inferred_network_file: str) -> pd.DataFrame:
    """Load and pivot a melted sparse inferred network.

    Parameters
    ----------
    inferred_network_file : str
        Path to a parquet file containing a "melted" network with columns
        ``source_id``, ``peak_id``, ``target_id``, ``score_type`` and
        ``score_value``.

    Returns
    -------
    dask.dataframe.DataFrame
        Network in wide format where score types form columns and the index is
        ``(source_id, peak_id, target_id)``.

    Notes
    -----
    The returned dataframe has a single partition which is usually sufficient
    for moderate sized networks.  Adjust ``npartitions`` if necessary.
    """
    print(f"Loading melted sparse network from: {inferred_network_file}")
    melted_ddf = pd.read_parquet(inferred_network_file, engine="pyarrow")

    # Standardize IDs
    melted_ddf["source_id"] = melted_ddf["source_id"].str.upper()
    melted_ddf["target_id"] = melted_ddf["target_id"].str.upper()

    # Aggregate scores
    grouped_ddf = (
        melted_ddf
        .groupby(["source_id", "peak_id", "target_id", "score_type"])["score_value"]
        .mean()
        .reset_index()
    )

    # Pivot manually by converting to pandas (if dataset is small enough)
    def pivot_partition(df):
        return df.pivot_table(
            index=["source_id", "peak_id", "target_id"],
            columns="score_type",
            values="score_value",
            aggfunc="first"
        ).reset_index()

    # Apply pivot in a single partition (best if you've already aggregated)
    pivot_df = grouped_ddf  # convert to Pandas here
    pivot_df = pivot_partition(pivot_df)
    return pivot_df

In [None]:
inferred_df = read_inferred_network(os.path.join(output_dir, "inferred_grns", "inferred_score_df.parquet"))

In [None]:
inferred_df_string = add_string_db_scores(inferred_df)

In [None]:
inferred_df_string

In [None]:
# Melt the dataframe to get it in the same format as combine_dataframes.py
score_cols = [
    'mean_tf_expr', 'mean_peak_expr', 'mean_tg_expr',
    'cicero_score', 'TSS_dist_score', 'correlation',
    'LITE_score', 'NITE_score', 'chromatin_differential', 
    'sliding_window_score', 'homer_binding_score', 'string_experimental_score',
    'string_textmining_score', 'string_combined_score'
    ]

melted_df = inferred_df_string.melt(
    id_vars=["source_id", "peak_id", "target_id"],
    value_vars=score_cols,
    var_name="score_type",
    value_name="score_value"
)

In [None]:
melted_df.to_parquet(os.path.join(output_dir, "inferred_grns", "inferred_score_df_full.parquet"), engine="pyarrow", compression="snappy")

### Investigating if the sliding window scores here are different than the score file

**Combined sliding window scores:**

In [None]:
combined_sliding_window_scores = inferred_df_string[["source_id", "peak_id", "sliding_window_score"]]
combined_sliding_window_scores

**Scores from the sliding window score file:**

In [None]:
norm_sliding_window_scores_full = pd.read_parquet("output/DS011_mESC/DS011_mESC_sample1/sliding_window_tf_to_peak_score.parquet", engine="pyarrow")
norm_sliding_window_scores_full["source_id"] = norm_sliding_window_scores_full["source_id"].str.upper()
labeled_norm_sliding_window_scores_df = pd.merge(
    combined_sliding_window_scores, 
    norm_sliding_window_scores_full, 
    on=["source_id", "peak_id"], 
    how="inner"
    )
labeled_norm_sliding_window_scores_df = labeled_norm_sliding_window_scores_df.rename(columns={
    "sliding_window_score_x":"combined_sliding_window_score",
    "sliding_window_score_y":"original_sliding_window_score"
})

In [None]:
labeled_norm_sliding_window_scores_df

In [None]:
def is_different(row):
    return row["combined_sliding_window_score"] != row["original_sliding_window_score"]

rows_diff_scores = labeled_norm_sliding_window_scores_df.apply(lambda x: is_different(x), axis=1)
matching_scores = len([i for i in rows_diff_scores if i == False])
diff_scores = len([i for i in rows_diff_scores if i == True])
print(f"{matching_scores} / {matching_scores + diff_scores} scores are the same between the combined df and the sliding window score df")

**Labeled score file from the XGBoost code:**

In [None]:
combined_score_df = pd.read_parquet("output/DS011_mESC/DS011_mESC_sample1/labeled_inferred_grn.parquet", engine="pyarrow")
inferred_edges = combined_score_df[["source_id", "peak_id", "sliding_window_score", "label"]]
inferred_edges

In [None]:
xgboost_model_scores = pd.merge(
    inferred_edges, 
    labeled_norm_sliding_window_scores_df,
    on=["source_id", "peak_id"],
    how="inner"
    ).rename(columns={"sliding_window_score":"xgboost_sliding_window_score"})
xgboost_model_scores 

In [None]:
def is_different(row):
    return row["original_sliding_window_score"] != row["xgboost_sliding_window_score"]

rows_diff_scores = xgboost_model_scores.apply(lambda x: is_different(x), axis=1)
matching_scores = len([i for i in rows_diff_scores if i == False])
diff_scores = len([i for i in rows_diff_scores if i == True])
print(f"{matching_scores} / {matching_scores + diff_scores} scores are the same between the combined df and the sliding window score df")

In [None]:
import matplotlib.pyplot as plt
def plot_true_false_feature_histogram(
    df,
    feature_col,
    limit_x = True
):

    fig = plt.figure(figsize=(5, 4))
    
    true_values = df[df["label"] == 1]
    false_values = df[df["label"] == 0]

    plt.hist(
        false_values[feature_col].dropna(),
        bins=50, alpha=0.7,
        color='#1682b1', edgecolor="#032b5f",
        label="False",
    )
    plt.hist(
        true_values[feature_col].dropna(),
        bins=50, alpha=0.7,
        color="#cb5f17", edgecolor="#b13301",
        label="True",
    )

    # set titles/labels on the same ax
    plt.title(feature_col, fontsize=14)
    plt.xlabel(feature_col, fontsize=14)
    plt.ylabel("Frequency", fontsize=14)
    if limit_x:
        plt.xlim(0, 1)

    fig.legend(
        loc="lower center",
        ncol=2,
        fontsize=14,
        bbox_to_anchor=(0.5, -0.02)
    )
    fig.tight_layout(rect=[0, 0.05, 1, 1])
    plt.show()

Okay, so all of the scores are the same between the XGBoost labeled dataset, the combined score dataset, and the original score file dataset. Perhaps the issue is due to balancing?

### Testing if balancing the True / False scores is causing the difference somehow

In [None]:
def balance_dataset(df):
    true_rows = df[df["label"] == 1]
    false_rows = df[df["label"] == 0]

    print("Before Balancing:")
    print(f"  - Number of True values: {len(true_rows)}")
    print(f"  - Number of False values: {len(false_rows)}")

    min_rows = min(len(true_rows), len(false_rows))
    print(f"\nSubsampling down to {min_rows} rows")

    true_rows_sampled = true_rows.sample(min_rows)
    false_rows_sampled = false_rows.sample(min_rows)

    balanced_df = pd.concat([true_rows_sampled, false_rows_sampled])

    balanced_true_rows = balanced_df[balanced_df["label"] == 1]
    balanced_false_rows = balanced_df[balanced_df["label"] == 0]

    print("\nAfter Balancing:")
    print(f"  - Number of True values: {len(balanced_true_rows)}")
    print(f"  - Number of False values: {len(balanced_false_rows)}")

    return balanced_df

In [None]:
xgboost_model_scores_balanced = balance_dataset(xgboost_model_scores)

In [None]:
plot_true_false_feature_histogram(xgboost_model_scores_balanced, "original_sliding_window_score", limit_x=True)

In [None]:
plot_true_false_feature_histogram(xgboost_model_scores_balanced, "xgboost_sliding_window_score", limit_x=True)

So, now the XGBoost scores are the same as the other ones. I think that it might be due to changing the sliding window score method to only use peak-gene pairs from `peaks_near_genes.parquet`. Let's look into this.

In [None]:
old_xgboost_scores_labeled = pd.read_parquet("output/DS011_mESC/DS011_mESC_sample1/previous_labeled_sliding_window_score_df.parquet", engine="pyarrow")
old_xgboost_scores_labeled = old_xgboost_scores_labeled[["source_id", "peak_id", "sliding_window_score"]]

In [None]:
new_xgboost_scores = xgboost_model_scores_balanced[["source_id", "peak_id", "xgboost_sliding_window_score", "label"]]
old_xgboost_model_merged = pd.merge(
    new_xgboost_scores, 
    old_xgboost_scores_labeled,
    on=["source_id", "peak_id"],
    how="inner"
    ).rename(columns={"sliding_window_score":"old_xgboost_sliding_window_score"}).drop_duplicates(subset=["source_id", "peak_id"])
old_xgboost_model_merged = old_xgboost_model_merged[["source_id", "peak_id", "xgboost_sliding_window_score", "old_xgboost_sliding_window_score", "label"]]
old_xgboost_model_merged

Some of the old scores have a sliding window score of 0, which means that there were no motifs found. I'm interested in score differences for edges that had scores in both datasets.

In [None]:
old_xgboost_model_merged = old_xgboost_model_merged[old_xgboost_model_merged["old_xgboost_sliding_window_score"] > 0]
old_xgboost_model_merged = old_xgboost_model_merged[old_xgboost_model_merged["xgboost_sliding_window_score"] > 0]

Let's look at the differences in sliding window scores between the old sliding window scores and the new sliding window scores. I didn't change anything dramatically in the sliding window scoring method, so it should have the same scores.

In [None]:
old_xgboost_model_merged["score_difference"] = old_xgboost_model_merged["xgboost_sliding_window_score"] - old_xgboost_model_merged["old_xgboost_sliding_window_score"]
old_xgboost_model_merged["score_difference"]

Negative values mean that the old XGBoost scores were greater, positive values mean that the new XGBoost scores are greater

In [None]:
old_xgboost_model_merged["score_difference"].hist(bins=150, grid=False)

I saved a previous run of the DS011 dataset, so let's compare our scores to those scores.

In [None]:
old_run_scores_df = pd.read_parquet("/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/output/DS011_mESC/DS011_mESC_sample1_old/sliding_window_tf_to_peak_score.parquet", engine="pyarrow")
old_run_scores_df = old_run_scores_df.reset_index(drop=True)
old_run_scores_df["source_id"] = old_run_scores_df["source_id"].str.upper()

In [None]:
old_run_scores_df

We need to merge the edges from the old run to the edges in the new run

In [None]:
old_run_merged = pd.merge(
    old_xgboost_model_merged,
    old_run_scores_df,
    on=["source_id", "peak_id"],
    how="inner"
).rename(columns={"sliding_window_score":"old_run_sliding_window_score"}).drop_duplicates(subset=["source_id", "peak_id"])
old_run_merged = old_run_merged[["source_id", "peak_id", "xgboost_sliding_window_score", "old_xgboost_sliding_window_score", "old_run_sliding_window_score", "label"]]
old_run_merged


Now let's see what the differences are between the old run and our most recent scores.

In [None]:
old_run_merged["new_score_vs_old_run_score_difference"] = old_run_merged["xgboost_sliding_window_score"] - old_run_merged["old_run_sliding_window_score"]
old_run_merged["new_score_vs_old_run_score_difference"]

In [None]:
fig = plt.figure(figsize=(5,4))
plt.hist(x=old_run_merged["new_score_vs_old_run_score_difference"], bins=25)
plt.xlim((-1,1))

In [None]:
old_run_merged["old_xgb_vs_old_run_score_difference"] = old_run_merged["old_xgboost_sliding_window_score"] - old_run_merged["old_run_sliding_window_score"]
old_run_merged["old_xgb_vs_old_run_score_difference"]

In [None]:
fig = plt.figure(figsize=(5,4))
plt.hist(x=old_run_merged["old_xgb_vs_old_run_score_difference"], bins=50)
plt.xlim((-1,1))

In [None]:
full_dataset_scores = pd.read_parquet("/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/output/DS011_mESC/DS011_mESC_sample1/sliding_window_tf_to_peak_score.parquet", engine="pyarrow")

In [None]:
full_dataset_scores

In [None]:
full_dataset_scores["source_id"] = full_dataset_scores["source_id"].str.upper()

In [None]:
full_dataset_merged = pd.merge(
    inferred_edges,
    full_dataset_scores,
    on=["source_id", "peak_id"],
    how="inner"
)

In [None]:
balanced_full_dataset = balance_dataset(full_dataset_merged)

In [None]:
plot_true_false_feature_histogram(balanced_full_dataset, "sliding_window_score_y")

Let's try using the edges from the sliding window scores that caused the different distribution.

In [None]:
full_dataset_merged_w_diff_dist = pd.merge(
    old_xgboost_model_merged,
    full_dataset_scores,
    on=["source_id", "peak_id"],
    how="inner"
)
full_dataset_merged_w_diff_dist

In [None]:
full_dataset_merged_w_diff_dist = full_dataset_merged_w_diff_dist.rename(columns={"sliding_window_score":"full_dataset_sliding_window_score"})

In [None]:
full_dataset_merged_w_diff_dist_balanced = balance_dataset(full_dataset_merged_w_diff_dist)

In [None]:
plot_true_false_feature_histogram(full_dataset_merged_w_diff_dist_balanced, "full_dataset_sliding_window_score")