### Investigating the bimodal distribution of True scores

We now have the True and False sliding window scores between the 5th and 95th percentiles. The False scores have a single broad distribution around 0.6, while the True scores are bimodal with a lower distribution around 0.375 - 0.425 and a higher distribution around 0.8 - 0.9. Next, we want to look at the lower distribution to see what is causing the high number of True values around 0.4.

In [None]:
!hostnamectl

In [None]:
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score

inferred_df = pd.read_parquet("/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/output/DS011_mESC/DS011_mESC_sample1/labeled_inferred_grn.parquet", engine="pyarrow")
sliding_window_score_df = inferred_df[["source_id", "peak_id", "target_id", "sliding_window_score", "label"]]
sliding_window_score_df

In [None]:
def balance_dataset(df):
    true_rows = df[df["label"] == 1]
    false_rows = df[df["label"] == 0]

    print("Before Balancing:")
    print(f"  - Number of True values: {len(true_rows)}")
    print(f"  - Number of False values: {len(false_rows)}")

    min_rows = min(len(true_rows), len(false_rows))
    print(f"\nSubsampling down to {min_rows} rows")

    true_rows_sampled = true_rows.sample(min_rows)
    false_rows_sampled = false_rows.sample(min_rows)

    balanced_df = pd.concat([true_rows_sampled, false_rows_sampled])

    balanced_true_rows = balanced_df[balanced_df["label"] == 1]
    balanced_false_rows = balanced_df[balanced_df["label"] == 0]

    print("\nAfter Balancing:")
    print(f"  - Number of True values: {len(balanced_true_rows)}")
    print(f"  - Number of False values: {len(balanced_false_rows)}")

    return balanced_df

In [None]:
def plot_true_false_feature_histogram(
    df,
    feature_col,
    limit_x = True
):

    fig = plt.figure(figsize=(5, 4))
    
    true_values = df[df["label"] == 1]
    false_values = df[df["label"] == 0]

    plt.hist(
        false_values[feature_col].dropna(),
        bins=50, alpha=0.7,
        color='#1682b1', edgecolor="#032b5f",
        label="False",
    )
    plt.hist(
        true_values[feature_col].dropna(),
        bins=50, alpha=0.7,
        color="#cb5f17", edgecolor="#b13301",
        label="True",
    )

    # set titles/labels on the same ax
    plt.title(feature_col, fontsize=14)
    plt.xlabel(feature_col, fontsize=14)
    plt.ylabel("Frequency", fontsize=14)
    if limit_x:
        plt.xlim(0, 1)

    fig.legend(
        loc="lower center",
        ncol=2,
        fontsize=14,
        bbox_to_anchor=(0.5, -0.02)
    )
    fig.tight_layout(rect=[0, 0.05, 1, 1])
    plt.show()