In [None]:
import dask.dataframe as dd
from dask.distributed import Client
import xgboost as xgb
import pandas as pd
import os
import logging
import argparse

def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Apply Dask-trained XGBoost model to a new network")
    parser.add_argument("--output_dir", type=str, required=True, help="Directory to save predictions")
    parser.add_argument("--model", type=str, required=True, help="Path to trained XGBoost .json Booster model")
    parser.add_argument("--target", type=str, required=True, help="Path to .parquet file for inference")
    parser.add_argument("--save_name", type=str, required=True, help="Filename for output")
    return parser.parse_args()

def read_inferred_network(inferred_network_file: str) -> dd.DataFrame:
    """
    Loads a melted sparse inferred network from Parquet and pivots it into a Dask DataFrame
    where each row is (source_id, target_id) and columns are score_types (mean-aggregated).
    """
    logging.info(f"Loading melted sparse network from: {inferred_network_file}")
    melted_ddf = dd.read_parquet(inferred_network_file, engine="pyarrow")

    # Standardize IDs
    melted_ddf["source_id"] = melted_ddf["source_id"].str.upper()
    melted_ddf["target_id"] = melted_ddf["target_id"].str.upper()

    # Aggregate scores
    grouped_ddf = (
        melted_ddf
        .groupby(["source_id", "peak_id", "target_id", "score_type"])["score_value"]
        .mean()
        .reset_index()
    )

    # Pivot manually by converting to pandas (if dataset is small enough)
    def pivot_partition(df):
        return df.pivot_table(
            index=["source_id", "peak_id", "target_id"],
            columns="score_type",
            values="score_value",
            aggfunc="first"
        ).reset_index()

    # Apply pivot in a single partition (best if you've already aggregated)
    pivot_df = grouped_ddf.compute()  # convert to Pandas here
    pivot_df = pivot_partition(pivot_df)
    return dd.from_pandas(pivot_df, npartitions=1)

def main():
    args = parse_args()

    model_path = args.model
    target_path = args.target
    output_dir = args.output_dir
    save_name = args.save_name

    logging.info("Loading XGBoost Booster")
    booster = xgb.Booster()
    booster.load_model(model_path)

    logging.info("Reading inferred network")
    inferred_dd = read_inferred_network(target_path)
    
    feature_names = booster.feature_names
    
    X_dd = inferred_dd[feature_names]

    logging.info("Converting to DaskDMatrix")
    client = Client()
    dtest = xgb.dask.DaskDMatrix(data=X_dd, feature_names=feature_names, client=client)

    logging.info("Running distributed prediction")
    y_pred = xgb.dask.predict(client=client, model=booster, data=dtest)

    # Convert to pandas (merging Dask DataFrame + Dask array)
    logging.info("Joining predictions back to source-target pairs")
    result_df = inferred_dd[["source_id", "peak_id", "target_id"]].compute()
    result_df["score"] = y_pred.compute()
    result_df = result_df.drop_duplicates()

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    output_path = os.path.join(output_dir, save_name)
    logging.info(f"Saving to {output_path}")
    result_df.to_csv(output_path, sep="\t", index=False)
    logging.info("Done!")

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format="%(message)s")
    main()

In [None]:
import dask.dataframe as dd
from dask.distributed import Client
import xgboost as xgb
import pandas as pd
import os
import logging
import argparse

def read_inferred_network(inferred_network_file: str) -> dd.DataFrame:
    """
    Loads a melted sparse inferred network from Parquet and pivots it into a Dask DataFrame
    where each row is (source_id, target_id) and columns are score_types (mean-aggregated).
    """
    logging.info(f"Loading melted sparse network from: {inferred_network_file}")
    melted_ddf = dd.read_parquet(inferred_network_file, engine="pyarrow")

    # Standardize IDs
    melted_ddf["source_id"] = melted_ddf["source_id"].str.upper()
    melted_ddf["target_id"] = melted_ddf["target_id"].str.upper()

    # Aggregate scores
    grouped_ddf = (
        melted_ddf
        .groupby(["source_id", "peak_id", "target_id", "score_type"])["score_value"]
        .mean()
        .reset_index()
    )

    # Pivot manually by converting to pandas (if dataset is small enough)
    def pivot_partition(df):
        return df.pivot_table(
            index=["source_id", "peak_id", "target_id"],
            columns="score_type",
            values="score_value",
            aggfunc="first"
        ).reset_index()

    # Apply pivot in a single partition (best if you've already aggregated)
    pivot_df = grouped_ddf.compute()  # convert to Pandas here
    pivot_df = pivot_partition(pivot_df)
    return dd.from_pandas(pivot_df, npartitions=1)

model_path = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/output/DS011_mESC/DS011_mESC_sample1_old/trained_models/xgb_DS011_mESC_sample1_old_model.json"
target_path = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/output/mESC/filtered_L2_E7.5_rep2_old/inferred_grns/inferred_score_df.parquet"
output_dir = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/output/DS011_mESC/DS011_mESC_sample1_old/model_predictions"
save_name = "testing_aggregation_predictions.tsv"


In [None]:
logging.info("Loading XGBoost Booster")
booster = xgb.Booster()
booster.load_model(model_path)

In [None]:
logging.info("Reading inferred network")
inferred_dd = read_inferred_network(target_path)

In [None]:
feature_names = booster.feature_names

In [None]:
feature_names

In [None]:
X_dd = inferred_dd[feature_names]

In [None]:
logging.info("Converting to DaskDMatrix")
client = Client()
dtest = xgb.dask.DaskDMatrix(data=X_dd, feature_names=feature_names, client=client)

In [None]:
logging.info("Running distributed prediction")
y_pred = xgb.dask.predict(client=client, model=booster, data=dtest)

In [None]:
y_pred

In [None]:
# Convert to pandas (merging Dask DataFrame + Dask array)
logging.info("Joining predictions back to source-target pairs")
result_df = inferred_dd[["source_id", "peak_id", "target_id"]].compute()
result_df["score"] = y_pred.compute()
result_df = result_df.drop_duplicates()

In [None]:
result_df

In [None]:
tf_tg_edge_groups = result_df.groupby(["source_id", "target_id"])["peak_id"]

In [None]:
tf_tg_edge_mean_peak_scores = tf_tg_edge_groups.count()



In [None]:
tf_tg_edge_mean_peak_scores

In [None]:
tf_tg_edge_scores = result_df.groupby(["source_id", "target_id"])["score"]

In [None]:
tf_tg_mean_scores = tf_tg_edge_scores.mean()

In [None]:
tf_tg_mean_scores

In [None]:

def read_ground_truth(ground_truth_file):
    ground_truth = pd.read_csv(ground_truth_file, sep='\t', quoting=csv.QUOTE_NONE, on_bad_lines='skip', header=0)
    ground_truth = ground_truth.rename(columns={"Source": "source_id", "Target": "target_id"})
    return ground_truth

def label_edges_with_ground_truth(inferred_network_dd, ground_truth_df):
    import dask.dataframe as dd
    import numpy as np
    ground_truth_pairs = set(zip(
        ground_truth_df["source_id"].str.upper(),
        ground_truth_df["target_id"].str.upper()
    ))
    
    inferred_network_dd["source_id"] = inferred_network_dd["source_id"].str.upper()
    inferred_network_dd["target_id"] = inferred_network_dd["target_id"].str.upper()


    def label_partition(df):
        df = df.copy()
        tf_tg_tuples = list(zip(df["source_id"], df["target_id"]))
        df.loc[:, "label"] = [1 if pair in ground_truth_pairs else 0 for pair in tf_tg_tuples]
        return df

    inferred_network_dd = inferred_network_dd.map_partitions(
        label_partition,
        meta=inferred_network_dd._meta.assign(label=np.int64(0))
    )

    return inferred_network_dd

ground_truth_file = "/gpfs/Labs/Uzun/DATA/PROJECTS/2024.SC_MO_TRN_DB.MIRA/REPOSITORY/CURRENT/REFERENCE_NETWORKS/RN111_ChIPSeq_BEELINE_Mouse_ESC.tsv"



In [None]:
import csv

In [None]:
ground_truth_df = read_ground_truth(ground_truth_file)

In [None]:
tf_tg_mean_scores.reset_index()

In [None]:
def label_edges_with_ground_truth(inferred_network_dd, ground_truth_df):
    import dask.dataframe as dd
    import numpy as np
    ground_truth_pairs = set(zip(
        ground_truth_df["source_id"].str.upper(),
        ground_truth_df["target_id"].str.upper()
    ))
    
    inferred_network_dd["source_id"] = inferred_network_dd["source_id"].str.upper()
    inferred_network_dd["target_id"] = inferred_network_dd["target_id"].str.upper()


    def label_partition(df):
        df = df.copy()
        tf_tg_tuples = list(zip(df["source_id"], df["target_id"]))
        df.loc[:, "label"] = [1 if pair in ground_truth_pairs else 0 for pair in tf_tg_tuples]
        return df

    inferred_network_dd = inferred_network_dd.apply(
        label_partition,
    )

    return inferred_network_dd

In [None]:
tf_tg_mean_scores = tf_tg_edge_scores.mean()

In [None]:
tf_tg_mean_scores = tf_tg_mean_scores.reset_index()

In [None]:
tf_tg_mean_scores

In [None]:
ground_truth_pairs = set(zip(
    ground_truth_df["source_id"].str.upper(),
    ground_truth_df["target_id"].str.upper()
))

In [None]:
tf_tg_mean_scores["source_id"] = tf_tg_mean_scores["source_id"].str.upper()
tf_tg_mean_scores["target_id"] = tf_tg_mean_scores["target_id"].str.upper()

In [None]:
tf_tg_mean_scores["label"] = [1 if pair in ground_truth_pairs else 0 for pair in list(zip(tf_tg_mean_scores["source_id"], tf_tg_mean_scores["target_id"]))]

In [None]:
tf_tg_mean_scores

In [None]:
import random

In [None]:
true_edges = tf_tg_mean_scores[tf_tg_mean_scores["label"] == 1]
false_edges = tf_tg_mean_scores[tf_tg_mean_scores["label"] == 0]

In [None]:
from typing import Union

In [None]:
def convert_true_false_to_string(value: Union[int, str]):
    if type(value) == int:
        if value == 0:
            return "False"
        elif value == 1:
            return "True"
        else:
            raise ValueError(f"Value {value} in the ground truth label columns is not 0 or 1")
    elif type(value) == str:
        return value
    else:
        raise TypeError("Value must be either of type int or str, got %s" % value)
        

tf_tg_mean_scores["label"] = tf_tg_mean_scores["label"].apply(convert_true_false_to_string)
tf_tg_pivoted: pd.DataFrame = tf_tg_mean_scores.pivot(columns="label", values="score")

In [None]:
n_true = len(tf_tg_mean_scores[tf_tg_mean_scores["label"] == "True"])
n_false = len(tf_tg_mean_scores[tf_tg_mean_scores["label"] == "False"])


In [None]:
n_false

In [None]:
n_true

In [None]:
min_group_count = min(n_false, n_true)

tf_tg_true = tf_tg_mean_scores[tf_tg_mean_scores["label"] == "True"].sample(min_group_count)
tf_tg_false = tf_tg_mean_scores[tf_tg_mean_scores["label"] == "False"].sample(min_group_count)

tf_tg_balanced = pd.concat([tf_tg_true, tf_tg_false])

In [None]:
len(tf_tg_balanced[tf_tg_balanced["label"] == "False"])

In [None]:
len(tf_tg_balanced[tf_tg_balanced["label"] == "True"])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(8,8))
sns.histplot(data=tf_tg_balanced, x="score", hue="label", bins=50, element="step", stat="count")
plt.title("Distribution of peak counts for TF-TG edges", fontsize=16)
plt.xlabel("Number of peaks", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.show()

In [None]:
tf_tg_balanced

In [None]:
tf_tg_edge_scores

In [None]:
tf_tg_edge_scores.count()

In [None]:
tf_tg_edge_scores.count().reset_index()

In [None]:
tf_tg_edge_count = tf_tg_edge_scores.count().reset_index()
tf_tg_edge_count["label"] = ["True" if pair in ground_truth_pairs else "False" for pair in list(zip(tf_tg_edge_count["source_id"], tf_tg_edge_count["target_id"]))]

In [None]:
n_true = len(tf_tg_edge_count[tf_tg_edge_count["label"] == "True"])
n_false = len(tf_tg_edge_count[tf_tg_edge_count["label"] == "False"])

min_group_count = min(n_false, n_true)

tf_tg_true = tf_tg_edge_count[tf_tg_edge_count["label"] == "True"].sample(min_group_count)
tf_tg_false = tf_tg_edge_count[tf_tg_edge_count["label"] == "False"].sample(min_group_count)

tf_tg_count_balanced = pd.concat([tf_tg_true, tf_tg_false])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(8,8))
sns.histplot(data=tf_tg_count_balanced, x="score", hue="label", bins=50, element="step", stat="count")
plt.title("Distribution of peak counts for TF-TG edges", fontsize=16)
plt.xlabel("Number of peaks", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(8,8))
sns.histplot(data=tf_tg_count_balanced, x="score", hue="label", log_scale=True, bins=50, element="step", stat="count")
plt.title("Distribution of peak counts for TF-TG edges", fontsize=16)
plt.xlabel("Number of peaks", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.show()

In [None]:
inferred_df = pd.read_parquet(target_path, engine="pyarrow")

In [None]:
inferred_df

In [None]:
# Aggregate scores
grouped_ddf = (
    inferred_df
    .groupby(["source_id", "peak_id", "target_id", "score_type"])["score_value"]
    .mean()
    .reset_index()
)

# Pivot manually by converting to pandas (if dataset is small enough)
def pivot_partition(df):
    return df.pivot_table(
        index=["source_id", "peak_id", "target_id"],
        columns="score_type",
        values="score_value",
        aggfunc="first"
    ).reset_index()
inferred_df_full = pivot_partition(grouped_ddf)

In [None]:
def aggregate_tf_tg_score(peak_scores):
    return np.percentile(peak_scores, 75)

def softmax_weighted_average(peak_scores, lambda_=5.0):
    weights = np.exp(lambda_ * peak_scores)
    return np.sum(weights * peak_scores) / np.sum(weights)

In [None]:
print(result_df.head())

In [None]:
threshold = 0.5
agg_df = result_df.groupby(["source_id", "target_id"])["score"].agg(
    tf_tg_score_75pct = lambda x: np.percentile(x, 75),
    tf_tg_score_softmax = lambda x: softmax_weighted_average(x, lambda_=5.0),
    tf_tg_frac_high = lambda x: np.mean(x > threshold)
).reset_index()

In [None]:
agg_df

In [None]:
agg_df["label"] = ["True" if pair in ground_truth_pairs else "False" for pair in list(zip(agg_df["source_id"], agg_df["target_id"]))]

In [None]:
n_true = len(agg_df[agg_df["label"] == "True"])
n_false = len(agg_df[agg_df["label"] == "False"])

min_group_count = min(n_false, n_true)

tf_tg_true = agg_df[agg_df["label"] == "True"].sample(min_group_count)
tf_tg_false = agg_df[agg_df["label"] == "False"].sample(min_group_count)

tf_tg_count_balanced = pd.concat([tf_tg_true, tf_tg_false])