Imports and defining the functions to load in the ground truth, and the melted inferred network score DataFrame and pivot it into a dense matrix. Creates a label column in the inferred network with a label of 1 if the edge is in the ground truth dataset, else 0.

In [7]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
from dask_ml.model_selection import train_test_split
from dask.distributed import Client
import xgboost as xgb
import argparse
import logging
import os
import csv

logging.basicConfig(level=logging.INFO, format='%(message)s')

def read_inferred_network(inferred_network_file: str) -> dd.DataFrame:
    """
    Loads a melted sparse inferred network from Parquet and pivots it into a Dask DataFrame
    where each row is (source_id, target_id) and columns are score_types (mean-aggregated).
    """
    logging.info(f"Loading melted sparse network from: {inferred_network_file}")
    melted_ddf = dd.read_parquet(inferred_network_file, engine="pyarrow")

    # Standardize IDs
    melted_ddf["source_id"] = melted_ddf["source_id"].str.upper()
    melted_ddf["target_id"] = melted_ddf["target_id"].str.upper()

    # Aggregate scores
    grouped_ddf = (
        melted_ddf
        .groupby(["source_id", "target_id", "score_type"])["score_value"]
        .mean()
        .reset_index()
    )

    # Pivot manually by converting to pandas (if dataset is small enough)
    def pivot_partition(df):
        return df.pivot_table(
            index=["source_id", "target_id"],
            columns="score_type",
            values="score_value",
            aggfunc="mean"
        ).reset_index()

    # Apply pivot in a single partition (best if you've already aggregated)
    pivot_df = grouped_ddf.compute()  # convert to Pandas here
    pivot_df = pivot_partition(pivot_df)
    return dd.from_pandas(pivot_df, npartitions=1)

def read_ground_truth(ground_truth_file):
    logging.info("Reading in the ground truth")
    ground_truth = pd.read_csv(ground_truth_file, sep='\t', quoting=csv.QUOTE_NONE, on_bad_lines='skip', header=0)
    ground_truth = ground_truth.rename(columns={"Source": "source_id", "Target": "target_id"})
    return ground_truth

def label_edges_with_ground_truth(inferred_network_dd, ground_truth_df):
    logging.info("Creating ground truth set")
    ground_truth_pairs = set(zip(
        ground_truth_df["source_id"].str.upper(),
        ground_truth_df["target_id"].str.upper()
    ))

    logging.info("Adding labels to inferred network")

    def label_partition(df):
        df = df.copy()  # <-- avoids SettingWithCopyWarning
        tf_tg_tuples = list(zip(df["source_id"], df["target_id"]))
        df.loc[:, "label"] = [1 if pair in ground_truth_pairs else 0 for pair in tf_tg_tuples]
        return df

    inferred_network_dd = inferred_network_dd.map_partitions(
        label_partition,
        meta=inferred_network_dd._meta.assign(label=np.int64(0))
    )

    return inferred_network_dd


In [10]:
ground_truth_file: str = "/gpfs/Labs/Uzun/DATA/PROJECTS/2024.SC_MO_TRN_DB.MIRA/REPOSITORY/CURRENT/REFERENCE_NETWORKS/RN111_ChIPSeq_BEELINE_Mouse_ESC.tsv"
inferred_network_file: str = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/output/mESC/filtered_L2_E7.5_rep1/inferred_grns/inferred_score_df.parquet"
trained_model_dir: str = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/output/mESC/filtered_L2_E7.5_rep1/trained_models"
fig_dir: str = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/figures/mm10/filtered_L2_E7.5_rep1"
model_save_name: str = "mESC_filtered_L2_E7.5_rep1_xgb_model"
num_cpu: int = int(4)

inferred_network_dd = read_inferred_network(inferred_network_file)
ground_truth_df = read_ground_truth(ground_truth_file)

inferred_network_dd = label_edges_with_ground_truth(inferred_network_dd, ground_truth_df)

Extract the feature columns to use for training

In [9]:
# Drop unnecessary columns
drop_cols = ["source_id", "peak_id", "target_id", "label"]
feature_names = [col for col in inferred_network_dd.columns if col not in drop_cols]

# Only keep columns needed for modeling
logging.info(f"Keeping {len(feature_names)} feature columns + labels")
model_dd = inferred_network_dd[feature_names + ["label"]].persist()

logging.info(f"Splitting {model_dd.shape[0].compute():,} rows into train/test with stratification")