In [None]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import matplotlib.pyplot as plt
import os

COMBINED_SCORE_DF_FEATURES = [
    "mean_TF_expression", 
    "mean_peak_accessibility", 
    "mean_TG_expression",
    "string_combined_score",
    "string_experimental_score",
    "string_textmining_score"
    ]

main_input_dir = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/input"
main_output_dir = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/output"

ds011_input_dir = os.path.join(main_input_dir, "DS011_mESC/DS011_mESC_sample1")
ds011_output_dir = os.path.join(main_output_dir, "DS011_mESC/DS011_mESC_sample1")

mesc_input_dir = os.path.join(main_input_dir, "mESC/filtered_L2_E7.5_rep2")
mesc_output_dir = os.path.join(main_output_dir, "mESC/filtered_L2_E7.5_rep2")

def create_score_path_dict(
    selected_features: list[str], 
    output_dir: str
    ) -> dict[str, str]:
    """
    Creates a dictionary of file paths to each score file for a given set of selected features.

    Arguments:
        selected_features (list[str]): List of selected feature score names
        output_dir (str): Output directory for the sample

    Returns:
        selected_feature_path_dict (dict[str:str]): A dictionary containing the selected feature names
        along with the path to the data file for that feature
    """
    
    feature_score_file_path_dict = {
        'mean_TF_expression' : os.path.join(output_dir, "inferred_grns/inferred_score_df.parquet"),
        'mean_peak_accessibility' : os.path.join(output_dir, "inferred_grns/inferred_score_df.parquet"),
        'mean_TG_expression' : os.path.join(output_dir, "inferred_grns/inferred_score_df.parquet"),
        'cicero_score' : os.path.join(output_dir, "cicero_peak_to_tg_scores.parquet"),
        'TSS_dist_score' : os.path.join(output_dir, "peak_to_gene_correlation.parquet"), 
        'correlation' : os.path.join(output_dir, "peak_to_gene_correlation.parquet"),
        'homer_binding_score' : os.path.join(output_dir, "homer_tf_to_peak.parquet"), 
        'sliding_window_score' : os.path.join(output_dir, "sliding_window_tf_to_peak_score.parquet"), 
        'string_combined_score' : os.path.join(output_dir, "inferred_grns/inferred_score_df.parquet"), 
        'string_experimental_score' : os.path.join(output_dir, "inferred_grns/inferred_score_df.parquet"), 
        'string_textmining_score' : os.path.join(output_dir, "inferred_grns/inferred_score_df.parquet")
    }

    selected_feature_path_dict = {}
    for feature_name in selected_features:
        if feature_name in feature_score_file_path_dict.keys():
            selected_feature_path_dict[feature_name] = feature_score_file_path_dict[feature_name]
            
    for feature_name, path in selected_feature_path_dict.items():
        assert os.path.isfile(path) | os.path.isdir(path), f'Error: {path} is not a file or directory'
        
    return selected_feature_path_dict

def check_for_features_in_combined_score_df(feature_score_dict: dict[str,str]):
    features = []
    for score_name in feature_score_dict.keys():
        if score_name in COMBINED_SCORE_DF_FEATURES:
            print(f'  - {score_name} in feature score path')
            features.append(score_name)
            
    return features

def load_melted_inferred_grn_ddf(inferred_net_path: str, feature_scores: list[str]):

    melted_tf_score_ddf = dd.read_parquet(inferred_net_path, engine="pyarrow")
    
    melted_tf_score_ddf = melted_tf_score_ddf[melted_tf_score_ddf["score_type"].isin(feature_scores)]
    
    grouped = (
        melted_tf_score_ddf
        .groupby(["source_id", "peak_id", "target_id", "score_type"])
        ["score_value"]
        .mean()
        .reset_index()
    )
    
    pdf = grouped.compute()
    
    wide_df = pdf.pivot_table(
        index=["source_id", "peak_id", "target_id"],
        columns="score_type",
        values="score_value",
        aggfunc="first"
    ).reset_index()
    
    
    return wide_df

def load_individual_score_dataframes(score_path_dict):
    individual_feature_score_dataframes = {}
    for feature_name, path in score_path_dict.items():
        if feature_name not in COMBINED_SCORE_DF_FEATURES:
            print(f'  - Loading {feature_name} DataFrame')
            df = pd.read_parquet(path, engine="pyarrow")
            df = df.reset_index(drop=True)
            individual_feature_score_dataframes[feature_name] = df

    return individual_feature_score_dataframes

In [45]:
selected_features = [
    'mean_TF_expression',
    'mean_peak_accessibility',
    'mean_TG_expression',
    'cicero_score',
    'TSS_dist_score', 
    'correlation',
    'homer_binding_score', 
    'sliding_window_score', 
    'string_combined_score', 
    'string_experimental_score', 
    'string_textmining_score'
    ]

mesc_score_paths: dict = create_score_path_dict(selected_features, mesc_output_dir)

print("\nLoading and combining individual feature scores")
feature_score_dataframes: dict = load_individual_score_dataframes(mesc_score_paths)

print('\nChecking for scores originating from the combined feature score DataFrame')
combined_df_features: list[str] = check_for_features_in_combined_score_df(mesc_score_paths)

if len(combined_df_features) > 0:
    print('\nLoading combined feature score dataframe')
    inferred_df_path = mesc_score_paths[combined_df_features[0]] # get path for the first feature name in combined_df_feature
    inferred_df = load_melted_inferred_grn_ddf(inferred_df_path, combined_df_features)
    print('\tDone!')
    
    print("\nSplitting off combined scores into individual dataframes")
    for feature_name in combined_df_features:
        feature_df = inferred_df[["source_id", "peak_id", "target_id", feature_name]]
        feature_score_dataframes[feature_name] = feature_df
    
else:
    print('  - No scores from the combined feature score DataFrame')




Loading and combining individual feature scores
  - Loading cicero_score DataFrame
  - Loading TSS_dist_score DataFrame
  - Loading correlation DataFrame
  - Loading homer_binding_score DataFrame
  - Loading sliding_window_score DataFrame
Checking for scores originating from the combined feature score DataFrame
  - mean_TF_expression in feature score path
  - mean_peak_accessibility in feature score path
  - mean_TG_expression in feature score path
  - string_combined_score in feature score path
  - string_experimental_score in feature score path
  - string_textmining_score in feature score path

Loading combined feature score dataframe
	Done!

Splitting off combined scores into individual dataframes
cicero_score
                peak_id target_id  cicero_score
0  chr1:9747331-9756094     Tcf24      0.774205
1  chr1:9767609-9800330     Tcf24      0.795169
2  chr1:9816059-9816659     Tcf24      0.563231
3  chr1:9830375-9830975     Tcf24      0.481638
4  chr1:9847961-9859633     Tcf24   

In [50]:
cicero_peak_to_tg = feature_score_dataframes["cicero_score"][["peak_id", "target_id"]]
corr_peak_to_tg = feature_score_dataframes["correlation"][["peak_id", "target_id"]]
peak_to_tg_edges = pd.merge(cicero_peak_to_tg, corr_peak_to_tg, how="outer")

In [51]:
peak_to_tg_edges

Unnamed: 0,peak_id,target_id
0,chr1:9747331-9756094,Tcf24
1,chr1:9767609-9800330,Tcf24
2,chr1:9816059-9816659,Tcf24
3,chr1:9830375-9830975,Tcf24
4,chr1:9847961-9859633,Tcf24
...,...,...
2039887,chr11:22763395-22763995,Wdpcp
2039888,chr4:148592054-148592654,Zfp982
2039889,chr1:165285923-165286523,Atp1b1
2039890,chr15:38335981-38336581,Grhl2


In [52]:
sliding_window_tf_to_peak = feature_score_dataframes["sliding_window_score"][["source_id", "peak_id"]]
homer_tf_to_peak = feature_score_dataframes["homer_binding_score"][["source_id", "peak_id"]]
tf_to_peak_edges = pd.merge(sliding_window_tf_to_peak, homer_tf_to_peak, how="outer")

In [53]:
tf_to_peak_edges

Unnamed: 0,source_id,peak_id
0,Ahctf1,chr17:3054703-3055303
1,Ahctf1,chr17:3102194-3102794
2,Ahctf1,chr17:3162106-3162706
3,Ahctf1,chr17:3375042-3375642
4,Ahctf1,chr17:3616249-3616849
...,...,...
18438861,Fosl2,chr13:3746587-3747187
18438862,Fosl2,chr7:133012640-133013240
18438863,Fosl2,chr2:170160109-170160709
18438864,Fosl2,chr2:165329517-165330117


In [54]:
tf_peak_tg_edges = pd.merge(tf_to_peak_edges, peak_to_tg_edges, on=["peak_id"], how="inner")

In [55]:
tf_peak_tg_edges

Unnamed: 0,source_id,peak_id,target_id
0,Ahctf1,chr17:3054703-3055303,Gm5679
1,Ahctf1,chr17:3054703-3055303,Gm49797
2,Ahctf1,chr17:3054703-3055303,Gm49221
3,Ahctf1,chr17:3054703-3055303,Scaf8
4,Ahctf1,chr17:3054703-3055303,Tiam2
...,...,...,...
187610268,Smad4,chr2:69374194-69374794,Ssb
187610269,Smad4,chr2:69374194-69374794,Mettl5
187610270,Smad4,chr2:69374194-69374794,Spc25
187610271,Smad4,chr2:69374194-69374794,Ubr3


Aggregating the TF-TG edges by the number of peaks between them

In [70]:
tf_tg_edges = tf_peak_tg_edges.groupby(by=["source_id", "target_id"]).size().reset_index(name="edge_count")

: 

In [2]:
import csv
import pandas as pd
def read_ground_truth(ground_truth_file):
    ground_truth = pd.read_csv(ground_truth_file, sep='\t', quoting=csv.QUOTE_NONE, on_bad_lines='skip', header=0)
    ground_truth = ground_truth.rename(columns={"Source": "source_id", "Target": "target_id"})
    return ground_truth


ground_truth_file = "/gpfs/Labs/Uzun/DATA/PROJECTS/2024.SC_MO_TRN_DB.MIRA/REPOSITORY/CURRENT/REFERENCE_NETWORKS/RN111_ChIPSeq_BEELINE_Mouse_ESC.tsv"
ground_truth_df = read_ground_truth(ground_truth_file)



In [4]:
import dask.dataframe as dd
inferred_network_df = dd.read_parquet("/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/src/testing_scripts/tmp/tf_peak_tg_edges.parquet", engine="pyarrow")


In [None]:
def label_edges_with_ground_truth(inferred_network_dd, ground_truth_df):
    import dask.dataframe as dd
    import numpy as np
    ground_truth_pairs = set(zip(
        ground_truth_df["source_id"].str.upper(),
        ground_truth_df["target_id"].str.upper()
    ))


    def label_partition(df):
        df = df.copy()  # <-- avoids SettingWithCopyWarning
        tf_tg_tuples = list(zip(df["source_id"], df["target_id"]))
        df.loc[:, "label"] = [1 if pair in ground_truth_pairs else 0 for pair in tf_tg_tuples]
        return df

    inferred_network_dd = inferred_network_dd.map_partitions(
        label_partition,
        meta=inferred_network_dd._meta.assign(label=np.int64(0))
    )

    return inferred_network_dd

tf_tg_edges_in_gt = label_edges_with_ground_truth(inferred_network_df, ground_truth_df)

In [None]:
tf_tg_edges_in_gt_df = tf_tg_edges_in_gt.compute()

In [None]:
tf_tg_edges_in_gt_df