## Test Homer Nearest TSS from ChIP-Atlas Peaks vs RN111 BEELINE ChIP-seq Ground Truth

We need to load the Homer TF-peak dataset, map the peaks to their nearest gene TSS, then compare against RN111

In [None]:
import pandas as pd
import pybedtools
import csv

In [None]:
tss_reference_file = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/data/genome_annotation/mm10/mm10_TSS.bed"
homer_results = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/output/chipseq_homer/homer_tf_to_peak.parquet"

def read_ground_truth(ground_truth_file):
    ground_truth = pd.read_csv(ground_truth_file, sep='\t', quoting=csv.QUOTE_NONE, on_bad_lines='skip', header=0)
    ground_truth = ground_truth.rename(columns={"Source": "source_id", "Target": "target_id"})
    return ground_truth

We need to convert the peaks to a bed file

In [None]:
homer_df = pd.read_parquet(homer_results, engine="pyarrow").reset_index(drop=True)
homer_df

In [None]:
# Assume df is your DataFrame
homer_df[["chrom", "start_end"]] = homer_df["peak_id"].str.split(":", expand=True)
homer_df[["start", "end"]] = homer_df["start_end"].str.split("-", expand=True)

# Convert start and end to integers (BED format requires numeric positions)
homer_df["start"] = homer_df["start"].astype(int)
homer_df["end"] = homer_df["end"].astype(int)

# Select and reorder columns for BED format
homer_bed_df = homer_df[["chrom", "start", "end", "source_id", "homer_binding_score"]]
bed_string = homer_bed_df.to_csv(sep="\t", header=False, index=False)
homer_bed_df.head()

In [None]:
homer_bed = pybedtools.BedTool(bed_string, from_string=True)
nearest_tss_bed = pybedtools.BedTool(tss_reference_file)

In [None]:
# Ensure both BedTool objects are sorted
homer_bed_sorted = homer_bed.sort()
nearest_tss_bed_sorted = nearest_tss_bed.sort()

# Now run closest
homer_closest_tss = homer_bed_sorted.closest(nearest_tss_bed_sorted, d=True)

In [None]:
raw_homer_closest_tss_df = homer_closest_tss.to_dataframe(
    names=["peak_start", "peak_end", "source_id",
           "homer_binding_score", "tss_chr", "tss_start", "tss_end", "target_id", "strand", "strand2", "distance"]
)
raw_homer_closest_tss_df.head()

In [None]:
ground_truth_file = "/gpfs/Labs/Uzun/DATA/PROJECTS/2024.SC_MO_TRN_DB.MIRA/REPOSITORY/CURRENT/REFERENCE_NETWORKS/RN111_ChIPSeq_BEELINE_Mouse_ESC.tsv"
ground_truth_df = read_ground_truth(ground_truth_file)

In [None]:
raw_homer_closest_tss_df["source_id"] = raw_homer_closest_tss_df["source_id"].str.upper()
raw_homer_closest_tss_df["target_id"] = raw_homer_closest_tss_df["target_id"].str.upper()

homer_closest_tss_df = raw_homer_closest_tss_df[["source_id", "target_id", "homer_binding_score", "distance"]]
homer_closest_tss_df

In [None]:
valid_targets = set(homer_closest_tss_df["target_id"]) & set(ground_truth_df["target_id"])
valid_sources = set(homer_closest_tss_df["source_id"]) & set(ground_truth_df["source_id"])

homer_shared_genes = homer_closest_tss_df[
    homer_closest_tss_df["source_id"].isin(valid_sources) &
    homer_closest_tss_df["target_id"].isin(valid_targets)
]

ground_truth_shared_genes = ground_truth_df[
    ground_truth_df["source_id"].isin(valid_sources) &
    ground_truth_df["target_id"].isin(valid_targets)
]

In [None]:
merged_df = pd.merge(
    homer_shared_genes, 
    ground_truth_shared_genes, 
    on=["source_id", "target_id"], 
    how="outer", 
    indicator=True
    ).convert_dtypes(convert_floating=True).drop_duplicates(subset=["source_id", "target_id"])
merged_df

In [None]:
only_in_ground_truth = merged_df[merged_df["_merge"] == "right_only"]
only_in_homer = merged_df[merged_df["_merge"] == "left_only"]
in_both = merged_df[merged_df["_merge"] == "both"]

In [None]:
def clean_data(df):
    df = df.drop(columns=['distance', 'Relationship', '_merge'])
    # Performed 1 aggregation grouped on column: 'chip_gene'
    df = df.groupby(['source_id', 'target_id']).agg(homer_binding_score_mean=('homer_binding_score', 'mean')).reset_index()
    # Sort by column: 'homer_binding_score_mean' (descending)
    df = df.sort_values(['homer_binding_score_mean'], ascending=[False])
    return df

in_both_clean = clean_data(in_both).rename(columns={'homer_binding_score_mean': 'Correct TF Binding Prediction'})
in_both_clean

In [None]:
only_in_homer_clean = clean_data(only_in_homer).rename(columns={'homer_binding_score_mean': 'TF Predictions Not in Ground Truth'})
only_in_homer_clean

In [None]:
pred_df = pd.merge(in_both_clean, only_in_homer_clean, on=["source_id", "target_id"], how="outer")
pred_df

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig = plt.Figure()
a = sns.boxplot(data=pred_df, showfliers=False)
a.set_ylabel("Homer Binding Score")
a.set_title("Homer Binding Scores for ChIP-seq TF Binding Predictions")

In [None]:
plt.figure(figsize=(10,8))
plt.hist(
    pred_df["Correct TF Binding Prediction"],
    bins=150,
    alpha=0.7,
    color='#4195df',
    label="Correct TF Binding Predictions"
)
plt.hist(
    pred_df["TF Predictions Not in Ground Truth"],
    bins=150,
    alpha=0.7,
    color='#dc8634',
    label="TF Predictions Not in Ground Truth"
)
plt.title("Homer Binding Scores for ChIP-seq TF Binding Predictions", fontsize=16)
plt.xlabel("Homer Binding Score", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0., fontsize=14)
