## Testing if Ground Truth Peaks are Mapped to Nearest Gene

For some of the ground truth files, I am not sure if the TF-TG edges are mapped to the nearest gene.

In [None]:
import pandas as pd
from grn_inference.utils import find_genes_near_peaks, format_peaks
import pybedtools


print("Reading and formatting ChIP-Atlas peaks")
chipatlas_peaks = pd.read_csv("/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/output/chipseq_homer/chipatlas_peaks.csv", header=0)
chipatlas_bed_df = format_peaks(chipatlas_peaks["peak_id"]).rename(columns={"chromosome":"chrom"}).drop_duplicates()
print(chipatlas_bed_df.head())

print("Reading and formatting TSS bed file")
tss_bed_df = pd.read_parquet("output/DS011_mESC/DS011_mESC_sample1/tmp/ensembl.parquet")
tss_bed_df = tss_bed_df.rename(columns={"chr":"chrom", "gene_id":"name"})
print(tss_bed_df.head())

In [None]:
# Ensure valid BED format and correct dtypes
required_cols = ["chrom", "start", "end"]
chipatlas_bed_df = chipatlas_bed_df[required_cols].copy()
tss_bed_df = tss_bed_df[required_cols + ["name"]].copy()  # Include gene_id column as name

# Ensure all start/end values are integers
for df in [chipatlas_bed_df, tss_bed_df]:
    df["start"] = pd.to_numeric(df["start"], errors="coerce").astype("Int64")
    df["end"] = pd.to_numeric(df["end"], errors="coerce").astype("Int64")
    df.dropna(subset=["start", "end"], inplace=True)

chipatlas_bed_df = chipatlas_bed_df[chipatlas_bed_df["end"] > chipatlas_bed_df["start"]]
tss_bed_df = tss_bed_df[tss_bed_df["end"] > tss_bed_df["start"]]

chipatlas_bed_df.to_csv("chipatlas_tmp.bed", sep="\t", header=False, index=False)
tss_bed_df.to_csv("tss_tmp.bed", sep="\t", header=False, index=False)

chipatlas_bed = pybedtools.BedTool("chipatlas_tmp.bed")
tss_bed = pybedtools.BedTool("tss_tmp.bed")

In [None]:
peak_tss_overlap = chipatlas_bed.window(tss_bed, w=1e6)

In [None]:
peak_tss_overlap_df  = peak_tss_overlap.to_dataframe()
peak_tss_overlap_df.head()

In [None]:
# Define the column types for conversion to DataFrame
dtype_dict = {
    "peak_chr": str,
    "peak_start": int,
    "peak_end": int,
    "peak_id": str,
    "gene_chr": str,
    "gene_start": int,
    "gene_end": int,
    "gene_id": str
}

# Convert the BedTool result to a DataFrame for further processing.
peak_tss_overlap_df = peak_tss_overlap.to_dataframe(
    names = [
        "peak_chr", "peak_start", "peak_end", "peak_id",
        "gene_chr", "gene_start", "gene_end", "gene_id"
    ],
    dtype=dtype_dict,
    low_memory=False  # ensures the entire file is read in one go
).rename(columns={"gene_id": "target_id"}).dropna()

In [None]:
peak_tss_overlap_df