In [1]:
import os
import subprocess
import pandas as pd

In [2]:
class HomerEnricher:
    """
    Vectorizes chromosomal locations by scanning them using Homer and its motif database
    """
    def __init__(self, roi_df, background_df, genome_filepath, homer_pwm_motifs, homer_outdir, threads=32):
        self.genome = genome_filepath
        self.homer_pwms = homer_pwm_motifs
        self.roi = roi_df
        self.background = background_df
        self.homer_outdir = homer_outdir
        self.threads = threads

        self.homer_outfile = self._get_homer_outfile()
        self.homer_roi = self._get_roi_homer()
        self.homer_background = self._get_background_homer()
        self.logfile = self._get_logfile()
        pass

    def _get_homer_outfile(self):
        return os.path.join(self.homer_outdir, "motif_odds.tsv")

    def _get_roi_homer(self):
        return os.path.join(self.homer_outdir, "tmp_roi_homer.bed")

    def _get_background_homer(self):
        return os.path.join(self.homer_outdir, "tmp_background_homer.bed")

    def _get_logfile(self):
        return os.path.join(self.homer_outdir, "homer_scan.log")

    def _process_homer(self, df_row):
        chrname = df_row.chrm
        start = df_row.start
        end = df_row.end
        peak_name = f"{chrname}_{start}_{end}"
        irrelevant_col = 0
        strand = "."
        return pd.Series({"3":peak_name, "4":irrelevant_col, "5": strand}) 

    def _create_homer_compatible_bed(self, df, save_file):
        """
        Converts an roi file with chromosomal coordinates to a homer compatible one
        """
        df.loc[:, ["chrm", "start", "end"]].merge(df.apply(self._process_homer, axis=1), left_index=True, right_index=True).to_csv(save_file, index=False, header=None, sep="\t")
        return

    def _homer_enrich(self):
        cmd = [
            "findMotifsGenome.pl", self.homer_roi, self.genome, self.homer_outdir, 
            "-bg", self.homer_background, "-mknown", self.homer_pwms, "-p", str(self.threads), 
            "-size", "given", 
            ]
        with open(self.logfile, "w") as lf:
            with open(self.homer_outfile, "w") as of:
                results = subprocess.run(cmd, stdout=of, stderr=lf)
        return results

    def run_enrichment(self):
        # making sure there is a directory on path
        os.makedirs(self.homer_outdir, exist_ok=True)
        # creating homer compatible tmp roi file
        self._create_homer_compatible_bed(self.roi, self.homer_roi)
        # creating homer compatible tmp bg file
        self._create_homer_compatible_bed(self.background, self.homer_background)
        # running homer
        self._homer_enrich()
        return



In [3]:
libs = ["CC", "ATF2", "CTCF", "FOXA1", "LEF1", "SCRT1", "TCF7L2", "16P12_1"]
data_dir = "/data7/deepro/starrseq/4_ml_classification_fragment_category/data"
genome_fasta = "/data5/deepro/genomes/hg38/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta"
homer_motifs = "/data7/deepro/starrseq/4_ml_classification_fragment_category/data/HOCOMOCOv11_core_HUMAN_mono_homer_format_0.001.motif"

for lib in libs:
    print(lib)
    compare_file = os.path.join(data_dir, lib, "cov_conf_compare.csv.gz")
    compare_df = pd.read_csv(compare_file)
    hcov_hconf_df = compare_df.loc[(
        (compare_df.ytarget==1)&(compare_df.cov_decile==9)&
        (compare_df.ypred_resnet>0.9)&(compare_df.ypred_linear<0.9)
        )]
    other_df = compare_df.loc[~(
        (compare_df.ytarget==1)&(compare_df.cov_decile==9)&
        (compare_df.ypred_resnet>0.9)&(compare_df.ypred_linear<0.9)
        )]
    homer_outdir = os.path.join(data_dir, lib, "compare_homer")
    he = HomerEnricher(hcov_hconf_df, other_df, genome_fasta, homer_motifs, homer_outdir)
    he.run_enrichment()



ATF2
CTCF
FOXA1
LEF1
SCRT1
TCF7L2
16P12_1
