In [1]:
import os
import pandas as pd

In [2]:
results_dir = "../data/"
libs = ["CC", "ATF2", "CTCF", "FOXA1", "LEF1", "SCRT1", "TCF7L2", "16P12_1"]
c_types = ["peaks_vs_notpeaks", "active_vs_inactive", "responsive_vs_nonresponsive", "induced_vs_repressed", "repressed_vs_induced"]
methods = ["homer", "meme"]


In [3]:
def get_homer_results(resdir, lib, ct):
    filename = os.path.join(resdir, lib, "homer", ct, "knownResults.txt")
    df = pd.read_csv(filename, sep="\t")
    df["Motif Name"] = df["Motif Name"].str.split("/", expand=True)[0]
    df = df.loc[df["q-value (Benjamini)"]<0.01]
    return df

def get_meme_results(resdir, lib, ct):
    filename = os.path.join(resdir, lib, "meme", ct, "ame.tsv")
    df = pd.read_csv(filename, sep="\t")
    df["motif_ID"] = df["motif_ID"].str.split(".", expand=True)[0]
    df = df.loc[df["adj_p-value"]<0.01]
    return df

def get_top_motifs(resdir, libs, cts, method, N=30):
    method_func_dict = {"homer": get_homer_results, "meme": get_meme_results}
    method_colname_dict = {"homer": "Motif Name", "meme": "motif_ID"}
    dfs = [method_func_dict[method](resdir, lib, ct).iloc[:N] for lib in libs for ct in cts]
    dfs = [df[method_colname_dict[method]] for df in dfs]
    df = pd.concat(dfs, axis=1)
    df.columns = [f"{l}_{ct}" for l in libs for ct in cts]
    return df

def create_motif_table(resdir, libs, cts, method, N=30):
    method_func_dict = {"homer": get_homer_results, "meme": get_meme_results}
    method_colname_dict = {
        "homer": ["Motif Name", "P-value", "q-value (Benjamini)", "% of Target Sequences with Motif", "% of Background Sequences with Motif"], 
        "meme": ["motif_ID", "p-value", "adj_p-value", "%TP", "%FP"]}
    dfs = [method_func_dict[method](resdir, lib, ct).iloc[:N] for lib in libs for ct in cts]
    dfs = [df.loc[:, method_colname_dict[method]] for df in dfs]
    df = pd.concat(dfs, axis=0)
    # df.columns = [f"{l}_{ct}" for l in libs for ct in cts]
    return df


In [5]:
for c_type in c_types:
    save_file = os.path.join(results_dir, "tables", f"{c_type}_mea.xlsx")
    with pd.ExcelWriter(save_file) as writer:
        for lib in libs:
            if lib=="CC":
                if c_type!="peaks_vs_notpeaks" and c_type!="active_vs_inactive":
                    continue
            if lib!="CC" and c_type=="active_vs_inactive":
                continue
            for method in methods:
                try:
                    df = create_motif_table(results_dir, [lib], [c_type], method, N=100)
                    sheet_name = f"{lib}_{method}"
                    df.to_excel(writer, sheet_name=sheet_name, index=False)
                except FileNotFoundError:
                    continue

In [4]:
get_top_motifs(results_dir, libs[1:], [c_types[2]], "meme", N=30)

Unnamed: 0,ATF2_induced_vs_repressed,CTCF_induced_vs_repressed,FOXA1_induced_vs_repressed,LEF1_induced_vs_repressed,SCRT1_induced_vs_repressed,TCF7L2_induced_vs_repressed,16P12_1_induced_vs_repressed
0,TBX3_HUMAN,SP2_HUMAN,SP3_HUMAN,FOSL2_HUMAN,P53_HUMAN,CEBPG_HUMAN,P53_HUMAN
1,TBX21_HUMAN,SP3_HUMAN,SP2_HUMAN,FOS_HUMAN,P73_HUMAN,ATF4_HUMAN,P73_HUMAN
2,SUH_HUMAN,ZIC1_HUMAN,KLF6_HUMAN,JUNB_HUMAN,P63_HUMAN,JUN_HUMAN,P63_HUMAN
3,KLF5_HUMAN,AP2B_HUMAN,AP2B_HUMAN,FOSB_HUMAN,ZBT48_HUMAN,NF2L1_HUMAN,CDX1_HUMAN
4,KLF1_HUMAN,KLF3_HUMAN,KLF3_HUMAN,JUN_HUMAN,ZFX_HUMAN,BATF_HUMAN,SRY_HUMAN
5,SALL4_HUMAN,KLF6_HUMAN,SALL4_HUMAN,JUND_HUMAN,ZSC31_HUMAN,FOSL1_HUMAN,HXA9_HUMAN
6,EGR2_HUMAN,SALL4_HUMAN,ZIC1_HUMAN,FOSL1_HUMAN,ZN667_HUMAN,JUND_HUMAN,PRDM6_HUMAN
7,KLF9_HUMAN,KLF1_HUMAN,SRBP2_HUMAN,E2F4_HUMAN,COT1_HUMAN,FOSL2_HUMAN,HXB13_HUMAN
8,KLF6_HUMAN,KLF12_HUMAN,KLF5_HUMAN,NF2L1_HUMAN,PAX6_HUMAN,SRY_HUMAN,PIT1_HUMAN
9,ITF2_HUMAN,KLF5_HUMAN,SP4_HUMAN,ZBT14_HUMAN,E2F4_HUMAN,CDX1_HUMAN,ANDR_HUMAN


In [9]:
get_top_motifs(results_dir, libs[-1:], c_types, "homer", N=30)

Unnamed: 0,16P12_1_peaks_vs_notpeaks,16P12_1_responsive_vs_nonresponsive,16P12_1_induced_vs_repressed,16P12_1_repressed_vs_induced
0,p73(p53),p53(p53),p53(p53),ZFX(Zf)
1,p53(p53),p53(p53),p53(p53),ERF8(AP2EREBP)
2,p53(p53),p73(p53),p73(p53),Zfp281(Zf)
3,p63(p53),p63(p53),p63(p53),Zic3(Zf)
4,p53(p53),ZFX(Zf),p53(p53),ERF9(AP2EREBP)
5,Chop(bZIP),ZNF711(Zf),Nrf2(bZIP),ZNF711(Zf)
6,AARE(HLH),p53(p53),NF-E2(bZIP),AARE(HLH)
7,Atf4(bZIP),Tcfcp2l1(CP2),Bach2(bZIP),RKD2(RWPRK)
8,NFIL3(bZIP),Chop(bZIP),Fos(bZIP),Rap210(AP2EREBP)
9,YY1(Zf),AARE(HLH),c-Myc(bHLH),AT1G12630(AP2EREBP)
