In [27]:
import os
import pandas as pd
from functools import reduce

In [12]:
results_dir = "../data/"
libs = ["ATF2", "CTCF", "FOXA1", "LEF1", "SCRT1", "TCF7L2"]
c_types = ["direct_binding_responsive_vs_nonresponsive", "indirect_binding_responsive_vs_nonresponsive"]
methods = ["homer", "meme"]

In [13]:
def get_homer_results(resdir, lib, ct):
    filename = os.path.join(resdir, lib, "homer", ct, "knownResults.txt")
    df = pd.read_csv(filename, sep="\t")
    df["Motif Name"] = df["Motif Name"].str.split("/", expand=True)[0]
    df = df.loc[df["q-value (Benjamini)"]<0.01]
    return df

def get_meme_results(resdir, lib, ct):
    filename = os.path.join(resdir, lib, "meme", ct, "ame.tsv")
    df = pd.read_csv(filename, sep="\t")
    df["motif_ID"] = df["motif_ID"].str.split(".", expand=True)[0]
    df = df.loc[df["adj_p-value"]<0.01]
    return df

def get_top_motifs(resdir, libs, cts, method, N=30):
    method_func_dict = {"homer": get_homer_results, "meme": get_meme_results}
    method_colname_dict = {"homer": "Motif Name", "meme": "motif_ID"}
    dfs = [method_func_dict[method](resdir, lib, ct).iloc[:N] for lib in libs for ct in cts]
    dfs = [df[method_colname_dict[method]] for df in dfs]
    df = pd.concat(dfs, axis=1)
    df.columns = [f"{l}_{ct}" for l in libs for ct in cts]
    return df

In [26]:
homer_motifs = {lib: set(get_homer_results(results_dir, lib, c_types[1])["Motif Name"].values) for lib in libs}

In [37]:
def reduce_intersection(a, b):
    return a.intersection(b)

def reduce_union(a,b):
    return a.union(b)

def get_unique(motif_dict, lib, other_libs):
    all_sets = [motif_dict[ol] for ol in other_libs]
    union_sets = reduce(reduce_union, all_sets)
    return motif_dict[lib].difference(union_sets)

def get_common_and_unique_motifs(motif_dict, libs):
    unique_dict = {lib: get_unique(motif_dict, lib, [l for l in libs if l!=lib]) for lib in libs}
    common_to_all = reduce(reduce_union, [motif_dict[lib] for lib in libs])
    return unique_dict, common_to_all

In [38]:
homer_unique, homer_common = get_common_and_unique_motifs(homer_motifs, libs)

In [39]:
homer_common

{'AARE(HLH)',
 'ABF1(bZIP)',
 'ABF2(bZIP)',
 'ABI5(bZIP)',
 'ABR1(AP2EREBP)',
 'AGL13(MADS)',
 'AGL15(MADS)',
 'AGL16(MADS)',
 'AGL25(MADS)',
 'AGL6(MADS)',
 'AGL95(ND)',
 'ANAC042(NAC)',
 'ANAC046(NAC)',
 'ANAC047(NAC)',
 'ANAC079(NAC)',
 'ANAC087(NAC)',
 'ANAC094(NAC)',
 'AP-1(bZIP)',
 'AP-2gamma(AP2)',
 'ARE(NR)',
 'AREB3(bZIP)',
 'ARF16(ARF)',
 'ARF2(ARF)',
 'AS2(LOBAS2)',
 'ASL18(LOBAS2)',
 'AT1G04880(ARID)',
 'AT1G10720(BSD)',
 'AT1G12630(AP2EREBP)',
 'AT1G20910(ARID)',
 'AT1G23810(Orphan)',
 'AT1G24250(Orphan)',
 'AT1G28160(AP2EREBP)',
 'AT1G44830(AP2EREBP)',
 'AT1G47655(C2C2dof)',
 'AT1G49560(G2like)',
 'AT1G69570(C2C2dof)',
 'AT1G71450(AP2EREBP)',
 'AT1G72740(MYBrelated)',
 'AT1G76880(Trihelix)',
 'AT2G15740(C2H2)',
 'AT2G20110(CPP)',
 'AT2G28920(ND)',
 'AT3G10030(Trihelix)',
 'AT3G10113(MYBrelated)',
 'AT3G10580(MYBrelated)',
 'AT3G16280(AP2EREBP)',
 'AT3G42860(zfGRF)',
 'AT3G57600(AP2EREBP)',
 'AT3G58630(Trihelix)',
 'AT3G60490(AP2EREBP)',
 'AT4G00250(GeBP)',
 'AT4G12670(MYB

In [40]:
homer_unique

{'ATF2': {'AT1G20910(ARID)',
  'AT3G42860(zfGRF)',
  'Arnt:Ahr(bHLH)',
  'Ascl2(bHLH)',
  'E2A(bHLH)',
  'E2A(bHLH),near_PU.1',
  'Egr1(Zf)',
  'Egr2(Zf)',
  'Eomes(T-box)',
  'Foxh1(Forkhead)',
  'GATA:SCL(Zf,bHLH)',
  'HLH-1(bHLH)',
  'IRF:BATF(IRF:bZIP)',
  'MYB10(MYB)',
  'MYB121(MYB)',
  'MYB40(MYB)',
  'MYB63(MYB)',
  'NFkB-p65(RHD)',
  'Nanog(Homeobox)',
  'OCT:OCT(POU,Homeobox,IR1)',
  'PAX6(Paired,Homeobox)',
  'SCL(bHLH)',
  'Slug(Zf)',
  'Tbox:Smad(T-box,MAD)',
  'WRKY11(WRKY)',
  'WRKY15(WRKY)',
  'WRKY20(WRKY)',
  'WRKY29(WRKY)',
  'WRKY3(WRKY)',
  'WRKY30(WRKY)',
  'WRKY31(WRKY)',
  'WRKY42(WRKY)',
  'WRKY47(WRKY)',
  'WRKY6(WRKY)',
  'WRKY65(WRKY)',
  'WRKY71(WRKY)',
  'WRKY75(WRKY)',
  'WRKY8(WRKY)',
  'ZBTB12(Zf)',
  'ZNF189(Zf)'},
 'CTCF': {'AGL13(MADS)',
  'AGL95(ND)',
  'AT1G23810(Orphan)',
  'AT2G20110(CPP)',
  'AT5G66940(C2C2dof)',
  'At3g12730(G2like)',
  'DREF',
  'FAR1(FAR1)',
  'HSFA1E(HSF)',
  'HSFA6A(HSF)',
  'HSFB4(HSF)',
  'HuR(?)',
  'KAN2(G2like)',
  'MY