# Validate motifs: specificity, enrichment, position (FIMO/MAST)

In [2]:

import os, shutil, subprocess, sys
from pathlib import Path
from Bio import SeqIO

# Reuse the same project layout as 0_setup.ipynb
PROJ = Path.cwd().resolve().parent if Path.cwd().name == 'notebooks' else Path.cwd()
DATA = PROJ / "data"
OUT  = PROJ / "results"
CLADES = OUT / "clades"
MOTIFS = OUT / "motifs"
REPORTS = OUT / "reports"
TREES = OUT / "trees"

for d in (DATA, OUT, CLADES, MOTIFS, REPORTS, TREES):
    d.mkdir(parents=True, exist_ok=True)

# Inputs expected (some may be generated in earlier notebooks)
IN_MSA = DATA / "query.algn.fa"          # Chetan's gapped MSA
IN_MSA_TRIMMED = DATA / "query.algn.trimmed.fa"
IN_TREE = DATA / "ASC-tree.newick"       # Provided tree (optional)
IN_TARGETS = DATA / "ASC_targets.fasta"      # Ungapped full-length sequences

print("DATA:", DATA)
print("OUT:", OUT)


DATA: /Users/gorkemdurmaz/Desktop/asc_project_10/data
OUT: /Users/gorkemdurmaz/Desktop/asc_project_10/results


## Goals
- Convert discovered motifs to PWMs/HMMs as needed.
- **Scan all sequences** (FIMO/MAST or hmmsearch) to assess specificity/enrichment.
- Summarize positional distribution (e.g., distance from bHLH center).

In [None]:
# 4.1 Run FIMO for each clade motifs against all proteins (full-length)
import os, shutil, subprocess
from pathlib import Path

# Assumes from your setup:
# IN_TARGETS = DATA / "ASC_targets.fasta"   # full-length proteins
# MOTIFS     = OUT  / "motifs"              # contains <clade>/bg.meme and <clade>/meme_out/

if not shutil.which("fimo"):
    raise SystemExit("ERROR: 'fimo' not found. Activate your MEME suite environment.")

# Optional global fallback background (produced earlier, e.g., bg.global.meme or bg.meme)
GLOBAL_BG_CANDIDATES = [MOTIFS / "bg.global.meme", MOTIFS / "bg.meme"]
GLOBAL_BFILE = next((p for p in GLOBAL_BG_CANDIDATES if p.exists()), None)

# Find clade motif dirs that actually have MEME results
clade_dirs = []
for d in sorted(MOTIFS.iterdir()):
    if not d.is_dir():
        continue
    meme_txt = d / "meme_out" / "meme.txt"  # from `-oc <clade>/meme_out`
    if meme_txt.exists():
        clade_dirs.append(d)

print("Found motif sets:", [d.name for d in clade_dirs])

#  Clean up old FIMO outputs for a fresh rerun 
for d in clade_dirs:
    old_fimo = d / "fimo_all"
    if old_fimo.exists():
        shutil.rmtree(old_fimo)
print("Old FIMO result folders removed.")

#  Run FIMO per clade 
for clade in clade_dirs:
    meme_txt = clade / "meme_out" / "meme.txt"
    outdir   = clade / "fimo_all"          # per-clade FIMO output
    bfile_local = clade / "bg.meme"        # per-clade background built earlier

    outdir.mkdir(parents=True, exist_ok=True)

    # Choose background: prefer local; fall back to global; else none
    use_bg = None
    if bfile_local.exists() and bfile_local.stat().st_size > 0:
        use_bg = bfile_local
    elif GLOBAL_BFILE is not None:
        use_bg = GLOBAL_BFILE

    cmd = [
        "fimo",
        "--oc", str(outdir),
        "--thresh", "1e-5",
        "--max-stored-scores", "20000000",
        "--verbosity", "1",
    ]
    if use_bg:
        cmd += ["--bgfile", str(use_bg)]
    cmd += [str(meme_txt), str(IN_TARGETS)]

    print(f"[FIMO] {clade.name} "
          f"(bg={'local' if use_bg == bfile_local else ('global' if use_bg == GLOBAL_BFILE else 'none')})"
          f" → {outdir}")
    subprocess.run(cmd, check=True)

print("FIMO scans complete.")


Found motif sets: ['ASCa_TrueSpiders_A', 'ASCa_TrueSpiders_B', 'ASCa_TrueSpiders_C', 'ASCa_TrueSpiders_D', 'ASCa_TrueSpiders_E', 'ASCb', 'ASCc', 'ASH', 'Chelicerate_ASCa_A', 'ase']
Old FIMO result folders removed.
[FIMO] ASCa_TrueSpiders_A (bg=local) → /Users/gorkemdurmaz/Desktop/asc_project_10/results/motifs/ASCa_TrueSpiders_A/fimo_all
[FIMO] ASCa_TrueSpiders_B (bg=local) → /Users/gorkemdurmaz/Desktop/asc_project_10/results/motifs/ASCa_TrueSpiders_B/fimo_all
[FIMO] ASCa_TrueSpiders_C (bg=local) → /Users/gorkemdurmaz/Desktop/asc_project_10/results/motifs/ASCa_TrueSpiders_C/fimo_all
[FIMO] ASCa_TrueSpiders_D (bg=local) → /Users/gorkemdurmaz/Desktop/asc_project_10/results/motifs/ASCa_TrueSpiders_D/fimo_all
[FIMO] ASCa_TrueSpiders_E (bg=local) → /Users/gorkemdurmaz/Desktop/asc_project_10/results/motifs/ASCa_TrueSpiders_E/fimo_all
[FIMO] ASCb (bg=local) → /Users/gorkemdurmaz/Desktop/asc_project_10/results/motifs/ASCb/fimo_all
[FIMO] ASCc (bg=local) → /Users/gorkemdurmaz/Desktop/asc_project

In [None]:
# 4.2_final — FIMO -> clades/families -> drop bHLH overlaps -> entropy flags ---> tidy summaries
import re, math, collections
import numpy as np, pandas as pd
from pathlib import Path
from Bio import SeqIO

# ---------- Config ----------
P_THRESH = 1e-4              # FIMO site p-value
Q_THRESH = 0.05              # FDR cutoff
BHLH_PAD = 8                 # overlap guard around PF00010 alignment span
SUMDIR = (MOTIFS / "_summaries"); SUMDIR.mkdir(parents=True, exist_ok=True)

# ---------- Utils 
norm_id = lambda x: re.sub(r"[^A-Za-z0-9._-]+","", re.sub(r"\.(t?\d+)$","", (x or "").split()[0].split("|")[-1]))
def shannon_entropy(s):
    if not s: return 0.0
    cnt, n = collections.Counter(s), len(s)
    return -sum((c/n)*math.log2(c/n) for c in cnt.values())

# ---------- Inputs & clade map 
targets_universe = sorted({norm_id(r.id) for r in SeqIO.parse(IN_TARGETS, "fasta")})
# Prefer results/clades/*.fa for mapping; else CLADE_MAP if present
EFFECTIVE_CLADE_MAP = {}
if CLADES.exists():
    for fa in sorted(CLADES.glob("*.fa*")):
        ids = [norm_id(r.id) for r in SeqIO.parse(fa, "fasta")]
        ids = sorted(set(i for i in ids if i in targets_universe))
        if ids: EFFECTIVE_CLADE_MAP[fa.stem] = ids
elif "CLADE_MAP" in globals():
    EFFECTIVE_CLADE_MAP = {cl: sorted({norm_id(i) for i in ids if norm_id(i) in targets_universe})
                           for cl, ids in CLADE_MAP.items() if ids}
seq2clade = {sid: cl for cl, ids in EFFECTIVE_CLADE_MAP.items() for sid in ids}

# ---------- Collect FIMO hits from each clade’s meme_out 
clade_dirs = [d for d in sorted(MOTIFS.iterdir())
              if d.is_dir() and (d/"meme_out"/"meme.txt").exists() and (d/"fimo_all"/"fimo.tsv").exists()]
rows = []
for d in clade_dirs:
    df = pd.read_csv(d/"fimo_all"/"fimo.tsv", sep="\t", comment="#")
    df["source_clade"] = d.name
    rows.append(df)
hits = pd.concat(rows, ignore_index=True) if rows else pd.DataFrame()
if hits.empty: raise SystemExit("No FIMO hits found. Did 4.1 run?")
# clean & filter
hits["p-value"] = pd.to_numeric(hits["p-value"], errors="coerce")
hits["q-value"] = pd.to_numeric(hits["q-value"], errors="coerce")
for c in ("start","stop"): hits[c] = pd.to_numeric(hits[c], errors="coerce").astype("Int64")
hits = hits.dropna(subset=["p-value"]).query("`p-value` <= @P_THRESH")
hits = hits[hits["q-value"].fillna(1.0) <= Q_THRESH].copy()
hits["seq_norm"] = hits["sequence_name"].map(norm_id)
hits["target_clade"] = hits["seq_norm"].map(seq2clade).fillna("Unassigned")
# collapse: one best site per (motif x sequence)
best = (hits.sort_values(["source_clade","motif_id","seq_norm","p-value"])
            .drop_duplicates(["source_clade","motif_id","seq_norm"], keep="first").copy())

# ------- Remove ONLY bHLH overlaps (keep whole-protein signals) -------
anch = pd.read_csv(OUT/"bHLH_anchors.tsv", sep="\t")
anch["seq_norm"] = anch["seq_id"].map(norm_id)
by_seq = anch.groupby("seq_norm").apply(
    lambda g: {"ints":[(int(s)-BHLH_PAD, int(e)+BHLH_PAD) for s,e in zip(g.ali_from,g.ali_to)],
               "anchors":[int(a) for a in g.anchor_pos]}).to_dict()
def overlaps_bhlh(seq, s, e):
    for xs, xe in by_seq.get(seq,{}).get("ints",[]):
        if not (e < xs or s > xe): return True
    return False
best["hit_mid"] = ((best["start"] + best["stop"])//2).astype("Int64")
best["overlap_bHLH"] = best.apply(lambda r: overlaps_bhlh(r["seq_norm"], int(r["start"]), int(r["stop"])), axis=1)
best["is_bHLH_like"] = best["overlap_bHLH"]  # overlap-only rule

# ---------- Families (ASCa / ASCb / ASCc) 
present = set(EFFECTIVE_CLADE_MAP.keys())
ASCb_names, ASCc_names = {"ASCb"} & present, {"ASCc"} & present
def clade_to_family(cl):
    if cl in ASCb_names: return "ASCb"
    if cl in ASCc_names: return "ASCc"
    if cl in (None, "Unassigned"): return "Unassigned"
    return "ASCa"
best["target_family"] = best["target_clade"].map(clade_to_family)

# ---------- Entropy flags + soft filter (optional but tiny) 
best["match_entropy"] = best["matched_sequence"].map(shannon_entropy)
best["entropy_flag"] = pd.cut(best["match_entropy"], bins=[-1,1.6,2.2,10], labels=["low","mid","high"])
fam_spread = (best.groupby(["source_clade","motif_id","target_family"], as_index=False)
                 .agg(n_seq_hits=("seq_norm","nunique")))
fam_wide = fam_spread.pivot_table(index=["source_clade","motif_id"], columns="target_family",
                                  values="n_seq_hits", fill_value=0)
for k in ["ASCa","ASCb","ASCc"]:
    if k not in fam_wide.columns: fam_wide[k]=0
fam_wide["families_hit"] = (fam_wide[["ASCa","ASCb","ASCc"]]>0).sum(axis=1)
best = best.merge(fam_wide[["families_hit"]].reset_index(), on=["source_clade","motif_id"], how="left")
best_soft = best[~((best["entropy_flag"]=="low") & (best["families_hit"]>=2))].copy()  # keep family-specific low-entropy

#  Generic summarizer (clade or family) ----------
def summarize(df, level, tag):
    key = "target_clade" if level=="clade" else "target_family"
    out = SUMDIR / f"{tag}.{level.upper()}"
    df2 = df.copy()
    df2["motif_col"] = df2["source_clade"] + "|" + df2["motif_id"]
    # presence
    pres = (df2.assign(val=1).pivot_table(index="seq_norm", columns="motif_col", values="val",
                                          aggfunc="max", fill_value=0).sort_index())
    pres = pres.join(df2[["seq_norm",key]].drop_duplicates().set_index("seq_norm"), how="left")
    # counts
    counts = (df2.groupby(["source_clade","motif_id",key], as_index=False)
                 .agg(n_seq_hits=("seq_norm","nunique")))
    totals = counts.groupby(["source_clade","motif_id"])["n_seq_hits"].transform("sum")
    counts["share_of_hits"] = counts["n_seq_hits"] / totals
    # denominators (exclude Unassigned)
    if level=="clade":
        sizes = {cl: len(ids) for cl, ids in EFFECTIVE_CLADE_MAP.items()}
    else:
        sizes = {
            "ASCb": sum(len(EFFECTIVE_CLADE_MAP.get(c,[])) for c in ASCb_names),
            "ASCc": sum(len(EFFECTIVE_CLADE_MAP.get(c,[])) for c in ASCc_names),
            "ASCa": sum(len(v) for k,v in EFFECTIVE_CLADE_MAP.items() if k not in ASCb_names|ASCc_names),
        }
    mapped_total = sum(sizes.values())
    counts["hit_rate"] = counts.apply(lambda r: (r.n_seq_hits / sizes.get(r[key],0)) if sizes.get(r[key],0)>0 else np.nan, axis=1)
    mapped = counts[counts[key].isin(sizes.keys())]
    base = (mapped.groupby(["source_clade","motif_id"])["n_seq_hits"].sum()
                 .rename("global_n_hits")).reset_index()
    base["global_rate"] = base["global_n_hits"] / max(mapped_total,1)
    spec = counts.merge(base, on=["source_clade","motif_id"], how="left")
    spec["enrichment"] = spec["hit_rate"] / spec["global_rate"]
    spec["log2_enrichment"] = spec["enrichment"].apply(lambda x: np.log2(x) if (pd.notna(x) and x>0) else np.nan)
    # write
    df2.to_csv(out.with_suffix(".fimo_hits.filtered_perseq.tsv"), sep="\t", index=False)
    spec.sort_values(["source_clade","motif_id",key]).to_csv(out.with_suffix(".motif_specificity_enrichment.tsv"), sep="\t", index=False)
    pres.to_csv(out.with_suffix(".motif_presence_matrix.tsv"), sep="\t")
    print(f"[{tag}.{level}] rows={len(df2)} mapped_total={mapped_total}")

#  Write ALL / NON_BHLH / NON_BHLH+SOFT at clade & family levels 
ALL      = best
NON_BHLH = best[~best["is_bHLH_like"]].copy()
NON_BHLH_SOFT = best_soft[~best_soft["is_bHLH_like"]].copy()

for tag, df in [("ALL", ALL), ("NON_BHLH", NON_BHLH), ("NON_BHLH_SOFT", NON_BHLH_SOFT)]:
    summarize(df, "clade",   tag)
    summarize(df, "family",  tag)

# Quick peek
display(NON_BHLH_SOFT.sort_values(["source_clade","motif_id","target_clade"]).head(20))
print("Flagged bHLH-overlaps:", int(best["is_bHLH_like"].sum()), "/", len(best))
print("Entropy tiers:", best["entropy_flag"].value_counts(dropna=False).to_dict())


[ALL.clade] rows=2123 mapped_total=66
[ALL.family] rows=2123 mapped_total=66
[NON_BHLH.clade] rows=200 mapped_total=66
[NON_BHLH.family] rows=200 mapped_total=66
[NON_BHLH_SOFT.clade] rows=200 mapped_total=66
[NON_BHLH_SOFT.family] rows=200 mapped_total=66


  by_seq = anch.groupby("seq_norm").apply(


Unnamed: 0,motif_id,motif_alt_id,sequence_name,start,stop,strand,score,p-value,q-value,matched_sequence,source_clade,seq_norm,target_clade,hit_mid,overlap_bHLH,is_bHLH_like,target_family,match_entropy,entropy_flag,families_hit
0,EYPFDGSTEMIMPYG,MEME-4,Abru_g14616.t1,149,163,+,47.2449,1.01e-15,3.82e-12,EYMFEDSTEIILPYA,ASCa_TrueSpiders_A,Abru_g14616,ASCa_TrueSpiders_A,156,False,False,ASCa,3.323231,high,1
1,EYPFDGSTEMIMPYG,MEME-4,Afer_g4188.t2,156,170,+,48.5,4.13e-16,1.95e-12,EYPFEGSPEMIMSYV,ASCa_TrueSpiders_A,Afer_g4188,ASCa_TrueSpiders_A,163,False,False,ASCa,3.056565,high,1
2,EYPFDGSTEMIMPYG,MEME-4,Hgra_g11716.t1,133,147,+,49.2041,2.23e-16,1.41e-12,EYGYDGSTEFVMPYG,ASCa_TrueSpiders_A,Hgra_g11716,ASCa_TrueSpiders_A,140,False,False,ASCa,3.139572,high,1
3,EYPFDGSTEMIMPYG,MEME-4,Hgra_g11718.t1,133,147,+,49.2041,2.23e-16,1.41e-12,EYGYDGSTEFVMPYG,ASCa_TrueSpiders_A,Hgra_g11718,ASCa_TrueSpiders_A,140,False,False,ASCa,3.139572,high,1
4,EYPFDGSTEMIMPYG,MEME-4,Ptep_aug3.g5172,127,141,+,50.4898,5.94e-17,1.12e-12,EYMFDASTEMIIPYG,ASCa_TrueSpiders_A,Ptep_aug3.g5172,ASCa_TrueSpiders_A,134,False,False,ASCa,3.373557,high,1
5,EYPFDGSTEMIMPYG,MEME-4,Ssce_g718.t1,74,88,+,46.6837,1.42e-15,4.49e-12,EYPFEGSPDMIMSYV,ASCa_TrueSpiders_A,Ssce_g718,ASCa_TrueSpiders_A,81,False,False,ASCa,3.240224,high,1
91,GKRLSKVETLRSAIDYIRQLRQIL,MEME-1,Ssce_g718.t1,17,40,+,98.1798,1.17e-30,3.2900000000000004e-27,GKRLSKVETLRSAIDYIRQLRQML,ASCa_TrueSpiders_A,Ssce_g718,ASCa_TrueSpiders_A,28,False,False,ASCa,3.584963,high,3
96,MEMFPHQDYPPQNS,MEME-8,Abru_g14616.t1,23,36,+,45.2024,1.98e-15,3.66e-11,TEMFSHQDYVPQNS,ASCa_TrueSpiders_A,Abru_g14616,ASCa_TrueSpiders_A,29,False,False,ASCa,3.521641,high,1
97,MEMFPHQDYPPQNS,MEME-8,Afer_g4188.t2,30,43,+,42.6786,2e-14,1.23e-10,MTMYPHQDFAMSLS,ASCa_TrueSpiders_A,Afer_g4188,ASCa_TrueSpiders_A,36,False,False,ASCa,3.324863,high,1
98,MEMFPHQDYPPQNS,MEME-8,Hgra_g11716.t1,3,16,+,41.131,5.85e-14,2.17e-10,MFSPPQQDYPSFSS,ASCa_TrueSpiders_A,Hgra_g11716,ASCa_TrueSpiders_A,9,False,False,ASCa,2.610577,high,1


Flagged bHLH-overlaps: 1923 / 2123
Entropy tiers: {'high': 2122, 'mid': 1, 'low': 0}


In [18]:
# --- 4.2_shortlist: identify family-specific motifs (ASCa / ASCb / ASCc) ---

def family_shortlist(spec_df: pd.DataFrame, min_hits=3):
    # only keep main families
    pivot = (spec_df[spec_df["target_family"].isin(["ASCa","ASCb","ASCc"])]
             .pivot_table(index=["source_clade","motif_id"],
                          columns="target_family", values="n_seq_hits",
                          aggfunc="sum", fill_value=0)
             .reset_index())
    # add specificity flags
    pivot["ASCa_specific"] = (pivot["ASCa"] >= min_hits) & (pivot["ASCb"] == 0) & (pivot["ASCc"] == 0)
    pivot["ASCb_specific"] = (pivot["ASCb"] >= min_hits) & (pivot["ASCa"] == 0) & (pivot["ASCc"] == 0)
    pivot["ASCc_specific"] = (pivot["ASCc"] >= min_hits) & (pivot["ASCa"] == 0) & (pivot["ASCb"] == 0)
    # dominant family by hit count
    fam_cols = ["ASCa","ASCb","ASCc"]
    pivot["dominant_family"] = pivot[fam_cols].idxmax(axis=1)
    pivot["dominant_hits"]   = pivot[fam_cols].max(axis=1)
    # rank with ASCa first, then ASCb, ASCc
    pivot = pivot.sort_values(["ASCa_specific","ASCb_specific","ASCc_specific","dominant_hits"],
                              ascending=[False,False,False,False])
    return pivot

# Example usage:
shortlist = family_shortlist(spec_nb_fam, min_hits=3)
out_csv = SUMDIR / "family_shortlist.csv"
shortlist.to_csv(out_csv, index=False)
print("Saved:", out_csv)
display(shortlist.head(20))


Saved: /Users/gorkemdurmaz/Desktop/asc_project_10/results/motifs/_summaries/family_shortlist.csv


target_family,source_clade,motif_id,ASCa,ASCb,ASCc,ASCa_specific,ASCb_specific,ASCc_specific,dominant_family,dominant_hits
18,ASCa_TrueSpiders_E,EASSPYDALHGDEEEELMDFASWF,8,0,0,True,False,False,ASCa,8
0,ASCa_TrueSpiders_A,EYPFDGSTEMIMPYG,6,0,0,True,False,False,ASCa,6
3,ASCa_TrueSpiders_A,PQGYRCDFGCPCNEG,6,0,0,True,False,False,ASCa,6
21,ASCa_TrueSpiders_E,QRIAPKLPHH,6,0,0,True,False,False,ASCa,6
2,ASCa_TrueSpiders_A,MEMFPHQDYPPQNS,5,0,0,True,False,False,ASCa,5
7,ASCa_TrueSpiders_C,HRFMASHEDARRLLYL,5,0,0,True,False,False,ASCa,5
9,ASCa_TrueSpiders_C,QHTQDDQLMDIGLWFS,5,0,0,True,False,False,ASCa,5
10,ASCa_TrueSpiders_C,SGPAGGSGDLSPASSHPSDCSLV,5,0,0,True,False,False,ASCa,5
11,ASCa_TrueSpiders_C,YGQDDCSSVASSEEI,5,0,0,True,False,False,ASCa,5
12,ASCa_TrueSpiders_D,DENDDFFBIMDWTTL,5,0,0,True,False,False,ASCa,5
