# De‑novo motif discovery per clade (MEME/GLAM2)

In [1]:

import os, shutil, subprocess, sys
from pathlib import Path
from Bio import SeqIO

# Reuse the same project layout as 0_setup.ipynb
PROJ = Path.cwd().resolve().parent if Path.cwd().name == 'notebooks' else Path.cwd()
DATA = PROJ / "data"
OUT  = PROJ / "results"
CLADES = OUT / "clades"
MOTIFS = OUT / "motifs"
REPORTS = OUT / "reports"
TREES = OUT / "trees"

for d in (DATA, OUT, CLADES, MOTIFS, REPORTS, TREES):
    d.mkdir(parents=True, exist_ok=True)

# Inputs expected (some may be generated in earlier notebooks)
IN_MSA = DATA / "query.algn.fa"          # Chetan's gapped MSA
IN_MSA_TRIMMED = DATA / "query.algn.trimmed.fa"
IN_TREE = DATA / "ASC-tree.newick"       # Provided tree (optional)
IN_TARGETS = DATA / "ASC_targets.fasta"      # Ungapped full-length sequences

print("DATA:", DATA)
print("OUT:", OUT)


DATA: /Users/gorkemdurmaz/Desktop/asc_project_10/data
OUT: /Users/gorkemdurmaz/Desktop/asc_project_10/results


## Goals
- Run **MEME** (or GLAM2) per clade FASTA.
- Discover 1–5 motifs per clade (configurable widths).
- Outputs: MEME HTML/TXT + motif PWMs in `results/motifs/<clade>/`

In [None]:
import pandas as pd
from Bio import SeqIO
from collections import defaultdict

ANCHORS_TSV = OUT / "bHLH_anchors.tsv"  # from the previous notebook
MASK_MARGIN = 12                        # aa to expand around ali_from/ali_to
MASKED_CLADES = OUT / "clades_masked"
MASKED_CLADES.mkdir(parents=True, exist_ok=True)

# Load domains per sequence (allow multiple)
anch = pd.read_csv(ANCHORS_TSV, sep="\t")
# we’ll mask the env span expanded by margin; fall back to ali_from/ali_to if env_* missing
def _row_windows(r):
    a = int(r.get("env_from", r["ali_from"]))
    b = int(r.get("env_to",   r["ali_to"]))
    return max(1, a - MASK_MARGIN), b + MASK_MARGIN

winmap = defaultdict(list)
for _, r in anch.iterrows():
    L = int(r["seq_len"])
    a, b = _row_windows(r)
    a = max(1, a); b = min(L, b)
    winmap[r["seq_id"]].append((a, b))  # 1-based inclusive

def mask_seq(rec, wins):
    # convert to 0-based slices and mask with 'X'
    s = list(str(rec.seq))
    for (a, b) in wins:
        for i in range(a-1, b):  # inclusive
            s[i] = 'X'
    rec.seq = type(rec.seq)("".join(s))
    return rec

# Create masked copies of the clade FASTAs
masked_fastas = []
for fa in sorted(Path(CLADES).glob("*.fa")):
    outfa = MASKED_CLADES / fa.name
    with open(outfa, "w") as out:
        for rec in SeqIO.parse(fa, "fasta"):
            if rec.id in winmap:
                rec = mask_seq(rec, winmap[rec.id])
            SeqIO.write(rec, out, "fasta")
    masked_fastas.append(outfa)

print("Masked FASTAs:", [p.name for p in masked_fastas][:8], "…")


Masked FASTAs: ['ASCa_TrueSpiders_A.fa', 'ASCa_TrueSpiders_B.fa', 'ASCa_TrueSpiders_C.fa', 'ASCa_TrueSpiders_D.fa', 'ASCa_TrueSpiders_E.fa', 'ASCa_TrueSpiders_F.fa', 'ASCb.fa', 'ASCc.fa'] …


In [None]:
# Run Only Once!!!
# If you need to run again do this: rm -rf results/motifs/*
##########################################################
##########################################################

import os, shutil, subprocess, sys
from pathlib import Path
from Bio import SeqIO

# IN_TARGETS = DATA/"ASC_targets.fasta"
# CLADES = OUT/"clades"
# MOTIFS = OUT/"motifs"

# ---- Config ----
NMOTIFS = 7            # how many motifs per clade
WMIN, WMAX = 6, 24     # motif width range (protein) — widen a bit for ASH16-like
MODEL = "zoops"        # {oops|zoops|anr}; zoops is a safe default
EVT = 1e-5             # E-value threshold to stop searching early
THREADS = max(1, (os.cpu_count() or 2) // 2)
MAXSIZE = 1_000_000    # skip run if total residues exceed this (MEME's -maxsize)
MIN_SEQ_FOR_LOCAL_BG = 4  # build per-clade bg only if clade has >= 4 seqs

# --- sanity: MEME present? ---
if not shutil.which("meme"):
    sys.exit("ERROR: 'meme' not found on PATH. Activate your MEME suite environment first.")

# --- Optional global background (fallback) ---
GLOBAL_BFILE = MOTIFS / "bg.global.meme"
if shutil.which("fasta-get-markov") and Path(IN_TARGETS).exists():
    GLOBAL_BFILE.parent.mkdir(parents=True, exist_ok=True)
    subprocess.run(["fasta-get-markov", str(IN_TARGETS), str(GLOBAL_BFILE)], check=False)
else:
    GLOBAL_BFILE = None

# --- collect clade FASTAs ---
clade_fastas = sorted(Path(CLADES).glob("*.fa")) ### We collect masked fasta to avoid the bHLH motifs but do not use it, causes problems - X letter.
print("Clade FASTAs:", [p.name for p in clade_fastas])

def count_residues(fa):
    nseq, nres = 0, 0
    for rec in SeqIO.parse(fa, "fasta"):
        nseq += 1
        nres += len(rec.seq)
    return nseq, nres

for fa in clade_fastas:
    nseq, nres = count_residues(fa)
    outdir = Path(MOTIFS) / fa.stem
    meme_run_dir = outdir / "meme_out"   # MEME will write here

    # Ensure base folder for this clade exists
    outdir.mkdir(parents=True, exist_ok=True)

    # ----- build per-clade background inside outdir 
    use_bfile = None
    per_clade_ok = False
    if shutil.which("fasta-get-markov") and nseq >= MIN_SEQ_FOR_LOCAL_BG:
        bfile_local = outdir / "bg.meme"
        try:
            subprocess.run(["fasta-get-markov", str(fa), str(bfile_local)], check=True)
            if bfile_local.exists() and bfile_local.stat().st_size > 0:
                use_bfile = bfile_local
                per_clade_ok = True
        except subprocess.CalledProcessError:
            per_clade_ok = False

    # Fallback to global bg
    if not per_clade_ok and GLOBAL_BFILE and GLOBAL_BFILE.exists():
        use_bfile = GLOBAL_BFILE

    # ----- prepare fresh MEME output directory -----
    if meme_run_dir.exists():
        shutil.rmtree(meme_run_dir)   # remove old MEME results completely

    cmd = [
        "meme", str(fa),
        "-protein",
        "-mod", MODEL,
        "-nmotifs", str(NMOTIFS),
        "-minw", str(WMIN), "-maxw", str(WMAX),
        "-evt", str(EVT),
        "-p", str(THREADS),
        "-maxsize", str(MAXSIZE),
        "-oc", str(meme_run_dir),     # always create a new directory
        "-nostatus",
        "-seed", "42",
    ]
    if use_bfile:
        cmd += ["-bfile", str(use_bfile)]

    # Guardrails
    if nseq < 2:
        print(f"Skip {fa.name}: only {nseq} sequence(s). Need ≥2 for motif discovery.")
        continue
    if nres < WMAX:
        print(f"Skip {fa.name}: total residues ({nres}) < max width ({WMAX}).")
        continue

    print(f"[MEME] {fa.name}: nseq={nseq}, residues={nres}, "
          f"bfile={'local' if (use_bfile and use_bfile.parent == outdir) else ('global' if use_bfile else 'none')} "
          f"→ {meme_run_dir}")
    try:
        subprocess.run(cmd, check=True)
    except subprocess.CalledProcessError as e:
        print(f"MEME failed for {fa.name} (exit {e.returncode}). Command was:\n{' '.join(cmd)}")

print("Done!")


[KThe given ghostscript "/Users/gorkemdurmaz/miniconda3/envs/asc/bin/gs" is not an executable file.


91 94 486 221.6 20169
Clade FASTAs: ['ASCa_TrueSpiders_A.fa', 'ASCa_TrueSpiders_B.fa', 'ASCa_TrueSpiders_C.fa', 'ASCa_TrueSpiders_D.fa', 'ASCa_TrueSpiders_E.fa', 'ASCa_TrueSpiders_F.fa', 'ASCb.fa', 'ASCc.fa', 'ASH.fa', 'Chelicerate_ASCa_A.fa', 'ase.fa']
6 94 176 148.8 893
[MEME] ASCa_TrueSpiders_A.fa: nseq=6, residues=893, bfile=local → /Users/gorkemdurmaz/Desktop/asc_project_10/results/motifs/ASCa_TrueSpiders_A/meme_out


[KThe given ghostscript "/Users/gorkemdurmaz/miniconda3/envs/asc/bin/gs" is not an executable file.


4 151 194 172.0 688
[MEME] ASCa_TrueSpiders_B.fa: nseq=4, residues=688, bfile=local → /Users/gorkemdurmaz/Desktop/asc_project_10/results/motifs/ASCa_TrueSpiders_B/meme_out


[KThe given ghostscript "/Users/gorkemdurmaz/miniconda3/envs/asc/bin/gs" is not an executable file.


5 199 233 215.0 1075
[MEME] ASCa_TrueSpiders_C.fa: nseq=5, residues=1075, bfile=local → /Users/gorkemdurmaz/Desktop/asc_project_10/results/motifs/ASCa_TrueSpiders_C/meme_out


[KThe given ghostscript "/Users/gorkemdurmaz/miniconda3/envs/asc/bin/gs" is not an executable file.


5 221 244 227.6 1138
[MEME] ASCa_TrueSpiders_D.fa: nseq=5, residues=1138, bfile=local → /Users/gorkemdurmaz/Desktop/asc_project_10/results/motifs/ASCa_TrueSpiders_D/meme_out


[KThe given ghostscript "/Users/gorkemdurmaz/miniconda3/envs/asc/bin/gs" is not an executable file.


5 204 222 213.2 1066
[MEME] ASCa_TrueSpiders_E.fa: nseq=5, residues=1066, bfile=local → /Users/gorkemdurmaz/Desktop/asc_project_10/results/motifs/ASCa_TrueSpiders_E/meme_out




Skip ASCa_TrueSpiders_F.fa: only 1 sequence(s). Need ≥2 for motif discovery.
14 128 280 205.9 2883
[MEME] ASCb.fa: nseq=14, residues=2883, bfile=local → /Users/gorkemdurmaz/Desktop/asc_project_10/results/motifs/ASCb/meme_out


[Kcessed: 100.0%The given ghostscript "/Users/gorkemdurmaz/miniconda3/envs/asc/bin/gs" is not an executable file.
[Kcessed: 100.0%

15 127 286 227.1 3406
[MEME] ASCc.fa: nseq=15, residues=3406, bfile=local → /Users/gorkemdurmaz/Desktop/asc_project_10/results/motifs/ASCc/meme_out


The given ghostscript "/Users/gorkemdurmaz/miniconda3/envs/asc/bin/gs" is not an executable file.
[KThe given ghostscript "/Users/gorkemdurmaz/miniconda3/envs/asc/bin/gs" is not an executable file.


5 201 354 282.0 1410
[MEME] ASH.fa: nseq=5, residues=1410, bfile=local → /Users/gorkemdurmaz/Desktop/asc_project_10/results/motifs/ASH/meme_out




[MEME] Chelicerate_ASCa_A.fa: nseq=3, residues=695, bfile=global → /Users/gorkemdurmaz/Desktop/asc_project_10/results/motifs/Chelicerate_ASCa_A/meme_out


The given ghostscript "/Users/gorkemdurmaz/miniconda3/envs/asc/bin/gs" is not an executable file.


[MEME] ase.fa: nseq=3, residues=1092, bfile=global → /Users/gorkemdurmaz/Desktop/asc_project_10/results/motifs/ase/meme_out


The given ghostscript "/Users/gorkemdurmaz/miniconda3/envs/asc/bin/gs" is not an executable file.


Done!
