# 5 · Motif post-processing & domain candidacy (ASC project)

This notebook complements your FIMO pipeline with:
- **TOMTOM clustering** of MEME motifs across clades
- **Distance-to-bHLH** analysis per motif/cluster
- **Cluster-level** specificity & enrichment (family/clade)
- **Tree** (presence matrices, binary datasets)
- **fimo plots** instead of tomtom clustering we used fimo results because it was more coherent



**Assumptions**:
- You already ran MEME (motifs per clade)
- You ran FIMO (full-length scan, producing `_summaries`)
- You have anchors in `results/bHLH_anchors.tsv`

In [None]:

import os, re, json, math, shutil, subprocess, collections
from pathlib import Path
import numpy as np, pandas as pd
from Bio import SeqIO

# --- Project paths  ---
PROJ = Path.cwd().resolve().parent if Path.cwd().name == 'notebooks' else Path.cwd()
DATA = PROJ / "data"
OUT  = PROJ / "results"
CLADES = OUT / "clades"
MOTIFS = OUT / "motifs"
REPORTS = OUT / "reports"
SUMDIR = MOTIFS / "_summaries"

for d in (OUT, CLADES, MOTIFS, REPORTS, SUMDIR):
    d.mkdir(parents=True, exist_ok=True)

IN_TARGETS = DATA / "ASC_targets.fasta"
BHLH_TSV   = OUT / "bHLH_anchors.tsv"

norm_id = lambda x: re.sub(r"[^A-Za-z0-9._-]+","", re.sub(r"\.(t?\d+)$","", (x or "").split()[0].split("|")[-1]))
def nearest(vals, x):
    if not vals: return (None, None)
    arr = np.array(vals, dtype=int)
    idx = int(np.argmin(np.abs(arr - x)))
    return (int(arr[idx]), int(arr[idx] - x))

print("Project:", PROJ)
print("Targets exist:", IN_TARGETS.exists(), "Anchors exist:", BHLH_TSV.exists())


Project: /Users/gorkemdurmaz/Desktop/asc_project_10
Targets exist: True Anchors exist: True


In [None]:
# --- AUTO-DETECT which summary tag to use 
# Prefer the strictest set (NON_BHLH_SOFT), else NON_BHLH, else ALL
CANDIDATE_TAGS = ["NON_BHLH_SOFT", "NON_BHLH", "ALL"]
avail = {tag: (SUMDIR / f"{tag}.fimo_hits.filtered_perseq.tsv").exists() for tag in CANDIDATE_TAGS}
BASE_TAG = next((t for t in CANDIDATE_TAGS if avail.get(t)), None)
if BASE_TAG is None:
    raise SystemExit("No *_fimo_hits.filtered_perseq.tsv found in _summaries/.")
fam_hits_path = SUMDIR / f"{BASE_TAG}.fimo_hits.filtered_perseq.tsv"

print(f"[Using] {fam_hits_path.name}")

# --- Load and compute distance-to-bHLH 
anchors = pd.read_csv(BHLH_TSV, sep="\t")
anchors["seq_norm"] = anchors["seq_id"].map(norm_id)
by_seq_anchors = anchors.groupby("seq_norm")["anchor_pos"].apply(lambda s: sorted(set(map(int, s)))).to_dict()

hits = pd.read_csv(fam_hits_path, sep="\t")
# Expect these columns from your 4.2: sequence_name, source_clade, motif_id, start, stop, matched_sequence,
# plus target_clade, target_family if the 4.2 saved them .
hits["seq_norm"]   = hits["sequence_name"].map(norm_id)
hits["hit_mid"]    = ((hits["start"] + hits["stop"]) // 2).astype("Int64")

def nearest(vals, x):
    if not vals: return (None, None)
    arr = np.array(vals, dtype=int)
    i = int(np.argmin(np.abs(arr - x)))
    return int(arr[i]), int(arr[i] - x)

nearest_anchor, delta = [], []
for _, r in hits.iterrows():
    mids = by_seq_anchors.get(r["seq_norm"], [])
    if pd.isna(r["hit_mid"]) or not mids:
        nearest_anchor.append(None); delta.append(None)
    else:
        na, d = nearest(mids, int(r["hit_mid"]))
        nearest_anchor.append(na); delta.append(d)

hits["nearest_bHLH_anchor"] = nearest_anchor
hits["dist_from_bHLH"]      = delta
print("Rows with distances:", hits["dist_from_bHLH"].notna().sum(), "/", len(hits))


[Using] NON_BHLH_SOFT.fimo_hits.filtered_perseq.tsv
Rows with distances: 185 / 200


In [5]:
# Map each (source_clade|motif_id) to a cluster ID (TOMTOM may be absent — then each motif is its own cluster)
hits["global_motif_id"] = hits["source_clade"] + "|" + hits["motif_id"].astype(str)
def motif_to_cluster(mid): 
    # If you ran TOMTOM and built cluster_id_of, it will map; otherwise each motif is its own cluster.
    return cluster_id_of.get(mid, f"SINGLE::{mid}")
hits["motif_cluster"] = hits["global_motif_id"].map(motif_to_cluster)

def cluster_specificity_from_hits(df: pd.DataFrame, level: str):
    """
    Compute specificity using the currently loaded hits table (no need for pre-saved FAMILY/CLADE files).
    - level: 'family' or 'clade'
    """
    if level == "family":
        key = "target_family"
        if key not in df.columns:
            raise SystemExit("target_family column not found in hits. Re-run your 4.2 to include family mapping, or map it here.")
    else:
        key = "target_clade"
        if key not in df.columns:
            raise SystemExit("target_clade column not found in hits. Re-run your 4.2 to include clade mapping, or map it here.")

    # Counts of unique sequences hit per bucket
    counts = (df.groupby(["motif_cluster", key])
                .agg(n_seq_hits=("seq_norm", "nunique"))
                .reset_index())

    # Denominators: number of sequences per bucket present in this hits table
    # (If you want denominators = all sequences per bucket, compute from CLADE FASTAs instead.)
    denom = df[["seq_norm", key]].drop_duplicates()[key].value_counts().to_dict()
    total_mapped = sum(denom.values()) if denom else np.nan

    counts["hit_rate"] = counts.apply(lambda r: r.n_seq_hits / denom.get(r[key], np.nan), axis=1)
    base = counts.groupby("motif_cluster")["n_seq_hits"].sum().rename("global_n_hits").reset_index()
    base["global_rate"] = base["global_n_hits"] / total_mapped if total_mapped else np.nan
    spec = counts.merge(base, on="motif_cluster", how="left")
    spec["enrichment"] = spec["hit_rate"] / spec["global_rate"]
    spec["log2_enrichment"] = spec["enrichment"].apply(lambda x: np.log2(x) if (pd.notna(x) and x > 0) else np.nan)
    return spec

spec_fam = cluster_specificity_from_hits(hits, "family")  # requires 'target_family' in hits
spec_cla = cluster_specificity_from_hits(hits, "clade")   # requires 'target_clade'  in hits

spec_fam.to_csv(SUMDIR / "CLUSTERS.FAMILY.specificity.tsv", sep="\t", index=False)
spec_cla.to_csv(SUMDIR / "CLUSTERS.CLADE.specificity.tsv",  sep="\t", index=False)
print("Specificity tables saved:", 
      (SUMDIR / "CLUSTERS.FAMILY.specificity.tsv").name, 
      (SUMDIR / "CLUSTERS.CLADE.specificity.tsv").name)


Specificity tables saved: CLUSTERS.FAMILY.specificity.tsv CLUSTERS.CLADE.specificity.tsv


## A) TOMTOM clustering of motifs across clades
We cluster all MEME motifs (from every clade) using TOMTOM all-vs-all to group redundant motifs.

In [6]:

meme_paths = sorted([p for p in MOTIFS.glob("*/meme_out/meme.txt") if p.is_file()])
print("Found MEME files:", len(meme_paths))

if shutil.which("tomtom") is None:
    print("WARNING: 'tomtom' not found on PATH. Skipping TOMTOM clustering section.")
    TOMTOM_AVAILABLE = False
else:
    TOMTOM_AVAILABLE = True

COMBINED_MEME = MOTIFS / "_combined_all_memes.txt"
if TOMTOM_AVAILABLE:
    with open(COMBINED_MEME, "w") as out:
        for mp in meme_paths:
            out.write(open(mp).read())
    print("Wrote:", COMBINED_MEME)

TOMDIR = MOTIFS / "_tomtom_all"
if TOMTOM_AVAILABLE:
    if TOMDIR.exists(): shutil.rmtree(TOMDIR)
    TOMDIR.mkdir(parents=True, exist_ok=True)
    cmd = ["tomtom", "-oc", str(TOMDIR), "-thresh", "0.5", "-min-overlap", "5", str(COMBINED_MEME), str(COMBINED_MEME)]
    print("Running:", " ".join(cmd))
    subprocess.run(cmd, check=False)
    print("TOMTOM done ->", TOMDIR)

cluster_id_of = {}
members = collections.defaultdict(set)

try:
    import networkx as nx
except ImportError:
    nx = None

if TOMTOM_AVAILABLE and nx and (TOMDIR / "tomtom.tsv").exists():
    tt = pd.read_csv(TOMDIR / "tomtom.tsv", sep="\t", comment="#")
    tt = tt[tt.get("q-value", tt.get("p-value", 1)).fillna(1) <= 0.1]
    G = nx.Graph()
    for _, r in tt.iterrows():
        G.add_edge(str(r["Query_ID"]), str(r["Target_ID"]))
    for i, comp in enumerate(nx.connected_components(G), 1):
        cid = f"MTFCL_{i:04d}"
        for m in comp:
            cluster_id_of[m] = cid
            members[cid].add(m)
    print("TOMTOM clusters:", len(members))
else:
    print("TOMTOM skipped — each motif treated as own cluster.")


Found MEME files: 10
Wrote: /Users/gorkemdurmaz/Desktop/asc_project_10/results/motifs/_combined_all_memes.txt
Running: tomtom -oc /Users/gorkemdurmaz/Desktop/asc_project_10/results/motifs/_tomtom_all -thresh 0.5 -min-overlap 5 /Users/gorkemdurmaz/Desktop/asc_project_10/results/motifs/_combined_all_memes.txt /Users/gorkemdurmaz/Desktop/asc_project_10/results/motifs/_combined_all_memes.txt


Provide at least 50 motifs for accurate p-value computation.
The output directory '/Users/gorkemdurmaz/Desktop/asc_project_10/results/motifs/_tomtom_all' already exists.
Its contents will be overwritten.
Processing query 1 out of 8 
# Computing q-values.
#   Cannot estimate pi_0 accurately from fewer than 100 p-values.
#   Total p-values = 8. Using pi_zero = 1.0.
Processing query 2 out of 8 
# Computing q-values.
#   Cannot estimate pi_0 accurately from fewer than 100 p-values.
#   Total p-values = 8. Using pi_zero = 1.0.
Processing query 3 out of 8 
# Computing q-values.
#   Cannot estimate pi_0 accurately from fewer than 100 p-values.
#   Total p-values = 8. Using pi_zero = 1.0.
Processing query 4 out of 8 
# Computing q-values.
#   Cannot estimate pi_0 accurately from fewer than 100 p-values.
#   Total p-values = 8. Using pi_zero = 1.0.
Processing query 5 out of 8 
# Computing q-values.
#   Cannot estimate pi_0 accurately from fewer than 100 p-values.
#   Total p-values = 8. Using p

TOMTOM done -> /Users/gorkemdurmaz/Desktop/asc_project_10/results/motifs/_tomtom_all
TOMTOM skipped — each motif treated as own cluster.


## B) Load FIMO results and compute distance-to-bHLH
Attach nearest bHLH anchor and compute `dist_from_bHLH` for each hit.

In [None]:
# --- Robust distance-to-bHLH computation (works with  _summaries filenames) 
import re, numpy as np, pandas as pd

# Helpers
norm_id = lambda x: re.sub(r"[^A-Za-z0-9._-]+","", re.sub(r"\.(t?\d+)$","", (str(x) or "").split()[0].split("|")[-1]))
def nearest(vals, x):
    """Return (nearest_value, delta = nearest_value - x) or (None, None) if vals empty."""
    if not vals:
        return (None, None)
    arr = np.array(vals, dtype=int)
    i = int(np.argmin(np.abs(arr - x)))
    return int(arr[i]), int(arr[i] - x)

# 1) Load anchors
if not BHLH_TSV.exists():
    raise SystemExit(f"Missing anchors file: {BHLH_TSV}")

anchors = pd.read_csv(BHLH_TSV, sep="\t")
required_anchor_cols = {"seq_id", "anchor_pos", "ali_from", "ali_to"}
missing_anchor = required_anchor_cols - set(anchors.columns)
if missing_anchor:
    raise SystemExit(f"Anchors file missing required columns: {missing_anchor}")

anchors["seq_norm"] = anchors["seq_id"].map(norm_id)
# Use unique, sorted anchor positions per sequence
by_seq_anchors = (anchors
                  .assign(anchor_pos=pd.to_numeric(anchors["anchor_pos"], errors="coerce"))
                  .dropna(subset=["anchor_pos"])
                  .groupby("seq_norm")["anchor_pos"]
                  .apply(lambda s: sorted(set(int(v) for v in s)))
                  .to_dict())

# 2) Auto-detect which hits table to use (prefer strictest set)
CANDIDATE_TAGS = ["NON_BHLH_SOFT", "NON_BHLH", "ALL"]
existing = [tag for tag in CANDIDATE_TAGS if (SUMDIR / f"{tag}.fimo_hits.filtered_perseq.tsv").exists()]
if not existing:
    raise SystemExit(f"No *_fimo_hits.filtered_perseq.tsv found in {SUMDIR}")
BASE_TAG = existing[0]  # pick first available in priority order
fam_hits_path = SUMDIR / f"{BASE_TAG}.fimo_hits.filtered_perseq.tsv"
print(f"[Using FIMO hits] {fam_hits_path.name}")

# 3) Load hits & sanity checks
hits = pd.read_csv(fam_hits_path, sep="\t")
# Expect these columns from your 4.2 outputs (some may use slightly different names)
needed = {"sequence_name", "source_clade", "motif_id", "start", "stop", "matched_sequence"}
missing = needed - set(hits.columns)
if missing:
    # Try common alternates (rare)
    rename_map = {}
    if "motif_ID" in missing and "motif_ID" in hits.columns: rename_map["motif_ID"] = "motif_id"
    if rename_map:
        hits = hits.rename(columns=rename_map)
        missing = needed - set(hits.columns)
if missing:
    raise SystemExit(f"Hits table missing required columns: {missing}")

# 4) Normalize IDs & coerce numeric
hits["seq_norm"] = hits["sequence_name"].map(norm_id)
for c in ("start", "stop"):
    hits[c] = pd.to_numeric(hits[c], errors="coerce").astype("Int64")

# 5) Compute hit midpoints safely
hits["hit_mid"] = ((hits["start"].astype("float") + hits["stop"].astype("float")) / 2.0).round().astype("Int64")

# 6) Compute nearest bHLH anchor & distance
nearest_anchor, delta = [], []
for _, r in hits.iterrows():
    mids = by_seq_anchors.get(r["seq_norm"], [])
    hm = r["hit_mid"]
    if pd.isna(hm) or not mids:
        nearest_anchor.append(None); delta.append(None)
    else:
        na, d = nearest(mids, int(hm))
        nearest_anchor.append(na); delta.append(d)

hits["nearest_bHLH_anchor"] = pd.array(nearest_anchor, dtype="Int64")
hits["dist_from_bHLH"]      = pd.array(delta, dtype="Int64")

# 7) Report
n_with = hits["dist_from_bHLH"].notna().sum()
print(f"Rows with distances: {n_with} / {len(hits)}  (tag={BASE_TAG})")

# (optional) keep only rows with valid distances
# hits = hits[hits["dist_from_bHLH"].notna()].copy()


[Using FIMO hits] NON_BHLH_SOFT.fimo_hits.filtered_perseq.tsv
Rows with distances: 185 / 200  (tag=NON_BHLH_SOFT)


## C) Map motifs to clusters and recompute family/clade specificity.

In [9]:

hits["global_motif_id"] = hits["source_clade"] + "|" + hits["motif_id"].astype(str)
def motif_to_cluster(mid): return cluster_id_of.get(mid, f"SINGLE::{mid}")
hits["motif_cluster"] = hits["global_motif_id"].map(motif_to_cluster)

def cluster_specificity(df, level):
    key = "target_family" if level == "family" else "target_clade"
    counts = df.groupby(["motif_cluster", key]).agg(n_seq_hits=("seq_norm", "nunique")).reset_index()
    sizes = {}
    src = SUMDIR / f"ALL.{level.upper()}.fimo_hits.filtered_perseq.tsv"
    if src.exists():
        amap = pd.read_csv(src, sep="\t")[["seq_norm", key]].drop_duplicates()
        sizes = amap[key].value_counts().to_dict()
    total = sum(sizes.values()) if sizes else np.nan
    counts["hit_rate"] = counts.apply(lambda r: r.n_seq_hits / sizes.get(r[key], np.nan), axis=1)
    base = counts.groupby("motif_cluster")["n_seq_hits"].sum().rename("global_n_hits").reset_index()
    base["global_rate"] = base["global_n_hits"] / total if total else np.nan
    spec = counts.merge(base, on="motif_cluster", how="left")
    spec["enrichment"] = spec["hit_rate"] / spec["global_rate"]
    spec["log2_enrichment"] = spec["enrichment"].apply(lambda x: np.log2(x) if pd.notna(x) and x > 0 else np.nan)
    return spec

spec_fam = cluster_specificity(hits, "family")
spec_cla = cluster_specificity(hits, "clade")
spec_fam.to_csv(SUMDIR / "CLUSTERS.FAMILY.specificity.tsv", sep="\t", index=False)
spec_cla.to_csv(SUMDIR / "CLUSTERS.CLADE.specificity.tsv", sep="\t", index=False)
print("Specificity tables saved.")


Specificity tables saved.


## D) Shortlist cluster-level candidates and compute positional stats.

In [10]:

def family_cluster_shortlist(spec_fam, min_hits=3, min_log2_enrich=1.0):
    fams = ["ASCa","ASCb","ASCc"]
    pivot = (spec_fam[spec_fam["target_family"].isin(fams)]
             .pivot_table(index="motif_cluster", columns="target_family", values="n_seq_hits", fill_value=0)
             .reset_index())
    for f in fams:
        if f not in pivot.columns: pivot[f] = 0
    pivot["dominant_family"] = pivot[fams].idxmax(axis=1)
    pivot["dominant_hits"] = pivot[fams].max(axis=1)
    enr = spec_fam.groupby(["motif_cluster","target_family"])["log2_enrichment"].max().unstack(fill_value=np.nan)
    pivot = pivot.merge(enr, left_on="motif_cluster", right_index=True, how="left")
    pivot["dominant_log2_enrichment"] = pivot.apply(lambda r: r.get(r["dominant_family"], np.nan), axis=1)
    keep = (pivot["dominant_hits"] >= min_hits) & (pivot["dominant_log2_enrichment"] >= min_log2_enrich)
    return pivot[keep].sort_values("dominant_hits", ascending=False)

cluster_shortlist = family_cluster_shortlist(spec_fam)
cluster_shortlist.to_csv(SUMDIR / "CLUSTERS.family_shortlist.tsv", sep="\t", index=False)
print("Shortlist written.")

pos_stats = (hits.groupby("motif_cluster")["dist_from_bHLH"]
             .agg(n="count", median="median", iqr=lambda s: np.nanpercentile(s,75)-np.nanpercentile(s,25) if len(s)>0 else np.nan)
             .reset_index())
pos_stats.to_csv(SUMDIR / "CLUSTERS.position_stats.tsv", sep="\t", index=False)
print("Position stats written.")


Shortlist written.
Position stats written.


  return _nanquantile_unchecked(
  return _nanquantile_unchecked(
  return _nanquantile_unchecked(
  return _nanquantile_unchecked(
  return _nanquantile_unchecked(
  return _nanquantile_unchecked(
  return _nanquantile_unchecked(
  return _nanquantile_unchecked(
  return _nanquantile_unchecked(
  return _nanquantile_unchecked(
  return _nanquantile_unchecked(
  return _nanquantile_unchecked(
  return _nanquantile_unchecked(
  return _nanquantile_unchecked(
  return _nanquantile_unchecked(
  return _nanquantile_unchecked(
  return _nanquantile_unchecked(
  return _nanquantile_unchecked(
  return _nanquantile_unchecked(
  return _nanquantile_unchecked(
  return _nanquantile_unchecked(
  return _nanquantile_unchecked(
  return _nanquantile_unchecked(
  return _nanquantile_unchecked(
  return _nanquantile_unchecked(
  return _nanquantile_unchecked(


## E) Tree visualization.

In [None]:
# Robust ETE3 plotting of motif presence next to your ASC tree — CIRCULAR (start at top)

from ete3 import Tree, TreeStyle, faces, TextFace
import pandas as pd, numpy as np, re, math
from pathlib import Path

TREE   = Path("data/ASC-tree.newick")
MATRIX = Path("results/motifs/_summaries/NON_BHLH_SOFT.motif_presence_matrix.fixed_ids.tsv")
OUTPNG = Path("results/ASC_tree_with_motifs_circular.png")
OUTSVG = Path("results/ASC_tree_with_motifs_circular.svg")   # vector for paper

# ---------- Load ----------
tree = Tree(str(TREE), format=1)
mat  = pd.read_csv(MATRIX, sep="\t", index_col=0)

# ---------- Normalize names so matrix rows match tree leaves ----------
leaf_names = [n.name for n in tree.get_leaves()]

def norm_id(x):
    s = str(x)
    s = re.sub(r"\s.*$", "", s)         # strip after first whitespace
    s = re.sub(r"\|.*$", "", s)         # strip after first pipe
    s = re.sub(r"\.(t|iso)?\d+$", "", s)  # drop trailing .t1 / .iso2 / .1
    return s

leaf_set = set(leaf_names)
row_map, hits = {}, 0
for rid in mat.index:
    candidates = [rid, norm_id(rid)]
    candidates += [c + ".t1" for c in candidates]
    m = next((c for c in candidates if c in leaf_set), None)
    if m is not None:
        row_map[rid] = m
        hits += 1
print(f"[Match] {hits} / {len(mat)} matrix rows matched to tree tips")

mat2 = mat.loc[[r for r in mat.index if r in row_map]].copy()
mat2.index = [row_map[r] for r in mat2.index]
mat2 = mat2.reindex(leaf_names).fillna(0).astype(int)

# ---------- Choose motif columns ----------
TOP_N = 20
col_counts   = mat2.sum(axis=0).sort_values(ascending=False)
cols_to_show = list(col_counts.head(TOP_N).index)
if len(cols_to_show) == 0:
    raise SystemExit("No motif columns to show (all-zero matrix after matching?).")
mat_show = mat2[cols_to_show]

# --- Prune tips with no motifs in the shown columns ---
rows_with_any = mat_show.sum(axis=1) > 0
keep_leaves = set(mat_show.index[rows_with_any])
if len(keep_leaves) == 0:
    raise SystemExit("All tips have zero motifs in the selected columns; nothing to plot.")
tree.prune(keep_leaves, preserve_branch_length=True)

# Reindex matrix to the *new* leaf order and drop now-empty motif columns
leaf_names = [n.name for n in tree.get_leaves()]
mat_show = mat_show.loc[leaf_names]
nonempty_cols = mat_show.sum(axis=0) > 0
mat_show = mat_show.loc[:, nonempty_cols]

# ---------- Colors (define AFTER pruning so it matches final columns) ----------
def distinct_colors(n):
    out = []
    for i in range(n):
        h = i / max(1, n); s, l = 0.60, 0.55
        def hue2rgb(p, q, t):
            if t < 0: t += 1
            if t > 1: t -= 1
            if t < 1/6: return p + (q - p) * 6 * t
            if t < 1/2: return q
            if t < 2/3: return p + (q - p) * (2/3 - t) * 6
            return p
        q = l + s - l * s; p = 2 * l - q
        r = hue2rgb(p, q, h + 1/3); g = hue2rgb(p, q, h); b = hue2rgb(p, q, h - 1/3)
        out.append('#%02x%02x%02x' % (int(r*255), int(g*255), int(b*255)))
    return out

motif_colors = dict(zip(list(mat_show.columns), distinct_colors(mat_show.shape[1])))

# ---------- Layout with small boxes per present motif ----------
BOX_W, BOX_H = 8, 8

def layout(node):
    if node.is_leaf():
        row = mat_show.loc[node.name]
        faces.add_face_to_node(TextFace(node.name, fsize=12), node, column=-1, position="aligned")
        col_idx = 0
        for motif, present in row.items():
            if present == 1:
                box = faces.RectFace(width=BOX_W, height=BOX_H, fgcolor="black", bgcolor=motif_colors[motif])
            else:
                box = faces.RectFace(width=BOX_W, height=BOX_H, fgcolor=None, bgcolor=None)
            faces.add_face_to_node(box, node, column=col_idx, position="aligned")
            col_idx += 1

# ---------- Tree style (CIRCULAR) ----------
ts = TreeStyle()
ts.mode = "c"                 # circular
ts.arc_start = -90            # <<< start at TOP: x=0, y>0
ts.arc_span  = 360            # full circle
ts.min_leaf_separation = 8
ts.show_leaf_name = False     # we render our own (smaller) names in layout()
ts.show_scale = False
ts.layout_fn = layout
ts.margin_left = ts.margin_right = ts.margin_top = ts.margin_bottom = 15

# Optional: cleaner look (no node circles)
for n in tree.traverse():
    n.img_style["size"] = 0

# ---------- Legend ----------
try:
    ts.legend._faces = []
except Exception:
    pass
for motif, color in motif_colors.items():
    ts.legend.add_face(faces.RectFace(width=14, height=14, fgcolor="black", bgcolor=color), column=0)
    ts.legend.add_face(TextFace("  " + motif, fsize=13), column=1)

# ---------- Render ----------
OUTPNG.parent.mkdir(parents=True, exist_ok=True)
tree.render(str(OUTPNG), tree_style=ts, w=3000, units="px", dpi=300)
tree.render(str(OUTSVG), tree_style=ts, w=2000, units="px")
print("Wrote:", OUTPNG)
print("Wrote:", OUTSVG)


[Match] 76 / 76 matrix rows matched to tree tips
Wrote: results/ASC_tree_with_motifs_circular.png
Wrote: results/ASC_tree_with_motifs_circular.svg


In [20]:
# --- Relaxed candidate extraction ---
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from pathlib import Path

PROJ = Path.cwd().resolve().parent if Path.cwd().name == 'notebooks' else Path.cwd()
REPORT = PROJ / "results" / "reports"
f = REPORT / "ASC_motif_clusters.all.tsv"
df = pd.read_csv(f, sep="\t")

# Handle missing data
df["dominant_log2_enrichment"] = pd.to_numeric(df["dominant_log2_enrichment"], errors="coerce").fillna(0)
df["dominant_hits"] = pd.to_numeric(df["dominant_hits"], errors="coerce").fillna(0)
df["pos_IQR"] = pd.to_numeric(df["pos_IQR"], errors="coerce").fillna(999)
df["is_family_specific"] = df["is_family_specific"].astype(bool)

# Define three thresholds
criteria = {
    "strict":  (df["dominant_hits"]>=3) & (df["dominant_log2_enrichment"]>=1.0) & (df["pos_IQR"]<=50) & (df["is_family_specific"]),
    "medium":  (df["dominant_hits"]>=2) & (df["dominant_log2_enrichment"]>=0.5) & (df["pos_IQR"]<=100),
    "loose":   (df["dominant_hits"]>=1) & (df["dominant_log2_enrichment"]>=0) & (df["pos_IQR"]<=200),
}

# Save filtered lists
for name, cond in criteria.items():
    out = REPORT / f"ASC_motif_clusters.{name}_candidates.tsv"
    df[cond].sort_values(["dominant_log2_enrichment","dominant_hits"], ascending=[False,False]).to_csv(out, sep="\t", index=False)
    print(f"{name:7s}: {cond.sum():2d} motifs  → {out.name}")

# Also produce a simple ranked top-20 view
top = df.sort_values(["dominant_log2_enrichment","dominant_hits"], ascending=[False,False]).head(20)
print("\nTop-ranked motifs by enrichment & hits:\n")
display(top[["motif_cluster","dominant_family","dominant_hits","dominant_log2_enrichment","pos_IQR"]])


strict :  0 motifs  → ASC_motif_clusters.strict_candidates.tsv
medium :  0 motifs  → ASC_motif_clusters.medium_candidates.tsv
loose  : 27 motifs  → ASC_motif_clusters.loose_candidates.tsv

Top-ranked motifs by enrichment & hits:



Unnamed: 0,motif_cluster,dominant_family,dominant_hits,dominant_log2_enrichment,pos_IQR
24,SINGLE::ASCb|KKDPFSPRCKIPLPTPFSPYEY,ASCb,14,0.0,2.75
32,SINGLE::ASCc|VNKENELHQRW,ASCc,13,0.0,26.0
28,SINGLE::ASCc|GCDDNYNPYLPFYDDYGGAL,ASCc,9,0.0,23.0
18,SINGLE::ASCa_TrueSpiders_E|EASSPYDALHGDEEEELMD...,ASCa,8,0.0,35.25
25,SINGLE::ASCb|KMPEYTMLTPLPLTPDVMSHDFEQ,ASCb,8,0.0,15.75
0,SINGLE::ASCa_TrueSpiders_A|EYPFDGSTEMIMPYG,ASCa,6,0.0,1.0
3,SINGLE::ASCa_TrueSpiders_A|PQGYRCDFGCPCNEG,ASCa,6,0.0,0.0
21,SINGLE::ASCa_TrueSpiders_E|QRIAPKLPHH,ASCa,6,0.0,4.0
22,SINGLE::ASCb|CELVPHYSFVNMVPP,ASCb,6,0.0,10.75
23,SINGLE::ASCb|EEEEDDDDDDDGAGLSGCSNNS,ASCb,6,0.0,45.25


# PLOT

In [None]:
# PLOT WITH CLUSTER NAMES
# === ASC domain architecture: correct MEME-# → CONSENSUS bridge, strict Tomtom RBH, robust anchors, top-K hits ===
import pandas as pd, numpy as np, re
from pathlib import Path
import matplotlib.pyplot as plt
from collections import defaultdict, deque

# -------- knobs --------
CLUSTER_MODE   = "strict"   # "strict" (RBH clusters on consensus) or "none" (color per PWM)
Q_MAX          = 0.05       # Tomtom q-value cutoff (if present)
P_MAX          = 1e-4       # Tomtom p-value cutoff (fallback if q missing)
MIN_OVERLAP    = 8          # Tomtom min overlap
TOP_K          = 3          # keep up to K non-overlapping sites per sequence
NONOVERLAP_PAD = 6          # minimal gap (aa) between kept sites
# -----------------------

# Project layout
PROJ = Path.cwd().resolve().parent if Path.cwd().name == 'notebooks' else Path.cwd()
DATA = PROJ / "data"
OUT  = PROJ / "results"
MOTIFS = OUT / "motifs"
SUM   = MOTIFS / "_summaries"
REPORT= OUT / "reports"
REPORT.mkdir(parents=True, exist_ok=True)

# Inputs
HITS_PATH = next((p for p in [
    SUM / "NON_BHLH_SOFT.fimo_hits.filtered_perseq.tsv",
    SUM / "NON_BHLH.fimo_hits.filtered_perseq.tsv",
    SUM / "ALL.fimo_hits.filtered_perseq.tsv",
] if p.exists()), None)
if HITS_PATH is None:
    raise SystemExit("No *fimo_hits.filtered_perseq.tsv found under _summaries/")

TOMTOM_TSV = MOTIFS / "_tomtom_all" / "tomtom.tsv"
if not TOMTOM_TSV.exists():
    alt = Path("tomtom.tsv")
    if alt.exists(): TOMTOM_TSV = alt
if not TOMTOM_TSV.exists():
    raise SystemExit("tomtom.tsv not found.")

COMBINED_MEME = MOTIFS / "_combined_all_memes.txt"
ANCHORS_PATH  = OUT / "bHLH_anchors.tsv"
TARGETS_FASTA = DATA / "ASC_targets.fasta"

print("[hits]   ", HITS_PATH)
print("[tomtom] ", TOMTOM_TSV)
print("[meme]   ", COMBINED_MEME, "(exists:", COMBINED_MEME.exists(), ")")
print("[anchors]", ANCHORS_PATH)

# -------- helpers --------
def norm_id(x):
    return re.sub(r"[^A-Za-z0-9._-]+","", re.sub(r"\.(t?\d+)$","", (str(x) or "").split()[0].split("|")[-1]))

def distinct_colors(n):
    cols=[]
    for i in range(max(1,n)):
        h=i/max(1,n); s=0.60; l=0.55
        def hue2rgb(p,q,t):
            if t<0: t+=1
            if t>1: t-=1
            if t<1/6: return p+(q-p)*6*t
            if t<1/2: return q
            if t<2/3: return p+(q-p)*(2/3-t)*6
            return p
        q=l+s-l*s; p=2*l-q
        r=hue2rgb(p,q,h+1/3); g=hue2rgb(p,q,h); b=hue2rgb(p,q,h-1/3)
        cols.append('#%02x%02x%02x'% (int(r*255),int(g*255),int(b*255)))
    return cols

# -------- load tables --------
hits = pd.read_csv(HITS_PATH, sep="\t")
tt   = pd.read_csv(TOMTOM_TSV, sep="\t", comment="#")
anchors = pd.read_csv(ANCHORS_PATH, sep="\t")

for c in ("motif_alt_id","sequence_name"):
    hits[c] = hits[c].astype(str).str.strip()
for c in ("Query_ID","Target_ID"):
    tt[c] = tt[c].astype(str).str.strip()

# -------- build MEME-# → CONSENSUS from combined MEME --------
# Your file looks like: "MOTIF <CONSENSUS>   MEME-#   width=..."
# So we must extract MEME-# from token3 and CONSENSUS from token2.
def build_bridge_from_combined(path: Path):
    memeid_to_cons = {}
    with open(path, "r", encoding="utf-8", errors="replace") as fh:
        for ln in fh:
            if not ln.startswith("MOTIF "): 
                continue
            # split only first 3 tokens: MOTIF, <CONSENSUS>, <REST>
            parts = ln.strip().split(maxsplit=2)
            if len(parts) < 3: 
                continue
            consensus = parts[1]               # e.g., GKRLSKVETLRSAIDYIRQLRQIL
            rest     = parts[2]               # e.g., "MEME-1\twidth = 24 ..."
            m = re.search(r'(MEME-\d+)', rest)
            if m:
                memeid_to_cons[m.group(1)] = consensus
    return pd.DataFrame(
        [(k,v) for k,v in memeid_to_cons.items()],
        columns=["meme_id","consensus"]
    )

bridge = build_bridge_from_combined(COMBINED_MEME)
if bridge.empty:
    raise SystemExit("Failed to extract MEME-# → CONSENSUS from _combined_all_memes.txt. Check the file format.")

print("[bridge] MEME ids in hits:", hits["motif_alt_id"].nunique(),
      "  in bridge:", bridge["meme_id"].nunique(),
      "  overlap:", len(set(hits["motif_alt_id"]) & set(bridge["meme_id"])))

# Map hits to consensus
hits = hits.merge(bridge, left_on="motif_alt_id", right_on="meme_id", how="left")
if hits["consensus"].isna().any():
    miss = hits[hits["consensus"].isna()]["motif_alt_id"].unique()[:10]
    print("WARNING: some MEME ids missing in bridge (showing up to 10):", miss)

# Per-PWM label if needed
if {"source_clade","motif_id"}.issubset(hits.columns):
    hits["global_motif_id"] = hits["source_clade"].astype(str) + "|" + hits["motif_id"].astype(str)
else:
    hits["global_motif_id"] = hits["motif_alt_id"].astype(str)

# -------- cluster consensuses via strict Tomtom RBH (or disable) --------
def build_clusters_strict(hits_df: pd.DataFrame, tt_df: pd.DataFrame):
    # thresholds
    if "q-value" in tt_df.columns:
        ttf = tt_df[tt_df["q-value"].fillna(1.0) <= Q_MAX].copy()
    else:
        ttf = tt_df[tt_df["p-value"].fillna(1.0) <= P_MAX].copy()
    if "Overlap" in ttf.columns:
        ttf = ttf[ttf["Overlap"].fillna(0) >= MIN_OVERLAP].copy()
    # drop self
    ttf = ttf[ttf["Query_ID"] != ttf["Target_ID"]].copy()
    # RBH on p (fallback q)
    if "p-value" in ttf.columns:
        ttf["rank_q"] = ttf.groupby("Query_ID")["p-value"].rank(method="first")
        ttf["rank_t"] = ttf.groupby("Target_ID")["p-value"].rank(method="first")
    else:
        ttf["rank_q"] = ttf.groupby("Query_ID")["q-value"].rank(method="first")
        ttf["rank_t"] = ttf.groupby("Target_ID")["q-value"].rank(method="first")
    rbh = ttf[(ttf["rank_q"]==1) & (ttf["rank_t"]==1)][["Query_ID","Target_ID"]]

    # adjacency
    adj = defaultdict(set)
    for _, r in rbh.iterrows():
        q, t = str(r["Query_ID"]), str(r["Target_ID"])
        adj[q].add(t); adj[t].add(q)
    # ensure isolated consensuses from hits are present
    for cons in hits_df["consensus"].dropna().astype(str).unique():
        _ = adj[cons]

    # BFS components
    visited=set(); cons2cluster={}
    cid=0
    for node in list(adj.keys()):
        if node in visited: continue
        cid += 1
        lab = f"MTFCL_{cid:04d}"
        dq = deque([node]); visited.add(node); cons2cluster[node]=lab
        while dq:
            u=dq.popleft()
            for v in adj[u]:
                if v not in visited:
                    visited.add(v); cons2cluster[v]=lab; dq.append(v)
    return cons2cluster

if CLUSTER_MODE == "none":
    hits["plot_label"] = hits["global_motif_id"].astype(str)
else:
    cons2cluster = build_clusters_strict(hits, tt)
    hits["plot_label"] = hits["consensus"].astype(str).map(cons2cluster)
    # singletons (consensus without RBH partners)
    mask_na = hits["plot_label"].isna() & hits["consensus"].notna()
    hits.loc[mask_na, "plot_label"] = hits.loc[mask_na, "consensus"].apply(lambda s: f"SINGLE::{s}")

print("Unique plot labels:", hits["plot_label"].nunique())

# -------- robust anchors join --------
for c in ("start","stop"):
    hits[c] = pd.to_numeric(hits[c], errors="coerce")
anchors["seq_norm_norm"] = anchors["seq_id"].map(norm_id)

# 1) normed join
hits["seq_norm"] = hits["sequence_name"].map(norm_id)
joined = hits.merge(
    anchors.rename(columns={"seq_norm_norm":"seq_norm"})[
        ["seq_norm","ali_from","ali_to","anchor_pos"]
    ],
    on="seq_norm", how="left"
)

# 2) raw id join if needed
need = joined["anchor_pos"].isna()
if need.any():
    j2 = hits.loc[need].merge(
        anchors[["seq_id","ali_from","ali_to","anchor_pos"]],
        left_on="sequence_name", right_on="seq_id", how="left"
    )
    joined.loc[need, ["ali_from","ali_to","anchor_pos"]] = j2[["ali_from","ali_to","anchor_pos"]].values

# 3) try adding ".t1"
need = joined["anchor_pos"].isna()
if need.any():
    tmp = hits.loc[need].copy()
    tmp["sequence_name_t1"] = tmp["sequence_name"].astype(str) + ".t1"
    j3 = tmp.merge(
        anchors.rename(columns={"seq_id":"seq_id_raw"})[["seq_id_raw","ali_from","ali_to","anchor_pos"]],
        left_on="sequence_name_t1", right_on="seq_id_raw", how="left"
    )
    idx = joined.index[need]
    joined.loc[idx, ["ali_from","ali_to","anchor_pos"]] = j3[["ali_from","ali_to","anchor_pos"]].values

print("Anchors matched:", joined["anchor_pos"].notna().sum(), "/", len(joined))
if joined["anchor_pos"].notna().sum() == 0 and joined["ali_from"].notna().any() and joined["ali_to"].notna().any():
    joined["anchor_pos"] = (joined["ali_from"] + joined["ali_to"]) / 2.0
    print("Using fallback anchor midpoint from ali span.")

# keep only rows with usable anchor
joined = joined.dropna(subset=["anchor_pos"]).copy()
print("Rows with usable anchors:", len(joined))

# -------- relative coords + pick top-K non-overlapping per sequence --------
for c in ("start","stop","ali_from","ali_to","anchor_pos"):
    joined[c] = pd.to_numeric(joined[c], errors="coerce")
joined["rel_start"] = joined["start"] - joined["anchor_pos"]
joined["rel_end"]   = joined["stop"]  - joined["anchor_pos"]

if "target_family" not in joined.columns:
    joined["target_family"] = "Unknown"

def pick_nonoverlapping(group):
    group = group.sort_values("p-value") if "p-value" in group.columns else group
    chosen=[]; keep=[]
    for _, r in group.iterrows():
        s,e = float(r["start"]), float(r["stop"])
        ok=True
        for cs,ce in chosen:
            if not (e < cs - NONOVERLAP_PAD or s > ce + NONOVERLAP_PAD):
                ok=False; break
        if ok:
            chosen.append((s,e)); keep.append(r)
        if len(keep) >= TOP_K: break
    return pd.DataFrame(keep)

picked = (joined.groupby("sequence_name", group_keys=False)
          .apply(pick_nonoverlapping)
          .reset_index(drop=True))

plotcols = ["sequence_name","plot_label","rel_start","rel_end","start","stop",
            "ali_from","ali_to","anchor_pos","target_family","motif_id","motif_alt_id","consensus","p-value"]
plotdf = picked[plotcols].copy()
plot_path = REPORT / "ASC_architecture.plotdata.tsv"
plotdf.to_csv(plot_path, sep="\t", index=False)
print("Wrote plotdata:", plot_path, " rows:", len(plotdf),
      " seqs:", plotdf["sequence_name"].nunique(), " labels:", plotdf["plot_label"].nunique())

# -------- plotting --------
def plot_arch(df, title, outpng):
    if df.empty:
        print(f"[Skip] {title}: no rows"); return
    labels = sorted(df["plot_label"].astype(str).unique())
    COLOR = dict(zip(labels, distinct_colors(len(labels))))

    fam_map = (df.groupby("sequence_name")["target_family"]
               .agg(lambda s: s.value_counts().idxmax() if s.notna().any() else "Unknown"))
    def seq_key(s): return (fam_map.get(s,"Unknown"), s)
    seqs = sorted(df["sequence_name"].unique(), key=seq_key)

    H = max(3.0, 0.18*len(seqs)); W = 12
    fig, ax = plt.subplots(figsize=(W,H), dpi=200)
    ymap = {s:i for i,s in enumerate(seqs)}

    # PF00010 span and baseline
    for s in seqs:
        y = ymap[s]
        sub = df[df["sequence_name"]==s].iloc[0]
        if pd.notna(sub.get("ali_from", np.nan)) and pd.notna(sub.get("ali_to", np.nan)) and pd.notna(sub.get("anchor_pos", np.nan)):
            ali_s = sub["ali_from"] - sub["anchor_pos"]
            ali_e = sub["ali_to"]   - sub["anchor_pos"]
        else:
            ali_s, ali_e = -15, 15
        ax.add_patch(plt.Rectangle((ali_s, y-0.35), max(4.0, ali_e-ali_s), 0.7,
                                   color="#9e9e9e", ec="#424242", lw=0.5, zorder=2))
        ax.plot([ali_s-200, ali_e+200], [y, y], color="#bdbdbd", lw=1.0, zorder=1)

    # motif rectangles
    for _, r in df.iterrows():
        y = ymap[r["sequence_name"]]
        x0, x1 = float(r["rel_start"]), float(r["rel_end"])
        lab = str(r["plot_label"])
        ax.add_patch(plt.Rectangle((x0, y-0.25), max(6.0, x1-x0), 0.5,
                                   color=COLOR[lab], ec="black", lw=0.3, zorder=3))

    all_x = np.concatenate([df["rel_start"].values, df["rel_end"].values])
    finite = all_x[np.isfinite(all_x)]
    q = np.quantile(np.abs(finite), 0.95) if finite.size else 200
    ax.set_xlim(-max(200,q), max(200,q))
    ax.set_ylim(-1, len(seqs)+0.5)
    ax.set_title(title)
    ax.set_xlabel("Position (aa) relative to bHLH anchor (0)")
    ax.set_yticks(range(len(seqs)))
    ax.set_yticklabels(seqs, fontsize=6)
    ax.axvline(0, color="#616161", lw=1.0, ls="--", zorder=1)

    # legend (top 20 labels)
    cov = df.groupby("plot_label")["sequence_name"].nunique().sort_values(ascending=False)
    top = [str(k) for k in cov.head(20).index]
    handles, labels = [], []
    for lab in top:
        patch = plt.Rectangle((0,0),1,1, color=COLOR[lab], ec="black", lw=0.3)
        handles.append(patch); labels.append(lab)
    if handles:
        ax.legend(handles, labels, title=("Clusters (strict RBH)" if CLUSTER_MODE!="none" else "Motifs"),
                  bbox_to_anchor=(1.02,1), loc="upper left", frameon=False, fontsize=7)

    plt.tight_layout()
    outp = REPORT / outpng
    fig.savefig(outp, bbox_inches="tight", dpi=200)
    plt.close(fig)
    print("Saved:", outp)

# Render
plot_arch(plotdf, f"ASC domain architecture (mode={CLUSTER_MODE}, top{TOP_K}/seq)", "ASC_domain_architecture.combined.png")
for fam in sorted(plotdf["target_family"].dropna().unique()):
    plot_arch(plotdf[plotdf["target_family"]==fam], f"ASC domain architecture — {fam} (mode={CLUSTER_MODE})", f"ASC_domain_architecture.{fam}.png")


In [1]:
# === Add interpretable 'family' and 'clade' labels to the plotting TSV ===
# Maps from NON_BHLH_SOFT.fimo_hits.filtered_perseq.tsv (preferred) or fallback to ALL...
# Key used: motif_key = f"{motif_id}|{motif_alt_id}"
import pandas as pd
from pathlib import Path

# Paths
PROJ = Path.cwd().resolve().parent if Path.cwd().name == 'notebooks' else Path.cwd()
OUT   = PROJ / "results"
MOTIFS= OUT / "motifs"
SUM   = MOTIFS / "_summaries"
REPORT= OUT / "reports"

PLOTDATA = REPORT / "ASC_architecture.plotdata.tsv"

# Prefer NON_BHLH_SOFT; fall back if not present
HITS_PATH = next((p for p in [
    SUM / "NON_BHLH_SOFT.fimo_hits.filtered_perseq.tsv",
    SUM / "NON_BHLH.fimo_hits.filtered_perseq.tsv",
    SUM / "ALL.fimo_hits.filtered_perseq.tsv",
] if p.exists()), None)

if not PLOTDATA.exists():
    raise SystemExit(f"Missing plotdata: {PLOTDATA}")
if HITS_PATH is None or not HITS_PATH.exists():
    raise SystemExit("Could not find a *_fimo_hits.filtered_perseq.tsv under _summaries/")

print("[Using plotdata] ", PLOTDATA)
print("[Using hits map]", HITS_PATH)

# Load
plotdf = pd.read_csv(PLOTDATA, sep="\t")
hits   = pd.read_csv(HITS_PATH, sep="\t")

# --- Build motif key in both frames (robust to missing columns) ---
def make_key(df):
    # use motif_id + motif_alt_id if both exist; else fall back to available
    m_id  = df["motif_id"].astype(str) if "motif_id" in df.columns else pd.Series(["NA"]*len(df))
    m_alt = df["motif_alt_id"].astype(str) if "motif_alt_id" in df.columns else (
            df["consensus"].astype(str) if "consensus" in df.columns else pd.Series(["NA"]*len(df)))
    return m_id.str.strip() + "|" + m_alt.str.strip()

plotdf["motif_key"] = make_key(plotdf)
hits["motif_key"]   = make_key(hits)

# --- Majority labels per motif (from hits) ---
def majority(s, default="Unknown"):
    s = s.dropna()
    return s.mode().iat[0] if len(s) else default

fam_by_key   = hits.groupby("motif_key")["target_family"].apply(majority).rename("family")
clade_by_key = hits.groupby("motif_key")["target_clade"].apply(majority).rename("clade_raw") if "target_clade" in hits.columns else pd.Series(dtype=object)

labels = fam_by_key.to_frame().join(clade_by_key, how="left")
# Resolve clade: if Unassigned or missing, use family
labels["clade"] = labels.apply(lambda r: r["clade_raw"] if pd.notna(r["clade_raw"]) and r["clade_raw"]!="Unassigned" else r["family"], axis=1)
labels = labels.drop(columns=["clade_raw"], errors="ignore").reset_index()

# --- Merge labels into plotdf ---
plotdf_lbl = plotdf.merge(labels, on="motif_key", how="left")

# Fallbacks if anything missing
plotdf_lbl["family"] = plotdf_lbl["family"].fillna("Unknown")
plotdf_lbl["clade"]  = plotdf_lbl["clade"].fillna(plotdf_lbl["family"])

# Optional: human-friendly domain label to show in legends
# e.g., "ASCb_related", "ASCc_related", or "ASCa_<clade>_related" if clade differs from family
def make_domain_label(row):
    fam = str(row["family"])
    cl  = str(row["clade"])
    if cl == fam or cl == "Unknown":
        return f"{fam}_related"
    else:
        return f"{fam}_{cl}_related"
plotdf_lbl["domain_label"] = plotdf_lbl.apply(make_domain_label, axis=1)

# Save augmented plot TSV
OUT_TSV = REPORT / "ASC_architecture.plotdata.with_family.tsv"
plotdf_lbl.to_csv(OUT_TSV, sep="\t", index=False)
print("Wrote:", OUT_TSV, " rows:", len(plotdf_lbl))

# Also write a small legend mapping from plot_label → domain_label (and counts)
legend = (plotdf_lbl.groupby(["plot_label","domain_label"])
          .agg(n_sites=("sequence_name","count"),
               n_seqs=("sequence_name","nunique"))
          .reset_index()
          .sort_values(["n_seqs","n_sites"], ascending=[False,False]))
LEGEND_TSV = REPORT / "ASC_architecture.plotlabel_to_domainlabel.tsv"
legend.to_csv(LEGEND_TSV, sep="\t", index=False)
print("Legend:", LEGEND_TSV)

# Quick peek
print("\nPreview:")
display(plotdf_lbl[["sequence_name","plot_label","motif_id","motif_alt_id","family","clade","domain_label"]].head(10))


[Using plotdata]  /Users/gorkemdurmaz/Desktop/asc_project_10/results/reports/ASC_architecture.plotdata.tsv
[Using hits map] /Users/gorkemdurmaz/Desktop/asc_project_10/results/motifs/_summaries/NON_BHLH_SOFT.fimo_hits.filtered_perseq.tsv
Wrote: /Users/gorkemdurmaz/Desktop/asc_project_10/results/reports/ASC_architecture.plotdata.with_family.tsv  rows: 134
Legend: /Users/gorkemdurmaz/Desktop/asc_project_10/results/reports/ASC_architecture.plotlabel_to_domainlabel.tsv

Preview:


Unnamed: 0,sequence_name,plot_label,motif_id,motif_alt_id,family,clade,domain_label
0,Abru_g13702.t1,MTFCL_0007,GCDDNYNPYLPFYDDYGGAL,MEME-5,ASCc,ASCc,ASCc_related
1,Abru_g13702.t1,MTFCL_0004,MVQEMSQFSTH,MEME-8,ASCc,ASCc,ASCc_related
2,Abru_g13702.t1,MTFCL_0002,VNKENELHQRW,MEME-4,ASCc,ASCc,ASCc_related
3,Abru_g14616.t1,MTFCL_0005,PQGYRCDFGCPCNEG,MEME-3,ASCa,ASCa_TrueSpiders_A,ASCa_ASCa_TrueSpiders_A_related
4,Abru_g14616.t1,MTFCL_0004,MEMFPHQDYPPQNS,MEME-8,ASCa,ASCa_TrueSpiders_A,ASCa_ASCa_TrueSpiders_A_related
5,Abru_g14798.t1,MTFCL_0007,PSYCTTPSPVMMLMSESRSPPMFZ,MEME-5,ASCa,ASCa_TrueSpiders_D,ASCa_ASCa_TrueSpiders_D_related
6,Abru_g14798.t1,MTFCL_0008,PPTHKLLLPLDRVGV,MEME-6,ASCa,ASCa_TrueSpiders_D,ASCa_ASCa_TrueSpiders_D_related
7,Abru_g14799.t1,MTFCL_0005,SGPAGGSGDLSPASSHPSDCSLV,MEME-3,ASCa,ASCa_TrueSpiders_C,ASCa_ASCa_TrueSpiders_C_related
8,Abru_g14799.t1,MTFCL_0002,QHTQDDQLMDIGLWFS,MEME-4,ASCa,ASCa,ASCa_related
9,Abru_g14799.t1,MTFCL_0007,YGQDDCSSVASSEEI,MEME-5,ASCa,ASCa_TrueSpiders_C,ASCa_ASCa_TrueSpiders_C_related


In [None]:
# The LAST one and with FIMO no tomtom

import pandas as pd
import numpy as np
import re
from pathlib import Path
import matplotlib.pyplot as plt

PROJ = Path.cwd().resolve().parent if Path.cwd().name == 'notebooks' else Path.cwd()
OUT = PROJ / "results"
FIG = OUT / "figures_new"
REPORT = OUT / "reports"
REPORT.mkdir(parents=True, exist_ok=True)

# === Load FIMO hits data ===
FIMO_FILE = PROJ / "results/motifs/_summaries/NON_BHLH_SOFT.fimo_hits.filtered_perseq.tsv"
df = pd.read_csv(FIMO_FILE, sep="\t")

print(f"Loaded {len(df)} rows from FIMO file")
print(f"Columns: {df.columns.tolist()}")

# === Load bHLH anchor positions from old dataset ===
PLOT_WFAM_UPDATED = REPORT / "ASC_architecture.plotdata.with_family.updated.tsv"
PLOT_WFAM = REPORT / "ASC_architecture.plotdata.with_family.tsv"
PLOT_BASE = REPORT / "ASC_architecture.plotdata.tsv"

if PLOT_WFAM_UPDATED.exists():
    anchor_df = pd.read_csv(PLOT_WFAM_UPDATED, sep="\t")
elif PLOT_WFAM.exists():
    anchor_df = pd.read_csv(PLOT_WFAM, sep="\t")
else:
    anchor_df = pd.read_csv(PLOT_BASE, sep="\t")

# Extract unique anchor positions per sequence
anchor_info = anchor_df[["sequence_name", "anchor_pos", "ali_from", "ali_to"]].drop_duplicates("sequence_name")
print(f"Loaded anchor positions for {len(anchor_info)} sequences")

# Merge anchor info with FIMO data
df = df.merge(anchor_info, on="sequence_name", how="left")

# === Calculate relative positions using anchor_pos ===
df["rel_start"] = df["start"] - df["anchor_pos"]
df["rel_end"] = df["stop"] - df["anchor_pos"]

# Check for sequences without anchor info
missing_anchor = df["anchor_pos"].isna().sum()
if missing_anchor > 0:
    print(f"Warning: {missing_anchor} rows missing anchor_pos (will be excluded from plots)")
    df = df[df["anchor_pos"].notna()].copy()

# === Use motif_col as plot_label (includes family/clade + motif identity) ===
if "motif_col" in df.columns:
    df["plot_label"] = df["motif_col"].astype(str)
    print("Using motif_col for family-aware motif labels")
else:
    # Fallback to old method if motif_col doesn't exist
    df["plot_label"] = df["motif_id"].astype(str) + "|" + df["motif_alt_id"].astype(str)
    print("Warning: motif_col not found, using motif_id|motif_alt_id instead")

# === Extract family and clade information ===
# Use families_hit instead of target_family to avoid unassigned proteins
if "families_hit" in df.columns:
    df["family_for_plot"] = df["families_hit"].astype(str)
    print("Using families_hit column for family assignment")
elif "target_family" in df.columns:
    df["family_for_plot"] = df["target_family"].astype(str)
    print("Warning: families_hit not found, using target_family instead")
else:
    df["family_for_plot"] = "Unknown"

if "target_clade" not in df.columns:
    df["target_clade"] = "Unknown"

# === Build consensus sequence (use matched_sequence) ===
if "matched_sequence" not in df.columns:
    df["consensus"] = df["motif_id"]
else:
    df["consensus"] = df["matched_sequence"]

# === Helpers ===
AA = set(list("ACDEFGHIKLMNPQRSTVWY"))

def short_consensus(s, k=6):
    s = str(s or "")
    s2 = "".join(ch for ch in s if ch in AA)
    return s2[:k] if s2 else s[:k]

def distinct_colors(n):
    cols = []
    for i in range(max(1, n)):
        h = i / max(1, n)
        s = 0.60
        l = 0.55
        def hue2rgb(p, q, t):
            if t < 0: t += 1
            if t > 1: t -= 1
            if t < 1/6: return p + (q - p) * 6 * t
            if t < 1/2: return q
            if t < 2/3: return p + (q - p) * (2/3 - t) * 6
            return p
        q = l + s - l * s
        p = 2 * l - q
        r = hue2rgb(p, q, h + 1/3)
        g = hue2rgb(p, q, h)
        b = hue2rgb(p, q, h - 1/3)
        cols.append('#%02x%02x%02x' % (int(r*255), int(g*255), int(b*255)))
    return cols

def majority_safe(s):
    s = s.dropna()
    s = s[s != "Unknown"]
    if s.empty:
        return "Unknown"
    m = s.mode()
    return m.iloc[0] if len(m) == 1 else "Ambiguous"

# === Position & width features ===
df["site_mid"] = (df["rel_start"] + df["rel_end"]) / 2.0
df["site_width"] = (df["stop"] - df["start"] + 1).clip(lower=1)

# === One-vote-per-sequence deduplication ===
dedup = df.drop_duplicates(["plot_label", "sequence_name"])

# === Build label table ===
pos_w = (dedup.groupby("plot_label")
         .agg(med_pos=("site_mid", "median"),
              med_w=("site_width", "median"),
              n_seqs=("sequence_name", "nunique"))
         .reset_index())

rep_cons = (df.groupby("plot_label")["consensus"]
            .apply(majority_safe)
            .rename("rep_consensus"))

dom_fam = (dedup.groupby("plot_label")["family_for_plot"]
           .apply(majority_safe)
           .rename("dominant_family"))

dom_cla = (dedup.groupby("plot_label")["target_clade"]
           .apply(majority_safe)
           .rename("dominant_clade"))

label_tbl = (pos_w.merge(rep_cons.reset_index(), on="plot_label", how="left")
             .merge(dom_fam.reset_index(), on="plot_label", how="left")
             .merge(dom_cla.reset_index(), on="plot_label", how="left"))

# === Position bin ===
def pos_bin(x):
    if pd.isna(x): return "posNA"
    if x < -150: return "N-dist"
    if x < -50: return "N-prox"
    if x <= 50: return "bHLH-prox"
    if x <= 150: return "C-prox"
    return "C-dist"

label_tbl["pos_bin"] = label_tbl["med_pos"].apply(pos_bin)
label_tbl["rep6"] = label_tbl["rep_consensus"].apply(short_consensus)
label_tbl["med_w"] = label_tbl["med_w"].round(0).astype("Int64")

# Fill NaN for display
if "dominant_family" not in label_tbl.columns:
    label_tbl["dominant_family"] = "Unknown"
if "dominant_clade" not in label_tbl.columns:
    label_tbl["dominant_clade"] = "Unknown"

label_tbl["clade_display"] = label_tbl["dominant_clade"].fillna(label_tbl["dominant_family"])

# === Build nice labels with clade info and motif sequence ===
def extract_family_clade(motif_col_str):
    """Extract family and clade from motif_col like 'ASCa_TrueSpiders_A|MOTIF'"""
    parts = str(motif_col_str).split("|")[0].split("_")
    family = parts[0] if parts else "Unknown"
    clade = "_".join(parts[1:]) if len(parts) > 1 else None
    return family, clade

label_tbl["motif_col_family"], label_tbl["motif_col_clade"] = zip(*label_tbl["plot_label"].apply(extract_family_clade))

# Get first 6 AA from motif_id for each plot_label
motif_id_map = df.groupby("plot_label")["motif_id"].first().to_dict()
label_tbl["motif_seq"] = label_tbl["plot_label"].map(motif_id_map).apply(short_consensus)

# Build nice label with clade if specific, otherwise just family
def build_nice_label(row):
    family = row["motif_col_family"]
    clade = row["motif_col_clade"]
    motif_seq = row["motif_seq"]
    med_w = row["med_w"]
    pos_bin = row["pos_bin"]
    
    # Include clade if it's clade-specific (not just family name)
    if clade and clade not in ["", "Unknown"]:
        prefix = f"{family}_{clade}"
    else:
        prefix = family
    
    return f"{prefix} • {motif_seq} (w{med_w}, {pos_bin})"

label_tbl["nice_label"] = label_tbl.apply(build_nice_label, axis=1)

# Site counts
site_counts = (df.groupby("plot_label")["sequence_name"]
               .size()
               .rename("n_sites")
               .reset_index())

label_tbl = label_tbl.merge(site_counts, on="plot_label", how="left")
label_tbl["n_sites"] = label_tbl["n_sites"].fillna(0).astype("Int64")

# === Save mapping ===
MAP_TSV = REPORT / "ASC_plotlabel_to_nicelabel_fimo.tsv"
label_tbl[["plot_label", "nice_label", "rep_consensus", "med_w", "med_pos",
           "pos_bin", "dominant_family", "dominant_clade", "n_seqs", "n_sites"]].to_csv(
    MAP_TSV, sep="\t", index=False)
print(f"Wrote mapping: {MAP_TSV} (n clusters: {len(label_tbl)})")

# === Family color palette ===
FAM_COLOR = {
    "ASCa": "#2e7d32",     # green
    "ASCb": "#1565c0",     # blue
    "ASCc": "#c62828",     # red
    "ASH": "#ef6c00",      # orange
    "Ase": "#00897b",      # teal
    "ase": "#00897b",      # teal
    "Unassigned": "#9e9e9e",
    "Unknown": "#9e9e9e",
    "1": "#2e7d32",        # families_hit=1 often ASCa
    "2": "#1565c0",        # families_hit=2 often ASCb
    "3": "#c62828",        # families_hit=3 often ASCc
}

def family_color(f):
    """Get color for a family, checking multiple possible values"""
    f_str = str(f)
    # First check exact match
    if f_str in FAM_COLOR:
        return FAM_COLOR[f_str]
    # Check if it starts with a known family name
    for fam_key in ["ASCa", "ASCb", "ASCc", "ASH", "ase", "Ase"]:
        if f_str.startswith(fam_key):
            return FAM_COLOR[fam_key]
    return FAM_COLOR["Unknown"]

def compute_x_extent(df_in):
    """Return symmetric x-limits based on farthest observed motif position"""
    if {"rel_start", "rel_end"}.issubset(df_in.columns):
        all_x = np.concatenate([df_in["rel_start"].values, df_in["rel_end"].values])
        finite = all_x[np.isfinite(all_x)]
    else:
        finite = np.array([])
    if finite.size:
        max_abs = float(np.nanmax(np.abs(finite)))
    else:
        max_abs = 200.0
    pad = max(50.0, 0.05 * max_abs)
    lim = max_abs + pad
    return -lim, lim

# === Plotter ===
def plot_arch_with_nice_legend(df_in, title, outpng, color_by="plot_label"):
    if df_in.empty:
        print(f"[Skip] {title}: no rows")
        return

    # Color key for motif rectangles
    if color_by == "plot_label":
        labels = sorted(df_in["plot_label"].astype(str).unique())
        COLOR = dict(zip(labels, distinct_colors(len(labels))))
        legend_items = labels
        legend_label_map = dict(zip(label_tbl["plot_label"].astype(str),
                                   label_tbl["nice_label"].astype(str)))
        def label_to_color(row): return COLOR.get(str(row["plot_label"]), "#cccccc")
        def label_to_legend_text(k): return legend_label_map.get(k, k)
        legend_title = "Motif clusters"
    elif color_by == "target_clade":
        clades = sorted(df_in["target_clade"].astype(str).fillna("Unassigned").unique())
        COLOR = dict(zip(clades, distinct_colors(len(clades))))
        legend_items = clades
        def label_to_color(row): return COLOR.get(str(row.get("target_clade", "Unassigned")), "#cccccc")
        def label_to_legend_text(k): return k
        legend_title = "Subclades"
    else:
        labels = sorted(df_in["plot_label"].astype(str).unique())
        COLOR = dict(zip(labels, distinct_colors(len(labels))))
        legend_items = labels
        legend_label_map = dict(zip(label_tbl["plot_label"].astype(str),
                                   label_tbl["nice_label"].astype(str)))
        def label_to_color(row): return COLOR.get(str(row["plot_label"]), "#cccccc")
        def label_to_legend_text(k): return legend_label_map.get(k, k)
        legend_title = "Motif clusters"

    # Per-sequence family - use source_clade and map to ASCa/ASCb/ASCc
    # ASCb → ASCb, ASCc → ASCc, everything else (ASH, ase, ASCa_*) → ASCa
    if "source_clade" in df_in.columns:
        def map_to_family(clade):
            clade_str = str(clade)
            if clade_str == "ASCb":
                return "ASCb"
            elif clade_str == "ASCc":
                return "ASCc"
            else:
                return "ASCa"  # ASH, ase, ASCa_TrueSpiders_*, etc.
        
        fam_map = df_in.groupby("sequence_name")["source_clade"].first().apply(map_to_family).to_dict()
    else:
        # Fallback
        fam_map = {}
    
    def seq_key(s): return (fam_map.get(s, "ASCa"), s)

    seqs = sorted(df_in["sequence_name"].unique(), key=seq_key)

    # Dynamic x-range
    x_left, x_right = compute_x_extent(df_in)

    H = max(3.0, 0.18 * len(seqs))
    W = 12
    fig, ax = plt.subplots(figsize=(W, H), dpi=200)
    ymap = {s: i for i, s in enumerate(seqs)}

    # Draw baselines and bHLH boxes with true lengths
    for s in seqs:
        y = ymap[s]
        # Simple baseline
        ax.plot([x_left, x_right], [y, y], color="#bdbdbd", lw=1.0, zorder=1)
        
        # Get actual bHLH span for this sequence from anchor_df
        seq_anchor = df_in[df_in["sequence_name"] == s].iloc[0]
        if pd.notna(seq_anchor.get("ali_from")) and pd.notna(seq_anchor.get("ali_to")) and pd.notna(seq_anchor.get("anchor_pos")):
            ali_s = seq_anchor["ali_from"] - seq_anchor["anchor_pos"]
            ali_e = seq_anchor["ali_to"] - seq_anchor["anchor_pos"]
            bhlh_width = ali_e - ali_s
        else:
            # Fallback to default
            ali_s, ali_e = -15, 15
            bhlh_width = 30
        
        # bHLH box with true width
        ax.add_patch(plt.Rectangle((ali_s, y - 0.35), bhlh_width, 0.7,
                                   color="#9e9e9e", ec="#424242", lw=0.5, zorder=2))

    # Draw motif rectangles with true widths and transparency
    for _, r in df_in.iterrows():
        y = ymap[r["sequence_name"]]
        x0, x1 = float(r.get("rel_start", np.nan)), float(r.get("rel_end", np.nan))
        if np.isnan(x0) or np.isnan(x1):
            continue
        # Use actual width (no minimum)
        motif_width = x1 - x0
        ax.add_patch(plt.Rectangle((x0, y - 0.25), motif_width, 0.5,
                                   color=label_to_color(r), ec="black", lw=0.3, 
                                   alpha=0.7, zorder=3))  # alpha=0.7 for transparency

    # Axis setup
    ax.set_xlim(x_left, x_right)
    ax.set_ylim(-1, len(seqs) + 0.5)
    ax.set_title(title)
    ax.set_xlabel("Position (aa) relative to reference")
    ax.axvline(0, color="#616161", lw=1.0, ls="--", zorder=1)

    # Colored y-axis labels by family
    ytick_positions = range(len(seqs))
    ytick_labels = []
    ytick_colors = []
    for s in seqs:
        fam = fam_map.get(s, "Unknown")
        color = family_color(fam)
        ytick_labels.append(s)
        ytick_colors.append(color)
    ax.set_yticks(ytick_positions)
    for ticklabel, color in zip(ax.set_yticklabels(ytick_labels, fontsize=6), ytick_colors):
        ticklabel.set_color(color)

    # Legend
    if color_by == "plot_label":
        cov = df_in.groupby("plot_label")["sequence_name"].nunique().sort_values(ascending=False)
        show_keys = [str(k) for k in cov.head(20).index]
    else:
        show_keys = legend_items

    handles, leg_labels = [], []
    for k in show_keys:
        patch = plt.Rectangle((0, 0), 1, 1, color=COLOR.get(k, "#cccccc"), ec="black", lw=0.3)
        handles.append(patch)
        leg_labels.append(label_to_legend_text(k))
    if handles:
        ax.legend(handles, leg_labels, title=legend_title,
                  bbox_to_anchor=(1.02, 1), loc="upper left", frameon=False, fontsize=7)

    plt.tight_layout()

    outp = FIG / outpng
    fig.savefig(outp, bbox_inches="tight", dpi=200)
    plt.close(fig)
    print(f"Saved: {outp}")

# === Generate plots ===
FIG.mkdir(parents=True, exist_ok=True)

# 1) Combined plot
plot_arch_with_nice_legend(df, "ASC domain architecture (FIMO-based)",
                          "ASC_domain_architecture_fimo.combined.png")

# 2) Per-family plots
if "family_for_plot" in df.columns:
    for fam in sorted(df["family_for_plot"].dropna().unique()):
        if fam == "Unknown":
            continue
        sub = df[df["family_for_plot"] == fam]
        plot_arch_with_nice_legend(sub, f"ASC domain architecture — {fam}",
                                  f"ASC_domain_architecture_fimo.{fam}.png")

        # 3) ASCa extras: per-subclade + clade-colored overview
        if str(fam) == "ASCa" and "target_clade" in sub.columns:
            clades = [c for c in sorted(sub["target_clade"].dropna().unique())
                     if c != "Unassigned"]
            for cl in clades:
                subc = sub[sub["target_clade"] == cl]
                safe_cl = re.sub('[^A-Za-z0-9_.-]+', '_', str(cl))
                plot_arch_with_nice_legend(subc,
                                          f"ASC domain architecture — ASCa • subclade {cl}",
                                          f"ASC_domain_architecture_fimo.ASCa_subclade_{safe_cl}.png")
            
            # Clade-colored overview
            plot_arch_with_nice_legend(sub,
                                      "ASC domain architecture — ASCa (colored by subclade)",
                                      "ASC_domain_architecture_fimo.ASCa_by_subclade.png",
                                      color_by="target_clade")

# === Extra ASCa plots with merged subclades ===
print("\n=== Generating merged subclade plots ===")

# Safety: require these columns
need_cols = {"target_clade", "sequence_name", "rel_start", "rel_end", "plot_label"}
missing = need_cols - set(df.columns)
if missing:
    print(f"[Skip] Missing columns for merged plots: {missing}")
else:
    # Mapping: merge selected subclades into new combined labels
    MERGE_MAP = {
        "ASCa_TrueSpiders_C": "Chelicerate ASCa",
        "ASCa_TrueSpiders_D": "Chelicerate ASCa",
        "ASCa_TrueSpiders_E": "ASCa3",
        "ASCa_TrueSpiders_F": "ASCa3",
    }

    # Filter to ASCa using source_clade (which includes all ASCa subclades, ASH, ase)
    df_asca = df[df["source_clade"].str.startswith("ASCa", na=False) | 
                  df["source_clade"].isin(["ASH", "ase"])].copy()
    
    if df_asca.empty:
        print("[Skip] No ASCa rows found.")
    else:
        # Create a merged clade column (default to original)
        df_asca["target_clade_merged"] = df_asca["target_clade"].astype(str).apply(
            lambda x: MERGE_MAP.get(x, x)
        )

        # ---------- Plot 1: Chelicerate ASCa (C + D merged) ----------
        chelic_keep = {"ASCa_TrueSpiders_C", "ASCa_TrueSpiders_D"}
        sub_chelic = df_asca[df_asca["target_clade"].isin(chelic_keep)].copy()
        if sub_chelic.empty:
            print("[Skip] No rows for Chelicerate ASCa (C/D).")
        else:
            # Swap in merged label as the plotting clade
            sub_chelic["target_clade"] = sub_chelic["target_clade_merged"]
            plot_arch_with_nice_legend(
                sub_chelic,
                "ASC domain architecture — ASCa (Chelicerate ASCa: C+D merged)",
                "ASC_domain_architecture_fimo.ASCa_Chelicerate_merged.png",
                color_by="plot_label"  # motifs colored by motif cluster
            )
            print("Created: ASCa_Chelicerate_merged.png")

        # ---------- Plot 2: ASCa3 (E + F merged) ----------
        asca3_keep = {"ASCa_TrueSpiders_E", "ASCa_TrueSpiders_F"}
        sub_asca3 = df_asca[df_asca["target_clade"].isin(asca3_keep)].copy()
        if sub_asca3.empty:
            print("[Skip] No rows for ASCa3 (E/F).")
        else:
            sub_asca3["target_clade"] = sub_asca3["target_clade_merged"]
            plot_arch_with_nice_legend(
                sub_asca3,
                "ASC domain architecture — ASCa (ASCa3: E+F merged)",
                "ASC_domain_architecture_fimo.ASCa_ASCa3_merged.png",
                color_by="plot_label"  # motifs colored by motif cluster
            )
            print("Created: ASCa_ASCa3_merged.png")

print("\n=== Done! All plots generated. ===")

Loaded 200 rows from FIMO file
Columns: ['motif_id', 'motif_alt_id', 'sequence_name', 'start', 'stop', 'strand', 'score', 'p-value', 'q-value', 'matched_sequence', 'source_clade', 'seq_norm', 'target_clade', 'hit_mid', 'overlap_bHLH', 'is_bHLH_like', 'target_family', 'match_entropy', 'entropy_flag', 'families_hit', 'motif_col']
Loaded anchor positions for 74 sequences
Using motif_col for family-aware motif labels
Using families_hit column for family assignment
Wrote mapping: /Users/gorkemdurmaz/Desktop/asc_project_10/results/reports/ASC_plotlabel_to_nicelabel_fimo.tsv (n clusters: 27)
Saved: /Users/gorkemdurmaz/Desktop/asc_project_10/results/figures_new/ASC_domain_architecture_fimo.combined.png
Saved: /Users/gorkemdurmaz/Desktop/asc_project_10/results/figures_new/ASC_domain_architecture_fimo.1.png
Saved: /Users/gorkemdurmaz/Desktop/asc_project_10/results/figures_new/ASC_domain_architecture_fimo.2.png
Saved: /Users/gorkemdurmaz/Desktop/asc_project_10/results/figures_new/ASC_domain_archi