# Define clades from tree & export perâ€‘clade FASTA

In [None]:

import os, shutil, subprocess, sys
from pathlib import Path
from Bio import SeqIO

# Reuse the same project layout as 0_setup.ipynb
PROJ = Path.cwd().resolve().parent if Path.cwd().name == 'notebooks' else Path.cwd()
DATA = PROJ / "data"
OUT  = PROJ / "results"
CLADES = OUT / "clades"
MOTIFS = OUT / "motifs"
REPORTS = OUT / "reports"
TREES = OUT / "trees"

for d in (DATA, OUT, CLADES, MOTIFS, REPORTS, TREES):
    d.mkdir(parents=True, exist_ok=True)

# Inputs expected (some may be generated in earlier notebooks)
IN_MSA = DATA / "query.algn.fa"          # Chetan's gapped MSA
IN_MSA_TRIMMED = DATA / "query.algn.trimmed.fa"
IN_TREE = DATA / "ASC-tree.newick"       # Provided tree - not that important at the moment
IN_TARGETS = DATA / "ASC_targets.fasta"      # Ungapped full-length sequences

print("DATA:", DATA)
print("OUT:", OUT)


DATA: /Users/gorkemdurmaz/Desktop/asc_project_10/data
OUT: /Users/gorkemdurmaz/Desktop/asc_project_10/results


## Goals
- Load ML tree and tip labels.
- Define clades by reference tips or by manual selection.
- Export **per-clade FASTA** for downstream motif discovery.

In [None]:

import json
from io import StringIO

#  Choose tree to use (prefer provided Newick) ---
candidates = [IN_TREE, TREES / "ASC_fulllength.treefile"]  # IN_TREE is DATA/"ASC-tree.newick"
tree_path = next((p for p in candidates if p and p.exists()), None)
if tree_path is None:
    raise FileNotFoundError(
        f"No tree found. Looked for: {candidates[0]} and {candidates[1]}"
    )
print(f"Using tree: {tree_path}")

# Parse tips (ete3 if available; simple fallback otherwise) 
tips = []
try:
    from ete3 import Tree
    # Try common formats: 0 (auto) then 1 (internal node names support)
    try:
        t = Tree(str(tree_path), format=0)
    except Exception:
        t = Tree(str(tree_path), format=1)
    tips = [n.name for n in t.iter_leaves() if n.name]
except Exception as e:
    print(f"ete3 not available or failed ({e}); using simple Newick fallback.")
    newick = tree_path.read_text().strip()
    # crude leaf label grab: split on punctuation that usually separates tokens
    import re
    # remove branch lengths [:number] and inner node annotations
    s = re.sub(r":[-+0-9.eE]+", "", newick)
    # remove parentheses/commas/semicolons/whitespace
    tokens = [tok for tok in re.split(r"[(),;\s]+", s) if tok]
    tips = tokens  # good enough if Newick has only leaf names labeled

print(f"n_tips={len(tips)}; first 10: {tips[:10]}")

# Optional: peek at the raw Newick for sanity
print((tree_path.read_text()[:200] + " ...") if tree_path.stat().st_size > 200 else tree_path.read_text())

# Clade map is done by visually grouping the labels in the tree.
CLADE_MAP = {
    "ASH": [
        "Afun_g169.t1", "Dsuz_g12008.t1", "Dmel_lethal_of_scute_NP_476623.1",
        "Dsuz_g12010.t1", "Dmel_scute_NP_476803.1", "Dsuz_g12012.t1",
        "Dmel_achaete_NP_476824.1", "Agla_g17200.t1", "Tcas_NP_001034537.1", "Rchl_g10048.t1", "Amel_g4745.t2", "Mpha_g7485.t1"
    ],

    "ase": [
        "Dsuz_g12006.t1", "Dmel_asense_NP_476694.1", "Agla_g20912.t1",
        "Tcas_NP_001034533.1", "Bmor_g3242.t1", "Amel_g4746.t1", "Mpha_g7487.t1"
    ],

    "ASCa_TrueSpiders_F": [
        "Ogib_g6232.t1", "Hgra_g12400.t1", "Ltri_g10597.t1", "Texc_g12844.t1"
    ],

    "ASCa_TrueSpiders_E": [
        "Osin_g12133.t1", "Afer_g4276.t1", "Ppse_g1119.t1", "Ssce_g2428.t1",
        "Mspe_g5305.t1", "Lele_g12310.t1", "Ptep_aug3.g27206", "Plun_g16587.t1",
        "Tmon_g15032.t1", "Abru_g14800.t1", "Tant_g13463.t1", "Hgra_g12232.t1",
        "Ogib_g6305.t1", "Texc_g13020.t1", "Ltri_g10490.t1"
    ],

    "ASCa_TrueSpiders_D": [
        "Osin_g12129.t1", "Ppse_g1117.t1", "Afer_g4271.t2", "Ssce_g2430.t1",
        "Mspe_g5309.t1", "Tmon_g15029.t1", "Ltri_g10487.t1", "Texc_g13018.t1",
        "Ogib_g6308.t2", "Hgra_g12229.t2", "Tant_g13465.t1", "Abru_g14798.t1",
        "Lele_g12303.t1", "Ptep_aug3.g5047", "Plun_g16592.t1"
    ],

    "Chelicerate_ASCa_A": [
        "Agen_DN34338", "Ppha_c103348", "Abru_g15101.t2", "Ogib_g6231.t1",
        "Osin_g11379.t1", "Lele_g4376.t1", "Ptep_aug3.g15676", "Plun_g15253.t1"
    ],

    "ASCa_TrueSpiders_C": [
        "Texc_g13019.t1", "Hgra_g12230.t1", "Ogib_g6307.t1", "Ltri_g10489.t1",
        "Abru_g14799.t1", "Tant_g13464.t1", "Lele_g12309.t1", "Tmon_g15030.t1",
        "Ptep_aug3.g27205", "Plun_g16588.t1", "Osin_g12131.t1", "Afer_g4273.t1",
        "Ppse_g1118.t1", "Ssce_g2429.t1", "Mspe_g5306.t1"
    ],

    "ASCa_TrueSpiders_B": [
        "Osin_g854.t1", "Mspe_g3937.t1", "Afer_g6687.t1", "Lele_g370.t1",
        "Ptep_aug3.g7988", "Plun_g17032.t1", "Tant_g11058.t1", "Abru_g19762.t1",
        "Texc_g12208.t1", "Hgra_g18054.t1"
    ],

    "ASCa_TrueSpiders_A": [
        "Tmon_g14730.t1", "Ppse_g9292.t1", "Ssce_g718.t1", "Hgra_g11718.t1",
        "Hgra_g11716.t1", "Texc_g13132.t1", "Ltri_g10189.t1", "Ogib_g6616.t1",
        "Afer_g4188.t2", "Abru_g14616.t1", "Tant_g13562.t1", "Osin_g12279.t1",
        "Lele_g12082.t1", "Ptep_aug3.g5172", "Plun_g16424.t1"
    ],

    "ASCb": [
        "Dsil_g2757.t1", "Ppha_c39179", "Hgra_g13806.t1", "Ogib_g875.t1",
        "Texc_g1715.t1", "Ltri_g6024.t1", "Abru_g3556.t2", "Tmon_g1300.t1",
        "Tant_g6122.t1", "Mspe_g17677.t1", "Ssce_g14979.t1", "Lele_g13365.t1",
        "Osin_g12189.t1", "Cdip_g7827.t1", "Cscu_CSCU021373", "Cscu_CSCU022895",
        "Lpol_g7117.t1", "Lpol_g10822.t1", "Cscu_ASCb_CSCU021418", "Hari_g511.t1",
        "Lvar_g11016.t1", "Myan_g16342.t1", "Dsil_g2895.t1", "Afer_g14943.t1",
        "Ppse_g10914.t1", "Mspe_g17153.t1", "Ssce_g14571.t1", "Tmon_g313.t1",
        "Ltri_g5694.t1", "Texc_g2632.t1", "Ogib_g33.t1", "Hgra_g12831.t1",
        "Abru_g3965.t1", "Tant_g14463.t1", "Lele_g13335.t1", "Ptep_aug3.g955",
        "Plun_g5997.t1"
    ],

    "ASCc": [
        "Agen_DN60670", "Plun_g18073.t1", "Ptep_aug3.g20076", "Ltri_g694.t1",
        "Abru_g18241.t1", "Lele_g13758.t1", "Ssce_g7592.t1", "Ppse_g5899.t1",
        "Afer_g7508.t1", "Dsil_g9056.t1", "Ppha_c99819", "Llun_g201.t2",
        "Cdip_g8855.t2", "Ccar_g9585.t1", "Agla_g29010.t1", "Scar_g1896.t2",
        "Tcas_XP_008191716.1", "Sgra_g15859.t1", "Cscu_CSCU009487",
        "Gmar_c54934_g1_i1", "Lpol_g7166.t1", "Lpol_g10882.t1",
        "Lvar_g11169.t2", "Sacu_g1847.t1", "Ekan_c10052_g1_i1",
        "Agen_DN62514", "Myan_g5102.t1", "Dsil_g8160.t1", "Ppse_g5520.t1",
        "Osin_g10889.t1", "Lele_g11972.t1", "Plun_g16239.t1", "Ptep_aug3.g13882",
        "Tant_g11243.t1", "Abru_g13702.t1", "Tmon_g10748.t1", "Texc_g9525.t1",
        "Ogib_g13102.t1", "Hgra_g7453.t1"
    ]
}



# Export per-clade FASTA
seqs = {rec.id: rec for rec in SeqIO.parse(IN_TARGETS, "fasta")}
for clade, ids in CLADE_MAP.items():
    outfa = CLADES / f"{clade}.fa"
    with outfa.open("w") as oh:
        for sid in ids:
            if sid in seqs:
                SeqIO.write(seqs[sid], oh, "fasta")
    print("Wrote", outfa, "n=", len(ids))


Using tree: /Users/gorkemdurmaz/Desktop/asc_project_10/data/ASC-tree.newick
n_tips=249; first 10: ['Mspe_g17321.t1', 'Afer_g11488.t2', 'Fcan_g12456.t1', 'Rchl_g10047.t1', 'Mpha_g7487.t1', 'Amel_g4746.t1', 'Pame_g3385.t1', 'Afun_g171.t1', 'Dmel_asense_NP_476694.1', 'Dsuz_g12006.t1']
(Mspe_g17321.t1:0.028249,(Afer_g11488.t2:0.122871,((((((((((Fcan_g12456.t1:0.777519,(((((Rchl_g10047.t1:0.075671,(Mpha_g7487.t1:0.075767,Amel_g4746.t1:0.053062)85:0.128239)98:0.883043,Pame_g3385.t1:0. ...
Wrote /Users/gorkemdurmaz/Desktop/asc_project_10/results/clades/ASH.fa n= 12
Wrote /Users/gorkemdurmaz/Desktop/asc_project_10/results/clades/ase.fa n= 7
Wrote /Users/gorkemdurmaz/Desktop/asc_project_10/results/clades/ASCa_TrueSpiders_F.fa n= 4
Wrote /Users/gorkemdurmaz/Desktop/asc_project_10/results/clades/ASCa_TrueSpiders_E.fa n= 15
Wrote /Users/gorkemdurmaz/Desktop/asc_project_10/results/clades/ASCa_TrueSpiders_D.fa n= 15
Wrote /Users/gorkemdurmaz/Desktop/asc_project_10/results/clades/Chelicerate_ASCa_A.f