In [None]:
# =========================
# 0) Setup & configuration
# =========================
from pathlib import Path
import subprocess, sys, re, textwrap, os
import pandas as pd
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

# ---- INPUTS  ----
FIMO_PERSEQ = Path("results/motifs/_summaries/ALL.fimo_hits.filtered_perseq.tsv")
TARGET_FASTA = Path("data/ASC_targets.fasta")   # all target proteins used in FIMO
# bHLH coordinates table: one row per protein with columns: id, start, end (1-based, inclusive)
# If you don't have this exact file, point to your HMMER domtblout-derived table (same columns)
BHLH_COORDS  = Path("results/bHLH_anchors.tsv")

# Optional mapping file: two columns [id, clade] where clade in {"insect","spider"}
# If not provided, a regex-based classifier below will try to infer from IDs.
TAXON_MAP = Path("data/id_to_clade.tsv")  # set to None if you don't have it
if not TAXON_MAP.exists():
    TAXON_MAP = None

# ---- OUTPUTS ----
OUTDIR = Path("results/weblogos_bHLH_by_lineage")
OUTDIR.mkdir(parents=True, exist_ok=True)

# MAFFT required for alignment-based logos 
MAFFT = "mafft"   # ensure it's on PATH (eg, conda install mafft)


In [None]:

# 1) Load FIMO per-seq list

# Expecting a column that holds sequence/protein IDs. Common names: 'sequence_name', 'seq_id', or 'target'
# We will try a few and fail helpfully if none present.
fimo = pd.read_csv(FIMO_PERSEQ, sep="\t")
cand_cols = [c for c in fimo.columns if c.lower() in {"sequence_name", "seq_name", "target", "seq_id", "sequence"}]
if not cand_cols:
    raise ValueError(f"Couldn't find an ID column in {FIMO_PERSEQ}. Columns: {list(fimo.columns)}")
id_col = cand_cols[0]
ids_from_fimo = set(fimo[id_col].astype(str).unique())
len(ids_from_fimo), list(sorted(list(ids_from_fimo))[:5])[:5]


(91,
 ['Abru_g13702.t1',
  'Abru_g14616.t1',
  'Abru_g14798.t1',
  'Abru_g14799.t1',
  'Abru_g14800.t1'])

In [None]:

# 2) Load bHLH coords & restrict to IDs

coords = pd.read_csv(BHLH_COORDS, sep="\t")

# helper to pick the first existing column from a preference list
def pick_col(df_cols_lower_to_orig, *candidates):
    for c in candidates:
        if c in df_cols_lower_to_orig:
            return df_cols_lower_to_orig[c]
    return None

# map lowercase -> original
lower2orig = {c.lower(): c for c in coords.columns}

col_id = pick_col(lower2orig, "id", "seq_id", "protein", "name")
if col_id is None:
    raise ValueError(f"No ID column found in {BHLH_COORDS}. Have: {list(coords.columns)}")

# prefer env_* (domain envelope) then ali_* (aligned core) then generic start/end
col_start = pick_col(lower2orig, "env_from", "ali_from", "start", "query_start")
col_end   = pick_col(lower2orig, "env_to",   "ali_to",   "end",   "query_end")
if col_start is None or col_end is None:
    raise ValueError(
        f"Couldn't find start/end-like columns in {BHLH_COORDS}. "
        f"Looked for one of env_from/ali_from/start and env_to/ali_to/end. "
        f"Have: {list(coords.columns)}"
    )

coords = coords[[col_id, col_start, col_end]].rename(
    columns={col_id: "id", col_start: "start", col_end: "end"}
)
coords["id"] = coords["id"].astype(str)
coords["start"] = coords["start"].astype(int)
coords["end"]   = coords["end"].astype(int)

# Keep only proteins that appear in ALL.fimo_hits.filtered_perseq.tsv
coords = coords[coords["id"].isin(ids_from_fimo)].drop_duplicates("id")
print("bHLH coord rows kept:", len(coords))
coords.head()


bHLH coord rows kept: 89


Unnamed: 0,id,start,end
0,Abru_g13702.t1,117,168
1,Abru_g14616.t1,58,110
2,Abru_g14798.t1,82,152
3,Abru_g14799.t1,91,143
4,Abru_g14800.t1,110,162


In [None]:

# 3) Load sequences and slice bHLH subsequences

# FASTA IDs must match the 'id' values above (after whatever normalization you already use).
seq_index = {}
for rec in SeqIO.parse(str(TARGET_FASTA), "fasta"):
    seq_index[rec.id] = rec.seq

# sanity & slicing
def safe_slice(seq, start1, end1):
    # coords are 1-based inclusive; convert to 0-based slice
    s = max(1, int(start1))
    e = int(end1)
    if s > e: s, e = e, s
    s0 = s-1
    # clamp
    s0 = max(0, min(s0, len(seq)))
    e0 = max(0, min(e, len(seq)))
    return seq[s0:e0]

missing = [pid for pid in coords["id"] if pid not in seq_index]
if missing:
    print(f"WARNING: {len(missing)} IDs have coords but not found in FASTA. (Showing up to 10): {missing[:10]}")

records = []
for row in coords.itertuples(index=False):
    pid, s, e = row.id, row.start, row.end
    if pid not in seq_index: 
        continue
    frag = safe_slice(seq_index[pid], s, e)
    if len(frag) < 20:
        # too short to be a meaningful bHLH; skip
        continue
    records.append(SeqRecord(Seq(str(frag)), id=pid, description=f"bHLH:{s}-{e}"))

print("bHLH subsequences:", len(records))


bHLH subsequences: 89


In [None]:

# 4) Assign clades (spider vs insect) for each protein

import re
from collections import defaultdict

# Determined from ASC_targets.fasta
PREFIX_TO_CLADE = {
    # spiders / chelicerates
    "Ptep": "spider",   # Parasteatoda tepidariorum
    "Isca": "spider",   # Ixodes scapularis (tick; chelicerate)
    "Abru": "spider",   # Argiope bruennichi
    "Afer": "spider",   # (assumed) Alopecosa ferox / spider lineage
    "Lpol": "spider",   # (assumed) Latrodectus/Loxosceles lineage
    "Ppha": "spider",   # Pholcus phalangioides
    "Dsil": "spider",   # (assumed) Dysdera/Dolomedes silvestris
    "Hgra": "spider",   # (assumed) Habronattus/Hasarius gramineus
    "Ssce": "spider",   # (assumed) Stegodyphus lineage

    # insects
    "Dmel": "insect",   # Drosophila melanogaster
    "Amel": "insect",   # Apis mellifera
    "Tcas": "insect",   # Tribolium castaneum
    "Gmar": "insect",   # (assumed) Gryllus/Gerris marginatus
    "Cdip": "insect",   # (assumed) insect lineage
}

def id_prefix(pid: str) -> str:
    # take token before first underscore, e.g. "Abru" from "Abru_g14616.t1"
    return pid.split("_", 1)[0]

def guess_clade_from_prefix(pid: str) -> str:
    pref = id_prefix(pid)
    return PREFIX_TO_CLADE.get(pref, "unknown")

id2clade = {r.id: guess_clade_from_prefix(r.id) for r in records}

# partition
spider_recs = [r for r in records if id2clade.get(r.id) == "spider"]
insect_recs = [r for r in records if id2clade.get(r.id) == "insect"]
unknown_recs= [r for r in records if id2clade.get(r.id) == "unknown"]
print(f"Insects: {len(insect_recs)}  |  Spiders: {len(spider_recs)}  |  Unknown: {len(unknown_recs)}")

# optional: write unknowns to a file for manual fix (should be empty with current FASTA)
if unknown_recs:
    with open(OUTDIR/"unknown_prefix_ids.txt","w") as fh:
        for r in unknown_recs:
            fh.write(r.id + "\n")


Insects: 15  |  Spiders: 74  |  Unknown: 0


In [None]:
#  RE-RUN SAFE BLOCK: split -> write FASTA -> MAFFT -> logos 
from pathlib import Path
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import subprocess, sys
import pandas as pd

# 0) Ensure OUTDIR exists
OUTDIR = Path("results/weblogos_bHLH_by_lineage")
OUTDIR.mkdir(parents=True, exist_ok=True)

# 1) Prefix-based clade mapping for your IDs
PREFIX_TO_CLADE = {
    # spiders / chelicerates
    "Ptep": "spider", "Isca": "spider", "Abru": "spider", "Afer": "spider",
    "Lpol": "spider", "Ppha": "spider", "Dsil": "spider", "Hgra": "spider", "Ssce": "spider",
    # insects
    "Dmel": "insect", "Amel": "insect", "Tcas": "insect", "Gmar": "insect", "Cdip": "insect",
}
def _prefix(pid: str) -> str: return pid.split("_", 1)[0]
def _clade(pid: str) -> str: return PREFIX_TO_CLADE.get(_prefix(pid), "unknown")

# 2) Expect 'records' already holds sliced bHLH SeqRecords (from the coords + FASTA step)
assert 'records' in globals() and len(records) > 0, "Missing 'records' (bHLH subsequences). Run the slicing cell first."

spider_recs = [r for r in records if _clade(r.id) == "spider"]
insect_recs = [r for r in records if _clade(r.id) == "insect"]
unknown_recs= [r for r in records if _clade(r.id) == "unknown"]

print(f"Insects: {len(insect_recs)} | Spiders: {len(spider_recs)} | Unknown: {len(unknown_recs)}")
if unknown_recs:
    with open(OUTDIR/"unknown_prefix_ids.txt","w") as fh:
        for r in unknown_recs: fh.write(r.id + "\n")
    print("Wrote unknown prefixes to", OUTDIR/"unknown_prefix_ids.txt")

# 3) Define and write raw FASTAs (this is where fa_spiders/fa_insects are created)
fa_spiders = OUTDIR / "bHLH_spiders.raw.faa"
fa_insects = OUTDIR / "bHLH_insects.raw.faa"
if len(spider_recs) >= 1: SeqIO.write(spider_recs, str(fa_spiders), "fasta")
if len(insect_recs) >= 1: SeqIO.write(insect_recs, str(fa_insects), "fasta")

# 4) MAFFT wrapper (define if not already)
try:
    run_mafft
except NameError:
    MAFFT = "mafft"
    def run_mafft(inp: Path, outp: Path, add_args=("--auto",)):
        if not inp.exists() or sum(1 for _ in SeqIO.parse(str(inp), "fasta")) < 2:
            print(f"Skip MAFFT: not enough sequences in {inp}")
            return False
        cmd = [MAFFT, *add_args, str(inp)]
        print("Running:", " ".join(cmd))
        res = subprocess.run(cmd, capture_output=True, text=True)
        if res.returncode != 0:
            print("MAFFT error:\n", res.stderr[:2000])
            return False
        outp.write_text(res.stdout)
        return True

# 5) Align
aln_spiders = OUTDIR / "bHLH_spiders.aln.faa"
aln_insects = OUTDIR / "bHLH_insects.aln.faa"
ok_sp = run_mafft(fa_spiders, aln_spiders) if len(spider_recs) >= 2 else False
ok_in = run_mafft(fa_insects, aln_insects) if len(insect_recs) >= 2 else False
print("Aligned:", {"spider": ok_sp, "insect": ok_in})

# 6) Logos (define helper if missing)
try:
    alignment_to_logo
except NameError:
    def _pip(pkg):
        try: __import__(pkg)
        except ImportError: subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
    _pip("logomaker"); _pip("matplotlib")

    import matplotlib.pyplot as plt
    import logomaker as lm

    def alignment_to_logo(aln_faa: Path, out_png: Path, out_svg: Path):
        seqs = [str(rec.seq) for rec in SeqIO.parse(str(aln_faa), "fasta")]
        if len(seqs) < 2:
            print(f"Not enough seqs to make a logo: {aln_faa}")
            return
        L = len(seqs[0])
        if not all(len(s)==L for s in seqs):
            raise ValueError(f"Alignment {aln_faa} has variable lengths; check MAFFT.")
        alphabet = list("ACDEFGHIKLMNPQRSTVWY")
        cols = []
        for i in range(L):
            d = {aa:0 for aa in alphabet}
            for s in seqs:
                if s[i] in d: d[s[i]] += 1
            cols.append(d)
        df = pd.DataFrame(cols)
        fig = plt.figure(figsize=(14, 3.2))
        ax = fig.add_subplot(111)
        lm.Logo(df, ax=ax, shade_below=.5, fade_below=.5, vpad=0.05, width=.98)
        ax.set_xlabel("Alignment position"); ax.set_ylabel("Information")
        ax.set_title(aln_faa.name.replace(".aln.faa","").replace("_"," ").upper())
        plt.tight_layout()
        fig.savefig(out_png, dpi=300); fig.savefig(out_svg); plt.close(fig)
        print("Saved:", out_png, "|", out_svg)

# 7) Build logos if alignments exist
if ok_sp:
    alignment_to_logo(aln_spiders, OUTDIR/"bHLH_spiders.logo.png", OUTDIR/"bHLH_spiders.logo.svg")
if ok_in:
    alignment_to_logo(aln_insects, OUTDIR/"bHLH_insects.logo.png", OUTDIR/"bHLH_insects.logo.svg")


Insects: 15 | Spiders: 74 | Unknown: 0
Running: mafft --auto results/weblogos_bHLH_by_lineage/bHLH_spiders.raw.faa
Running: mafft --auto results/weblogos_bHLH_by_lineage/bHLH_insects.raw.faa
Aligned: {'spider': True, 'insect': True}
Saved: results/weblogos_bHLH_by_lineage/bHLH_spiders.logo.png | results/weblogos_bHLH_by_lineage/bHLH_spiders.logo.svg
Saved: results/weblogos_bHLH_by_lineage/bHLH_insects.logo.png | results/weblogos_bHLH_by_lineage/bHLH_insects.logo.svg


In [None]:
#  COMBINED LOGO: spiders + insects together 
from pathlib import Path
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

assert 'spider_recs' in globals() and 'insect_recs' in globals(), "Run the clade-split cell first."

OUTDIR = Path("results/weblogos_bHLH_by_lineage")
OUTDIR.mkdir(parents=True, exist_ok=True)

# 1) Combine and write raw FASTA
all_recs = spider_recs + insect_recs
print(f"Total (spiders+insects): {len(all_recs)}")
fa_all = OUTDIR / "bHLH_all.raw.faa"
SeqIO.write(all_recs, str(fa_all), "fasta")

# 2) Align (needs >=2 sequences)
aln_all = OUTDIR / "bHLH_all.aln.faa"
ok_all = False
if len(all_recs) >= 2:
    ok_all = run_mafft(fa_all, aln_all)
else:
    print("Skip MAFFT: need at least 2 sequences for a combined logo.")

# 3) Build logo
if ok_all:
    alignment_to_logo(aln_all, OUTDIR/"bHLH_all.logo.png", OUTDIR/"bHLH_all.logo.svg")


Total (spiders+insects): 89
Running: mafft --auto results/weblogos_bHLH_by_lineage/bHLH_all.raw.faa
Saved: results/weblogos_bHLH_by_lineage/bHLH_all.logo.png | results/weblogos_bHLH_by_lineage/bHLH_all.logo.svg


In [None]:

# 7) Provenance snapshot

with open(OUTDIR/"README.txt","w") as fh:
    fh.write(textwrap.dedent(f"""
    WebLogo of bHLH regions by lineage
    ----------------------------------
    Inputs:
      FIMO per-seq: {FIMO_PERSEQ}
      Targets FASTA: {TARGET_FASTA}
      bHLH coords:  {BHLH_COORDS}
      Taxon map:    {TAXON_MAP if TAXON_MAP else "regex-based fallback"}

    Outputs:
      Raw FASTA:    {fa_spiders.name}, {fa_insects.name}
      Alignments:   {aln_spiders.name if ok_sp else "NA"}, {aln_insects.name if ok_in else "NA"}
      Logos:        bHLH_spiders.logo.(png|svg), bHLH_insects.logo.(png|svg)

    Notes:
      - Proteins considered were those present in ALL.fimo_hits.filtered_perseq.tsv.
      - Only the bHLH domain (start..end from coords table) was extracted.
      - Logos are built from MAFFT alignments.
    """).strip()+"\n")
print("Done.")


In [None]:
# 
# Generate WebLogos for bHLH domains by Gene Family (ASCa, ASCb, ASCc) and Species

# This code creates 9 logos:
# - ASCa: combined, insects-only, spiders-only
# - ASCb: combined, insects-only, spiders-only  
# - ASCc: combined, insects-only, spiders-only

from pathlib import Path
import subprocess
import sys
import pandas as pd
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq



# Output directory
OUTDIR = Path("weblogos_by_family_and_species")
OUTDIR.mkdir(parents=True, exist_ok=True)

# MAFFT for alignment
MAFFT = "mafft"

# Species classification based on prefix

PREFIX_TO_SPECIES = {
    # Spiders/Chelicerates
    "Ptep": "spider", "Isca": "spider", "Abru": "spider", "Afer": "spider",
    "Lpol": "spider", "Ppha": "spider", "Dsil": "spider", "Hgra": "spider", "Ssce": "spider", "Gmar": "spider", "Ppse": "spider", "Cdip": "spider",
    # Insects
    "Dmel": "insect", "Amel": "insect", "Tcas": "insect", "Cdip": "insect", "Dsuz": "insect", "Bmor": "insect", "Agla": "insect", "Afun": "insect", "Mpha": "insect", "Fcan": "insect"
}


def get_prefix(seq_id: str) -> str:
    """Extract prefix from sequence ID (e.g., 'Abru_g13702.t1' -> 'Abru')"""
    return seq_id.split("_", 1)[0]

def get_species_group(seq_id: str) -> str:
    """Classify sequence as 'insect', 'spider', or 'unknown' based on prefix"""
    prefix = get_prefix(seq_id)
    return PREFIX_TO_SPECIES.get(prefix, "unknown")

def get_family(target_family: str) -> str:
    """
    Classify into ASCa, ASCb, or ASCc based on target_family column.
    - Returns the target_family value directly if it's ASCa, ASCb, or ASCc
    - Otherwise returns 'other'
    """
    if pd.isna(target_family):
        return "other"
    target_family = str(target_family).strip()
    if target_family in ['ASCa', 'ASCb', 'ASCc']:
        return target_family
    else:
        return "other"

# ===========================
# 1) LOAD DATA
# ===========================
print("Loading data...")

# Load FIMO per-sequence results
fimo = pd.read_csv(FIMO_PERSEQ, sep="\t")
print(f"Loaded {len(fimo)} FIMO hits")

# Find the sequence ID column
cand_cols = [c for c in fimo.columns if c.lower() in {"sequence_name", "seq_name", "target", "seq_id", "sequence"}]
if not cand_cols:
    raise ValueError(f"Couldn't find an ID column in {FIMO_PERSEQ}. Columns: {list(fimo.columns)}")
seq_id_col = cand_cols[0]

# Add family and species classifications
fimo['family'] = fimo['target_family'].apply(get_family)
fimo['species_group'] = fimo[seq_id_col].apply(get_species_group)

# Get unique sequences per family
print("\nSequence counts by family:")
for family in ['ASCa', 'ASCb', 'ASCc']:
    count = fimo[fimo['family'] == family][seq_id_col].nunique()
    print(f"  {family}: {count} sequences")

# Load bHLH coordinates
coords = pd.read_csv(BHLH_COORDS, sep="\t")

# Standardize column names
lower2orig = {c.lower(): c for c in coords.columns}
def pick_col(df_cols_lower_to_orig, *candidates):
    for c in candidates:
        if c in df_cols_lower_to_orig:
            return df_cols_lower_to_orig[c]
    return None

col_id = pick_col(lower2orig, "id", "seq_id", "protein", "name")
col_start = pick_col(lower2orig, "env_from", "ali_from", "start", "query_start")
col_end = pick_col(lower2orig, "env_to", "ali_to", "end", "query_end")

if not all([col_id, col_start, col_end]):
    raise ValueError(f"Couldn't find required columns in {BHLH_COORDS}")

coords = coords[[col_id, col_start, col_end]].rename(
    columns={col_id: "seq_id", col_start: "start", col_end: "end"}
)
coords["seq_id"] = coords["seq_id"].astype(str)
coords["start"] = coords["start"].astype(int)
coords["end"] = coords["end"].astype(int)

print(f"\nLoaded {len(coords)} bHLH domain coordinates")

# Merge FIMO data with coordinates to get family info for each sequence
# Keep only one row per sequence (in case there are duplicates)
fimo_unique = fimo.groupby(seq_id_col).first().reset_index()
coords_with_family = coords.merge(
    fimo_unique[[seq_id_col, 'family', 'species_group']], 
    left_on='seq_id', 
    right_on=seq_id_col,
    how='left'
)
coords_with_family = coords_with_family.dropna(subset=['family', 'species_group'])

print(f"\nAfter merging: {len(coords_with_family)} sequences with family and species info")

# Load target sequences
print("\nLoading target FASTA...")
seq_dict = {rec.id: rec for rec in SeqIO.parse(str(TARGET_FASTA), "fasta")}
print(f"Loaded {len(seq_dict)} target sequences")

# ===========================
# 2) EXTRACT bHLH SUBSEQUENCES
# ===========================
print("\nExtracting bHLH subsequences...")

all_bhlh_records = []
for _, row in coords_with_family.iterrows():
    seq_id = row['seq_id']
    if seq_id not in seq_dict:
        continue
    
    full_seq = seq_dict[seq_id]
    start = row['start'] - 1  # Convert to 0-based
    end = row['end']  # End is inclusive in 1-based, exclusive in Python slicing
    
    bhlh_seq = str(full_seq.seq[start:end])
    
    # Create new record with family and species in description
    new_rec = SeqRecord(
        Seq(bhlh_seq),
        id=seq_id,
        description=f"family={row['family']} species={row['species_group']}"
    )
    all_bhlh_records.append((new_rec, row['family'], row['species_group']))

print(f"Extracted {len(all_bhlh_records)} bHLH subsequences")

# ===========================
# 3) ORGANIZE BY FAMILY AND SPECIES
# ===========================
print("\nOrganizing sequences by family and species...")

# Dictionary to hold sequences for each combination
sequence_groups = {
    'ASCa_combined': [],
    'ASCa_insects': [],
    'ASCa_spiders': [],
    'ASCb_combined': [],
    'ASCb_insects': [],
    'ASCb_spiders': [],
    'ASCc_combined': [],
    'ASCc_insects': [],
    'ASCc_spiders': [],
}

for rec, family, species_group in all_bhlh_records:
    if family == 'ASCa':
        sequence_groups['ASCa_combined'].append(rec)
        if species_group == 'insect':
            sequence_groups['ASCa_insects'].append(rec)
        elif species_group == 'spider':
            sequence_groups['ASCa_spiders'].append(rec)
    elif family == 'ASCb':
        sequence_groups['ASCb_combined'].append(rec)
        if species_group == 'insect':
            sequence_groups['ASCb_insects'].append(rec)
        elif species_group == 'spider':
            sequence_groups['ASCb_spiders'].append(rec)
    elif family == 'ASCc':
        sequence_groups['ASCc_combined'].append(rec)
        if species_group == 'insect':
            sequence_groups['ASCc_insects'].append(rec)
        elif species_group == 'spider':
            sequence_groups['ASCc_spiders'].append(rec)

# Print summary
print("\nSequence counts by group:")
for group_name, seqs in sequence_groups.items():
    print(f"  {group_name}: {len(seqs)} sequences")

# ===========================
# 4) ALIGNMENT FUNCTION
# ===========================
def run_mafft(input_fasta: Path, output_fasta: Path) -> bool:
    """Run MAFFT alignment on input FASTA, save to output FASTA"""
    if not input_fasta.exists():
        print(f"  Input file not found: {input_fasta}")
        return False
    
    n_seqs = sum(1 for _ in SeqIO.parse(str(input_fasta), "fasta"))
    if n_seqs < 2:
        print(f"  Skip MAFFT: only {n_seqs} sequence(s) in {input_fasta.name}")
        return False
    
    cmd = [MAFFT, "--auto", str(input_fasta)]
    print(f"  Running MAFFT on {input_fasta.name} ({n_seqs} sequences)...")
    
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
        print(f"  MAFFT error: {result.stderr[:500]}")
        return False
    
    output_fasta.write_text(result.stdout)
    return True

# ===========================
# 5) LOGO GENERATION FUNCTION
# ===========================
def alignment_to_logo(aln_fasta: Path, out_png: Path, out_svg: Path, title: str):
    """Generate sequence logo from aligned FASTA"""
    try:
        import matplotlib.pyplot as plt
        import logomaker as lm
    except ImportError:
        print("Installing logomaker and matplotlib...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "logomaker", "matplotlib"])
        import matplotlib.pyplot as plt
        import logomaker as lm
    
    # Read sequences
    seqs = [str(rec.seq) for rec in SeqIO.parse(str(aln_fasta), "fasta")]
    if len(seqs) < 2:
        print(f"  Not enough sequences for logo: {aln_fasta.name}")
        return
    
    # Check alignment length consistency
    L = len(seqs[0])
    if not all(len(s) == L for s in seqs):
        print(f"  Warning: Variable length sequences in {aln_fasta.name}")
        return
    
    # Count amino acids at each position
    alphabet = list("ACDEFGHIKLMNPQRSTVWY")
    position_counts = []
    for i in range(L):
        counts = {aa: 0 for aa in alphabet}
        for seq in seqs:
            if seq[i] in counts:
                counts[seq[i]] += 1
        position_counts.append(counts)
    
    # Create DataFrame for logomaker
    df = pd.DataFrame(position_counts)
    
    # Generate logo
    fig = plt.figure(figsize=(16, 3.5))
    ax = fig.add_subplot(111)
    lm.Logo(df, ax=ax, shade_below=0.5, fade_below=0.5, vpad=0.05, width=0.98)
    ax.set_xlabel("Alignment Position", fontsize=12)
    ax.set_ylabel("Information (bits)", fontsize=12)
    ax.set_title(title, fontsize=14, fontweight='bold')
    plt.tight_layout()
    
    # Save
    fig.savefig(out_png, dpi=300, bbox_inches='tight')
    fig.savefig(out_svg, bbox_inches='tight')
    plt.close(fig)
    
    print(f"  ✓ Saved: {out_png.name} and {out_svg.name}")

# ===========================
# 6) GENERATE ALL LOGOS
# ===========================
print("\n" + "="*70)
print("GENERATING WEBLOGOS")
print("="*70)

for group_name, records in sequence_groups.items():
    if len(records) < 2:
        print(f"\n{group_name}: Skipping (only {len(records)} sequence(s))")
        continue
    
    print(f"\n{group_name}: {len(records)} sequences")
    
    # Write raw FASTA
    raw_fasta = OUTDIR / f"{group_name}_raw.faa"
    SeqIO.write(records, str(raw_fasta), "fasta")
    print(f"  Wrote raw FASTA: {raw_fasta.name}")
    
    # Align with MAFFT
    aln_fasta = OUTDIR / f"{group_name}_aligned.faa"
    success = run_mafft(raw_fasta, aln_fasta)
    
    if not success:
        print(f"  Skipping logo generation for {group_name}")
        continue
    
    # Generate logo
    out_png = OUTDIR / f"{group_name}_logo.png"
    out_svg = OUTDIR / f"{group_name}_logo.svg"
    
    # Create nice title
    family = group_name.split('_')[0]
    species = group_name.split('_')[1]
    if species == 'combined':
        title = f"{family} bHLH Domain - All Species (n={len(records)})"
    elif species == 'insects':
        title = f"{family} bHLH Domain - Insects Only (n={len(records)})"
    elif species == 'spiders':
        title = f"{family} bHLH Domain - Spiders Only (n={len(records)})"
    else:
        title = f"{family} bHLH Domain - {species} (n={len(records)})"
    
    alignment_to_logo(aln_fasta, out_png, out_svg, title)

# ===========================
# 7) SUMMARY
# ===========================
print("\n" + "="*70)
print("COMPLETED!")
print("="*70)
print(f"\nAll outputs saved to: {OUTDIR.absolute()}")
print("\nGenerated logos:")
for group_name, records in sequence_groups.items():
    if len(records) >= 2:
        print(f"  ✓ {group_name}: {len(records)} sequences")
    else:
        print(f"  ✗ {group_name}: Only {len(records)} sequence(s) - skipped")

print("\nFiles generated:")
print("  - *_raw.faa: Raw extracted bHLH sequences")
print("  - *_aligned.faa: MAFFT-aligned sequences")
print("  - *_logo.png: WebLogo in PNG format (300 dpi)")
print("  - *_logo.svg: WebLogo in SVG format (vector)")

Loading data...
Loaded 2123 FIMO hits

Sequence counts by family:
  ASCa: 37 sequences
  ASCb: 14 sequences
  ASCc: 15 sequences

Loaded 89 bHLH domain coordinates

After merging: 89 sequences with family and species info

Loading target FASTA...
Loaded 91 target sequences

Extracting bHLH subsequences...
Extracted 89 bHLH subsequences

Organizing sequences by family and species...

Sequence counts by group:
  ASCa_combined: 36 sequences
  ASCa_insects: 8 sequences
  ASCa_spiders: 28 sequences
  ASCb_combined: 14 sequences
  ASCb_insects: 1 sequences
  ASCb_spiders: 13 sequences
  ASCc_combined: 15 sequences
  ASCc_insects: 2 sequences
  ASCc_spiders: 13 sequences

GENERATING WEBLOGOS

ASCa_combined: 36 sequences
  Wrote raw FASTA: ASCa_combined_raw.faa
  Running MAFFT on ASCa_combined_raw.faa (36 sequences)...
  ✓ Saved: ASCa_combined_logo.png and ASCa_combined_logo.svg

ASCa_insects: 8 sequences
  Wrote raw FASTA: ASCa_insects_raw.faa
  Running MAFFT on ASCa_insects_raw.faa (8 sequen

In [12]:
from pathlib import Path
from Bio import SeqIO
import numpy as np
from collections import Counter

WD = Path("weblogos_by_family_and_species")

print("="*70)
print("bHLH DIAGNOSTIC ANALYSIS")
print("="*70)

for fam in ["ASCa", "ASCb", "ASCc"]:
    for sp in ["combined", "insects", "spiders"]:
        g = f"{fam}_{sp}"
        raw = WD / f"{g}_raw.faa"
        aln = WD / f"{g}_aligned.faa"
        
        if not raw.exists():
            continue
            
        seqs = list(SeqIO.parse(str(raw), "fasta"))
        lens = [len(s.seq) for s in seqs]
        
        print(f"\n{g}:")
        print(f"  Sequences: {len(seqs)}")
        print(f"  Length range: {min(lens)}-{max(lens)} aa")
        print(f"  Mean ± SD: {np.mean(lens):.1f} ± {np.std(lens):.1f} aa")
        
        if aln.exists():
            aseqs = [str(r.seq) for r in SeqIO.parse(str(aln), "fasta")]
            L = len(aseqs[0])
            gaps = sum(s.count('-') for s in aseqs)
            gap_pct = 100 * gaps / (len(aseqs) * L)
            print(f"  Alignment: {L} positions, {gap_pct:.1f}% gaps")
            
            # Calculate entropy
            entropies = []
            for i in range(L):
                col = [s[i] for s in aseqs if s[i] != '-']
                if col:
                    counts = Counter(col)
                    total = sum(counts.values())
                    ent = -sum((c/total) * np.log2(c/total) for c in counts.values())
                    entropies.append(ent)
            
            print(f"  Mean entropy: {np.mean(entropies):.2f} bits")
            conserved = sum(1 for e in entropies if e < 0.5)
            print(f"  Conserved positions (<0.5 bits): {conserved}/{L} ({100*conserved/L:.1f}%)")
            
            print(f"  Sample alignment (first 3 seqs, 60 positions):")
            for r in list(SeqIO.parse(str(aln), "fasta"))[:3]:
                print(f"    {r.id[:16]:16s}: {str(r.seq)[:60]}")
        
        # Flag problems
        if max(lens) - min(lens) > 30:
            print(f"  ⚠️  High length variation ({max(lens)-min(lens)} aa)")

bHLH DIAGNOSTIC ANALYSIS

ASCa_combined:
  Sequences: 36
  Length range: 50-75 aa
  Mean ± SD: 58.1 ± 7.6 aa
  Alignment: 75 positions, 22.5% gaps
  Mean entropy: 1.17 bits
  Conserved positions (<0.5 bits): 25/75 (33.3%)
  Sample alignment (first 3 seqs, 60 positions):
    Abru_g14616.t1  : -VARRNERERKRVRLVNMGFAKLRQYIP---------------------TTGRPGKRLSK
    Abru_g14798.t1  : SVARRNARERKRVCLVNMGFANLRDHIPPHLTVQAGPPSKSRSG----NNSAASNKKLSK
    Abru_g14799.t1  : AVSRRNARERKRVRLVNLGFSTLRERVP----------------------PGAKNKKLSK

ASCa_insects:
  Sequences: 8
  Length range: 59-69 aa
  Mean ± SD: 64.1 ± 2.8 aa
  Alignment: 70 positions, 8.4% gaps
  Mean entropy: 0.62 bits
  Conserved positions (<0.5 bits): 38/70 (54.3%)
  Sample alignment (first 3 seqs, 60 positions):
    Amel_g4745.t2   : -VARRNARERNRVKQVNNGFATLRQHIPQSVAQALGGSTAGTHGGSRAGSKKLSKVETLR
    Amel_g4746.t1   : AVARRNARERNRVKQVNNGFATLRQHIPSHIAAGYGD---------RG--KKLSKVETLR
    Dmel_achaete_NP_: -VIRRNARERNRVKQVNNGFSQLRQHIPAAVIADLSNGRR---GIGPGA

In [6]:

# ================================================================================
# Generate publication-ready multi-panel figures for ASC family bHLH domains
# ================================================================================
# Creates 3 figures:
#   Figure 1: ASCa (spiders top, insects bottom)
#   Figure 2: ASCb (spiders only)
#   Figure 3: ASCc (spiders top, insects bottom)

from pathlib import Path
import pandas as pd
from Bio import SeqIO
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import logomaker as lm

# Configuration
WEBLOGOS_DIR = Path("weblogos_by_family_and_species")
OUTPUT_DIR = Path("publication_figures")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

def alignment_to_counts_matrix(aln_fasta):
    """Convert aligned FASTA to counts matrix for logomaker"""
    seqs = [str(rec.seq) for rec in SeqIO.parse(str(aln_fasta), "fasta")]
    if len(seqs) < 1:
        return None
    
    L = len(seqs[0])
    alphabet = list("ACDEFGHIKLMNPQRSTVWY")
    
    position_counts = []
    for i in range(L):
        counts = {aa: 0 for aa in alphabet}
        for seq in seqs:
            if seq[i] in counts:
                counts[seq[i]] += 1
        position_counts.append(counts)
    
    return pd.DataFrame(position_counts)

# ================================================================================
# FIGURE 1: ASCa (Spiders + Insects)
# ================================================================================
print("Generating Figure 1: ASCa...")

fig1 = plt.figure(figsize=(16, 6))
gs1 = gridspec.GridSpec(2, 1, figure=fig1, height_ratios=[1, 1], hspace=0.7)

# ASCa Spiders (top)
ax1_top = fig1.add_subplot(gs1[0])
aln_asca_spiders = WEBLOGOS_DIR / "ASCa_spiders_aligned.faa"
df_asca_spiders = alignment_to_counts_matrix(aln_asca_spiders)
n_asca_spiders = sum(1 for _ in SeqIO.parse(str(WEBLOGOS_DIR / "ASCa_spiders_raw.faa"), "fasta"))

lm.Logo(df_asca_spiders, ax=ax1_top, shade_below=0.5, fade_below=0.5, 
        vpad=0.05, width=0.98, font_name='Arial')
ax1_top.set_xlabel("Alignment Position", fontsize=13, fontweight='bold')
ax1_top.set_ylabel("Information (bits)", fontsize=13, fontweight='bold')
ax1_top.set_title(f"ASCa - Spiders (n={n_asca_spiders})", 
                  fontsize=15, fontweight='bold', pad=10)
ax1_top.tick_params(labelsize=11)

# ASCa Insects (bottom)
ax1_bottom = fig1.add_subplot(gs1[1])
aln_asca_insects = WEBLOGOS_DIR / "ASCa_insects_aligned.faa"
df_asca_insects = alignment_to_counts_matrix(aln_asca_insects)
n_asca_insects = sum(1 for _ in SeqIO.parse(str(WEBLOGOS_DIR / "ASCa_insects_raw.faa"), "fasta"))

lm.Logo(df_asca_insects, ax=ax1_bottom, shade_below=0.5, fade_below=0.5, 
        vpad=0.05, width=0.98, font_name='Arial')
ax1_bottom.set_xlabel("Alignment Position", fontsize=13, fontweight='bold')
ax1_bottom.set_ylabel("Information (bits)", fontsize=13, fontweight='bold')
ax1_bottom.set_title(f"ASCa - Insects (n={n_asca_insects})", 
                     fontsize=15, fontweight='bold', pad=10)
ax1_bottom.tick_params(labelsize=11)

plt.tight_layout()
fig1.savefig(OUTPUT_DIR / "Figure1_ASCa_bHLH.png", dpi=300, bbox_inches='tight')
fig1.savefig(OUTPUT_DIR / "Figure1_ASCa_bHLH.svg", bbox_inches='tight')
plt.close(fig1)
print(f"  ✓ Saved: Figure1_ASCa_bHLH.png/.svg")

# ================================================================================
# FIGURE 2: ASCb (Spiders only)
# ================================================================================
print("Generating Figure 2: ASCb...")

fig2 = plt.figure(figsize=(16, 3.5))
ax2 = fig2.add_subplot(111)

aln_ascb_spiders = WEBLOGOS_DIR / "ASCb_spiders_aligned.faa"
df_ascb_spiders = alignment_to_counts_matrix(aln_ascb_spiders)
n_ascb_spiders = sum(1 for _ in SeqIO.parse(str(WEBLOGOS_DIR / "ASCb_spiders_raw.faa"), "fasta"))

lm.Logo(df_ascb_spiders, ax=ax2, shade_below=0.5, fade_below=0.5, 
        vpad=0.05, width=0.98, font_name='Arial')
ax2.set_xlabel("Alignment Position", fontsize=13, fontweight='bold')
ax2.set_ylabel("Information (bits)", fontsize=13, fontweight='bold')
ax2.set_title(f"ASCb - Spiders (n={n_ascb_spiders})", 
              fontsize=15, fontweight='bold', pad=10)
ax2.tick_params(labelsize=11)

plt.tight_layout()
fig2.savefig(OUTPUT_DIR / "Figure2_ASCb_bHLH.png", dpi=300, bbox_inches='tight')
fig2.savefig(OUTPUT_DIR / "Figure2_ASCb_bHLH.svg", bbox_inches='tight')
plt.close(fig2)
print(f"  ✓ Saved: Figure2_ASCb_bHLH.png/.svg")

# ================================================================================
# FIGURE 3: ASCc (Spiders + Insects)
# ================================================================================
print("Generating Figure 3: ASCc...")

fig3 = plt.figure(figsize=(16, 6))
gs3 = gridspec.GridSpec(2, 1, figure=fig3, height_ratios=[1, 1], hspace=0.7)

# ASCc Spiders (top)
ax3_top = fig3.add_subplot(gs3[0])
aln_ascc_spiders = WEBLOGOS_DIR / "ASCc_spiders_aligned.faa"
df_ascc_spiders = alignment_to_counts_matrix(aln_ascc_spiders)
n_ascc_spiders = sum(1 for _ in SeqIO.parse(str(WEBLOGOS_DIR / "ASCc_spiders_raw.faa"), "fasta"))

lm.Logo(df_ascc_spiders, ax=ax3_top, shade_below=0.5, fade_below=0.5, 
        vpad=0.05, width=0.98, font_name='Arial')
ax3_top.set_xlabel("Alignment Position", fontsize=13, fontweight='bold')
ax3_top.set_ylabel("Information (bits)", fontsize=13, fontweight='bold')
ax3_top.set_title(f"ASCc - Spiders (n={n_ascc_spiders})", 
                  fontsize=15, fontweight='bold', pad=10)
ax3_top.tick_params(labelsize=11)

# ASCc Insects (bottom)
ax3_bottom = fig3.add_subplot(gs3[1])
aln_ascc_insects = WEBLOGOS_DIR / "ASCc_insects_aligned.faa"
df_ascc_insects = alignment_to_counts_matrix(aln_ascc_insects)
n_ascc_insects = sum(1 for _ in SeqIO.parse(str(WEBLOGOS_DIR / "ASCc_insects_raw.faa"), "fasta"))

lm.Logo(df_ascc_insects, ax=ax3_bottom, shade_below=0.5, fade_below=0.5, 
        vpad=0.05, width=0.98, font_name='Arial')
ax3_bottom.set_xlabel("Alignment Position", fontsize=13, fontweight='bold')
ax3_bottom.set_ylabel("Information (bits)", fontsize=13, fontweight='bold')
ax3_bottom.set_title(f"ASCc - Insects (n={n_ascc_insects})", 
                     fontsize=15, fontweight='bold', pad=10)
ax3_bottom.tick_params(labelsize=11)

plt.tight_layout()
fig3.savefig(OUTPUT_DIR / "Figure3_ASCc_bHLH.png", dpi=300, bbox_inches='tight')
fig3.savefig(OUTPUT_DIR / "Figure3_ASCc_bHLH.svg", bbox_inches='tight')
plt.close(fig3)
print(f"  ✓ Saved: Figure3_ASCc_bHLH.png/.svg")

print("\n" + "="*70)
print("PUBLICATION FIGURES COMPLETE")
print("="*70)
print(f"\nAll figures saved to: {OUTPUT_DIR.absolute()}")
print("\nGenerated files:")
print("  - Figure1_ASCa_bHLH.png/.svg (Spiders + Insects)")
print("  - Figure2_ASCb_bHLH.png/.svg (Spiders only)")
print("  - Figure3_ASCc_bHLH.png/.svg (Spiders + Insects)")

Generating Figure 1: ASCa...


  plt.tight_layout()


  ✓ Saved: Figure1_ASCa_bHLH.png/.svg
Generating Figure 2: ASCb...
  ✓ Saved: Figure2_ASCb_bHLH.png/.svg
Generating Figure 3: ASCc...


  plt.tight_layout()


  ✓ Saved: Figure3_ASCc_bHLH.png/.svg

PUBLICATION FIGURES COMPLETE

All figures saved to: /Users/gorkemdurmaz/Desktop/asc_project_10/publication_figures

Generated files:
  - Figure1_ASCa_bHLH.png/.svg (Spiders + Insects)
  - Figure2_ASCb_bHLH.png/.svg (Spiders only)
  - Figure3_ASCc_bHLH.png/.svg (Spiders + Insects)
