# PAML site test submit sbatch job

In [None]:
# Import all necessary packages
import os
import pandas as pd
import subprocess
from pathlib import Path
import random, shutil, subprocess, re
from io import StringIO
from typing import List, Tuple
from Bio import AlignIO, SeqIO, Phylo
from Bio.Align import MultipleSeqAlignment
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import shutil

## Workflow 

In [None]:
# How the output files will look: 
# {data_dir}/PAML/x_chromosome/{family}/{cluster}/
#   ├─ {cluster}_nogaps.fa         (once per cluster)
#   ├─ iter_01/
#   │    ├─ subset_species.fasta
#   │    ├─ subset.phy
#   │    ├─ subset_fullID_to_species.tsv
#   │    ├─ tree_pruned.nwk
#   │    ├─ codeml.ctl
#   │    └─ mlc.txt                (codeml output)
#   ├─ iter_02/  (same files)
#   └─ iter_03/  (same files)

In [None]:
# Extract clusters 
# set the family names for the X chromosome
# families = ['CSF2RA', 'SPANX', 'TBL1X' ,'VCX' ,'TMSB' ,'MAGEB',
#  'TCEAL8' ,'H2A','endogenous','SPACA5',
#  'SSX' ,'GAGE' ,'NUDT10' ,'CENPVL',
#  'FLJ39060' ,'XAGE1' ,'FAM156', 'SPIN',
#  'ZXD' ,'CXorf49' ,'DMRTC1' ,'FAM236', 'PABPC', 'RPL36A', 'ARMCX' ,'NXF',
#  'TCP11X2' ,'GPRASP', 'RAB40A' ,'H2BW', 'CT47' ,'RHOXF2' ,'SMIM10' ,'ETD',
#  'INTS6L', 'CT45A', 'CXorf51', 'EOLA' ,'HSFX' ,'TMEM185A', 'CSAG', 'PNMA',
#  'PWWP4', 'OPN1LW', 'TEX28', 'LAGE3', 'IKBKG' ,'F8A1',
#  'collagen' ,'LOC129475109','LOC115932372']

# set the family names for the Y chromosome
families = ['CDY1', 'glutamate', 'TSPY8' ,'DAZ1',
 'BPY2', 'RBMY1B', 'MTRNR2', 'proline', 'VCY1B',
 'HSFY1', 'keratin' ,'FRG1',
 'centriole','FAM47A', 'zinc','isoenzyme',
 'retrovirus','TATA-box']


In [None]:

data_dir = Path("/home/emma/Amplicons/Workspaces/emma/downloaded_data")

# species codes + fixed species tree (all taxa; Will be pruned per iteration)
species_targets = ["HomSap","PanTro","PanPan","GorGor","PonAbe","PonPyg","SymSyn","MacFas"]
# Unrooted tree
tree_newick = "(((((PanTro,PanPan), HomSap), GorGor), (PonPyg,PonAbe)), SymSyn, MacFas);"

# tools
perl_fasta2phy = data_dir / "PAML" / "FASTAtoPHYL.pl"   # your Perl converter
codeml_bin = "codeml"                                    # or Path("/full/path/to/codeml")

# iterations
n_iters = 6 
base_seed = None 

# base output dir
PAML_ROOT = data_dir / "PAML" / "x_chromosome" # for the X chromosome
#PAML_ROOT = data_dir / "PAML" / "y_chromosome" # for the y chromosome

PAML_ROOT.mkdir(parents=True, exist_ok=True)

In [None]:
# extract the clusters per family
# this dict will hold, for each family, the list of multi-seq clusters
# change the directory sequences_x to sequences_y when doing one or the other. 

cluster_list_per_family = {}

for family in families:
    # ensure the alignments directory exists
    cluster_alignments = f"{data_dir}/sequences_y/{family}_selected_isoform/blastdb/cluster_alignments"
    os.makedirs(cluster_alignments, exist_ok=True)

    # grab every .fa basename in the cluster_fastas dir
    cluster_dir = f"{data_dir}/sequences_y/{family}_selected_isoform/blastdb/cluster_fastas"
    all_clusters = [
        os.path.splitext(fn)[0]
        for fn in os.listdir(cluster_dir)
        if fn.endswith(".fa")
    ]

    # filter out FASTAs with only one sequence
    filtered = []
    for name in all_clusters:
        path = os.path.join(cluster_dir, f"{name}.fa")
        with open(path) as f:
            nseq = sum(1 for line in f if line.startswith(">"))
        if nseq > 1:
            filtered.append(name)

    # check for duplicate IDs
    for name in filtered:
        path = os.path.join(cluster_dir, f"{name}.fa")
        seen, dups = set(), set()
        with open(path) as f:
            for line in f:
                if line.startswith(">"):
                    seqid = line[1:].split()[0]
                    if seqid in seen:
                        dups.add(seqid)
                    else:
                        seen.add(seqid)
        if dups:
            print(f"[{family}] {name}.fa has duplicate IDs: {', '.join(dups)}")

    # store the filtered list for later
    cluster_list_per_family[family] = filtered

    print(f"{family}: keeping {len(filtered)} clusters")


In [None]:
# All functions
# Clean up the alignment -> codeml has a hard time with gaps. So drop entire codons if any sequence has a gap inside that codon
def make_codonwise_nogap_alignment(inp_path, out_path, gap_chars="-"):
    from Bio import AlignIO
    from Bio.Align import MultipleSeqAlignment
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq

    aln = AlignIO.read(str(inp_path), "fasta")
    L = aln.get_alignment_length()
    floorL = (L // 3) * 3

    keep_idx = []
    gapset = set(gap_chars)
    for start in range(0, floorL, 3):
        # drop codon if ANY '-' occurs in its 3 columns (in ANY sequence)
        if any(any(ch in gapset for ch in aln[:, start + k]) for k in range(3)):
            continue
        keep_idx.extend([start, start+1, start+2])

    if not keep_idx:
        raise ValueError("No codon columns left after codon-wise gap removal.")

    trimmed = MultipleSeqAlignment([
        SeqRecord(Seq(''.join(str(rec.seq)[i] for i in keep_idx)), id=rec.id, description="")
        for rec in aln
    ])

    if trimmed.get_alignment_length() % 3 != 0:
        raise AssertionError("Trimmed length not divisible by 3 (unexpected).")

    AlignIO.write(trimmed, str(out_path), "fasta")
    kept_codons = len(keep_idx) // 3
    total_codons = floorL // 3
    print(f"codonwise nogaps -> {out_path.name} (kept {kept_codons}/{total_codons} codons)")

def pick_one_per_species(fasta_path: Path, targets: List[str], rng: random.Random) -> Tuple[List[SeqRecord], dict, int, int]:
    records = list(SeqIO.parse(str(fasta_path), "fasta"))
    buckets = {sp: [] for sp in targets}
    for r in records:
        sp = r.id.split("_")[-1]
        if sp in buckets:
            r.seq = r.seq.upper()
            buckets[sp].append(r)
    picked, id_map = [], {}
    for sp in targets:
        if buckets[sp]:
            r = rng.choice(buckets[sp])
            picked.append(r)
            id_map[r.id] = sp
    if len(picked) < 2:
        raise ValueError("Need at least 2 target species in this cluster.")
    lens = {len(r.seq) for r in picked}
    if len(lens) != 1:
        raise ValueError(f"Unequal lengths after nogaps: {sorted(lens)}")
    return picked, id_map, len(picked), next(iter(lens))

def write_species_fasta(picked: List[SeqRecord], out_fa: Path):
    with open(out_fa, "w") as out:
        for r in picked:
            sp = r.id.split("_")[-1]
            out.write(f">{sp}\n{str(r.seq)}\n")

def prune_tree_to_species(tree_newick: str, present: List[str], out_tree: Path):
    tree = Phylo.read(StringIO(tree_newick), "newick")
    names = {t.name for t in tree.get_terminals()}
    missing = sorted(set(present) - names)
    if missing:
        raise ValueError(f"Tree missing taxa present in alignment: {missing}")
    for term in list(tree.get_terminals()):
        if term.name not in present:
            tree.prune(term)
    Phylo.write(tree, str(out_tree), "newick")


def fasta_to_phylip_with_perl(perl_script: Path, species_fa: Path, nseq: int, length: int, out_phy: Path):
    # run in the iteration folder so the .phy is created there
    cwd = species_fa.parent
    cmd = ["perl", str(perl_script), species_fa.name, str(nseq), str(length)]
    subprocess.run(cmd, check=True, cwd=str(cwd))
    tmp = cwd / (species_fa.stem + ".phy")
    if not tmp.exists():
        raise FileNotFoundError(f"Expected {tmp} not found after Perl conversion.")
    shutil.move(str(tmp), str(out_phy))

def write_ctl(ctl_path: Path, seqfile: str, treefile: str, outfile: str):
    ctl = f"""seqfile = {seqfile}
treefile = {treefile}
outfile = {outfile}

noisy = 3                              * Display moderate amount of information on the screen
verbose = 1                            * Detailed output file
seqtype = 1                            * Codon data
ndata = 1                              * One gene alignment
icode = 0                              * Universal genetic code
cleandata = 0                          * Do not remove sites with ambiguity data (because gaps already removed before)
model = 0                              * One ω for all branches (M0 and site models)
NSsites = 0 1 2 7 8                    * Models M0 (0), M1a (1), M2a (2), M7 (7), and M8 (8)
CodonFreq = 7                          * Use mutation-selection model
estFreq = 0                            * Use observed frequencies to calculate fitness/freq pars
clock = 0                              * Assume no clock
fix_omega = 0                          * Enables option to estimate omega
omega = 0.5                            * Initial omega value

"""
    ctl_path.write_text(ctl)

def run_codeml(codeml_bin, ctl_path: Path, workdir: Path):
    subprocess.run([str(codeml_bin), ctl_path.name], cwd=str(workdir), check=True)

In [None]:
# run the Workflow

if shutil.which("perl") is None:
    raise SystemExit("Perl not found. Install perl or add it to PATH.")
if shutil.which(str(codeml_bin)) is None:
    raise SystemExit("codeml not found. Set 'codeml_bin' to its full path or add to PATH.")
if not perl_fasta2phy.exists():
    raise SystemExit(f"Perl converter not found: {perl_fasta2phy}")

for family in families:
    clusters = cluster_list_per_family.get(family, [])
    print(f"\n=== FAMILY: {family} ({len(clusters)} clusters) ===")
    for cluster in clusters:
        # input alignment to clean (your trimmed alignments live here) # SEQUENCES_X HAS TO BE CHANGED TO SEQUENCES_Y WHEN Y CHROM
        inp = data_dir / "sequences_y_longestisoform" / f"{family}_selected_isoform" / "blastdb" / "cluster_alignments" / f"{cluster}_NT.fa"
        if not inp.exists():
            print(f"  [skip] missing alignment: {inp}")
            continue

        cluster_dir = PAML_ROOT / family / cluster
        cluster_dir.mkdir(parents=True, exist_ok=True)

        # 1) no-gap alignment once per cluster
        nogap = cluster_dir / f"{cluster}_nogaps.fa"
        if not nogap.exists():
            make_codonwise_nogap_alignment(inp, nogap)

        # RNG for this cluster
        rng = random.Random(base_seed) if base_seed is not None else random.Random()

        # 2) iterations
        for it in range(1, n_iters+1):
            iter_dir = cluster_dir / f"iter_{it:02d}"
            iter_dir.mkdir(exist_ok=True)

            try:
                # pick one seq per species
                picked, id_map, nseq, length = pick_one_per_species(nogap, species_targets, rng)
                present = sorted({r.id.split("_")[-1] for r in picked})

                # write subset files
                subset_fa   = iter_dir / "subset_species.fasta"
                subset_phy  = iter_dir / "subset.phy"
                subset_map  = iter_dir / "subset_fullID_to_species.tsv"
                subset_tree = iter_dir / "tree_pruned.nwk"

                write_species_fasta(picked, subset_fa)
                with open(subset_map, "w") as f:
                    f.write("full_id\tspecies\n")
                    for full, sp in id_map.items():
                        f.write(f"{full}\t{sp}\n")

                prune_tree_to_species(tree_newick, present, subset_tree)

                # FASTA -> PHYLIP
                fasta_to_phylip_with_perl(perl_fasta2phy, subset_fa, nseq, length, subset_phy)

                # codeml control + run
                ctl = iter_dir / "codeml.ctl"
                out_mlc = iter_dir / "mlc.txt"
                write_ctl(ctl, "subset.phy", "tree_pruned.nwk", "mlc.txt")
                run_codeml(codeml_bin, ctl, iter_dir)

                print(f"  {family}/{cluster} iter {it:02d}: OK ({nseq} spp × {length} bp)")

            except Exception as e:
                print(f"  {family}/{cluster} iter {it:02d}: SKIP -> {e}")
