# Y chromosome ampliconic clustering updated

In [1]:
# Import all necessary packages
import os
import pandas as pd
import subprocess
import matplotlib.pyplot as plt
from Bio import Phylo
from Bio import SeqIO
from Bio.Seq import Seq
import re
from Bio.SeqRecord import SeqRecord
from collections import defaultdict
from pathlib import Path
import numpy as np
import itertools

In [2]:
# STEP 1. Define species information

# Define base directories
data_dir = "/home/emma/Amplicons/Workspaces/emma/downloaded_data"
work_dir = os.path.join(data_dir, "work_dir", "y_multicopy")

#Define the list of dictionaries (data) containing genome information for different species
data = [
    {'species':'PanTro',
     'data': {'chr_y': "NC_072422.2",
              'chr_x': "NC_072421.2",
              'path_to_annotation_NCBI': f"{data_dir}/references/PanTro/ncbi_dataset/data/GCF_028858775.2/genomic.gff", #annotation file 
              'path_to_annotation_NCBI_chry': f"{data_dir}/references/PanTro/ncbi_dataset/data/GCF_028858775.2/genomic_chrY.gff", #Y specific annotation
              'path_to_annotation_CAT': f"{data_dir}/CAT/consensus_gene_set/GCF_028858775.2.gff3", #path to an alternate annotation format (generated by CAT pipeline)
              'ref':  f"{data_dir}/references/PanTro/ncbi_dataset/data/GCF_028858775.2/GCF_028858775.2_NHGRI_mPanTro3-v2.0_pri_genomic.fna", # primary reference genome file
              'rna':  f"{data_dir}/references/PanTro/ncbi_dataset/data/GCF_028858775.2/rna.fna", #RNA sequences 
              'prot': f"{data_dir}/references/PanTro/ncbi_dataset/data/GCF_028858775.2/protein.faa", #protein sequences (FASTA format)
              'cds': f"{data_dir}/references/PanTro/ncbi_dataset/data/GCF_028858775.2/cds_from_genomic.fna", #coding DNA seq derived from genome
              'gff_x': f"{data_dir}/references/PanTro/ncbi_dataset/data/GCF_028858775.2/genomic_chrX.gff", # chrX annotation file
              'fasta_x': f"{data_dir}/references/PanTro/ncbi_dataset/data/GCF_028858775.2/PanTro_X.fasta", # only X reference genome
              'fasta_y': f"{data_dir}/references/PanTro/ncbi_dataset/data/GCF_028858775.2/PanTro_Y.fasta", # only Y reference genome
              'gff_x_cds': f"{data_dir}/references/PanTro/ncbi_dataset/data/GCF_028858775.2/genomic_chrX_cds_isoform.gff", # gff file with only CDS annotation filtered
              'gff_y_cds': f"{data_dir}/references/PanTro/ncbi_dataset/data/GCF_028858775.2/genomic_chrY_cds_isoform.gff", # gff file with only CDS annotation filtered
 }},
    {'species':'HomSap',
     'data': {'chr_y': "NC_060948.1",
              'chr_x': "NC_060947.1",
              'path_to_annotation_NCBI': f"{data_dir}/references/HomSap/ncbi_dataset/data/GCF_009914755.1/genomic.gff",
              'path_to_annotation_NCBI_chry': f"{data_dir}/references/HomSap/ncbi_dataset/data/GCF_009914755.1/genomic_chrY.gff",
              'path_to_annotation_CAT': f"{data_dir}/CAT/consensus_gene_set/hg38.gff3",
              'ref': f"{data_dir}/references/HomSap/ncbi_dataset/data/GCF_009914755.1/GCF_009914755.1_T2T-CHM13v2.0_genomic.fna",
              'cds': f"{data_dir}/references/HomSap/ncbi_dataset/data/GCF_009914755.1/cds_from_genomic.fna",
              'prot': f"{data_dir}/references/HomSap/ncbi_dataset/data/GCF_009914755.1/protein.faa",
              'gff_x': f"{data_dir}/references/HomSap/ncbi_dataset/data/GCF_009914755.1/genomic_chrX.gff",
              'fasta_x': f"{data_dir}/references/HomSap/ncbi_dataset/data/GCF_009914755.1/HomSap_X.fasta",
              'fasta_y': f"{data_dir}/references/HomSap/ncbi_dataset/data/GCF_009914755.1/HomSap_Y.fasta",
              'gff_x_cds': f"{data_dir}/references/HomSap/ncbi_dataset/data/GCF_009914755.1/genomic_chrX_cds_isoform.gff",
              'gff_y_cds': f"{data_dir}/references/HomSap/ncbi_dataset/data/GCF_009914755.1/genomic_chrY_cds_isoform.gff",
              }},
     {'species':'PanPan',
     'data': {'chr_y': "NC_073273.2",
              'chr_x': "NC_073272.2",
              'path_to_annotation_NCBI': f"{data_dir}/references/PanPan/ncbi_dataset/data/GCF_029289425.2/genomic.gff",
              'path_to_annotation_NCBI_chry': f"{data_dir}/references/PanPan/ncbi_dataset/data/GCF_029289425.2/genomic_chrY.gff",
              'path_to_annotation_CAT': f"{data_dir}/CAT/consensus_gene_set/GCF_029289425.2.gff3",
              'ref': f"{data_dir}/references/PanPan/ncbi_dataset/data/GCF_029289425.2/GCF_029289425.2_NHGRI_mPanPan1-v2.0_pri_genomic.fna",
              'cds': f'{data_dir}/references/PanPan/ncbi_dataset/data/GCF_029289425.2/cds_from_genomic.fna',
              'prot': f'{data_dir}/references/PanPan/ncbi_dataset/data/GCF_029289425.2/protein.faa',
              'gff_x': f"{data_dir}/references/PanPan/ncbi_dataset/data/GCF_029289425.2/genomic_chrX.gff",
              'fasta_x': f"{data_dir}/references/PanPan/ncbi_dataset/data/GCF_029289425.2/PanPan_X.fasta",
              'fasta_y': f"{data_dir}/references/PanPan/ncbi_dataset/data/GCF_029289425.2/PanPan_Y.fasta",
              'gff_x_cds': f"{data_dir}/references/PanPan/ncbi_dataset/data/GCF_029289425.2/genomic_chrX_cds_isoform.gff",
              'gff_y_cds': f"{data_dir}/references/PanPan/ncbi_dataset/data/GCF_029289425.2/genomic_chrY_cds_isoform.gff",
              }},
      {'species':'GorGor',
     'data': {'chr_y': "NC_073248.2",
              'chr_x': "NC_073247.2",
              'path_to_annotation_NCBI': f"{data_dir}/references/GorGor/ncbi_dataset/data/GCF_029281585.2/genomic.gff",
              'path_to_annotation_NCBI_chry': f"{data_dir}/references/GorGor/ncbi_dataset/data/GCF_029281585.2/genomic_chrY.gff",
              'path_to_annotation_CAT': f"{data_dir}/CAT/consensus_gene_set/GCF_029281585.2.gff3",
              'ref': f'{data_dir}/references/GorGor/ncbi_dataset/data/GCF_029281585.2/GCF_029281585.2_NHGRI_mGorGor1-v2.0_pri_genomic.fna',
              'cds': f'{data_dir}/references/GorGor/ncbi_dataset/data/GCF_029281585.2/cds_from_genomic.fna',
              'prot': f'{data_dir}/references/GorGor/ncbi_dataset/data/GCF_029281585.2/protein.faa',
              'gff_x': f"{data_dir}/references/GorGor/ncbi_dataset/data/GCF_029281585.2/genomic_chrX.gff",
              'fasta_x': f"{data_dir}/references/GorGor/ncbi_dataset/data/GCF_029281585.2/GorGor_X.fasta",
              'fasta_y': f"{data_dir}/references/GorGor/ncbi_dataset/data/GCF_029281585.2/GorGor_Y.fasta",
              'gff_x_cds': f"{data_dir}/references/GorGor/ncbi_dataset/data/GCF_029281585.2/genomic_chrX_cds_isoform.gff",
              'gff_y_cds': f"{data_dir}/references/GorGor/ncbi_dataset/data/GCF_029281585.2/genomic_chrY_cds_isoform.gff",
              }},
    {'species':'PonPyg',
     'data': {'chr_y': "NC_072397.2",
              'chr_x': "NC_072396.2",
              'path_to_annotation_NCBI': f"{data_dir}/references/PonPyg/ncbi_dataset/data/GCF_028885625.2/genomic.gff",
              'path_to_annotation_NCBI_chry': f"{data_dir}/references/PonPyg/ncbi_dataset/data/GCF_028885625.2/genomic_chrY.gff",
              'path_to_annotation_CAT': f"{data_dir}/CAT/consensus_gene_set/GCF_028885625.2.gff3",
              'ref': f'{data_dir}/references/PonPyg/ncbi_dataset/data/GCF_028885625.2/GCF_028885625.2_NHGRI_mPonPyg2-v2.0_pri_genomic.fna',
              'cds': f'{data_dir}/references/PonPyg/ncbi_dataset/data/GCF_028885625.2/cds_from_genomic.fna',
              'prot': f'{data_dir}/references/PonPyg/ncbi_dataset/data/GCF_028885625.2/protein.faa',
              'gff_x': f"{data_dir}/references/PonPyg/ncbi_dataset/data/GCF_028885625.2/genomic_chrX.gff",
              'fasta_x': f"{data_dir}/references/PonPyg/ncbi_dataset/data/GCF_028885625.2/PonPyg_X.fasta",
              'fasta_y': f"{data_dir}/references/PonPyg/ncbi_dataset/data/GCF_028885625.2/PonPyg_Y.fasta",
              'gff_x_cds': f"{data_dir}/references/PonPyg/ncbi_dataset/data/GCF_028885625.2/genomic_chrX_cds_isoform.gff",
              'gff_y_cds': f"{data_dir}/references/PonPyg/ncbi_dataset/data/GCF_028885625.2/genomic_chrY_cds_isoform.gff",
              }},
    {'species':'PonAbe',
     'data': {'chr_y': "NC_072009.2",
              'chr_x': "NC_072008.2",
              'path_to_annotation_NCBI': f"{data_dir}/references/PonAbe/ncbi_dataset/data/GCF_028885655.2/genomic.gff",
              'path_to_annotation_NCBI_chry': f"{data_dir}/references/PonAbe/ncbi_dataset/data/GCF_028885655.2/genomic_chrY.gff",
              'path_to_annotation_CAT': f"{data_dir}/CAT/consensus_gene_set/GCF_028885655.2.gff3",
              'ref': f'{data_dir}/references/PonAbe/ncbi_dataset/data/GCF_028885655.2/GCF_028885655.2_NHGRI_mPonAbe1-v2.0_pri_genomic.fna',
              'cds': f'{data_dir}/references/PonAbe/ncbi_dataset/data/GCF_028885655.2/cds_from_genomic.fna',
              'prot': f'{data_dir}/references/PonAbe/ncbi_dataset/data/GCF_028885655.2/protein.faa',
              'gff_x': f"{data_dir}/references/PonAbe/ncbi_dataset/data/GCF_028885655.2/genomic_chrX.gff",
              'fasta_x': f"{data_dir}/references/PonAbe/ncbi_dataset/data/GCF_028885655.2/PonAbe_X.fasta",
              'fasta_y': f"{data_dir}/references/PonAbe/ncbi_dataset/data/GCF_028885655.2/PonAbe_Y.fasta",
              'gff_x_cds': f"{data_dir}/references/PonAbe/ncbi_dataset/data/GCF_028885655.2/genomic_chrX_cds_isoform.gff",
              'gff_y_cds': f"{data_dir}/references/PonAbe/ncbi_dataset/data/GCF_028885655.2/genomic_chrY_cds_isoform.gff",
              }},    
    {'species':'SymSyn',
      'data': {'chr_y': "NC_072448.2",
               'chr_x': "NC_072447.2",
               'ref': f'{data_dir}/references/SymSyn/ncbi_dataset/data/GCF_028878055.3/GCF_028878055.3_NHGRI_mSymSyn1-v2.1_pri_genomic.fna',
               'path_to_annotation_NCBI': f"{data_dir}/references/SymSyn/ncbi_dataset/data/GCF_028878055.3/genomic.gff",
               'path_to_annotation_NCBI_chry': f"{data_dir}/references/SymSyn/ncbi_dataset/data/GCF_028878055.3/genomic_chrY.gff",
               'path_to_annotation_CAT': f"{data_dir}/CAT/consensus_gene_set/GCF_028878055.3.gff3",
               'cds': f'{data_dir}/references/SymSyn/ncbi_dataset/data/GCF_028878055.3/cds_from_genomic.fna',
               'prot': f'{data_dir}/references/SymSyn/ncbi_dataset/data/GCF_028878055.3/protein.faa',
               'gff_x': f"{data_dir}/references/SymSyn/ncbi_dataset/data/GCF_028878055.3/genomic_chrX.gff",
               'fasta_x': f"{data_dir}/references/SymSyn/ncbi_dataset/data/GCF_028878055.3/SymSyn_X.fasta",
               'fasta_y': f"{data_dir}/references/SymSyn/ncbi_dataset/data/GCF_028878055.3/SymSyn_Y.fasta",
               'gff_x_cds': f"{data_dir}/references/SymSyn/ncbi_dataset/data/GCF_028878055.3/genomic_chrX_cds_isoform.gff",
               'gff_y_cds': f"{data_dir}/references/SymSyn/ncbi_dataset/data/GCF_028878055.3/genomic_chrY_cds_isoform.gff",
               }},
    {'species':'MacFas',
      'data': {'chr_y': "NC_132903.1",
               'chr_x': "NC_088395.1",
               'ref': f'{data_dir}/references/MacFas/ncbi_dataset/data/GCF_037993035.2/GCF_037993035.2_T2T-MFA8v1.1_genomic.fna',
               'path_to_annotation_NCBI': f"{data_dir}/references/MacFas/ncbi_dataset/data/GCF_037993035.2/genomic.gff",
               'cds': f'{data_dir}/references/MacFas/ncbi_dataset/data/GCF_037993035.2/cds_from_genomic.fna',
               'prot': f'{data_dir}/references/MacFas/ncbi_dataset/data/GCF_037993035.2/protein.faa',
               'gff_x': f"{data_dir}/references/MacFas/ncbi_dataset/data/GCF_037993035.2/genomic_chrX.gff",
               'fasta_x': f"{data_dir}/references/MacFas/ncbi_dataset/data/GCF_037993035.2/MacFas_X.fasta",
               'fasta_y': f"{data_dir}/references/MacFas/ncbi_dataset/data/GCF_037993035.2/MacFas_Y.fasta",
               'gff_x_cds': f"{data_dir}/references/MacFas/ncbi_dataset/data/GCF_037993035.2/genomic_chrX_cds_isoform.gff",
               'gff_y_cds': f"{data_dir}/references/MacFas/ncbi_dataset/data/GCF_037993035.2/genomic_chrY_cds_isoform.gff"
               }},
]

# Maps species identifiers to their common name
species_to_sequence_spec = {
    'PanTro': 'chimpanzee',
    'HomSap': 'human',
    'PanPan': 'bonobo',
    'GorGor': 'gorilla',
    'PonPyg': 'b-orang',
    'PonAbe': 's-orang',
    'SymSyn': 'siamang',
    'MacFas': 'macaque'

}
# Extracting species names -> list of species identifiers by iterating t
species_info = {item['species']: item['data'] for item in data}
species_list = [d['species'] for d in data]
species_info
species_list

['PanTro',
 'HomSap',
 'PanPan',
 'GorGor',
 'PonPyg',
 'PonAbe',
 'SymSyn',
 'MacFas']

In [3]:
##Load dataframes with Gene coordinates and Family information
genes = pd.read_csv(f"{work_dir}/protein_extracted_longest/clusters_merged/gene_details_updated_with_palindromes_coordinates.tsv", sep='\t')

Unnamed: 0,Species,Gene,Gene_symbol,Start,End,Strand,Class,gene_family_symbol,in_palindrome,palindrome_name
0,PanTro,testis-specific chromodomain protein Y 1,LOC745547,5358723,5360941,+,AMPLICONIC,CDY1,yes,Q1B
1,PanTro,"glutamate dehydrogenase 1, mitochondrial-like",LOC750007,5539042,5540684,+,AMPLICONIC,"glutamate dehydrogenase 1, mitochondrial-like",no,
2,PanTro,testis-specific Y-encoded protein 3-like,LOC129135297,6661276,6664016,+,ANCESTRAL,TSPY8,no,
3,PanTro,testis-specific Y-encoded protein 3,LOC107973386,6672036,6674778,+,ANCESTRAL,TSPY8,no,
4,PanTro,testis-specific Y-encoded protein 3,LOC112207446,6682798,6685540,+,ANCESTRAL,TSPY8,no,
...,...,...,...,...,...,...,...,...,...,...
399,MacFas,testis-specific Y-encoded protein 2-like,LOC141409530,11981124,11983881,-,Unknown,TSPY8,no,
400,MacFas,testis-specific Y-encoded protein 2-like,LOC141409557,12091184,12093921,+,Unknown,TSPY8,no,
401,MacFas,deleted in azoospermia protein 1-like,LOC141409531,12794702,12850544,+,Unknown,DAZ1,no,
402,MacFas,testis-specific chromodomain protein Y 2-like,LOC141409533,13091147,13093961,-,Unknown,CDY1,no,


In [11]:
unique_vals = genes['gene_family_symbol'].unique()
print(unique_vals)
len(unique_vals)

['CDY1' 'glutamate dehydrogenase 1, mitochondrial-like' 'TSPY8' 'DAZ1'
 'BPY2' 'RBMY1B' 'MTRNR2-like 17' 'proline-rich protein, Y-linked' 'VCY1B'
 'HSFY1' 'keratin, type I cytoskeletal 18-like' 'protein FRG1-like'
 'centriole and centriolar satellite protein OFD1-like'
 'protein FAM47A-like' 'zinc finger protein 285-like'
 'adenylate kinase isoenzyme 6-like'
 'endogenous retrovirus group K member 19 Env polyprotein-like'
 'TATA-box binding protein associated factor 11 like protein 2-like']


18

## Extract coding sequences per family 

In [12]:
#Left out because not ampliconic (looked at these individually before and are not ampliconic when 95% identity over 80% of their coverage) : 'uncharacterized LOC129138873

# notes: 
# 'glutamate dehydrogenase 1, mitochondrial-like' glutamate
# 'MTRNR2-like 17' is called MTRNR2
# 'proline-rich protein, Y-linked' is named proline
#  'keratin, type I cytoskeletal 18-like' is called keratin
# 'protein FRG1-like' is named FRG1
# 'centriole and centriolar satellite protein OFD1-like' is called centriole 
# 'protein FAM47A-like' is called FAM47A
# 'zinc finger protein 285-like' is called zinc
# 'adenylate kinase isoenzyme 6-like' is called isoenzyme
# 'endogenous retrovirus group K member 19 Env polyprotein-like' is called retrovirus
# 'TATA-box binding protein associated factor 11 like protein 2-like' is called TATAbox


In [13]:
# if only want certain families:
families = ['CDY1', 'glutamate', 'TSPY8' ,'DAZ1',
 'BPY2', 'RBMY1B', 'MTRNR2', 'proline', 'VCY1B',
 'HSFY1', 'keratin' ,'FRG1',
 'centriole','FAM47A', 'zinc','isoenzyme',
 'retrovirus','TATA-box']
len(families)

1

### Extract the coding Regions
From each gene extract the CDS and merge all the species together in one large fasta file. <br>
Translate the sequence at the end so that the sequences can be checked with the protein files in the references genomes

In [15]:
# Extract the sequences

def parse_attributes(attr_str):
    attrs = {}
    for part in attr_str.strip().split(";"):
        if "=" in part:
            k, v = part.split("=", 1)
            attrs[k.strip().lower()] = v.strip()
    return attrs

def get_isoform(attrs):
    """Extract isoform identifier from attributes."""
    prod = attrs.get("product", "")
    m = re.search(r'isoform\s+(X\d+|\d+|[a-z])\b', prod, re.IGNORECASE)
    if m:
        return m.group(1).upper()  # normalize to uppercase
    
    for key in ("transcript_id", "protein_id"):
        if key in attrs:
            return attrs[key]
    
    return "NA"

def select_best_isoform(isoforms_dict, refseq_iso):
    """Select best isoform according to priority rules."""
    cands = list(isoforms_dict.keys())
    
    # Priority 1: RefSeq Select
    if refseq_iso and refseq_iso in cands:
        return refseq_iso
    
    # Priority 2: NA (unnamed isoform)
    if "NA" in cands:
        return "NA"
    
    # Priority 3: X1 (exact match only!)
    if "X1" in cands:
        return "X1"
    
    # Priority 4: 1 or A
    for iso in ("1", "A"):
        if iso in cands:
            return iso
    
    # Priority 5: Lowest number or alphabetically first
    # Separate X-prefixed numbers, plain numbers, and alphabetic
    x_numeric = []
    plain_numeric = []
    alpha_isos = []
    
    for iso in cands:
        m_x = re.match(r'^X(\d+)$', iso, re.IGNORECASE)
        if m_x:
            x_numeric.append((int(m_x.group(1)), iso))
            continue
        
        m_plain = re.match(r'^(\d+)$', iso)
        if m_plain:
            plain_numeric.append((int(m_plain.group(1)), iso))
            continue
        
        alpha_isos.append(iso)
    
    if x_numeric:
        x_numeric.sort()
        return x_numeric[0][1]
    
    if plain_numeric:
        plain_numeric.sort()
        return plain_numeric[0][1]
    
    if alpha_isos:
        alpha_isos.sort()
        return alpha_isos[0]
    
    return sorted(cands)[0]

def extract_cds(genome_fasta, gff_file, gene_name):
    """Extract CDS for gene, selecting best isoform."""
    genome = SeqIO.to_dict(SeqIO.parse(genome_fasta, "fasta"))
    records = {}     
    attrs_map = {}    
    refseq_iso = None
    
    # Collect all CDS features for this gene
    with open(gff_file) as fh:
        for line in fh:
            if line.startswith("#"):
                continue
            cols = line.rstrip("\n").split("\t")
            if len(cols) < 9 or cols[2] != "CDS":
                continue
            
            seqid, _, _, start, end, _, strand, frame, attr_str = cols[:9]
            attrs = parse_attributes(attr_str)
            
            if attrs.get("gene") != gene_name:
                continue
            
            iso = get_isoform(attrs)
            
            fragment = (seqid, int(start) - 1, int(end), int(frame), strand)
            
            if iso not in records:
                records[iso] = []
            
            fragment_coords = (fragment[0], fragment[1], fragment[2], fragment[4])  
            existing_coords = [(f[0], f[1], f[2], f[4]) for f in records[iso]]
            
            if fragment_coords not in existing_coords:
                records[iso].append(fragment)
            
            attrs_map[iso] = attrs
            
            if "tag" in attrs and "refseq select" in attrs["tag"].lower():
                refseq_iso = iso
    
    if not records:
        return {}
    
    # Assemble CDS sequence for each isoform
    isoform_sequences = {}
    for iso, frags in records.items():
        rev = (frags[0][4] == "-")
        frags.sort(key=lambda x: x[1], reverse=rev)
        
        pieces = []
        first = True
        for sid, s, e, frm, strand in frags:
            subseq = genome[sid].seq[s:e]
            if strand == "-":
                subseq = subseq.reverse_complement()
            if first:
                subseq = subseq[frm:]  
                first = False
            pieces.append(str(subseq))
        
        isoform_sequences[iso] = "".join(pieces)
    
    # Select best isoform
    best_iso = select_best_isoform(isoform_sequences, refseq_iso)
    
    return {best_iso: (isoform_sequences[best_iso], attrs_map[best_iso])}

# ========== Main processing loop ================
for family in families:
    print(f"\n=== Processing family {family!r} ===")
    filtered_genes = genes[
        genes["gene_family_symbol"].str.contains(family, na=False)
    ]
    
    intermediate_dir = f"{data_dir}/sequences_y_updated/{family}_selected_isoform"
    os.makedirs(intermediate_dir, exist_ok=True)
    
    for species in species_list:
        info = species_info[species]
        genome = info["fasta_y"]
        gff_file = info["gff_y_cds"]
        out_fa = f"{intermediate_dir}/{species}_{family}.fa"
        
        with open(out_fa, "w") as fout:
            sp_genes = filtered_genes[filtered_genes["Species"] == species]
            written = 0
            
            for gene in sp_genes["Gene_symbol"]:
                iso_dict = extract_cds(genome, gff_file, gene)
                
                if not iso_dict:
                    print(f"    [!] No CDS for {gene} in {species}")
                    continue
                
                for iso, (seq, attrs) in iso_dict.items():
                    pid = attrs.get("protein_id", "")
                    header = f">{gene}_isoform_{iso}"
                    if pid:
                        header += f";protein_id={pid}"
                    fout.write(header + "\n" + seq + "\n")
                    written += 1
            
            print(f"  • {species}: wrote {written}/{len(sp_genes)} genes → {out_fa}")


=== Processing family 'CDY1' ===
  • PanTro: wrote 4/4 genes → /home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_y_updated/CDY1_selected_isoform/PanTro_CDY1.fa
  • HomSap: wrote 4/4 genes → /home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_y_updated/CDY1_selected_isoform/HomSap_CDY1.fa
  • PanPan: wrote 2/2 genes → /home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_y_updated/CDY1_selected_isoform/PanPan_CDY1.fa
  • GorGor: wrote 1/1 genes → /home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_y_updated/CDY1_selected_isoform/GorGor_CDY1.fa
  • PonPyg: wrote 13/13 genes → /home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_y_updated/CDY1_selected_isoform/PonPyg_CDY1.fa
  • PonAbe: wrote 22/22 genes → /home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_y_updated/CDY1_selected_isoform/PonAbe_CDY1.fa
  • SymSyn: wrote 5/5 genes → /home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_y_updated/CDY1_selected_isofo

In [17]:
# combine fasta files
for family in families:
    print(f"\n→ Processing family {family!r}\n")

    # combine step
    intermediate_dir = f"{data_dir}/sequences_y_updated/{family}_selected_isoform"
    combined_fasta   = f"{intermediate_dir}/all_species_{family}.fa"
    with open(combined_fasta, "w") as outfile:
        for species in species_list:
            species_fasta = f"{intermediate_dir}/{species}_{family}.fa"
            try:
                with open(species_fasta) as infile:
                    for line in infile:
                        if line.startswith(">"):
                            header = line.strip()
                            if not header.endswith(f"_{species}"):
                                header += f"_{species}"
                            outfile.write(header + "\n")
                        else:
                            outfile.write(line)
            except FileNotFoundError:
                print(f"[Warning] {species_fasta} not found.")
    print(f"→ Combined FASTA for {family} at {combined_fasta}")


→ Processing family 'CSF2RA'

→ Combined FASTA for CSF2RA at /home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_x_updated/CSF2RA_selected_isoform/all_species_CSF2RA.fa


In [18]:
##TRANSLATE 
def translate_record(record, frame=0):
    """
    Translates a SeqRecord's nucleotide sequence into a protein sequence.
    
    :param record: A SeqRecord object containing the nucleotide sequence.
    :param frame: The reading frame (0, 1, or 2) to start translation.
    :return: A new SeqRecord with the translated protein sequence.
    """
    # Adjust the sequence for the reading frame
    trimmed_seq = record.seq[frame:]
    # Translate into protein (keeps '*' for stop codons)
    protein_seq = trimmed_seq.translate(to_stop=False)
    # Create a new SeqRecord for the protein
    return SeqRecord(protein_seq, id=record.id, description="translated")

def translate_fasta(input_file, output_file, frame=0):
    """
    Reads a FASTA file with nucleotide sequences, translates each sequence, 
    and writes the protein sequences to an output FASTA file.
    
    :param input_file: Path to the input FASTA file.
    :param output_file: Path to the output FASTA file.
    :param frame: The reading frame (0, 1, or 2) to start translation.
    """
    # Parse the input FASTA file and translate each record
    translated_records = []
    for record in SeqIO.parse(input_file, "fasta"):
        translated_records.append(translate_record(record, frame))
    
    # Write the translated records to the output FASTA file
    SeqIO.write(translated_records, output_file, "fasta")

for family in families:
    # 1) make the per-family directory
    intermediate_dir = f"{data_dir}/sequences_y_updated/{family}_selected_isoform"
    os.makedirs(intermediate_dir, exist_ok=True)

    # 2) build input/output paths
    in_fasta  = f"{intermediate_dir}/all_species_{family}.fa"
    out_fasta = f"{intermediate_dir}/all_species_{family}_translated.fa"

    # 3) translate
    print(f"Translating {in_fasta} → {out_fasta}")
    translate_fasta(in_fasta, out_fasta, frame=0)

Translating /home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_x_updated/CSF2RA_selected_isoform/all_species_CSF2RA.fa → /home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_x_updated/CSF2RA_selected_isoform/all_species_CSF2RA_translated.fa


## Create Ampliconic clustering across species using BLAST

In [19]:
### Create ampliconic clusters based on Identity using BLASTN
for family in families:
    print(f"▶ makeblastdb for {family!r}")
    cmd = f"makeblastdb \
            -in {data_dir}/sequences_y_updated/{family}_selected_isoform/all_species_{family}.fa \
            -dbtype nucl \
            -out {data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/blastdb"
    subprocess.run(cmd, shell=True, check=True)

▶ makeblastdb for 'CSF2RA'


Building a new DB, current time: 10/31/2025 14:52:06
New DB name:   /home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_x_updated/CSF2RA_selected_isoform/blastdb/blastdb
New DB title:  /home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_x_updated/CSF2RA_selected_isoform/all_species_CSF2RA.fa
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 9 sequences in 0.08725 seconds.




In [20]:
# pairwise comparison of each gene coding sequence against all other genes in the database 
for family in families:
    print(f"Processing family: {family}")
    cmd = f"blastn -query {data_dir}/sequences_y_updated/{family}_selected_isoform/all_species_{family}.fa \
    -db {data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/blastdb \
    -out {data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/results.blastp.tsv \
    -outfmt \"6 qseqid sseqid pident mismatch gapopen gaps qcovs qcovshsp evalue\" "

    subprocess.run(cmd, shell=True, check=True)

Processing family: CSF2RA


In [85]:
# # Collect edges for each protein with species-specific thresholds
edges = {}
identity_default = 95
identity_macfas = 90  # More lenient threshold for MacFas because it is already 5% sequence identity away from the other species
coverage = 80
score = 0.001

edges_per_family = {}

for family in families:
    print(f"Processing family: {family}")
    edges = {}
    file = f"{data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/results.blastp.tsv"
    
    with open(file, "r") as infile:
        for line in infile:
            cols = line.strip().split("\t")
            
            # Determine which identity threshold to use
            query_id = cols[0]
            subject_id = cols[1]
            
            is_macfas_comparison = "MacFas" in query_id or "MacFas" in subject_id
            
            # Use appropriate identity threshold
            identity_threshold = identity_macfas if is_macfas_comparison else identity_default
            
            # Apply filtering with the appropriate threshold
            if (float(cols[2]) >= identity_threshold
                and int(cols[6]) >= coverage
                and float(cols[7]) < score):
                
                if cols[0] in edges:
                    edges[cols[0]].append(cols[1])
                else:
                    edges[cols[0]] = [cols[1]]
    
    edges_per_family[family] = edges


Processing family: CSF2RA


In [86]:
two_way_per_family = {}
vertices_per_family = {}

for family in families:
    print(f"Processing family: {family}")
    edges     = edges_per_family[family]
    edges_2way = []
    vertices   = set()

    for node_A in edges:
        for node_B in edges[node_A]:
            if node_B in edges and node_A in edges[node_B]:
                pair = tuple(sorted((node_A, node_B)))
                if pair in edges_2way:
                    continue
                edges_2way.append(pair)
                vertices.add(node_A)
                vertices.add(node_B)

    two_way_per_family[family] = edges_2way
    vertices_per_family[family]   = vertices

Processing family: CSF2RA


In [87]:
clusters_per_family = {}
merged_clusters_per_family = {}

for family in families:
    print(f"→ Clustering for family {family!r}")
    edges_2way = two_way_per_family[family]

    # 1) transitive clustering
    clusters = []
    for edge in edges_2way:
        found = False
        for cluster in clusters:
            if edge[0] in cluster or edge[1] in cluster:
                cluster.add(edge[0])
                cluster.add(edge[1])
                found = True
                break
        if not found:
            clusters.append(set(edge))

    # 2) merge overlapping clusters
    merged_clusters = []
    for cluster in clusters:
        found = False
        for m in merged_clusters:
            if cluster & m:       # any intersection?
                m.update(cluster)
                found = True
                break
        if not found:
            merged_clusters.append(cluster)

    # store and/or print
    merged_clusters_per_family[family] = merged_clusters

→ Clustering for family 'CSF2RA'


In [89]:
# Define which sequences belong into cluster together
for family in families:
    clusters = merged_clusters_per_family.get(family, [])
    if not clusters:
        print(f"No clusters for family {family!r}, skipping.")
        continue

    rows = [
        (
            gene,
            gene.rsplit('_', 1)[1],      # species is the suffix after the last underscore
            cluster_id
        )
        for cluster_id, cluster in enumerate(clusters, start=1)
        for gene in cluster
    ]

    # Create  DataFrame
    df = pd.DataFrame(rows, columns=['gene_name', 'species', 'cluster'])
    
    # Save to CSV
    out_csv = f"{data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/gene_clustering_{family}.csv"
    df.to_csv(out_csv, index=False)
    print(f"  • Saved clustering table for {family!r} → {out_csv}")

  • Saved clustering table for 'CSF2RA' → /home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_x_updated/CSF2RA_selected_isoform/blastdb/gene_clustering_CSF2RA.csv


In [91]:
# gives cluster names automatically based on overal recognizable cluster name 
for family in families:
    print(f"→ Naming clusters for family {family!r}")

    # 1) Load the per-family clustering table
    csv_in = f"{data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/gene_clustering_{family}.csv"
    df = pd.read_csv(csv_in)

    # 2) Auto-name each cluster
    cluster_name = {}
    species_combo_counts = {}

    for cid, grp in df.groupby('cluster'):
        # prefer non-LOC genes
        non_loc = grp[~grp['gene_name'].str.startswith('LOC')]
        if not non_loc.empty:
            # take the shortest prefix before first '_'
            prefs = non_loc['gene_name'].str.partition('_')[0]
            base = prefs.loc[prefs.str.len().idxmin()]
        else:
            # fallback: name by species combination
            combo = "_".join(sorted(grp['species'].unique()))
            cnt = species_combo_counts.get(combo, 0) + 1
            species_combo_counts[combo] = cnt
            base = f"{combo}_cluster{cnt}"

        cluster_name[cid] = base

    # 3)  big clusters keep the base name
    sizes = df.groupby('cluster').size().to_dict()
    dups = defaultdict(list)
    for cid, name in cluster_name.items():
        dups[name].append(cid)

    for name, cids in dups.items():
        if len(cids) > 1:
            # largest cluster keeps the base name
            cids.sort(key=lambda x: sizes[x], reverse=True)
            for idx, cid in enumerate(cids[1:], start=2):
                cluster_name[cid] = f"{name}_{idx}"

    # 4) Map back and save
    df['cluster_name'] = df['cluster'].map(cluster_name)
    out_csv = f"{data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/{family}_clusters_named_auto.csv"
    df.to_csv(out_csv, index=False)
    print(f"  • Saved auto-named clusters → {out_csv}")


→ Naming clusters for family 'CSF2RA'
  • Saved auto-named clusters → /home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_x_updated/CSF2RA_selected_isoform/blastdb/CSF2RA_clusters_named_auto.csv


## Show the cluster counts

In [93]:
for family in families:
    print(f"\n→ Building species×cluster matrix for {family!r}")

    # 1) load the named‐cluster table
    in_csv = f"{data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/{family}_clusters_named_auto.csv"
    df = pd.read_csv(in_csv)

    # 2) count genes per (species, cluster)
    grouped = (df.groupby(['species', 'cluster_name']).size().reset_index(name='count'))

    # 3) pivot
    cluster_species_df = (grouped.pivot(index='species', columns='cluster_name', values='count').fillna(0).sort_index().sort_index(axis=1))

    # 4) save
    #print(cluster_species_df)  # or display(...) in Jupyter
    out_csv = (f"{data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/species_cluster_counts_{family}.csv")
    cluster_species_df.to_csv(out_csv)
    print(f"  • Saved species×cluster counts → {out_csv}")


→ Building species×cluster matrix for 'CSF2RA'
  • Saved species×cluster counts → /home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_x_updated/CSF2RA_selected_isoform/blastdb/species_cluster_counts_CSF2RA.csv


## Subset the fasta file of a gene family into its clusters

In [95]:
for family in families:
    print(f"\n→ Processing family {family!r}")

    # 1) Load the auto‐named clusters table
    csv_in = f"{data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/{family}_clusters_named_auto.csv"
    df = pd.read_csv(csv_in)

    # 2) Build the cluster→genes map
    cluster_to_genes = df.groupby("cluster_name")["gene_name"].apply(list).to_dict()

    #  summary
    for cluster, genes in cluster_to_genes.items():
        print(f"  Cluster: {cluster}, Number of genes: {len(genes)}")

    # 3) Load the full family FASTA 
    fasta_in = f"{data_dir}/sequences_y_updated/{family}_selected_isoform/all_species_{family}.fa"
    records = list(SeqIO.parse(fasta_in, "fasta"))
    record_dict = {rec.id: rec for rec in records}
    print(f"  Loaded {len(record_dict)} FASTA records")

    # 4) Make output dir for per‐cluster FASTAs
    cluster_dir = f"{data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/cluster_fastas"
    os.makedirs(cluster_dir, exist_ok=True)

    # 5) Write one FASTA per cluster
    for cluster, gene_list in cluster_to_genes.items():
        selected = [record_dict[g] for g in gene_list if g in record_dict]
        out_fa = f"{cluster_dir}/{family}_cluster_{cluster}.fa"
        SeqIO.write(selected, out_fa, "fasta")
        print(f"    • Wrote {len(selected)} records → {out_fa}")


→ Processing family 'CSF2RA'
  Cluster: CSF2RA, Number of genes: 7
  Cluster: CSF2RA_2, Number of genes: 1
  Cluster: PonPyg_cluster1, Number of genes: 1
  Loaded 9 FASTA records
    • Wrote 7 records → /home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_x_updated/CSF2RA_selected_isoform/blastdb/cluster_fastas/CSF2RA_cluster_CSF2RA.fa
    • Wrote 1 records → /home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_x_updated/CSF2RA_selected_isoform/blastdb/cluster_fastas/CSF2RA_cluster_CSF2RA_2.fa
    • Wrote 1 records → /home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_x_updated/CSF2RA_selected_isoform/blastdb/cluster_fastas/CSF2RA_cluster_PonPyg_cluster1.fa


##  Make sure the correct gene names are in the coordinate file

In [97]:
# 1) Read once and filter out everything with no gene_family_symbol
gene_details_file = f"{work_dir}/protein_extracted_longest/clusters_merged/gene_details_updated_with_palindromes_coordinates.tsv"

master_df = pd.read_csv(gene_details_file, sep="\t")
master_df = master_df.dropna(subset=["gene_family_symbol"])

# 2) Loop over each family
for family in families:
    print(f"\n→ Updating gene details for family {family!r}")

    # Load family's auto-named clusters
    cluster_file = os.path.join(
        data_dir,
        "sequences_y_updated",
        f"{family}_selected_isoform",
        "blastdb",
        f"{family}_clusters_named_auto.csv"
    )
    cluster_df = pd.read_csv(cluster_file, sep=",")

    # Subset the master table to only genes in this family
    filt = master_df[
        master_df['gene_family_symbol'].str.contains(family, na=False)
    ]

    #  merge keys
    cluster_df['gene_prefix'] = cluster_df['gene_name'].str.split('_').str[0]
    mapping = cluster_df[['gene_prefix', 'species', 'cluster_name']]

    # Merge on Gene_symbol + Species → gene_prefix + species
    merged = filt.merge(
        mapping,
        left_on=['Gene_symbol', 'Species'],
        right_on=['gene_prefix', 'species'],
        how='left'
    )

    # Rename and drop helper columns
    merged.rename(columns={'cluster_name': 'cluster'}, inplace=True)
    drop_cols = [c for c in ('gene_prefix', 'species_y') if c in merged.columns]
    if drop_cols:
        merged.drop(columns=drop_cols, inplace=True)

    print(f"  {len(merged)} rows after merge (with new ‘cluster’ col)")

    # Write out the updated table
    out_tsv = os.path.join(
        data_dir,
        "sequences_y_updated",
        f"{family}_selected_isoform",
        "blastdb",
        f"{family}_compl_gene_details_updated_with_palindromes_coordinates.tsv"
    )
    merged.to_csv(out_tsv, sep="\t", index=False)
    print(f"  • Saved → {out_tsv}")


→ Updating gene details for family 'CSF2RA'
  9 rows after merge (with new ‘cluster’ col)
  • Saved → /home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_x_updated/CSF2RA_selected_isoform/blastdb/CSF2RA_compl_gene_details_updated_with_palindromes_coordinates.tsv


In [98]:
## one big file with all the information

# 1) Load & pre-filter your master gene_details
gene_details_file = f"{work_dir}/protein_extracted_longest/clusters_merged/gene_details_updated_with_palindromes_coordinates.tsv"

master_df = pd.read_csv(gene_details_file, sep="\t")
master_df = master_df.dropna(subset=["gene_family_symbol"])


# 2) Collect all per‐family cluster maps into one DataFrame
maps = []
for family in families:
    fn = f"{data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/{family}_clusters_named_auto.csv"
    if not os.path.exists(fn):
        print(f"[!] missing cluster file for {family}, skipping")
        continue

    cf = pd.read_csv(fn)
    cf["gene_prefix"] = cf["gene_name"].str.split("_").str[0]
    maps.append(cf[["gene_prefix", "species", "cluster_name"]].rename(
        columns={"species": "Species", "cluster_name": "cluster"}
    ))

# one big mapping table
map_df = pd.concat(maps, ignore_index=True).drop_duplicates(
    subset=["gene_prefix", "Species"]
)

# Merge once onto master_df
merged_all = master_df.merge(map_df,
    left_on=["Gene_symbol", "Species"],
    right_on=["gene_prefix", "Species"],
    how="left"
)

# clean up helper columns
if "gene_prefix" in merged_all.columns:
    merged_all.drop(columns=["gene_prefix"], inplace=True)

# Save table
out_file = f"{data_dir}/sequences_y_updated/all_families_gene_details_with_clusters.tsv"

merged_all.to_csv(out_file, sep="\t", index=False)
print(f"→ Wrote combined table with cluster info for all families → {out_file}")


→ Wrote combined table with cluster info for all families → /home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_x_updated/all_families_gene_details_with_clusters.tsv


## Do MEGA analysis dN & dS calculation per cluster

### Define all the clusters 

In [101]:
cluster_list_per_family = {}

for family in families:
    # ensure the alignments directory exists
    cluster_alignments = f"{data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/cluster_alignments"
    os.makedirs(cluster_alignments, exist_ok=True)

    # grab every .fa basename in the cluster_fastas dir
    cluster_dir = f"{data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/cluster_fastas"
    all_clusters = [
        os.path.splitext(fn)[0]
        for fn in os.listdir(cluster_dir)
        if fn.endswith(".fa")
    ]

    # filter out FASTAs with only one sequence
    filtered = []
    for name in all_clusters:
        path = os.path.join(cluster_dir, f"{name}.fa")
        with open(path) as f:
            nseq = sum(1 for line in f if line.startswith(">"))
        if nseq > 1:
            filtered.append(name)

    for name in filtered:
        path = os.path.join(cluster_dir, f"{name}.fa")
        seen, dups = set(), set()
        with open(path) as f:
            for line in f:
                if line.startswith(">"):
                    seqid = line[1:].split()[0]
                    if seqid in seen:
                        dups.add(seqid)
                    else:
                        seen.add(seqid)
        if dups:
            print(f"[{family}] {name}.fa has duplicate IDs: {', '.join(dups)}")

    # store the filtered list for later
    cluster_list_per_family[family] = filtered

    print(f"{family}: keeping {len(filtered)} clusters")

CSF2RA: keeping 1 clusters


### Make a Codon based Multi-Sequence Alignment

In [102]:
### Make a codon-based alignment
# STEP 1: Align with MACSE

for family in families:
    # 1) ensure the output directory exists
    ds_dir = f"{data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/cluster_alignments"
    os.makedirs(ds_dir, exist_ok=True)
    
for family, cluster_list in cluster_list_per_family.items():
    print(f"\n→ Aligning {len(cluster_list)} clusters for family {family!r}")
    cluster_alignments = f"{data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/cluster_alignments"

    for cluster in cluster_list:
        cmd = (
            f"macse -prog alignSequences "
            f"-seq {data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/cluster_fastas/{cluster}.fa "
            f"-out_NT {data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/cluster_alignments/{cluster}_NT.fa "
            f"-out_AA {data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/cluster_alignments/{cluster}_AA.fa"
        )
        subprocess.run(cmd, shell=True, check=True,stdout=subprocess.DEVNULL,stderr=subprocess.DEVNULL)

    print(f"→ Done family {family!r}")


→ Aligning 1 clusters for family 'CSF2RA'
→ Done family 'CSF2RA'


In [103]:
## refine alignment in MACSE 
## to make the alignment better 
# run secondly+ seperately!! takes very long to run it at the same time

for family, cluster_list in cluster_list_per_family.items():
    print(f"\n→ Aligning {len(cluster_list)} clusters for family {family!r}")
    cluster_alignments = f"{data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/cluster_alignments"

    # # Refine made alignment: for alignments that are difficult
    for cluster in cluster_list:
         cmd = (
             f"macse -prog refineAlignment "
             f"-align {data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/cluster_alignments/{cluster}_NT.fa "
             f"-out_NT {data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/cluster_alignments/{cluster}_NT.fa "
             f"-out_AA {data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/cluster_alignments/{cluster}_AA.fa"
         )
         subprocess.run(cmd, shell=True, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # run quietly 
    
    print(f"→ Done family {family!r}")


→ Aligning 1 clusters for family 'CSF2RA'
→ Done family 'CSF2RA'


In [104]:
## clean alignment in MACSE 
## to make the alignment useable for after 

for family, cluster_list in cluster_list_per_family.items():
    print(f"\n→ Aligning {len(cluster_list)} clusters for family {family!r}")
    cluster_alignments = f"{data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/cluster_alignments"

    # # clean alignments with stopcodons in the middle of the sequence (MACSE output "!" with frameshift/stop codons. Replace by "NNN" for analysis:)
    for cluster in cluster_list:
         cmd = (
             f"macse -prog exportAlignment "
             f"-align {data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/cluster_alignments/{cluster}_NT.fa "
             f"-codonForInternalStop NNN "
             f"-codonForInternalFS --- "
             f"-charForRemainingFS --- "
             f"-out_NT {data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/cluster_alignments/{cluster}_NT.fa "
             f"-out_AA {data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/cluster_alignments/{cluster}_AA.fa"
         )
         subprocess.run(cmd, shell=True, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # run quietly 
    
    print(f"→ Done family {family!r}")


→ Aligning 1 clusters for family 'CSF2RA'
→ Done family 'CSF2RA'


### Calculate dN & dS and N & S counts for each pairwise comparison

In [105]:
## Calculate SYNonymous substitutions rate
# Modified Nei-Gojobori with complete deletion
for family in families:
    # 1) ensure the output directory exists
    ds_dir = f"{data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/cluster_dS"
    os.makedirs(ds_dir, exist_ok=True)

    # 2) list of clusters for this family
    clusters = cluster_list_per_family[family]

    # 3) megacc command for each cluster
    for cluster in clusters:
        cmd = (
            f"megacc "
            f"-a {data_dir}/gene_family/sequences/MAGE_isoform_X1/blastdb/compute_ds_modNG_compldel.mao "
            f"-d {data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/cluster_alignments/{cluster}_NT.fa "
            f"-o {ds_dir}/{cluster}"
        )
        subprocess.run(cmd,shell=True, check=True,stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)


In [106]:
## Calculate NONsynonymous substitutions rate
# Modified Nei-Gojobori with complete deletion
for family in families:
    # 1) ensure the output directory exists
    ds_dir = f"{data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/cluster_dN"
    os.makedirs(ds_dir, exist_ok=True)

    # 2) list of clusters for this family
    clusters = cluster_list_per_family[family]

    # 3) megacc command for each cluster
    for cluster in clusters:
        cmd = (
            f"megacc "
            f"-a {data_dir}/gene_family/sequences/MAGE_isoform_X1/blastdb/compute_dN_modNG_compldel.mao "
            f"-d {data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/cluster_alignments/{cluster}_NT.fa "
            f"-o {ds_dir}/{cluster}"
        )
        subprocess.run(cmd,shell=True, check=True,stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

In [109]:
## Calculate SYNonymous substitution COUNTS
#actual counts of synonymous differences
for family in families:
    # 1) ensure the output directory exists
    ds_dir = f"{data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/cluster_syn_count"
    os.makedirs(ds_dir, exist_ok=True)

    # 2) list of clusters for this family
    clusters = cluster_list_per_family[family]

    # 3) megacc command for each cluster
    for cluster in clusters:
        cmd = (
            f"megacc "
            f"-a {data_dir}/gene_family/sequences/MAGE_isoform_X1/blastdb/compute_syn_count_modNG_compldel.mao "
            f"-d {data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/cluster_alignments/{cluster}_NT.fa "
            f"-o {ds_dir}/{cluster}"
        )
        subprocess.run(cmd,shell=True, check=True,stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)


In [110]:
## Calculate NONSYNonymous substitution COUNTS
#actual counts of nonsynonymous differences
for family in families:
    # 1) ensure the output directory exists
    ds_dir = f"{data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/cluster_nonsyn_count"
    os.makedirs(ds_dir, exist_ok=True)

    # 2) list of clusters for this family
    clusters = cluster_list_per_family[family]

    # 3) megacc command for each cluster
    for cluster in clusters:
        cmd = (
            f"megacc "
            f"-a {data_dir}/gene_family/sequences/MAGE_isoform_X1/blastdb/compute_nonsyn_count_modNG_compldel.mao "
            f"-d {data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/cluster_alignments/{cluster}_NT.fa "
            f"-o {ds_dir}/{cluster}"
        )
        subprocess.run(cmd,shell=True, check=True,stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)


### Turn .meg files into csv files

In [113]:
# parser 
def parse_meg_file(path: Path) -> pd.DataFrame:
    text = path.read_text().splitlines()
    labels = []
    for line in text:
        s = line.strip()
        if re.match(r"^\[\s*\d", s) and "#" not in s:
            break
        m = re.match(r"^\[\s*(\d+)\]\s*#\s*(.+)$", s)
        if m:
            labels.append(m.group(2).strip())
    n = len(labels)
    full = np.zeros((n, n), float)
    for line in text:
        s = line.strip()
        m = re.match(r"^\[\s*(\d+)\]\s*(.*)$", s)
        if not m: continue
        i = int(m.group(1))
        if not (1 <= i <= n): continue
        rest = m.group(2)
        nums = re.findall(r"[-+]?\d*\.\d+(?:[eE][-+]?\d+)?", rest)
        if len(nums) != i - 1: continue
        for k, tok in enumerate(nums):
            j = k + 1
            v = float(tok)
            full[i-1, j-1] = full[j-1, i-1] = v
    np.fill_diagonal(full, 0.0)
    df = pd.DataFrame(full, index=labels, columns=labels)
    mask = np.tril(np.ones(df.shape, bool), k=-1)
    return df.where(mask)

# base data directory
data_dir = Path("/home/emma/Amplicons/Workspaces/emma/downloaded_data")

In [116]:
# the per‐family loop SYNONYMOUS RATE
for family in families:
    print(f"\n→ Processing dS matrices for family {family!r}")
    
    in_dir  = data_dir / f"sequences_y_updated/{family}_selected_isoform/blastdb/cluster_dS"
    out_dir = in_dir / "matrix_csvs"
    out_dir.mkdir(exist_ok=True)

    for meg in sorted(in_dir.glob("*.meg")):
        df_lower = parse_meg_file(meg)
        out_file = out_dir / f"{meg.stem}_dS.csv"
        df_lower.to_csv(out_file)
        #print(f"   • wrote {out_file}")


→ Processing dS matrices for family 'CSF2RA'


In [117]:
# the per‐family loop NONSYNONYMOUS RATE
for family in families:
    print(f"\n→ Processing dN matrices for family {family!r}")
    
    in_dir  = data_dir / f"sequences_y_updated/{family}_selected_isoform/blastdb/cluster_dN"
    out_dir = in_dir / "matrix_csvs"
    out_dir.mkdir(exist_ok=True)

    for meg in sorted(in_dir.glob("*.meg")):
        df_lower = parse_meg_file(meg)
        out_file = out_dir / f"{meg.stem}_dN.csv"
        df_lower.to_csv(out_file)
        #print(f"   • wrote {out_file}")


→ Processing dN matrices for family 'CSF2RA'


In [118]:
#  the per‐family loop SYNONYMOUS RATE
for family in families:
    print(f"\n→ Processing dS matrices for family {family!r}")
    
    in_dir  = data_dir / f"sequences_y_updated/{family}_selected_isoform/blastdb/cluster_syn_count"
    out_dir = in_dir / "matrix_csvs"
    out_dir.mkdir(exist_ok=True)

    for meg in sorted(in_dir.glob("*.meg")):
        df_lower = parse_meg_file(meg)
        out_file = out_dir / f"{meg.stem}_S_count.csv"
        df_lower.to_csv(out_file)
        #print(f"   • wrote {out_file}")



→ Processing dS matrices for family 'CSF2RA'


In [121]:
# now, the per‐family loop NONSYNONYMOUS RATE
for family in families:
    print(f"\n→ Processing dS matrices for family {family!r}")
    
    in_dir  = data_dir / f"sequences_y_updated/{family}_selected_isoform/blastdb/cluster_nonsyn_count"
    out_dir = in_dir / "matrix_csvs"
    out_dir.mkdir(exist_ok=True)

    for meg in sorted(in_dir.glob("*.meg")):
        df_lower = parse_meg_file(meg)
        out_file = out_dir / f"{meg.stem}_N_count.csv"
        df_lower.to_csv(out_file)
        #print(f"   • wrote {out_file}")



→ Processing dS matrices for family 'CSF2RA'


### Combine all the synonymous and nonsynonymous tables

In [122]:
# extract the number of codons 
def get_num_sites(cluster):
    """
    Try to pull the reported “No. of Sites=” from the .meg file.
    If that fails, parse the alignment itself and return length/3.
    """
    meg_path = (
        data_dir
        / f"sequences_y_updated/{family}_selected_isoform/blastdb/cluster_dN"
        / f"{family}_cluster_{cluster}.meg"
    )
    text = meg_path.read_text()
    m = re.search(r"No\.?\s*of\s*Sites\s*=\s*(\d+)", text, flags=re.IGNORECASE)
    if m:
        return int(m.group(1))
    try:
        aln = AlignIO.read(str(meg_path), "mega")
        return aln.get_alignment_length() // 3
    except Exception:
        return None

### Between species

In [123]:
import warnings
warnings.filterwarnings("ignore")

for family in families:
    print(f"\n→ Processing between‐species stats for family {family}")

    # 1) define syn/nonsyn CSV directories
    syn_dir    = data_dir / f"sequences_y_updated/{family}_selected_isoform/blastdb/cluster_dS/matrix_csvs"
    nonsyn_dir = data_dir / f"sequences_y_updated/{family}_selected_isoform/blastdb/cluster_dN/matrix_csvs"

    # 2) cluster IDs
    clusters = []
    for f in syn_dir.glob(f"{family}_cluster_*_dS.csv"):
        m = re.match(rf"{family}_cluster_(.+)_dS\.csv", f.name)
        if m:
            clusters.append(m.group(1))
    clusters = sorted(set(clusters))
    print("  Found clusters:", clusters)

    # compute stats for a species pair in a lower‐triangle df
    def pair_stats(df, a, b):
        r = df.index.to_series().str.contains
        c = df.columns.to_series().str.contains
        mask = np.outer(r(a), c(b)) | np.outer(r(b), c(a))
        vals = df.where(mask).stack()
        return vals.mean(), vals.std(ddof=1)

    # 3) load syn/nonsyn matrices and compute mean/SD
    records = []
    for cluster in clusters:
        syn_df    = (pd.read_csv(syn_dir/f"{family}_cluster_{cluster}_dS.csv", index_col=0)
                       .sort_index().sort_index(axis=1))
        nonsyn_df = (pd.read_csv(nonsyn_dir/f"{family}_cluster_{cluster}_dN.csv", index_col=0)
                       .sort_index().sort_index(axis=1))
        for sp1, sp2 in itertools.combinations(species_list, 2):
            m_s, sd_s = pair_stats(syn_df,    sp1, sp2)
            m_n, sd_n = pair_stats(nonsyn_df, sp1, sp2)
            records.append({
                "Cluster":      cluster,
                "Species1":     sp1,
                "Species2":     sp2,
                "Mean_Syn":     m_s,
                "SD_Syn":       sd_s,
                "Mean_Nonsyn":  m_n,
                "SD_Nonsyn":    sd_n
            })

    master = pd.DataFrame(records)
    cols = ["Mean_Syn","SD_Syn","Mean_Nonsyn","SD_Nonsyn"]
    master_clean = master.dropna(subset=cols, how="all")

    # 4) pull “No. of Sites” (codons) from each cluster's .meg
    site_map = {cl: get_num_sites(cl) for cl in master_clean["Cluster"].unique()}
    master_clean["No_of_Codon"] = master_clean["Cluster"].map(site_map)

    # 5) compute dN/dS ratio
    master_clean["dNdS"] = master_clean["Mean_Nonsyn"] / master_clean["Mean_Syn"]

    # 6) annotate copy‐numbers
    counts_csv = (
        data_dir
        / f"sequences_y_updated/{family}_selected_isoform/blastdb/species_cluster_counts_{family}.csv"
    )
    counts_df = pd.read_csv(counts_csv).set_index("species")
    master_clean["Species1_num_copies"] = [
        counts_df.at[s, c]
        for s, c in zip(master_clean["Species1"], master_clean["Cluster"])
    ]
    master_clean["Species2_num_copies"] = [
        counts_df.at[s, c]
        for s, c in zip(master_clean["Species2"], master_clean["Cluster"])
    ]

    # 7) build S/N counts table and merge
    rec2 = []
    syn_cnt_dir    = data_dir / f"sequences_y_updated/{family}_selected_isoform/blastdb/cluster_syn_count/matrix_csvs"
    nonsyn_cnt_dir = data_dir / f"sequences_y_updated/{family}_selected_isoform/blastdb/cluster_nonsyn_count/matrix_csvs"
    for cluster in clusters:
        s_df = (pd.read_csv(syn_cnt_dir/f"{family}_cluster_{cluster}_S_count.csv", index_col=0)
                  .sort_index().sort_index(axis=1))
        n_df = (pd.read_csv(nonsyn_cnt_dir/f"{family}_cluster_{cluster}_N_count.csv", index_col=0)
                  .sort_index().sort_index(axis=1))
        for sp1, sp2 in itertools.combinations(species_list, 2):
            ms, ss = pair_stats(s_df, sp1, sp2)
            mn, sn = pair_stats(n_df, sp1, sp2)
            rec2.append({
                "Cluster":           cluster,
                "Species1":          sp1,
                "Species2":          sp2,
                "Mean_Syn_count":    ms,
                "SD_Syn_count":      ss,
                "Mean_Nonsyn_count": mn,
                "SD_Nonsyn_count":   sn
            })

    counts_total = (
        pd.DataFrame(rec2)
        .dropna(subset=["Mean_Syn_count","SD_Syn_count","Mean_Nonsyn_count","SD_Nonsyn_count"], how="all")
    )

    final = (
        master_clean
        .merge(counts_total, on=["Cluster","Species1","Species2"], how="left")
        .round(4)
    )
    final[["Species1_num_copies","Species2_num_copies"]] = final[["Species1_num_copies","Species2_num_copies"]].astype(int)

    # 8) compute “potential synonymous sites” and adjusted dN/dS
    final["pot_syn_sites"] = final["Mean_Syn_count"] / final["Mean_Syn"]
    final["adj_dNdS"]      = (final["Mean_Nonsyn"]) / (
        (final["Mean_Syn_count"] + 1) / final["pot_syn_sites"]
    )
    
    # 9) save
    out_tsv = (
        data_dir
        / f"sequences_y_updated/{family}_selected_isoform/blastdb/{family}_dN_dS_betweenspecies.tsv"
    )
    final.to_csv(out_tsv, sep="\t", index=False)
    print(f"  → saved {out_tsv}")


→ Processing between‐species stats for family CSF2RA
  Found clusters: ['CSF2RA']
  → saved /home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_x_updated/CSF2RA_selected_isoform/blastdb/CSF2RA_dN_dS_betweenspecies.tsv


### Within species

In [124]:
for family in families:
    print(f"\n→ Within‐species summary for family {family!r}")

    # 1) Directories for dS and dN matrices
    syn_dir    = data_dir / f"sequences_y_updated/{family}_selected_isoform/blastdb/cluster_dS/matrix_csvs"
    nonsyn_dir = data_dir / f"sequences_y_updated/{family}_selected_isoform/blastdb/cluster_dN/matrix_csvs"

    # 2) cluster IDs from the dS filenames
    clusters = sorted({
        re.match(rf"{family}_cluster_(.+)_dS\.csv", p.name).group(1)
        for p in syn_dir.glob(f"{family}_cluster_*_dS.csv")
        if re.match(rf"{family}_cluster_(.+)_dS\.csv", p.name)
    })
    print("  clusters:", clusters)

    # get mean & SD for a species in a lower‐triangle matrix
    def pair_stats(df, a, b):
        idx0 = df.index.to_series().str.contains
        idx1 = df.columns.to_series().str.contains
        mask = np.outer(idx0(a), idx1(b)) | np.outer(idx0(b), idx1(a))
        vals = df.where(mask).stack()
        return vals.mean(), vals.std(ddof=1)

    # 3) Build within‐species dS/dN rates table
    rate_records = []
    for cluster in clusters:
        s_df = (pd.read_csv(syn_dir/f"{family}_cluster_{cluster}_dS.csv", index_col=0)
                  .sort_index().sort_index(axis=1))
        n_df = (pd.read_csv(nonsyn_dir/f"{family}_cluster_{cluster}_dN.csv", index_col=0)
                  .sort_index().sort_index(axis=1))
        for sp in species_list:
            m_s, sd_s = pair_stats(s_df, sp, sp)
            m_n, sd_n = pair_stats(n_df, sp, sp)
            rate_records.append({
                "Cluster":      cluster,
                "Species":      sp,
                "Mean_Syn":     m_s,
                "SD_Syn":       sd_s,
                "Mean_Nonsyn":  m_n,
                "SD_Nonsyn":    sd_n
            })
    within_rates = pd.DataFrame(rate_records).dropna(
        subset=["Mean_Syn","SD_Syn","Mean_Nonsyn","SD_Nonsyn"],
        how="all"
    )

    # 4) Extract “No. of Sites” (codons) from each cluster’s .meg
    site_map = {c: get_num_sites(c) for c in within_rates["Cluster"].unique()}
    within_rates["No_of_Codon"] = within_rates["Cluster"].map(site_map)

    # 5) Compute dN/dS ratio
    within_rates["dNdS"] = within_rates["Mean_Nonsyn"] / within_rates["Mean_Syn"]

    # 6) Annotate copy‐number from species×cluster counts
    cnt_csv = (data_dir
               / f"sequences_y_updated/{family}_selected_isoform/blastdb/"
               / f"species_cluster_counts_{family}.csv")
    cnt_df = pd.read_csv(cnt_csv).set_index("species")
    within_rates["num_copies"] = [
        cnt_df.at[row.Species, row.Cluster]
        for _, row in within_rates.iterrows()
    ]

    # 7) Build within‐species raw count table (S_count / N_count)
    count_records = []
    syn_cnt_dir    = data_dir / f"sequences_y_updated/{family}_selected_isoform/blastdb/cluster_syn_count/matrix_csvs"
    nonsyn_cnt_dir = data_dir / f"sequences_y_updated/{family}_selected_isoform/blastdb/cluster_nonsyn_count/matrix_csvs"
    for cluster in clusters:
        sf = syn_cnt_dir   / f"{family}_cluster_{cluster}_S_count.csv"
        nf = nonsyn_cnt_dir/ f"{family}_cluster_{cluster}_N_count.csv"
        if not (sf.exists() and nf.exists()):
            print(f"  ⚠ skipping counts for {cluster}: missing file")
            continue
        s_df = pd.read_csv(sf, index_col=0).sort_index().sort_index(axis=1)
        n_df = pd.read_csv(nf, index_col=0).sort_index().sort_index(axis=1)
        for sp in species_list:
            ms, ss = pair_stats(s_df, sp, sp)
            mn, sn = pair_stats(n_df, sp, sp)
            count_records.append({
                "Cluster":           cluster,
                "Species":           sp,
                "Mean_Syn_count":    ms,
                "SD_Syn_count":      ss,
                "Mean_Nonsyn_count": mn,
                "SD_Nonsyn_count":   sn
            })
    within_counts = pd.DataFrame(count_records).dropna(
        subset=["Mean_Syn_count","SD_Syn_count","Mean_Nonsyn_count","SD_Nonsyn_count"],
        how="all"
    )

    # 8) Merge rates + counts into one within‐species table
    within_species = within_rates.merge(
        within_counts,
        on=["Cluster","Species"],
        how="left"
    )
     # 9) Round decimals to 4 places, integers for codons & copies
    dec_cols = [
        "Mean_Syn","SD_Syn","Mean_Nonsyn","SD_Nonsyn",
        "Mean_Syn_count","SD_Syn_count",
        "Mean_Nonsyn_count","SD_Nonsyn_count","dNdS"
    ]
    within_species[dec_cols] = within_species[dec_cols].round(4)
    int_cols = ["No_of_Codon","num_copies"]
    within_species[int_cols] = within_species[int_cols].round(0).astype(int)

    # 10) Save the combined table
    out_file = (data_dir
                / f"sequences_y_updated/{family}_selected_isoform/blastdb/"
                / f"{family}_dN_dS_withinspecies.tsv")
    within_species.to_csv(out_file, sep="\t", index=False)
    print(f"  • saved combined within‐species table → {out_file}")


→ Within‐species summary for family 'CSF2RA'
  clusters: ['CSF2RA']
  • saved combined within‐species table → /home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_x_updated/CSF2RA_selected_isoform/blastdb/CSF2RA_dN_dS_withinspecies.tsv


### Combine all tables together

In [126]:
# BETWEEN SPECIES 

# 1) Read each per-family TSV, add a "Family" column, collect into a list
tables = []
for family in families:
    path = f"{data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/{family}_dN_dS_betweenspecies.tsv"
    df = pd.read_csv(path, sep="\t")
    df["Family"] = family
    tables.append(df)

# 2) Concatenate  all into one DataFrame
combined = pd.concat(tables, ignore_index=True)

# 3) Save 
out = f"{data_dir}/sequences_y_updated/all_families_dN_dS_betweenspecies.tsv"
combined.to_csv(out, sep="\t", index=False)

print(f"→ Wrote combined table with {len(combined)} rows to {out}")

→ Wrote combined table with 15 rows to /home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_x_updated/all_families_dN_dS_betweenspecies.tsv


In [127]:
# WITHIN SPECIES 
# 1) Read each per-family TSV, add a "Family" column, collect into a list
tables = []
for family in families:
    path = f"{data_dir}/sequences_y_updated/{family}_selected_isoform/blastdb/{family}_dN_dS_withinspecies.tsv"
    df = pd.read_csv(path, sep="\t")
    df["Family"] = family
    tables.append(df)

# 2) Concatenate them all into one DataFrame
combined = pd.concat(tables, ignore_index=True)

# 3) Save 
out = f"{data_dir}/sequences_y_updated/all_families_dN_dS_withinspecies.tsv"
combined.to_csv(out, sep="\t", index=False)

print(f"→ Wrote combined table with {len(combined)} rows to {out}")

→ Wrote combined table with 1 rows to /home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_x_updated/all_families_dN_dS_withinspecies.tsv


## Merge dNdS dataframes with bootstrapped dataframe

In [4]:
## bootstrap information
bootstrap_y = pd.read_csv(f'/home/emma/Amplicons/Workspaces/emma/downloaded_data/Y_updated_bootstrap_results_20251101_140447/bootstrap_results.csv',sep=",")
bootstrap_y

bootstrap = bootstrap_y

def frac_zeros(val):
    # split by comma, convert to float
    arr = np.array([float(x) for x in val.split(",")])
    return np.mean(arr == 0)

# Apply to each column
bootstrap["dN_fraction_zeros"] = bootstrap["dN_rates"].apply(frac_zeros)
bootstrap["dS_fraction_zeros"] = bootstrap["dS_rates"].apply(frac_zeros)

# calculate mean dNdS + mean dS + mean dN
def calc_bootstrap_stats(dn_str, ds_str):
    # Parse strings into float arrays
    dn = np.array([float(x) for x in dn_str.split(",")])
    ds = np.array([float(x) for x in ds_str.split(",")])
    
    # --- mean of ratios ---
    ratios = np.divide(dn, ds, out=np.full_like(dn, np.nan), where=ds!=0)
    mean_dnds = np.nanmean(ratios)
    
    # --- mean dN and mean dS ---
    mean_dn = np.mean(dn)
    mean_ds = np.mean(ds)
    
    return mean_dnds, mean_dn, mean_ds


# Apply row-wise
bootstrap[["mean_dNdS", "mean_dN", "mean_dS"]] = bootstrap.apply(
    lambda row: pd.Series(calc_bootstrap_stats(row["dN_rates"], row["dS_rates"])),
    axis=1
)

def count_ratio_below_above_1(dn_str, ds_str):
    # Parse comma-separated strings into float arrays (fast & robust)
    dn = np.fromstring(dn_str, sep=",")
    ds = np.fromstring(ds_str, sep=",")

    # dN/dS with dS==0 -> +inf (including 0/0)
    ratios = np.divide(dn, ds, out=np.full_like(dn, np.inf), where=ds != 0)

    # Counts (NaNs won’t occur with the logic above; inf > 1 evaluates True)
    below = int(np.sum(ratios < 1))
    above = int(np.sum(ratios > 1))  # counts +inf as above 1 automatically

    return below, above

# Apply row-wise
bootstrap[["dNdS_count_below1", "dNdS_count_above1"]] = bootstrap.apply(
    lambda row: pd.Series(count_ratio_below_above_1(row["dN_rates"], row["dS_rates"])),
    axis=1
)

# bootstrap
ALPHA   = 0.05
BOOT_N  = 10000  # total bootstraps

# Fractions
bootstrap["frac_below1"] = bootstrap["dNdS_count_below1"] / BOOT_N
bootstrap["frac_above1"] = bootstrap["dNdS_count_above1"] / BOOT_N

# (optional) if you want to track the mass exactly at 1 (and any NaNs if present)
# This assumes no double-counting across the *_count_* columns.
if "dNdS_count_nan" in bootstrap.columns:
    bootstrap["frac_equal1"] = (BOOT_N - bootstrap["dNdS_count_below1"]
                                          - bootstrap["dNdS_count_above1"]
                                          - bootstrap["dNdS_count_nan"]) / BOOT_N
else:
    bootstrap["frac_equal1"] = (BOOT_N - bootstrap["dNdS_count_below1"]
                                          - bootstrap["dNdS_count_above1"]) / BOOT_N

bootstrap[["frac_below1","frac_above1","frac_equal1"]] = \
    bootstrap[["frac_below1","frac_above1","frac_equal1"]].clip(lower=0, upper=1)

.
positive   = bootstrap["frac_below1"] <= ALPHA
purifying  = bootstrap["frac_above1"] <= ALPHA
neutralish = positive & purifying      # e.g., ~all mass at exactly 1

bootstrap["selection"] = np.select(
    [neutralish,            positive,     purifying],
    ["neutral (~1)",        "positive",   "purifying"],
    default="nonsignificant"
)

#display
cols_to_round = ["frac_below1","frac_above1","frac_equal1"]
bootstrap[cols_to_round] = bootstrap[cols_to_round].round(4)
bootstrap


  mean_dnds = np.nanmean(ratios)


Unnamed: 0,family,cluster,species1,species2,dS_rates,dN_rates,dN_fraction_zeros,dS_fraction_zeros,mean_dNdS,mean_dN,mean_dS,dNdS_count_below1,dNdS_count_above1,frac_below1,frac_above1,frac_equal1,selection
0,CDY1,CDY1_cluster_PonAbe_cluster1,PonAbe,PonAbe,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....","0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",1.0000,1.0000,,0.000000,0.000000,0,10000,0.0000,1.0000,0.0,positive
1,CDY1,CDY1_cluster_MacFas_cluster1,MacFas,MacFas,"0.00216435,0.00211715,0.00849678,0.00447861,0....","0.0024692,0.00500556,0.0,0.00245011,0.00751001...",0.3675,0.3626,0.889887,0.002494,0.002151,3508,6492,0.3508,0.6492,0.0,nonsignificant
2,CDY1,CDY1_cluster_CDY1,HomSap,PanPan,"0.028284592499999997,0.0302337325,0.024942645,...","0.012828775,0.013636295,0.01277519,0.010479885...",0.0000,0.0000,0.687893,0.014729,0.023736,8824,1176,0.8824,0.1176,0.0,nonsignificant
3,CDY1,CDY1_cluster_CDY1,GorGor,PanPan,"0.01331115,0.03492027,0.0209043,0.02752504,0.0...","0.01061712,0.01363079,0.00968765,0.00911549,0....",0.0000,0.0000,0.459232,0.011556,0.027731,9765,235,0.9765,0.0235,0.0,purifying
4,CDY1,CDY1_cluster_CDY1,GorGor,HomSap,"0.020523384999999998,0.0186310075,0.0191644125...","0.01105862,0.01362904,0.011008205,0.0122820249...",0.0000,0.0000,0.546812,0.011145,0.022559,9532,468,0.9532,0.0468,0.0,purifying
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,isoenzyme,isoenzyme_cluster_PonAbe_PonPyg_cluster2,PonAbe,PonPyg,"0.00122474,0.0,0.0012729133333333335,0.0012300...","0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",1.0000,0.3626,0.000000,0.000000,0.001233,6374,3626,0.6374,0.3626,0.0,nonsignificant
186,isoenzyme,isoenzyme_cluster_PonAbe_PonPyg_cluster2,PonAbe,PonAbe,"0.00244948,0.0,0.002545826666666667,0.00246002...","0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",1.0000,0.3626,0.000000,0.000000,0.002465,6374,3626,0.6374,0.3626,0.0,nonsignificant
187,retrovirus,retrovirus_cluster_PonAbe_PonPyg_cluster1,PonAbe,PonAbe,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....","0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",1.0000,1.0000,,0.000000,0.000000,0,10000,0.0000,1.0000,0.0,positive
188,retrovirus,retrovirus_cluster_PonAbe_PonPyg_cluster1,PonAbe,PonPyg,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....","0.00130599,0.00262513,0.00138077,0.00543626,0....",0.3695,1.0000,,0.001367,0.000000,0,10000,0.0000,1.0000,0.0,positive


In [5]:
# dNdS pairwise dataframe 
y_between_overview = pd.read_csv(f'/home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_y_updated/all_families_dN_dS_betweenspecies.tsv',sep="\t")
y_within_overview = pd.read_csv(f'/home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_y_updated/all_families_dN_dS_withinspecies.tsv',sep="\t")
# merge the 2 dataframes
y_within_modified = y_within_overview.copy()

y_within_modified["Species1_num_copies"] = y_within_modified["num_copies"]
y_within_modified["Species2_num_copies"] = y_within_modified["num_copies"]

y_within_modified["Species1"] = y_within_modified["Species"]
y_within_modified["Species2"] = y_within_modified["Species"]

y_within_modified = y_within_modified.drop(columns=["num_copies", "Species"])

y_between_modified = y_between_overview.drop(columns=["pot_syn_sites", "adj_dNdS"])

# Ensure same column order
y_within_modified = y_within_modified[y_between_modified.columns]

# Concatenate
merged_overview_y = pd.concat([y_between_modified, y_within_modified], ignore_index=True)
merged_overview_y

Unnamed: 0,Cluster,Species1,Species2,Mean_Syn,SD_Syn,Mean_Nonsyn,SD_Nonsyn,No_of_Codon,dNdS,Species1_num_copies,Species2_num_copies,Mean_Syn_count,SD_Syn_count,Mean_Nonsyn_count,SD_Nonsyn_count,Family
0,CDY1,PanTro,HomSap,0.0191,0.0033,0.0172,0.0016,517,0.9031,4,4,8.2500,1.4376,19.25,1.7701,CDY1
1,CDY1,PanTro,PanPan,0.0127,0.0028,0.0025,0.0008,517,0.1935,4,2,5.5000,1.1952,2.75,0.8864,CDY1
2,CDY1,PanTro,GorGor,0.0219,0.0030,0.0141,0.0009,517,0.6419,4,1,9.5000,1.2910,15.75,0.9574,CDY1
3,CDY1,PanTro,PonPyg,0.0641,0.0023,0.0523,0.0018,517,0.8153,4,12,27.6667,1.0018,58.50,1.9813,CDY1
4,CDY1,PanTro,PonAbe,0.0643,0.0026,0.0521,0.0015,517,0.8102,4,20,27.7250,1.1276,58.30,1.6812,CDY1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,PonAbe_PonPyg_cluster1,PonAbe,PonAbe,0.0197,0.0170,0.0091,0.0060,203,0.4625,3,3,3.3333,2.8868,4.00,2.6458,isoenzyme
186,PonAbe_PonPyg_cluster2,PonAbe,PonAbe,0.0025,0.0021,0.0000,0.0000,311,0.0000,3,3,0.6667,0.5774,0.00,0.0000,isoenzyme
187,SymSyn_cluster1,SymSyn,SymSyn,0.0000,,0.0000,,251,,2,2,0.0000,,0.00,,isoenzyme
188,PonAbe_PonPyg_cluster1,PonAbe,PonAbe,0.0000,,0.0000,,339,,2,2,0.0000,,0.00,,retrovirus


In [6]:
# left horizontal merge of merged_overview and bootstrap dataframes

# 1) Keep only the columns  from bootstrap
boot_cols = [
    "family", "cluster", "species1", "species2",
    "mean_dNdS", "mean_dN", "mean_dS",
    "frac_below1", "frac_above1", "selection"
]
boot_sub = bootstrap[boot_cols].copy()

# 2) Rename bootstrap key columns to match merged_overview
boot_sub = boot_sub.rename(columns={
    "family": "Family",
    "cluster": "Cluster",
    "species1": "Species1",
    "species2": "Species2"
})

# 2) Clean Cluster: keep only text after "_cluster_"
boot_sub["Cluster"] = (
    boot_sub["Cluster"].astype(str)
    .str.split(pat="_cluster_", n=1, expand=False)
    .str[-1]
    .str.strip()
)

def with_canonical_species(df):
    out = df.copy()
    s1 = out["Species1"].astype(str).str.strip()
    s2 = out["Species2"].astype(str).str.strip()
    # sort the pair case-insensitively so A–B == B–A
    order = s1.str.lower() <= s2.str.lower()
    out["_SpeciesA"] = s1.where(order, s2)
    out["_SpeciesB"] = s2.where(order, s1)
    return out

mo = with_canonical_species(merged_overview_y)
bs = with_canonical_species(boot_sub)

bs = bs.drop_duplicates(subset=["Family", "Cluster", "_SpeciesA", "_SpeciesB"])

# Merge on Family, Cluster, and canonical species
final_df_y = pd.merge(
    mo,
    bs[[
        "Family", "Cluster", "_SpeciesA", "_SpeciesB",
        "mean_dNdS", "mean_dN", "mean_dS", "frac_below1", "frac_above1", "selection"
    ]],
    how="left",
    on=["Family", "Cluster", "_SpeciesA", "_SpeciesB"]
)

# Drop helper columns
final_df_y = final_df_y.drop(columns=["_SpeciesA", "_SpeciesB"])
mask_both_zero = (final_df_y["mean_dN"] == 0) & (final_df_y["mean_dS"] == 0)
final_df_y.loc[mask_both_zero, "selection"] = "purifying"
final_df_y
final_df_y.to_csv(f"/home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_y_updated/Bootstrap_all_families_dN_dS_between_within_species_y_updated2.csv", index=False)
final_df_y

Unnamed: 0,Cluster,Species1,Species2,Mean_Syn,SD_Syn,Mean_Nonsyn,SD_Nonsyn,No_of_Codon,dNdS,Species1_num_copies,...,SD_Syn_count,Mean_Nonsyn_count,SD_Nonsyn_count,Family,mean_dNdS,mean_dN,mean_dS,frac_below1,frac_above1,selection
0,CDY1,PanTro,HomSap,0.0191,0.0033,0.0172,0.0016,517,0.9031,4,...,1.4376,19.25,1.7701,CDY1,1.005204,0.017221,0.019046,0.5920,0.4080,nonsignificant
1,CDY1,PanTro,PanPan,0.0127,0.0028,0.0025,0.0008,517,0.1935,4,...,1.1952,2.75,0.8864,CDY1,0.239645,0.002482,0.012703,0.9882,0.0118,purifying
2,CDY1,PanTro,GorGor,0.0219,0.0030,0.0141,0.0009,517,0.6419,4,...,1.2910,15.75,0.9574,CDY1,0.717800,0.014045,0.021878,0.8558,0.1442,nonsignificant
3,CDY1,PanTro,PonPyg,0.0641,0.0023,0.0523,0.0018,517,0.8153,4,...,1.0018,58.50,1.9813,CDY1,0.839586,0.052261,0.064113,0.8326,0.1674,nonsignificant
4,CDY1,PanTro,PonAbe,0.0643,0.0026,0.0521,0.0015,517,0.8102,4,...,1.1276,58.30,1.6812,CDY1,0.834389,0.052064,0.064268,0.8384,0.1616,nonsignificant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,PonAbe_PonPyg_cluster1,PonAbe,PonAbe,0.0197,0.0170,0.0091,0.0060,203,0.4625,3,...,2.8868,4.00,2.6458,isoenzyme,0.599171,0.009121,0.019669,0.8713,0.1287,nonsignificant
186,PonAbe_PonPyg_cluster2,PonAbe,PonAbe,0.0025,0.0021,0.0000,0.0000,311,0.0000,3,...,0.5774,0.00,0.0000,isoenzyme,0.000000,0.000000,0.002465,0.6374,0.3626,nonsignificant
187,SymSyn_cluster1,SymSyn,SymSyn,0.0000,,0.0000,,251,,2,...,,0.00,,isoenzyme,,0.000000,0.000000,0.0000,1.0000,purifying
188,PonAbe_PonPyg_cluster1,PonAbe,PonAbe,0.0000,,0.0000,,339,,2,...,,0.00,,retrovirus,,0.000000,0.000000,0.0000,1.0000,purifying


In [5]:
# Overview of the within species positive selection
dnds_total = pd.read_csv(f"{data_dir}/sequences_y_updated/Bootstrap_all_families_dN_dS_between_within_species_y_updated2.csv")
dnds_total

Unnamed: 0,Cluster,Species1,Species2,Mean_Syn,SD_Syn,Mean_Nonsyn,SD_Nonsyn,No_of_Codon,dNdS,Species1_num_copies,...,SD_Syn_count,Mean_Nonsyn_count,SD_Nonsyn_count,Family,mean_dNdS,mean_dN,mean_dS,frac_below1,frac_above1,selection
0,CDY1,PanTro,HomSap,0.0191,0.0033,0.0172,0.0016,517,0.9031,4,...,1.4376,19.25,1.7701,CDY1,1.005204,0.017221,0.019046,0.5920,0.4080,nonsignificant
1,CDY1,PanTro,PanPan,0.0127,0.0028,0.0025,0.0008,517,0.1935,4,...,1.1952,2.75,0.8864,CDY1,0.239645,0.002482,0.012703,0.9882,0.0118,purifying
2,CDY1,PanTro,GorGor,0.0219,0.0030,0.0141,0.0009,517,0.6419,4,...,1.2910,15.75,0.9574,CDY1,0.717800,0.014045,0.021878,0.8558,0.1442,nonsignificant
3,CDY1,PanTro,PonPyg,0.0641,0.0023,0.0523,0.0018,517,0.8153,4,...,1.0018,58.50,1.9813,CDY1,0.839586,0.052261,0.064113,0.8326,0.1674,nonsignificant
4,CDY1,PanTro,PonAbe,0.0643,0.0026,0.0521,0.0015,517,0.8102,4,...,1.1276,58.30,1.6812,CDY1,0.834389,0.052064,0.064268,0.8384,0.1616,nonsignificant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,PonAbe_PonPyg_cluster1,PonAbe,PonAbe,0.0197,0.0170,0.0091,0.0060,203,0.4625,3,...,2.8868,4.00,2.6458,isoenzyme,0.599171,0.009121,0.019669,0.8713,0.1287,nonsignificant
186,PonAbe_PonPyg_cluster2,PonAbe,PonAbe,0.0025,0.0021,0.0000,0.0000,311,0.0000,3,...,0.5774,0.00,0.0000,isoenzyme,0.000000,0.000000,0.002465,0.6374,0.3626,nonsignificant
187,SymSyn_cluster1,SymSyn,SymSyn,0.0000,,0.0000,,251,,2,...,,0.00,,isoenzyme,,0.000000,0.000000,0.0000,1.0000,purifying
188,PonAbe_PonPyg_cluster1,PonAbe,PonAbe,0.0000,,0.0000,,339,,2,...,,0.00,,retrovirus,,0.000000,0.000000,0.0000,1.0000,purifying
