# X chromosome ampliconic clustering updated

Goal of script: Make ampliconic clusters based on nucleotide sequences

In [None]:
# Import all necessary packages
import os
import pandas as pd
import subprocess
import matplotlib.pyplot as plt
from Bio import Phylo
from Bio import SeqIO
from Bio.Seq import Seq
import re
from Bio.SeqRecord import SeqRecord
from collections import defaultdict
from pathlib import Path
import numpy as np
import itertools

In [None]:
# Define base directories
data_dir = "/home/emma/Amplicons/Workspaces/emma/downloaded_data"
work_dir = os.path.join(data_dir, "work_dir", "x_multicopy")

#Define the list of dictionaries (data) containing genome information for different species
data = [
    {'species':'PanTro',
     'data': {'chr_y': "NC_072422.2",
              'chr_x': "NC_072421.2",
              'path_to_annotation_NCBI': f"{data_dir}/references/PanTro/ncbi_dataset/data/GCF_028858775.2/genomic.gff",
              'path_to_annotation_NCBI_chry': f"{data_dir}/references/PanTro/ncbi_dataset/data/GCF_028858775.2/genomic_chrY.gff", 
              'path_to_annotation_CAT': f"{data_dir}/CAT/consensus_gene_set/GCF_028858775.2.gff3",
              'ref':  f"{data_dir}/references/PanTro/ncbi_dataset/data/GCF_028858775.2/GCF_028858775.2_NHGRI_mPanTro3-v2.0_pri_genomic.fna", 
              'rna':  f"{data_dir}/references/PanTro/ncbi_dataset/data/GCF_028858775.2/rna.fna",
              'prot': f"{data_dir}/references/PanTro/ncbi_dataset/data/GCF_028858775.2/protein.faa", 
              'cds': f"{data_dir}/references/PanTro/ncbi_dataset/data/GCF_028858775.2/cds_from_genomic.fna", 
              'gff_x': f"{data_dir}/references/PanTro/ncbi_dataset/data/GCF_028858775.2/genomic_chrX.gff", 
              'fasta_x': f"{data_dir}/references/PanTro/ncbi_dataset/data/GCF_028858775.2/PanTro_X.fasta", 
              'gff_x_cds': f"{data_dir}/references/PanTro/ncbi_dataset/data/GCF_028858775.2/genomic_chrX_cds_isoform.gff", 
 }},
    {'species':'HomSap',
     'data': {'chr_y': "NC_060948.1",
              'chr_x': "NC_060947.1",
              'path_to_annotation_NCBI': f"{data_dir}/references/HomSap/ncbi_dataset/data/GCF_009914755.1/genomic.gff",
              'path_to_annotation_NCBI_chry': f"{data_dir}/references/HomSap/ncbi_dataset/data/GCF_009914755.1/genomic_chrY.gff",
              'path_to_annotation_CAT': f"{data_dir}/CAT/consensus_gene_set/hg38.gff3",
              'ref': f"{data_dir}/references/HomSap/ncbi_dataset/data/GCF_009914755.1/GCF_009914755.1_T2T-CHM13v2.0_genomic.fna",
              'cds': f"{data_dir}/references/HomSap/ncbi_dataset/data/GCF_009914755.1/cds_from_genomic.fna",
              'prot': f"{data_dir}/references/HomSap/ncbi_dataset/data/GCF_009914755.1/protein.faa",
              'gff_x': f"{data_dir}/references/HomSap/ncbi_dataset/data/GCF_009914755.1/genomic_chrX.gff",
              'fasta_x': f"{data_dir}/references/HomSap/ncbi_dataset/data/GCF_009914755.1/HomSap_X.fasta",
              'gff_x_cds': f"{data_dir}/references/HomSap/ncbi_dataset/data/GCF_009914755.1/genomic_chrX_cds_isoform.gff",

              }},
     {'species':'PanPan',
     'data': {'chr_y': "NC_073273.2",
              'chr_x': "NC_073272.2",
              'path_to_annotation_NCBI': f"{data_dir}/references/PanPan/ncbi_dataset/data/GCF_029289425.2/genomic.gff",
              'path_to_annotation_NCBI_chry': f"{data_dir}/references/PanPan/ncbi_dataset/data/GCF_029289425.2/genomic_chrY.gff",
              'path_to_annotation_CAT': f"{data_dir}/CAT/consensus_gene_set/GCF_029289425.2.gff3",
              'ref': f"{data_dir}/references/PanPan/ncbi_dataset/data/GCF_029289425.2/GCF_029289425.2_NHGRI_mPanPan1-v2.0_pri_genomic.fna",
              'cds': f'{data_dir}/references/PanPan/ncbi_dataset/data/GCF_029289425.2/cds_from_genomic.fna',
              'prot': f'{data_dir}/references/PanPan/ncbi_dataset/data/GCF_029289425.2/protein.faa',
              'gff_x': f"{data_dir}/references/PanPan/ncbi_dataset/data/GCF_029289425.2/genomic_chrX.gff",
              'fasta_x': f"{data_dir}/references/PanPan/ncbi_dataset/data/GCF_029289425.2/PanPan_X.fasta",
              'gff_x_cds': f"{data_dir}/references/PanPan/ncbi_dataset/data/GCF_029289425.2/genomic_chrX_cds_isoform.gff",
              }},
      {'species':'GorGor',
     'data': {'chr_y': "NC_073248.2",
              'chr_x': "NC_073247.2",
              'path_to_annotation_NCBI': f"{data_dir}/references/GorGor/ncbi_dataset/data/GCF_029281585.2/genomic.gff",
              'path_to_annotation_NCBI_chry': f"{data_dir}/references/GorGor/ncbi_dataset/data/GCF_029281585.2/genomic_chrY.gff",
              'path_to_annotation_CAT': f"{data_dir}/CAT/consensus_gene_set/GCF_029281585.2.gff3",
              'ref': f'{data_dir}/references/GorGor/ncbi_dataset/data/GCF_029281585.2/GCF_029281585.2_NHGRI_mGorGor1-v2.0_pri_genomic.fna',
              'cds': f'{data_dir}/references/GorGor/ncbi_dataset/data/GCF_029281585.2/cds_from_genomic.fna',
              'prot': f'{data_dir}/references/GorGor/ncbi_dataset/data/GCF_029281585.2/protein.faa',
              'gff_x': f"{data_dir}/references/GorGor/ncbi_dataset/data/GCF_029281585.2/genomic_chrX.gff",
              'fasta_x': f"{data_dir}/references/GorGor/ncbi_dataset/data/GCF_029281585.2/GorGor_X.fasta",
                            'gff_x_cds': f"{data_dir}/references/GorGor/ncbi_dataset/data/GCF_029281585.2/genomic_chrX_cds_isoform.gff",
              }},
    {'species':'PonPyg',
     'data': {'chr_y': "NC_072397.2",
              'chr_x': "NC_072396.2",
              'path_to_annotation_NCBI': f"{data_dir}/references/PonPyg/ncbi_dataset/data/GCF_028885625.2/genomic.gff",
              'path_to_annotation_NCBI_chry': f"{data_dir}/references/PonPyg/ncbi_dataset/data/GCF_028885625.2/genomic_chrY.gff",
              'path_to_annotation_CAT': f"{data_dir}/CAT/consensus_gene_set/GCF_028885625.2.gff3",
              'ref': f'{data_dir}/references/PonPyg/ncbi_dataset/data/GCF_028885625.2/GCF_028885625.2_NHGRI_mPonPyg2-v2.0_pri_genomic.fna',
              'cds': f'{data_dir}/references/PonPyg/ncbi_dataset/data/GCF_028885625.2/cds_from_genomic.fna',
              'prot': f'{data_dir}/references/PonPyg/ncbi_dataset/data/GCF_028885625.2/protein.faa',
              'gff_x': f"{data_dir}/references/PonPyg/ncbi_dataset/data/GCF_028885625.2/genomic_chrX.gff",
              'fasta_x': f"{data_dir}/references/PonPyg/ncbi_dataset/data/GCF_028885625.2/PonPyg_X.fasta",
              'gff_x_cds': f"{data_dir}/references/PonPyg/ncbi_dataset/data/GCF_028885625.2/genomic_chrX_cds_isoform.gff",
              }},
    {'species':'PonAbe',
     'data': {'chr_y': "NC_072009.2",
              'chr_x': "NC_072008.2",
              'path_to_annotation_NCBI': f"{data_dir}/references/PonAbe/ncbi_dataset/data/GCF_028885655.2/genomic.gff",
              'path_to_annotation_NCBI_chry': f"{data_dir}/references/PonAbe/ncbi_dataset/data/GCF_028885655.2/genomic_chrY.gff",
              'path_to_annotation_CAT': f"{data_dir}/CAT/consensus_gene_set/GCF_028885655.2.gff3",
              'ref': f'{data_dir}/references/PonAbe/ncbi_dataset/data/GCF_028885655.2/GCF_028885655.2_NHGRI_mPonAbe1-v2.0_pri_genomic.fna',
              'cds': f'{data_dir}/references/PonAbe/ncbi_dataset/data/GCF_028885655.2/cds_from_genomic.fna',
              'prot': f'{data_dir}/references/PonAbe/ncbi_dataset/data/GCF_028885655.2/protein.faa',
              'gff_x': f"{data_dir}/references/PonAbe/ncbi_dataset/data/GCF_028885655.2/genomic_chrX.gff",
              'fasta_x': f"{data_dir}/references/PonAbe/ncbi_dataset/data/GCF_028885655.2/PonAbe_X.fasta",
              'gff_x_cds': f"{data_dir}/references/PonAbe/ncbi_dataset/data/GCF_028885655.2/genomic_chrX_cds_isoform.gff",
              }},    
    {'species':'SymSyn',
      'data': {'chr_y': "NC_072448.2",
               'chr_x': "NC_072447.2",
               'ref': f'{data_dir}/references/SymSyn/ncbi_dataset/data/GCF_028878055.3/GCF_028878055.3_NHGRI_mSymSyn1-v2.1_pri_genomic.fna',
               'path_to_annotation_NCBI': f"{data_dir}/references/SymSyn/ncbi_dataset/data/GCF_028878055.3/genomic.gff",
               'path_to_annotation_NCBI_chry': f"{data_dir}/references/SymSyn/ncbi_dataset/data/GCF_028878055.3/genomic_chrY.gff",
               'path_to_annotation_CAT': f"{data_dir}/CAT/consensus_gene_set/GCF_028878055.3.gff3",
               'cds': f'{data_dir}/references/SymSyn/ncbi_dataset/data/GCF_028878055.3/cds_from_genomic.fna',
               'prot': f'{data_dir}/references/SymSyn/ncbi_dataset/data/GCF_028878055.3/protein.faa',
               'gff_x': f"{data_dir}/references/SymSyn/ncbi_dataset/data/GCF_028878055.3/genomic_chrX.gff",
               'fasta_x': f"{data_dir}/references/SymSyn/ncbi_dataset/data/GCF_028878055.3/SymSyn_X.fasta",
               'gff_x_cds': f"{data_dir}/references/SymSyn/ncbi_dataset/data/GCF_028878055.3/genomic_chrX_cds_isoform.gff",
               }},
    {'species':'MacFas',
      'data': {'chr_y': "NC_132903.1",
               'chr_x': "NC_088395.1",
               'ref': f'{data_dir}/references/MacFas/ncbi_dataset/data/GCF_037993035.2/GCF_037993035.2_T2T-MFA8v1.1_genomic.fna',
               'path_to_annotation_NCBI': f"{data_dir}/references/MacFas/ncbi_dataset/data/GCF_037993035.2/genomic.gff",
               'cds': f'{data_dir}/references/MacFas/ncbi_dataset/data/GCF_037993035.2/cds_from_genomic.fna',
               'prot': f'{data_dir}/references/MacFas/ncbi_dataset/data/GCF_037993035.2/protein.faa',
               'gff_x': f"{data_dir}/references/MacFas/ncbi_dataset/data/GCF_037993035.2/genomic_chrX.gff",
               'fasta_x': f"{data_dir}/references/MacFas/ncbi_dataset/data/GCF_037993035.2/MacFas_X.fasta",
               'gff_x_cds': f"{data_dir}/references/MacFas/ncbi_dataset/data/GCF_037993035.2/genomic_chrX_cds_isoform.gff",
               }},
]

# Maps species identifiers to their common name
species_to_sequence_spec = {
    'PanTro': 'chimpanzee',
    'HomSap': 'human',
    'PanPan': 'bonobo',
    'GorGor': 'gorilla',
    'PonPyg': 'b-orang',
    'PonAbe': 's-orang',
    'SymSyn': 'siamang',
    'MacFas': 'macaque'

}
# Extracting species names -> list of species identifiers by iterating t
species_info = {item['species']: item['data'] for item in data}
species_list = [d['species'] for d in data]
species_info
species_list

In [None]:
##Load dataframes with Gene coordinates and Family information

# DataFrame with gene coordinates and other details
genes = pd.read_csv(f"{work_dir}/protein_extracted_longest/clusters_merged/gene_details_updated_with_palindromes_coordinates_x.tsv", sep='\t')
genes

In [None]:
unique_vals = genes['gene_family_symbol'].unique()
print(unique_vals)
len(unique_vals)

## Extract coding sequences per family 

In [None]:
# notes: 
#'endogenous retrovirus group K member 6 Env polyprotein-like' is called endogenous
#'putative uncharacterized protein FLJ39060' is called FLJ39060
#'collagen alpha-4(IV) chain-like' is called collages
# 'uncharacterized LOC129475109' is called LOC129475109
# 'uncharacterized LOC115932372' is called LOC115932372
# INTSL6 contains SAGE gene family 

In [None]:
# specify which family to focus on -> this is called back later every time
families = ['CSF2RA', 'SPANX', 'TBL1X' ,'VCX' ,'TMSB' ,'MAGEB',
 'TCEAL8' ,'H2A','endogenous','SPACA5',
 'SSX' ,'GAGE' ,'NUDT10' ,'CENPVL',
 'FLJ39060' ,'XAGE1' ,'FAM156', 'SPIN',
 'ZXD' ,'CXorf49' ,'DMRTC1' ,'FAM236', 'PABPC', 'RPL36A', 'ARMCX' ,'NXF',
 'TCP11X2' ,'GPRASP', 'RAB40A' ,'H2BW', 'CT47' ,'RHOXF2' ,'SMIM10' ,'ETD',
 'INTS6L', 'CT45A', 'CXorf51', 'EOLA' ,'HSFX' ,'TMEM185A', 'CSAG', 'PNMA',
 'PWWP4', 'OPN1LW', 'TEX28', 'LAGE3', 'IKBKG' ,'F8A1',
 'collagen' ,'LOC129475109','LOC115932372', 'MAGED1']
len(families)


### Extract the coding Regions
From each gene extract the CDS and merge all the species together in one large fasta file. <br>
Translate the sequence at the end so that the sequences can be checked with the protein files in the references genomes

In [None]:
# Extract the sequences
import re
import os
from Bio import SeqIO

def parse_attributes(attr_str):
    attrs = {}
    for part in attr_str.strip().split(";"):
        if "=" in part:
            k, v = part.split("=", 1)
            attrs[k.strip().lower()] = v.strip()
    return attrs

def get_isoform(attrs):
    """Extract isoform identifier from attributes."""
    # Look for "isoform X" pattern in product field
    prod = attrs.get("product", "")
    # More specific regex: capture X followed by digits, or standalone digit/letter
    m = re.search(r'isoform\s+(X\d+|\d+|[a-z])\b', prod, re.IGNORECASE)
    if m:
        return m.group(1).upper()  # normalize to uppercase
    
    # Fallback to transcript_id or protein_id
    for key in ("transcript_id", "protein_id"):
        if key in attrs:
            return attrs[key]
    
    return "NA"

def select_best_isoform(isoforms_dict, refseq_iso):
    """Select best isoform according to priority rules."""
    cands = list(isoforms_dict.keys())
    
    # Priority 1: RefSeq Select
    if refseq_iso and refseq_iso in cands:
        return refseq_iso
    
    # Priority 2: NA (unnamed isoform)
    if "NA" in cands:
        return "NA"
    
    # Priority 3: X1 (exact match only!)
    if "X1" in cands:
        return "X1"
    
    # Priority 4: 1 or A
    for iso in ("1", "A"):
        if iso in cands:
            return iso
    
    # Priority 5: Lowest number or alphabetically first
    # Separate X-prefixed numbers, plain numbers, and alphabetic
    x_numeric = []
    plain_numeric = []
    alpha_isos = []
    
    for iso in cands:
        # Match X followed by numbers (X4, X10, X123)
        m_x = re.match(r'^X(\d+)$', iso, re.IGNORECASE)
        if m_x:
            x_numeric.append((int(m_x.group(1)), iso))
            continue
        
        # Match plain numbers (2, 3, 10)
        m_plain = re.match(r'^(\d+)$', iso)
        if m_plain:
            plain_numeric.append((int(m_plain.group(1)), iso))
            continue
        
        # Everything else is alphabetic
        alpha_isos.append(iso)
    
    # Prioritize X-numbers, then plain numbers, then alphabetic
    if x_numeric:
        x_numeric.sort()
        return x_numeric[0][1]
    
    if plain_numeric:
        plain_numeric.sort()
        return plain_numeric[0][1]
    
    if alpha_isos:
        alpha_isos.sort()
        return alpha_isos[0]
    
    # Fallback: just return first alphabetically
    return sorted(cands)[0]

def extract_cds(genome_fasta, gff_file, gene_name):
    """Extract CDS for gene, selecting best isoform."""
    genome = SeqIO.to_dict(SeqIO.parse(genome_fasta, "fasta"))
    records = {}      # isoform → list of CDS fragments
    attrs_map = {}    # isoform → parsed attrs
    refseq_iso = None
    
    # Collect all CDS features for this gene
    with open(gff_file) as fh:
        for line in fh:
            if line.startswith("#"):
                continue
            cols = line.rstrip("\n").split("\t")
            if len(cols) < 9 or cols[2] != "CDS":
                continue
            
            seqid, _, _, start, end, _, strand, frame, attr_str = cols[:9]
            attrs = parse_attributes(attr_str)
            
            if attrs.get("gene") != gene_name:
                continue
            
            iso = get_isoform(attrs)
            
            # Create a tuple representing this CDS fragment
            # (seqid, start, end, strand) - these define a unique fragment
            fragment = (seqid, int(start) - 1, int(end), int(frame), strand)
            
            # Only add if we haven't seen this exact fragment for this isoform
            if iso not in records:
                records[iso] = []
            
            # Check if this exact coordinate already exists for this isoform
            fragment_coords = (fragment[0], fragment[1], fragment[2], fragment[4])  # seqid, start, end, strand
            existing_coords = [(f[0], f[1], f[2], f[4]) for f in records[iso]]
            
            if fragment_coords not in existing_coords:
                records[iso].append(fragment)
            
            # Store attributes (will be overwritten if duplicate, but that's fine)
            attrs_map[iso] = attrs
            
            # Check for RefSeq Select tag
            if "tag" in attrs and "refseq select" in attrs["tag"].lower():
                refseq_iso = iso
    
    if not records:
        return {}
    
    # Assemble CDS sequence for each isoform
    isoform_sequences = {}
    for iso, frags in records.items():
        rev = (frags[0][4] == "-")
        frags.sort(key=lambda x: x[1], reverse=rev)
        
        pieces = []
        first = True
        for sid, s, e, frm, strand in frags:
            subseq = genome[sid].seq[s:e]
            if strand == "-":
                subseq = subseq.reverse_complement()
            if first:
                subseq = subseq[frm:]  # Apply frame offset to first fragment
                first = False
            pieces.append(str(subseq))
        
        isoform_sequences[iso] = "".join(pieces)
    
    # Select best isoform
    best_iso = select_best_isoform(isoform_sequences, refseq_iso)
    
    return {best_iso: (isoform_sequences[best_iso], attrs_map[best_iso])}

# ========== Main processing loop ================
for family in families:
    print(f"\n=== Processing family {family!r} ===")
    filtered_genes = genes[
        genes["gene_family_symbol"].str.contains(family, na=False)
    ]
    
    intermediate_dir = f"{data_dir}/sequences_x_updated/{family}_selected_isoform"
    os.makedirs(intermediate_dir, exist_ok=True)
    
    for species in species_list:
        info = species_info[species]
        genome = info["fasta_x"]
        gff_file = info["gff_x_cds"]
        out_fa = f"{intermediate_dir}/{species}_{family}.fa"
        
        with open(out_fa, "w") as fout:
            sp_genes = filtered_genes[filtered_genes["Species"] == species]
            written = 0
            
            for gene in sp_genes["Gene_symbol"]:
                iso_dict = extract_cds(genome, gff_file, gene)
                
                if not iso_dict:
                    print(f"    [!] No CDS for {gene} in {species}")
                    continue
                
                for iso, (seq, attrs) in iso_dict.items():
                    pid = attrs.get("protein_id", "")
                    header = f">{gene}_isoform_{iso}"
                    if pid:
                        header += f";protein_id={pid}"
                    fout.write(header + "\n" + seq + "\n")
                    written += 1
            
            print(f"  • {species}: wrote {written}/{len(sp_genes)} genes → {out_fa}")

In [None]:
# combine the fasta files
for family in families:
    print(f"\n→ Processing family {family!r}\n")

    # … your per‐family + per‐species extraction code …

    # combine step
    intermediate_dir = f"{data_dir}/sequences_x_updated/{family}_selected_isoform"
    combined_fasta   = f"{intermediate_dir}/all_species_{family}.fa"
    with open(combined_fasta, "w") as outfile:
        for species in species_list:
            species_fasta = f"{intermediate_dir}/{species}_{family}.fa"
            try:
                with open(species_fasta) as infile:
                    for line in infile:
                        if line.startswith(">"):
                            header = line.strip()
                            if not header.endswith(f"_{species}"):
                                header += f"_{species}"
                            outfile.write(header + "\n")
                        else:
                            outfile.write(line)
            except FileNotFoundError:
                print(f"[Warning] {species_fasta} not found.")
    print(f"→ Combined FASTA for {family} at {combined_fasta}")

In [None]:
##TRANSLATE 

## Translate the coding sequence to check if there are stop codons within 
def translate_record(record, frame=0):
    """
    Translates a SeqRecord's nucleotide sequence into a protein sequence.
    
    :param record: A SeqRecord object containing the nucleotide sequence.
    :param frame: The reading frame (0, 1, or 2) to start translation.
    :return: A new SeqRecord with the translated protein sequence.
    """
    # Adjust the sequence for the reading frame
    trimmed_seq = record.seq[frame:]
    # Translate into protein (keeps '*' for stop codons)
    protein_seq = trimmed_seq.translate(to_stop=False)
    # Create a new SeqRecord for the protein
    return SeqRecord(protein_seq, id=record.id, description="translated")

def translate_fasta(input_file, output_file, frame=0):
    """
    Reads a FASTA file with nucleotide sequences, translates each sequence, 
    and writes the protein sequences to an output FASTA file.
    
    :param input_file: Path to the input FASTA file.
    :param output_file: Path to the output FASTA file.
    :param frame: The reading frame (0, 1, or 2) to start translation.
    """
    # Parse the input FASTA file and translate each record
    translated_records = []
    for record in SeqIO.parse(input_file, "fasta"):
        translated_records.append(translate_record(record, frame))
    
    # Write the translated records to the output FASTA file
    SeqIO.write(translated_records, output_file, "fasta")

for family in families:
    # 1) make the per-family directory
    intermediate_dir = f"{data_dir}/sequences_x_updated/{family}_selected_isoform"
    os.makedirs(intermediate_dir, exist_ok=True)

    # 2) build your input/output paths
    in_fasta  = f"{intermediate_dir}/all_species_{family}.fa"
    out_fasta = f"{intermediate_dir}/all_species_{family}_translated.fa"

    # 3) translate
    print(f"Translating {in_fasta} → {out_fasta}")
    translate_fasta(in_fasta, out_fasta, frame=0)

## Create Ampliconic clustering across species using BLAST

In [None]:
### Create ampliconic clusters based on Identity using BLASTN

# use the makeblastbd command to create a BLAST database from the complete FASTA file of coding sequences sequences. 
# makeblastbd is a tool to create a database from sequence data. Prepares the input FASTA file for fast querying using BLAST. 
# - bdtype nucl -> defines the type of sequence in the database. nucl indicates that the sequence are nucleotide (prot would be for protein).
#creates automatically directory to store the output per gene family

for family in families:
    print(f"▶ makeblastdb for {family!r}")
    cmd = f"makeblastdb \
            -in {data_dir}/sequences_x_updated/{family}_selected_isoform/all_species_{family}.fa \
            -dbtype nucl \
            -out {data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/blastdb"
    subprocess.run(cmd, shell=True, check=True)

In [None]:
# then do a pairwise comparison of each gene coding sequence against all other genes in the database 
# The output (.blastp.tsv) provides detailed alignments and similarities for each nucleotide query against the database.

#Run a BLAST search for each species' coding sequences against the BLAST database created. 
# blast all against DB
     # blastn -> a tool for comparing nucleotide sequences (query) against the above made database
     # -query -> specified the INPUT query file. This is the FASTA file of all the genes of all the species (same file as above)
     # -db -> specified the BLAST DATABASE to search against (this is the one that is created by makeblastdb)
     # -out -> specified the OUTPUT file for the resutls of the BLAST search. the results will be saved in a tab-seperated file (.blastp.tsv)
     # -outfmt -> specified the output format for BLAST results. 6 means tabular format with custom columns.
    # Selected columns then: qseqid (Query sequence ID, sseqid (subject (database) sequence ID), pident (percentage of identical matches), mismatch (number of mismatches), gapopen (number of gap openings)
    # gaps (total number of gaps), qcovs (query coverage per subject sequence), qcovshsp (Query coverage per HSP (high scoring pair), evalue (Expected value, which measures the significance of the match).

for family in families:
    print(f"Processing family: {family}")
    cmd = f"blastn -query {data_dir}/sequences_x_updated/{family}_selected_isoform/all_species_{family}.fa \
    -db {data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/blastdb \
    -out {data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/results.blastp.tsv \
    -outfmt \"6 qseqid sseqid pident mismatch gapopen gaps qcovs qcovshsp evalue\" "

    subprocess.run(cmd, shell=True, check=True)

In [None]:
# Process the BLAST results to build a network of relationships (edges) between proteins based on similarity thresholds for identity, coverage, and score

    # edges -> a dictionary to store relationships between sequences. 
    # Keys = query protein IDs and values=lists of subject (matched) protein IDs The thresholds are: "identity (minimum percentage of identical matches (95%), coverage (miniumum query coverage (80%)), score -> maximum e-value (0.001), indicating the significance of the alignment)

    #Takes the just made .blasp.tsv. and open the BLAST result file in read mode (with.open etc) for line in infile = Then process each line of the BLAST result file.
    #line = line.strip().split("\t") -> Splits the line into a list of fields using tab (\t) as the delimiter.

    # if float(line[2]) >= identity and int(line[6]) >= coverage and float(line[7]) < score: -> applies the filtering criteria.
    # identity -> Checks if the percentage identity (pident) is at least 95%.
    #int(line[6]) >= coverage -> Checks if the query coverage (qcovs) is at least 80%.
    # float(line[7]) < score -> Ensures the e-value (evalue) is below the threshold of 0.001

    # if line[0] in edges: -> store the edges. If the query sequence (line[0]) is already in edges, it appends the subject sequence (line[1]) to its list of edges.
    # If the query sequence is not yet in edges Create a new entry with the query protein as the key and a list containing the subject protein as the value

    # The edges Dictionary: Represents a network of protein relationships for each species based on BLAST similarity. -> according to the filtering criteria it will so that protein 1 is similar to protein 3 and 5 for example.

# # Collect edges for each protein with species-specific thresholds
edges = {}
identity_default = 95
identity_macfas = 90  # More lenient threshold for MacFas because it is already 5% sequence identity away from the other species
coverage = 80
score = 0.001

edges_per_family = {}

for family in families:
    print(f"Processing family: {family}")
    edges = {}
    file = f"{data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/results.blastp.tsv"
    
    with open(file, "r") as infile:
        for line in infile:
            cols = line.strip().split("\t")
            
            # Determine which identity threshold to use
            # If either the query or subject is from MacFas, use the lower threshold
            query_id = cols[0]
            subject_id = cols[1]
            
            # Check if either sequence is from MacFas (adjust the pattern to match your naming convention)
            is_macfas_comparison = "MacFas" in query_id or "MacFas" in subject_id
            
            # Use appropriate identity threshold
            identity_threshold = identity_macfas if is_macfas_comparison else identity_default
            
            # Apply filtering with the appropriate threshold
            if (float(cols[2]) >= identity_threshold
                and int(cols[6]) >= coverage
                and float(cols[7]) < score):
                
                if cols[0] in edges:
                    edges[cols[0]].append(cols[1])
                else:
                    edges[cols[0]] = [cols[1]]
    
    edges_per_family[family] = edges


In [None]:
# Then refine the edges collected above, by keeping only two-way edges -> so both nodes must point to each other. It constructs a new list of two-way edges and updates the set of vertices accordingly.

    # edges_2way = [] -> an empty list to store the two-way edges vertices = set() -> a set to store all unique sequences (nodes) that are part of these two-way edge
    # for node_A in edges: -> iterates through each source node (node_A) in the edges dictionary
    # for node_B in edges[node_A]: -> ensures that node_B exists as a source node in edges. if node_B in edges and node_A in edges[node_B]: -> hecks if the relationship is reciprocated (i.e., node_A is in the list of edges for node_B). -> if both are met, the edge between node_A and node_B is 2-way.
    # tuple = (node_A, node_B) -> create a tuple representing the edge between node_A and node_B tuple = sorted(tuple) -> sorts the tuple to ensure consistent ordering (smaller node, larger node). This avoids duplicates.
    # if tuple in edges_2way: continue -> If the tuple is already in edges_2way, it skips to the next iteration to avoid duplicates.
    # edges_2way.append(tuple) -> adds the normalized edge tuple to edges_2way
    # vertices.add(node_A) vertices.add(node_B) -> adds both nodes (node_A and node_B) to the vertices set to track all nodes involved in two-way edges
    # Output -> A list of all 2way edges in edges_2way. Vertices is a set of all unique nodes (sequences) involved in two-way edges.

two_way_per_family = {}
vertices_per_family = {}

for family in families:
    print(f"Processing family: {family}")
    edges     = edges_per_family[family]
    edges_2way = []
    vertices   = set()

    for node_A in edges:
        for node_B in edges[node_A]:
            if node_B in edges and node_A in edges[node_B]:
                pair = tuple(sorted((node_A, node_B)))
                if pair in edges_2way:
                    continue
                edges_2way.append(pair)
                vertices.add(node_A)
                vertices.add(node_B)

    two_way_per_family[family] = edges_2way
    vertices_per_family[family]   = vertices

In [None]:
# Merge clusers of sequences using transitive clustering based on the two-way edges. The process involves 2 main steps: creating initial clusters and then merging clusters that share common elements.

    # In first part. You intialize clusters -> a list of sets, where each set represents a cluster of connected nodes (sequences). You then iterate over the edges -> each edge in edges_2way represents a connection between two sequences (edge[0] and edge[1]).
    # Then you check if Edge belongs to an exister cluster. For each cluster, check if theiter protein in the edge( edge[0] and edge[1]) is already part of a cluster. If yes -> both proteins are added to the cluster. The found=True indicate the edge has been incorporated. then you break out of the loop to avoid the redundant checks.
    # Then you create a new cluster for disconnected edges -> if the edge does not belong to any existing cluster (found=False), create a new cluster containing the two sequences (set(edge)) and add it to clusters.

    # The second part is to merge clusters with common elements. Initialize merged_cluster -> a lists of sets where overlapping clusters are mered into single unified clusters. Then you iterate over the initial cluster -> each cluster in clusters is checked for operlap with clusters in merged_clusters. Then you use cluster.intersection(merged_cluster) to check if there are any common elements between current cluster and any merged_cluster. 
    # If there is overlap (len(...) > 0), merge 2 clusters by updating merged_cluster with elements from cluster (merged_cluster.update(cluster). Mark found=True to indicate the cluster has been merged. Then break out of the loop to avoid redundent checks
    # if no overlap is found -> add the cluster as a new indepedent entry in merged_clusters

    # Output -> a list of sets, where each set represents a final cluster of connected nodes. overlapping clusters have been unified. This provides a higher-level organization of sequences into distinct clusters, which can then be used for downstream analysis, such as functional annotation or evolutionary relationships.

clusters_per_family = {}
merged_clusters_per_family = {}

for family in families:
    print(f"→ Clustering for family {family!r}")
    edges_2way = two_way_per_family[family]

    # 1) transitive clustering
    clusters = []
    for edge in edges_2way:
        found = False
        for cluster in clusters:
            if edge[0] in cluster or edge[1] in cluster:
                cluster.add(edge[0])
                cluster.add(edge[1])
                found = True
                break
        if not found:
            clusters.append(set(edge))

    # 2) merge overlapping clusters
    merged_clusters = []
    for cluster in clusters:
        found = False
        for m in merged_clusters:
            if cluster & m:       # any intersection?
                m.update(cluster)
                found = True
                break
        if not found:
            merged_clusters.append(cluster)

    # store and/or print
    merged_clusters_per_family[family] = merged_clusters

In [None]:
merged_clusters_per_family['']

In [None]:
# Define which sequences belong into cluster together
# take the list of clusters, extract gene name, and species and create a dataframe with a cluster number for each gene. 
# build a list of (gene, species, cluster_id) tuples
for family in families:
    clusters = merged_clusters_per_family.get(family, [])
    if not clusters:
        print(f"No clusters for family {family!r}, skipping.")
        continue

    # Build your list of (gene_name, species, cluster_id)
    rows = [
        (
            gene,
            gene.rsplit('_', 1)[1],      # species is the suffix after the last underscore
            cluster_id
        )
        for cluster_id, cluster in enumerate(clusters, start=1)
        for gene in cluster
    ]

    # Create the DataFrame
    df = pd.DataFrame(rows, columns=['gene_name', 'species', 'cluster'])
    
    # Save to CSV
    out_csv = f"{data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/gene_clustering_{family}.csv"
    df.to_csv(out_csv, index=False)
    print(f"  • Saved clustering table for {family!r} → {out_csv}")

In [None]:
# gives cluster names automatically based on overal recognizable cluster name 
for family in families:
    print(f"→ Naming clusters for family {family!r}")

    # 1) Load the per-family clustering table
    csv_in = f"{data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/gene_clustering_{family}.csv"
    df = pd.read_csv(csv_in)

    # 2) Auto-name each cluster
    cluster_name = {}
    species_combo_counts = {}

    for cid, grp in df.groupby('cluster'):
        # prefer non-LOC genes
        non_loc = grp[~grp['gene_name'].str.startswith('LOC')]
        if not non_loc.empty:
            # take the shortest prefix before first '_'
            prefs = non_loc['gene_name'].str.partition('_')[0]
            base = prefs.loc[prefs.str.len().idxmin()]
        else:
            # fallback: name by species combination
            combo = "_".join(sorted(grp['species'].unique()))
            cnt = species_combo_counts.get(combo, 0) + 1
            species_combo_counts[combo] = cnt
            base = f"{combo}_cluster{cnt}"

        cluster_name[cid] = base

    # 3) Disambiguate duplicates so big clusters keep the base name
    sizes = df.groupby('cluster').size().to_dict()
    dups = defaultdict(list)
    for cid, name in cluster_name.items():
        dups[name].append(cid)

    for name, cids in dups.items():
        if len(cids) > 1:
            # largest cluster keeps the base name
            cids.sort(key=lambda x: sizes[x], reverse=True)
            for idx, cid in enumerate(cids[1:], start=2):
                cluster_name[cid] = f"{name}_{idx}"

    # 4) Map back and save
    df['cluster_name'] = df['cluster'].map(cluster_name)
    out_csv = f"{data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/{family}_clusters_named_auto.csv"
    df.to_csv(out_csv, index=False)
    print(f"  • Saved auto-named clusters → {out_csv}")


## Show the cluster counts

In [None]:
# Show for each species (rows) and each cluster (columns), how many genes from that species are present in that cluster
# group the gene-level dataframe and then bivor the results so that species are rows and cluster are the columns. 

# Assuming final_df is your DataFrame with columns: gene_name, species, cluster, ... (made above)
# For example:
#   gene_name          species  cluster
# 0 PAGE4_GorGor      GorGor   1
# 1 PAGE4_HomSap      HomSap   1
# 2 XAGE2_GorGor      GorGor   2
# ...

for family in families:
    print(f"\n→ Building species×cluster matrix for {family!r}")

    # 1) load the named‐cluster table
    in_csv = f"{data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/{family}_clusters_named_auto.csv"
    df = pd.read_csv(in_csv)

    # 2) count genes per (species, cluster)
    grouped = (df.groupby(['species', 'cluster_name']).size().reset_index(name='count'))

    # 3) pivot so species are rows, clusters are columns
    cluster_species_df = (grouped.pivot(index='species', columns='cluster_name', values='count').fillna(0).sort_index().sort_index(axis=1))

    # 4) show or save
    #print(cluster_species_df)  # or display(...) in Jupyter
    out_csv = (f"{data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/species_cluster_counts_{family}.csv")
    cluster_species_df.to_csv(out_csv)
    print(f"  • Saved species×cluster counts → {out_csv}")

## Subset the fasta file of a gene family into its clusters

In [None]:
for family in families:
    print(f"\n→ Processing family {family!r}")

    # 1) Load the auto‐named clusters table
    csv_in = f"{data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/{family}_clusters_named_auto.csv"
    df = pd.read_csv(csv_in)

    # 2) Build the cluster→genes map
    cluster_to_genes = df.groupby("cluster_name")["gene_name"].apply(list).to_dict()

    # (Optional) summary
    for cluster, genes in cluster_to_genes.items():
        print(f"  Cluster: {cluster}, Number of genes: {len(genes)}")

    # 3) Load the full family FASTA once
    fasta_in = f"{data_dir}/sequences_x_updated/{family}_selected_isoform/all_species_{family}.fa"
    records = list(SeqIO.parse(fasta_in, "fasta"))
    record_dict = {rec.id: rec for rec in records}
    print(f"  Loaded {len(record_dict)} FASTA records")

    # 4) Make output dir for per‐cluster FASTAs
    cluster_dir = f"{data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/cluster_fastas"
    os.makedirs(cluster_dir, exist_ok=True)

    # 5) Write one FASTA per cluster
    for cluster, gene_list in cluster_to_genes.items():
        selected = [record_dict[g] for g in gene_list if g in record_dict]
        out_fa = f"{cluster_dir}/{family}_cluster_{cluster}.fa"
        SeqIO.write(selected, out_fa, "fasta")
        print(f"    • Wrote {len(selected)} records → {out_fa}")

##  Make sure the correct gene names are in the coordinate file

In [None]:
# 1) Read once and filter out everything with no gene_family_symbol
gene_details_file = f"{work_dir}/protein_extracted_longest/clusters_merged/gene_details_updated_with_palindromes_coordinates.tsv"

master_df = pd.read_csv(gene_details_file, sep="\t")
master_df = master_df.dropna(subset=["gene_family_symbol"])

# 2) Loop over each family
for family in families:
    print(f"\n→ Updating gene details for family {family!r}")

    # Load that family's auto-named clusters
    cluster_file = os.path.join(
        data_dir,
        "sequences_x_updated",
        f"{family}_selected_isoform",
        "blastdb",
        f"{family}_clusters_named_auto.csv"
    )
    cluster_df = pd.read_csv(cluster_file, sep=",")

    # Subset the master table to only genes in this family
    filt = master_df[
        master_df['gene_family_symbol'].str.contains(family, na=False)
    ]

    # Prepare the merge keys
    cluster_df['gene_prefix'] = cluster_df['gene_name'].str.split('_').str[0]
    mapping = cluster_df[['gene_prefix', 'species', 'cluster_name']]

    # Merge on Gene_symbol + Species → gene_prefix + species
    merged = filt.merge(
        mapping,
        left_on=['Gene_symbol', 'Species'],
        right_on=['gene_prefix', 'species'],
        how='left'
    )

    # Rename and drop helper columns
    merged.rename(columns={'cluster_name': 'cluster'}, inplace=True)
    drop_cols = [c for c in ('gene_prefix', 'species_y') if c in merged.columns]
    if drop_cols:
        merged.drop(columns=drop_cols, inplace=True)

    # (Optional) quick peek
    print(f"  {len(merged)} rows after merge (with new ‘cluster’ col)")

    # Write out the updated table
    out_tsv = os.path.join(
        data_dir,
        "sequences_x_updated",
        f"{family}_selected_isoform",
        "blastdb",
        f"{family}_compl_gene_details_updated_with_palindromes_coordinates.tsv"
    )
    merged.to_csv(out_tsv, sep="\t", index=False)
    print(f"  • Saved → {out_tsv}")

In [None]:
## one big file with all the information

# 1) Load & pre-filter your master gene_details
gene_details_file = f"{work_dir}/protein_extracted_longest/clusters_merged/gene_details_updated_with_palindromes_coordinates.tsv"

master_df = pd.read_csv(gene_details_file, sep="\t")
master_df = master_df.dropna(subset=["gene_family_symbol"])


# 2) Collect all per‐family cluster maps into one DataFrame
maps = []
for family in families:
    fn = f"{data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/{family}_clusters_named_auto.csv"
    if not os.path.exists(fn):
        print(f"[!] missing cluster file for {family}, skipping")
        continue

    cf = pd.read_csv(fn)
    # extract the gene prefix (pre‐underscore) → Gene_symbol
    cf["gene_prefix"] = cf["gene_name"].str.split("_").str[0]
    # keep only the columns we need
    maps.append(cf[["gene_prefix", "species", "cluster_name"]].rename(
        columns={"species": "Species", "cluster_name": "cluster"}
    ))

# one big mapping table
map_df = pd.concat(maps, ignore_index=True).drop_duplicates(
    subset=["gene_prefix", "Species"]
)


# 3) Merge once onto master_df
#    left_on Gene_symbol + Species  → right_on gene_prefix + Species
merged_all = master_df.merge(map_df,
    left_on=["Gene_symbol", "Species"],
    right_on=["gene_prefix", "Species"],
    how="left"
)

# clean up helper columns
if "gene_prefix" in merged_all.columns:
    merged_all.drop(columns=["gene_prefix"], inplace=True)

# 4) Save your giant table
out_file = f"{data_dir}/sequences_x_updated/all_families_gene_details_with_clusters2.tsv"

merged_all.to_csv(out_file, sep="\t", index=False)
print(f"→ Wrote combined table with cluster info for all families → {out_file}")


## MEGA analysis dN & dS calculation per cluster

### Define all the clusters 

In [None]:
# define all the clusters
import os

# this dict will hold, for each family, the list of multi-seq clusters
cluster_list_per_family = {}

for family in families:
    # ensure the alignments directory exists
    cluster_alignments = f"{data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/cluster_alignments"
    os.makedirs(cluster_alignments, exist_ok=True)

    # grab every .fa basename in the cluster_fastas dir
    cluster_dir = f"{data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/cluster_fastas"
    all_clusters = [
        os.path.splitext(fn)[0]
        for fn in os.listdir(cluster_dir)
        if fn.endswith(".fa")
    ]

    # filter out FASTAs with only one sequence
    filtered = []
    for name in all_clusters:
        path = os.path.join(cluster_dir, f"{name}.fa")
        with open(path) as f:
            nseq = sum(1 for line in f if line.startswith(">"))
        if nseq > 1:
            filtered.append(name)

    # optional sanity‐check for duplicate IDs
    for name in filtered:
        path = os.path.join(cluster_dir, f"{name}.fa")
        seen, dups = set(), set()
        with open(path) as f:
            for line in f:
                if line.startswith(">"):
                    seqid = line[1:].split()[0]
                    if seqid in seen:
                        dups.add(seqid)
                    else:
                        seen.add(seqid)
        if dups:
            print(f"[{family}] {name}.fa has duplicate IDs: {', '.join(dups)}")

    # store the filtered list for later
    cluster_list_per_family[family] = filtered

    print(f"{family}: keeping {len(filtered)} clusters")

### Make a Codon based Multi-Sequence Alignment

In [None]:
### Make a codon-based alignment
# STEP 1: Align with MACSE

for family in families:
    # 1) ensure the output directory exists
    ds_dir = f"{data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/cluster_alignments"
    os.makedirs(ds_dir, exist_ok=True)
    
for family, cluster_list in cluster_list_per_family.items():
    print(f"\n→ Aligning {len(cluster_list)} clusters for family {family!r}")
    cluster_alignments = f"{data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/cluster_alignments"

    for cluster in cluster_list:
        cmd = (
            f"macse -prog alignSequences "
            f"-seq {data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/cluster_fastas/{cluster}.fa "
            f"-out_NT {data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/cluster_alignments/{cluster}_NT.fa "
            f"-out_AA {data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/cluster_alignments/{cluster}_AA.fa"
        )
        subprocess.run(cmd, shell=True, check=True,stdout=subprocess.DEVNULL,stderr=subprocess.DEVNULL)

    print(f"→ Done family {family!r}")

In [None]:
## refine alignment in MACSE 
## to make the alignment better 
# run secondly+ seperately!! takes very long to run it at the same time

for family, cluster_list in cluster_list_per_family.items():
    print(f"\n→ Aligning {len(cluster_list)} clusters for family {family!r}")
    cluster_alignments = f"{data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/cluster_alignments"

    # # Refine made alignment: for alignments that are difficult
    for cluster in cluster_list:
         cmd = (
             f"macse -prog refineAlignment "
             f"-align {data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/cluster_alignments/{cluster}_NT.fa "
             f"-out_NT {data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/cluster_alignments/{cluster}_NT.fa "
             f"-out_AA {data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/cluster_alignments/{cluster}_AA.fa"
         )
         subprocess.run(cmd, shell=True, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # run quietly 
    
    print(f"→ Done family {family!r}")

In [None]:
## clean alignment in MACSE 
## to make the alignment useable for after 

for family, cluster_list in cluster_list_per_family.items():
    print(f"\n→ Aligning {len(cluster_list)} clusters for family {family!r}")
    cluster_alignments = f"{data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/cluster_alignments"

    # # clean alignments with stopcodons in the middle of the sequence (MACSE output "!" with frameshift/stop codons. Replace by "NNN" for analysis:)
    for cluster in cluster_list:
         cmd = (
             f"macse -prog exportAlignment "
             f"-align {data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/cluster_alignments/{cluster}_NT.fa "
             f"-codonForInternalStop NNN "
             f"-codonForInternalFS --- "
             f"-charForRemainingFS --- "
             f"-out_NT {data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/cluster_alignments/{cluster}_NT.fa "
             f"-out_AA {data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/cluster_alignments/{cluster}_AA.fa"
         )
         subprocess.run(cmd, shell=True, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # run quietly 
    
    print(f"→ Done family {family!r}")

### Calculate dN & dS and N & S counts for each pairwise comparison

In [None]:
## Calculate SYNonymous substitutions rate
# Modified Nei-Gojobori with complete deletion
for family in families:
    # 1) ensure the output directory exists
    ds_dir = f"{data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/cluster_dS"
    os.makedirs(ds_dir, exist_ok=True)

    # 2) get the (previously filtered) list of clusters for this family
    clusters = cluster_list_per_family[family]

    # 3) run your original megacc command for each cluster
    for cluster in clusters:
        cmd = (
            f"megacc "
            f"-a {data_dir}/compute_ds_modNG_compldel.mao "
            #f"-d {data_dir}/sequences_x/{family}_isoform_X1/blastdb/cluster_alignments/{cluster}.meg "
            f"-d {data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/cluster_alignments/{cluster}_NT.fa "
            f"-o {ds_dir}/{cluster}"
        )
        subprocess.run(cmd,shell=True, check=True,stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)


In [None]:
## Calculate NONsynonymous substitutions rate
# Modified Nei-Gojobori with complete deletion
for family in families:
    # 1) ensure the output directory exists
    ds_dir = f"{data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/cluster_dN"
    os.makedirs(ds_dir, exist_ok=True)

    # 2) get the (previously filtered) list of clusters for this family
    clusters = cluster_list_per_family[family]

    # 3) run your original megacc command for each cluster
    for cluster in clusters:
        cmd = (
            f"megacc "
            f"-a {data_dir}/compute_dN_modNG_compldel.mao "
            #f"-d {data_dir}/sequences/{family}_isoform_X1/blastdb/cluster_alignments/{cluster}.meg "
            f"-d {data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/cluster_alignments/{cluster}_NT.fa "
            f"-o {ds_dir}/{cluster}"
        )
        subprocess.run(cmd,shell=True, check=True,stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

In [None]:
## Calculate SYNonymous substitution COUNTS
#actual counts of synonymous differences
for family in families:
    # 1) ensure the output directory exists
    ds_dir = f"{data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/cluster_syn_count"
    os.makedirs(ds_dir, exist_ok=True)

    # 2) get the (previously filtered) list of clusters for this family
    clusters = cluster_list_per_family[family]

    # 3) run your original megacc command for each cluster
    for cluster in clusters:
        cmd = (
            f"megacc "
            f"-a {data_dir}/compute_syn_count_modNG_compldel.mao "
            #f"-d {data_dir}/sequences/{family}_isoform_X1/blastdb/cluster_alignments/{cluster}.meg "
            f"-d {data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/cluster_alignments/{cluster}_NT.fa "
            f"-o {ds_dir}/{cluster}"
        )
        subprocess.run(cmd,shell=True, check=True,stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)


In [None]:
## Calculate NONSYNonymous substitution COUNTS
#actual counts of nonsynonymous differences
for family in families:
    # 1) ensure the output directory exists
    ds_dir = f"{data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/cluster_nonsyn_count"
    os.makedirs(ds_dir, exist_ok=True)

    # 2) get the (previously filtered) list of clusters for this family
    clusters = cluster_list_per_family[family]

    # 3) run your original megacc command for each cluster
    for cluster in clusters:
        cmd = (
            f"megacc "
            f"-a {data_dir}/compute_nonsyn_count_modNG_compldel.mao "
            #f"-d {data_dir}/sequences/{family}_isoform_X1/blastdb/cluster_alignments/{cluster}.meg "
            f"-d {data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/cluster_alignments/{cluster}_NT.fa "
            f"-o {ds_dir}/{cluster}"
        )
        subprocess.run(cmd,shell=True, check=True,stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)


### Turn .meg files into csv files

In [None]:
# existing parser (define once)
def parse_meg_file(path: Path) -> pd.DataFrame:
    text = path.read_text().splitlines()
    labels = []
    for line in text:
        s = line.strip()
        if re.match(r"^\[\s*\d", s) and "#" not in s:
            break
        m = re.match(r"^\[\s*(\d+)\]\s*#\s*(.+)$", s)
        if m:
            labels.append(m.group(2).strip())
    n = len(labels)
    full = np.zeros((n, n), float)
    for line in text:
        s = line.strip()
        m = re.match(r"^\[\s*(\d+)\]\s*(.*)$", s)
        if not m: continue
        i = int(m.group(1))
        if not (1 <= i <= n): continue
        rest = m.group(2)
        nums = re.findall(r"[-+]?\d*\.\d+(?:[eE][-+]?\d+)?", rest)
        if len(nums) != i - 1: continue
        for k, tok in enumerate(nums):
            j = k + 1
            v = float(tok)
            full[i-1, j-1] = full[j-1, i-1] = v
    np.fill_diagonal(full, 0.0)
    df = pd.DataFrame(full, index=labels, columns=labels)
    mask = np.tril(np.ones(df.shape, bool), k=-1)
    return df.where(mask)

# base data directory
data_dir = Path("/home/emma/Amplicons/Workspaces/emma/downloaded_data")

In [None]:
# the per‐family loop SYNONYMOUS RATE
for family in families:
    print(f"\n→ Processing dS matrices for family {family!r}")
    
    in_dir  = data_dir / f"sequences_x_updated/{family}_selected_isoform/blastdb/cluster_dS"
    out_dir = in_dir / "matrix_csvs"
    out_dir.mkdir(exist_ok=True)

    for meg in sorted(in_dir.glob("*.meg")):
        df_lower = parse_meg_file(meg)
        out_file = out_dir / f"{meg.stem}_dS.csv"
        df_lower.to_csv(out_file)
        #print(f"   • wrote {out_file}")

In [None]:
# the per‐family loop NONSYNONYMOUS RATE
for family in families:
    print(f"\n→ Processing dN matrices for family {family!r}")
    
    in_dir  = data_dir / f"sequences_x_updated/{family}_selected_isoform/blastdb/cluster_dN"
    out_dir = in_dir / "matrix_csvs"
    out_dir.mkdir(exist_ok=True)

    for meg in sorted(in_dir.glob("*.meg")):
        df_lower = parse_meg_file(meg)
        out_file = out_dir / f"{meg.stem}_dN.csv"
        df_lower.to_csv(out_file)
        #print(f"   • wrote {out_file}")

In [None]:
#  the per‐family loop SYNONYMOUS RATE
for family in families:
    print(f"\n→ Processing dS matrices for family {family!r}")
    
    in_dir  = data_dir / f"sequences_x_updated/{family}_selected_isoform/blastdb/cluster_syn_count"
    out_dir = in_dir / "matrix_csvs"
    out_dir.mkdir(exist_ok=True)

    for meg in sorted(in_dir.glob("*.meg")):
        df_lower = parse_meg_file(meg)
        out_file = out_dir / f"{meg.stem}_S_count.csv"
        df_lower.to_csv(out_file)
        #print(f"   • wrote {out_file}")


In [None]:
# now, the per‐family loop NONSYNONYMOUS RATE
for family in families:
    print(f"\n→ Processing dS matrices for family {family!r}")
    
    in_dir  = data_dir / f"sequences_x_updated/{family}_selected_isoform/blastdb/cluster_nonsyn_count"
    out_dir = in_dir / "matrix_csvs"
    out_dir.mkdir(exist_ok=True)

    for meg in sorted(in_dir.glob("*.meg")):
        df_lower = parse_meg_file(meg)
        out_file = out_dir / f"{meg.stem}_N_count.csv"
        df_lower.to_csv(out_file)
        #print(f"   • wrote {out_file}")


### Combine all the synonymous and nonsynonymous tables

In [None]:
# define the function to extract the number of codons -> this will be called afterwards om the big table below
def get_num_sites(cluster):
    """
    Try to pull the reported “No. of Sites=” from the .meg file.
    If that fails, parse the alignment itself and return length/3.
    """
    meg_path = (
        data_dir
        / f"sequences_x_updated/{family}_selected_isoform/blastdb/cluster_dN"
        / f"{family}_cluster_{cluster}.meg"
    )
    text = meg_path.read_text()
    # 1) look for “No. of Sites = 123” with any spacing
    m = re.search(r"No\.?\s*of\s*Sites\s*=\s*(\d+)", text, flags=re.IGNORECASE)
    if m:
        return int(m.group(1))
    # 2) fallback: parse the alignment and count columns
    try:
        aln = AlignIO.read(str(meg_path), "mega")
        # alignment.get_alignment_length() gives total columns;
        # since this is codon‐alignment file, divide by 3
        return aln.get_alignment_length() // 3
    except Exception:
        return None

### Between species

In [None]:
import warnings
warnings.filterwarnings("ignore")
# — assume you already have:
#    families    : list of family names, e.g. ['NXF','ABC',…]
#    data_dir    : Path to "/home/emma/Amplicons/Workspaces/emma/downloaded_data"
#    species_list: list of species codes

for family in families:
    print(f"\n→ Processing between‐species stats for family {family}")

    # 1) define syn/nonsyn CSV directories
    syn_dir    = data_dir / f"sequences_x_updated/{family}_selected_isoform/blastdb/cluster_dS/matrix_csvs"
    nonsyn_dir = data_dir / f"sequences_x_updated/{family}_selected_isoform/blastdb/cluster_dN/matrix_csvs"

    # 2) discover cluster IDs
    clusters = []
    for f in syn_dir.glob(f"{family}_cluster_*_dS.csv"):
        m = re.match(rf"{family}_cluster_(.+)_dS\.csv", f.name)
        if m:
            clusters.append(m.group(1))
    clusters = sorted(set(clusters))
    print("  Found clusters:", clusters)

    # helper: compute stats for a species pair in a lower‐triangle df
    def pair_stats(df, a, b):
        r = df.index.to_series().str.contains
        c = df.columns.to_series().str.contains
        mask = np.outer(r(a), c(b)) | np.outer(r(b), c(a))
        vals = df.where(mask).stack()
        return vals.mean(), vals.std(ddof=1)

    # 3) load syn/nonsyn matrices and compute mean/SD
    records = []
    for cluster in clusters:
        syn_df    = (pd.read_csv(syn_dir/f"{family}_cluster_{cluster}_dS.csv", index_col=0)
                       .sort_index().sort_index(axis=1))
        nonsyn_df = (pd.read_csv(nonsyn_dir/f"{family}_cluster_{cluster}_dN.csv", index_col=0)
                       .sort_index().sort_index(axis=1))
        for sp1, sp2 in itertools.combinations(species_list, 2):
            m_s, sd_s = pair_stats(syn_df,    sp1, sp2)
            m_n, sd_n = pair_stats(nonsyn_df, sp1, sp2)
            records.append({
                "Cluster":      cluster,
                "Species1":     sp1,
                "Species2":     sp2,
                "Mean_Syn":     m_s,
                "SD_Syn":       sd_s,
                "Mean_Nonsyn":  m_n,
                "SD_Nonsyn":    sd_n
            })

    master = pd.DataFrame(records)
    cols = ["Mean_Syn","SD_Syn","Mean_Nonsyn","SD_Nonsyn"]
    master_clean = master.dropna(subset=cols, how="all")

    # 4) pull “No. of Sites” (codons) from each cluster's .meg
    site_map = {cl: get_num_sites(cl) for cl in master_clean["Cluster"].unique()}
    master_clean["No_of_Codon"] = master_clean["Cluster"].map(site_map)

    # 5) compute dN/dS ratio
    master_clean["dNdS"] = master_clean["Mean_Nonsyn"] / master_clean["Mean_Syn"]

    # 6) annotate copy‐numbers
    counts_csv = (
        data_dir
        / f"sequences_x_updated/{family}_selected_isoform/blastdb/species_cluster_counts_{family}.csv"
    )
    counts_df = pd.read_csv(counts_csv).set_index("species")
    master_clean["Species1_num_copies"] = [
        counts_df.at[s, c]
        for s, c in zip(master_clean["Species1"], master_clean["Cluster"])
    ]
    master_clean["Species2_num_copies"] = [
        counts_df.at[s, c]
        for s, c in zip(master_clean["Species2"], master_clean["Cluster"])
    ]

    # 7) build S/N counts table and merge
    rec2 = []
    syn_cnt_dir    = data_dir / f"sequences_x_updated/{family}_selected_isoform/blastdb/cluster_syn_count/matrix_csvs"
    nonsyn_cnt_dir = data_dir / f"sequences_x_updated/{family}_selected_isoform/blastdb/cluster_nonsyn_count/matrix_csvs"
    for cluster in clusters:
        s_df = (pd.read_csv(syn_cnt_dir/f"{family}_cluster_{cluster}_S_count.csv", index_col=0)
                  .sort_index().sort_index(axis=1))
        n_df = (pd.read_csv(nonsyn_cnt_dir/f"{family}_cluster_{cluster}_N_count.csv", index_col=0)
                  .sort_index().sort_index(axis=1))
        for sp1, sp2 in itertools.combinations(species_list, 2):
            ms, ss = pair_stats(s_df, sp1, sp2)
            mn, sn = pair_stats(n_df, sp1, sp2)
            rec2.append({
                "Cluster":           cluster,
                "Species1":          sp1,
                "Species2":          sp2,
                "Mean_Syn_count":    ms,
                "SD_Syn_count":      ss,
                "Mean_Nonsyn_count": mn,
                "SD_Nonsyn_count":   sn
            })

    counts_total = (
        pd.DataFrame(rec2)
        .dropna(subset=["Mean_Syn_count","SD_Syn_count","Mean_Nonsyn_count","SD_Nonsyn_count"], how="all")
    )

    final = (
        master_clean
        .merge(counts_total, on=["Cluster","Species1","Species2"], how="left")
        .round(4)
    )
    final[["Species1_num_copies","Species2_num_copies"]] = final[["Species1_num_copies","Species2_num_copies"]].astype(int)

    # 8) compute “potential synonymous sites” and adjusted dN/dS
    final["pot_syn_sites"] = final["Mean_Syn_count"] / final["Mean_Syn"]
    final["adj_dNdS"]      = (final["Mean_Nonsyn"]) / (
        (final["Mean_Syn_count"] + 1) / final["pot_syn_sites"]
    )
    
    # 9) save
    out_tsv = (
        data_dir
        / f"sequences_x_updated/{family}_selected_isoform/blastdb/{family}_dN_dS_betweenspecies.tsv"
    )
    final.to_csv(out_tsv, sep="\t", index=False)
    print(f"  → saved {out_tsv}")

### Within species

In [None]:
# ── ASSUMED DEFINED upstream ─────────────────────────────────────────
# families     = ['NXF','ABC','XYZ', …]
# data_dir     = Path("/home/emma/Amplicons/Workspaces/emma/downloaded_data")
# species_list = ["HomSap","PanTro","PanPan","GorGor","PonPyg","PonAbe","SymSyn","MacFas"]

for family in families:
    print(f"\n→ Within‐species summary for family {family!r}")

    # 1) Directories for dS and dN matrices
    syn_dir    = data_dir / f"sequences_x_updated/{family}_selected_isoform/blastdb/cluster_dS/matrix_csvs"
    nonsyn_dir = data_dir / f"sequences_x_updated/{family}_selected_isoform/blastdb/cluster_dN/matrix_csvs"

    # 2) Discover cluster IDs from the dS filenames
    clusters = sorted({
        re.match(rf"{family}_cluster_(.+)_dS\.csv", p.name).group(1)
        for p in syn_dir.glob(f"{family}_cluster_*_dS.csv")
        if re.match(rf"{family}_cluster_(.+)_dS\.csv", p.name)
    })
    print("  clusters:", clusters)

    # Helper to get mean & SD for a species in a lower‐triangle matrix
    def pair_stats(df, a, b):
        idx0 = df.index.to_series().str.contains
        idx1 = df.columns.to_series().str.contains
        mask = np.outer(idx0(a), idx1(b)) | np.outer(idx0(b), idx1(a))
        vals = df.where(mask).stack()
        return vals.mean(), vals.std(ddof=1)

    # 3) Build within‐species dS/dN rates table
    rate_records = []
    for cluster in clusters:
        s_df = (pd.read_csv(syn_dir/f"{family}_cluster_{cluster}_dS.csv", index_col=0)
                  .sort_index().sort_index(axis=1))
        n_df = (pd.read_csv(nonsyn_dir/f"{family}_cluster_{cluster}_dN.csv", index_col=0)
                  .sort_index().sort_index(axis=1))
        for sp in species_list:
            m_s, sd_s = pair_stats(s_df, sp, sp)
            m_n, sd_n = pair_stats(n_df, sp, sp)
            rate_records.append({
                "Cluster":      cluster,
                "Species":      sp,
                "Mean_Syn":     m_s,
                "SD_Syn":       sd_s,
                "Mean_Nonsyn":  m_n,
                "SD_Nonsyn":    sd_n
            })
    within_rates = pd.DataFrame(rate_records).dropna(
        subset=["Mean_Syn","SD_Syn","Mean_Nonsyn","SD_Nonsyn"],
        how="all"
    )

    # 4) Extract “No. of Sites” (codons) from each cluster’s .meg
    site_map = {c: get_num_sites(c) for c in within_rates["Cluster"].unique()}
    within_rates["No_of_Codon"] = within_rates["Cluster"].map(site_map)

    # 5) Compute dN/dS ratio
    within_rates["dNdS"] = within_rates["Mean_Nonsyn"] / within_rates["Mean_Syn"]

    # 6) Annotate copy‐number from species×cluster counts
    cnt_csv = (data_dir
               / f"sequences_x_updated/{family}_selected_isoform/blastdb/"
               / f"species_cluster_counts_{family}.csv")
    cnt_df = pd.read_csv(cnt_csv).set_index("species")
    within_rates["num_copies"] = [
        cnt_df.at[row.Species, row.Cluster]
        for _, row in within_rates.iterrows()
    ]

    # 7) Build within‐species raw count table (S_count / N_count)
    count_records = []
    syn_cnt_dir    = data_dir / f"sequences_x_updated/{family}_selected_isoform/blastdb/cluster_syn_count/matrix_csvs"
    nonsyn_cnt_dir = data_dir / f"sequences_x_updated/{family}_selected_isoform/blastdb/cluster_nonsyn_count/matrix_csvs"
    for cluster in clusters:
        sf = syn_cnt_dir   / f"{family}_cluster_{cluster}_S_count.csv"
        nf = nonsyn_cnt_dir/ f"{family}_cluster_{cluster}_N_count.csv"
        if not (sf.exists() and nf.exists()):
            print(f"  ⚠ skipping counts for {cluster}: missing file")
            continue
        s_df = pd.read_csv(sf, index_col=0).sort_index().sort_index(axis=1)
        n_df = pd.read_csv(nf, index_col=0).sort_index().sort_index(axis=1)
        for sp in species_list:
            ms, ss = pair_stats(s_df, sp, sp)
            mn, sn = pair_stats(n_df, sp, sp)
            count_records.append({
                "Cluster":           cluster,
                "Species":           sp,
                "Mean_Syn_count":    ms,
                "SD_Syn_count":      ss,
                "Mean_Nonsyn_count": mn,
                "SD_Nonsyn_count":   sn
            })
    within_counts = pd.DataFrame(count_records).dropna(
        subset=["Mean_Syn_count","SD_Syn_count","Mean_Nonsyn_count","SD_Nonsyn_count"],
        how="all"
    )

    # 8) Merge rates + counts into one within‐species table
    within_species = within_rates.merge(
        within_counts,
        on=["Cluster","Species"],
        how="left"
    )
     # 9) Round decimals to 4 places, integers for codons & copies
    dec_cols = [
        "Mean_Syn","SD_Syn","Mean_Nonsyn","SD_Nonsyn",
        "Mean_Syn_count","SD_Syn_count",
        "Mean_Nonsyn_count","SD_Nonsyn_count","dNdS"
    ]
    within_species[dec_cols] = within_species[dec_cols].round(4)
    int_cols = ["No_of_Codon","num_copies"]
    within_species[int_cols] = within_species[int_cols].round(0).astype(int)

    # 10) Save the combined table
    out_file = (data_dir
                / f"sequences_x_updated/{family}_selected_isoform/blastdb/"
                / f"{family}_dN_dS_withinspecies.tsv")
    within_species.to_csv(out_file, sep="\t", index=False)
    print(f"  • saved combined within‐species table → {out_file}")

### Combine all tables together

In [None]:
# BETWEEN SPECIES 

## Combine in one large table for all families together 
# ── ASSUMED DEFINED UPSTREAM ─────────────────────────────────────────
# families = ['NXF','ABC','XYZ', …]
# data_dir = Path("/home/emma/Amplicons/Workspaces/emma/downloaded_data")

# 1) Read each per-family TSV, add a "Family" column, collect into a list
tables = []
for family in families:
    path = f"{data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/{family}_dN_dS_betweenspecies.tsv"
    df = pd.read_csv(path, sep="\t")
    df["Family"] = family
    tables.append(df)

# 2) Concatenate them all into one DataFrame
combined = pd.concat(tables, ignore_index=True)

# 3) Save the big table
out = f"{data_dir}/sequences_x_updated/all_families_dN_dS_betweenspecies.tsv"
combined.to_csv(out, sep="\t", index=False)

print(f"→ Wrote combined table with {len(combined)} rows to {out}")

In [None]:
# WITHIN SPECIES 

## Combine in one large table for all families together 
# ── ASSUMED DEFINED UPSTREAM ─────────────────────────────────────────
# families = ['NXF','ABC','XYZ', …]
# data_dir = Path("/home/emma/Amplicons/Workspaces/emma/downloaded_data")

# 1) Read each per-family TSV, add a "Family" column, collect into a list
tables = []
for family in families:
    path = f"{data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/{family}_dN_dS_withinspecies.tsv"
    df = pd.read_csv(path, sep="\t")
    df["Family"] = family
    tables.append(df)

# 2) Concatenate them all into one DataFrame
combined = pd.concat(tables, ignore_index=True)

# 3) Save the big table
out = f"{data_dir}/sequences_x_updated/all_families_dN_dS_withinspecies.tsv"
combined.to_csv(out, sep="\t", index=False)

print(f"→ Wrote combined table with {len(combined)} rows to {out}")

In [None]:
### Cluster counts overview
# DataFrame with gene coordinates and other details (adjust filename if needed)
genes_y = pd.read_csv(f"{data_dir}/sequences_y_updated/all_families_gene_details_with_clusters.tsv", sep='\t')
genes_x = pd.read_csv(f"{data_dir}/sequences_x_updated/all_families_gene_details_with_clusters.tsv", sep='\t')
genes_y_long = pd.read_csv(f"{data_dir}/sequences_y_longestisoform/all_families_gene_details_with_clusters.tsv", sep='\t')
genes_x_long = pd.read_csv(f"{data_dir}/sequences_x_longestisoform/all_families_gene_details_with_clusters.tsv", sep='\t')

In [None]:
# transform dataframe into Family, cluster, species counts
# 1) Count how many genes per Family + Cluster + Species
counts_y = (
    genes_y
      .groupby(['gene_family_symbol', 'cluster', 'Species'])
      .size()                                # number of rows in each group
      .unstack(fill_value=0)                # turn Species into columns
      .reset_index()                        # bring family & cluster back as cols
)

# 2) Rename for clarity
counts_y = counts_y.rename(
    columns={
      'gene_family_symbol': 'Family',
      'cluster':            'Cluster'
    }
)

# 3) View result
counts_y

# save
counts_y.to_csv(f"{data_dir}/sequences_y_updated/cluster_counts_perspecies.tsv", sep="\t", index=False)

In [None]:
# transform dataframe into Family, cluster, species counts
# 1) Count how many genes per Family + Cluster + Species
counts_x = (
    genes_x
      .groupby(['gene_family_symbol', 'cluster', 'Species'])
      .size()                                # number of rows in each group
      .unstack(fill_value=0)                # turn Species into columns
      .reset_index()                        # bring family & cluster back as cols
)

# 2) Rename for clarity
counts_x = counts_x.rename(
    columns={
      'gene_family_symbol': 'Family',
      'cluster':            'Cluster'
    }
)

# 3) View result
counts_x

# save
counts_x.to_csv(f"{data_dir}/sequences_x_updated/cluster_counts_perspecies.tsv", sep="\t", index=False)

In [None]:
# transform dataframe into Family, cluster, species counts
# 1) Count how many genes per Family + Cluster + Species
counts_y_long = (
    genes_y_long
      .groupby(['gene_family_symbol', 'cluster', 'Species'])
      .size()                                # number of rows in each group
      .unstack(fill_value=0)                # turn Species into columns
      .reset_index()                        # bring family & cluster back as cols
)

# 2) Rename for clarity
counts_y_long = counts_y_long.rename(
    columns={
      'gene_family_symbol': 'Family',
      'cluster':            'Cluster'
    }
)

# 3) View result
counts_y_long

# save
counts_y_long.to_csv(f"{data_dir}/sequences_y_longestisoform/cluster_counts_perspecies.tsv", sep="\t", index=False)

In [None]:
# transform dataframe into Family, cluster, species counts
# 1) Count how many genes per Family + Cluster + Species
counts_x_long = (
    genes_x_long
      .groupby(['gene_family_symbol', 'cluster', 'Species'])
      .size()                                # number of rows in each group
      .unstack(fill_value=0)                # turn Species into columns
      .reset_index()                        # bring family & cluster back as cols
)

# 2) Rename for clarity
counts_x_long = counts_x_long.rename(
    columns={
      'gene_family_symbol': 'Family',
      'cluster':            'Cluster'
    }
)

# 3) View result
counts_x_long

# save
counts_x_long.to_csv(f"{data_dir}/sequences_x_longestisoform/cluster_counts_perspecies.tsv", sep="\t", index=False)

## After bootstrapping merge dNdS analysis
#### The bootstrapping is done by is another script !!!

In [None]:
## bootstrap information
bootstrap_y = pd.read_csv(f'/home/emma/Amplicons/Workspaces/emma/downloaded_data/X_updated_bootstrap_results_20251101_140844/bootstrap_results.csv',sep=",")
bootstrap_y

bootstrap = bootstrap_y
# add new columns that state the amount of times you have "0" in the bootstrapped list
import numpy as np

# Helper function: turn comma-separated string into list of floats, then compute fraction of zeros
def frac_zeros(val):
    # split by comma, convert to float
    arr = np.array([float(x) for x in val.split(",")])
    return np.mean(arr == 0)

# Apply to each column
bootstrap["dN_fraction_zeros"] = bootstrap["dN_rates"].apply(frac_zeros)
bootstrap["dS_fraction_zeros"] = bootstrap["dS_rates"].apply(frac_zeros)

# calculate mean dNdS + mean dS + mean dN
# keep in mind paired bootrsapped. 
# calculate the mean of the ratios mean(dN[i]/dS[i])
import numpy as np

def calc_bootstrap_stats(dn_str, ds_str):
    # Parse strings into float arrays
    dn = np.array([float(x) for x in dn_str.split(",")])
    ds = np.array([float(x) for x in ds_str.split(",")])
    
    # --- mean of ratios ---
    ratios = np.divide(dn, ds, out=np.full_like(dn, np.nan), where=ds!=0)
    mean_dnds = np.nanmean(ratios)
    
    # --- mean dN and mean dS ---
    mean_dn = np.mean(dn)
    mean_ds = np.mean(ds)
    
    return mean_dnds, mean_dn, mean_ds


# Apply row-wise
bootstrap[["mean_dNdS", "mean_dN", "mean_dS"]] = bootstrap.apply(
    lambda row: pd.Series(calc_bootstrap_stats(row["dN_rates"], row["dS_rates"])),
    axis=1
)


# counts but without filtering out the ones with division of dS  by zero -> this would just give infinite which would count as above 1 ? 
import numpy as np
import pandas as pd

def count_ratio_below_above_1(dn_str, ds_str):
    # Parse comma-separated strings into float arrays (fast & robust)
    dn = np.fromstring(dn_str, sep=",")
    ds = np.fromstring(ds_str, sep=",")

    # dN/dS with dS==0 -> +inf (including 0/0)
    ratios = np.divide(dn, ds, out=np.full_like(dn, np.inf), where=ds != 0)

    # Counts (NaNs won’t occur with the logic above; inf > 1 evaluates True)
    below = int(np.sum(ratios < 1))
    above = int(np.sum(ratios > 1))  # counts +inf as above 1 automatically

    return below, above

# Apply row-wise
bootstrap[["dNdS_count_below1", "dNdS_count_above1"]] = bootstrap.apply(
    lambda row: pd.Series(count_ratio_below_above_1(row["dN_rates"], row["dS_rates"])),
    axis=1
)

# bootstrap
ALPHA   = 0.05
BOOT_N  = 10000  # total bootstraps

# Fractions
bootstrap["frac_below1"] = bootstrap["dNdS_count_below1"] / BOOT_N
bootstrap["frac_above1"] = bootstrap["dNdS_count_above1"] / BOOT_N

# (optional) if you want to track the mass exactly at 1 (and any NaNs if present)
# This assumes no double-counting across the *_count_* columns.
if "dNdS_count_nan" in bootstrap.columns:
    bootstrap["frac_equal1"] = (BOOT_N - bootstrap["dNdS_count_below1"]
                                          - bootstrap["dNdS_count_above1"]
                                          - bootstrap["dNdS_count_nan"]) / BOOT_N
else:
    bootstrap["frac_equal1"] = (BOOT_N - bootstrap["dNdS_count_below1"]
                                          - bootstrap["dNdS_count_above1"]) / BOOT_N

# Safety: clip numerical slop
bootstrap[["frac_below1","frac_above1","frac_equal1"]] = \
    bootstrap[["frac_below1","frac_above1","frac_equal1"]].clip(lower=0, upper=1)

# Labels (use ≤ per your definition). Treat the rare case where both sides ≤ α
# (i.e., most mass is exactly 1) as "neutral (~1)".
positive   = bootstrap["frac_below1"] <= ALPHA
purifying  = bootstrap["frac_above1"] <= ALPHA
neutralish = positive & purifying      # e.g., ~all mass at exactly 1

bootstrap["selection"] = np.select(
    [neutralish,            positive,     purifying],
    ["neutral (~1)",        "positive",   "purifying"],
    default="nonsignificant"
)

# (optional) p-values for one-sided bootstrap tests
#bootstrap["p_pos"] = bootstrap["frac_below1"]  # P(dN/dS ≤ 1)
#bootstrap["p_pur"] = bootstrap["frac_above1"]  # P(dN/dS ≥ 1)

# tidy display
cols_to_round = ["frac_below1","frac_above1","frac_equal1"]
bootstrap[cols_to_round] = bootstrap[cols_to_round].round(4)
bootstrap


In [None]:
# dNdS pairwise dataframe 
y_between_overview = pd.read_csv(f'/home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_x_updated/all_families_dN_dS_betweenspecies.tsv',sep="\t")
y_within_overview = pd.read_csv(f'/home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_x_updated/all_families_dN_dS_withinspecies.tsv',sep="\t")
# merge the 2 dataframes
# --- Step 1: Modify x_within_overview ---
y_within_modified = y_within_overview.copy()

# Duplicate num_copies -> Species1_num_copies and Species2_num_copies
y_within_modified["Species1_num_copies"] = y_within_modified["num_copies"]
y_within_modified["Species2_num_copies"] = y_within_modified["num_copies"]

# Duplicate Species -> Species1 and Species2
y_within_modified["Species1"] = y_within_modified["Species"]
y_within_modified["Species2"] = y_within_modified["Species"]

# Drop the old single-species columns
y_within_modified = y_within_modified.drop(columns=["num_copies", "Species"])

# --- Step 2: Modify x_between_overview ---
y_between_modified = y_between_overview.drop(columns=["pot_syn_sites", "adj_dNdS"])

# --- Step 3: Align and merge ---
# Ensure same column order
y_within_modified = y_within_modified[y_between_modified.columns]

# Concatenate
merged_overview_y = pd.concat([y_between_modified, y_within_modified], ignore_index=True)
merged_overview_y

In [None]:
# left horizontal merge of merged_overview and bootstrap dataframes
import pandas as pd

# 1) Keep only the columns you need from bootstrap
boot_cols = [
    "family", "cluster", "species1", "species2",
    "mean_dNdS", "mean_dN", "mean_dS",
    "frac_below1", "frac_above1", "selection"
]
boot_sub = bootstrap[boot_cols].copy()

# 2) Rename bootstrap key columns to match merged_overview
boot_sub = boot_sub.rename(columns={
    "family": "Family",
    "cluster": "Cluster",
    "species1": "Species1",
    "species2": "Species2"
})

# 2) Clean Cluster: keep only text after "_cluster_"
boot_sub["Cluster"] = (
    boot_sub["Cluster"].astype(str)
    .str.split(pat="_cluster_", n=1, expand=False)
    .str[-1]
    .str.strip()
)

# --- starting from your prepared `boot_sub` (with Cluster cleaned) and `merged_overview` ---

def with_canonical_species(df):
    out = df.copy()
    s1 = out["Species1"].astype(str).str.strip()
    s2 = out["Species2"].astype(str).str.strip()
    # sort the pair case-insensitively so A–B == B–A
    order = s1.str.lower() <= s2.str.lower()
    out["_SpeciesA"] = s1.where(order, s2)
    out["_SpeciesB"] = s2.where(order, s1)
    return out

mo = with_canonical_species(merged_overview_y)
bs = with_canonical_species(boot_sub)

# (optional) if bootstrap can have duplicates per canonical key, dedupe:
bs = bs.drop_duplicates(subset=["Family", "Cluster", "_SpeciesA", "_SpeciesB"])

# Merge on Family, Cluster, and canonical species
final_df_y = pd.merge(
    mo,
    bs[[
        "Family", "Cluster", "_SpeciesA", "_SpeciesB",
        "mean_dNdS", "mean_dN", "mean_dS", "frac_below1", "frac_above1", "selection"
    ]],
    how="left",
    on=["Family", "Cluster", "_SpeciesA", "_SpeciesB"]
)

# Drop helper columns
final_df_y = final_df_y.drop(columns=["_SpeciesA", "_SpeciesB"])
mask_both_zero = (final_df_y["mean_dN"] == 0) & (final_df_y["mean_dS"] == 0)
final_df_y.loc[mask_both_zero, "selection"] = "purifying"
final_df_y
final_df_y.to_csv(f"/home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_x_updated/Bootstrap_all_families_dN_dS_between_within_species_x_updated2.csv", index=False)


In [None]:
# Overview of the within species positive selection
dnds_total = pd.read_csv(f"{data_dir}/sequences_x_updated/Bootstrap_all_families_dN_dS_between_within_species_x_updated2.csv")
dnds_total

In [None]:
# extract within species 
# only positive selection filtering
import numpy as np
import pandas as pd

# 1) Make a numeric helper column (strings like "Inf", "inf", "Infinity" become np.inf; bad text -> NaN)
df = dnds_total.copy()
df["dNdS_num"] = pd.to_numeric(df["dNdS"], errors="coerce")

# 2) Keep only finite values (this removes +inf/-inf and NaN)
df = df[np.isfinite(df["dNdS_num"])]

# 3) Apply your filters and select columns
cols = ["Family", "Cluster", "Species1", "Species1_num_copies", "dNdS", "selection"]
dnds_filtered = df.loc[
    (df["Species1"] == df["Species2"]) & (df["dNdS_num"] > 1),
    cols
]

#save
dnds_filtered.to_csv(f"/home/emma/Amplicons/Workspaces/emma/downloaded_data/sequences_x_updated/within_positive_selection.csv", index=False)

dnds_filtered

In [None]:
# group by family-cluster
df = dnds_filtered.copy()
df["dNdS"] = df["dNdS"].astype(float).round(2)

result = (
    df.groupby(["Family", "Cluster"])
      .agg({
          "Species1": lambda x: list(x.unique()),
          "dNdS": list,
           "Species1_num_copies": list
      })
      .reset_index()
)

result