In [None]:
import pandas as pd
import numpy as np

import uuid
import ast
import os

from Bio import AlignIO
from Bio import SeqIO
from Bio import AlignIO, Phylo
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from collections import Counter

from ete3 import Tree, TreeStyle, NodeStyle, faces, AttrFace

import scipy.stats as stats
from scipy.cluster.hierarchy import linkage, dendrogram

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.figure_factory as ff

from matplotlib.gridspec import GridSpec
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate and map the Codon rarity at each position of a multiple sequence alignment

### Function to calculate the codon rarity at each position of a multiple sequence alignment:
CRS = Codon Rarity Score, AA = Amino Acid, occ = Occurence, f_c = Frequency of codon, len = Length, aln = Alignment, gaps = Gaps, n_aln = Number of sequences in the alignment
$$ CRS_{position}= {\sum \limits _{AA_{AA}} ^{n_{aln}}(\sum \limits _{occ=1} ^{n_{aln}} {AA_{occ}} * f_c) \over {len_{total}(alignment)} - (gaps)} $$
$$ f_c = { \sum n_c \over \sum n_{AA} } * { 1 \over n_{cAA} } $$

## Set data to analyse

In [None]:
alignment_file = "/Users/dominiquefastus/master_project/Data/protFAMS/peptM7/nustruTREE/MSA/peptM7_nustru_secstru_filtered_protein_aligned.fasta"
nt_fasta_file = None 
nustruDB = "/Users/dominiquefastus/master_project/Data/protFAMS/peptM7/peptM7_nustru_secstru_filtered.csv" 
tree_file = "/Users/dominiquefastus/master_project/Data/protFAMS/peptM7/nustruTREE/TREE/rerooted_tree_file.nwk"
output_path = "/Users/dominiquefastus/master_project/NuStru/nustruEVOL/" 
working_name = os.path.basename(alignment_file)
family_name = None

job_id = uuid.uuid4()

nustrudb = pd.read_csv(nustruDB)
protein_alignment = AlignIO.read(alignment_file, "fasta")
if nt_fasta_file is not None:
    nucleotide_sequences = SeqIO.parse(nt_fasta_file, "fasta")
else:
    for record in protein_alignment:
        try:
            nucleotide_id = nustrudb.loc[nustrudb["primary_id"] == record.id]["nucleotide_id"].values[0]
            nucleotide_sequence = nustrudb.loc[nustrudb["primary_id"] == record.id]["nucleotide_sequence"].values[0]
            with open(f"{output_path}/{job_id}_nucleotide_sequences.fasta", "a") as f:
                f.write(f">{nucleotide_id}\n{nucleotide_sequence}\n")
        except:
            continue
    nucleotide_sequences = SeqIO.parse(f"{output_path}/{job_id}_nucleotide_sequences.fasta", "fasta")

## Define functions

In [None]:
def fasta_to_array(fasta, align_to=None, codon=False):
    all_seqs = []
    all_ids = []
    
    if codon:
        for (ind,record) in enumerate(fasta):
            all_seqs.append(list(str(record.seq)))
            all_seqs[ind] = [''.join(map(str, all_seqs[ind][i:i+3])) for i in range(0, len(all_seqs[ind]), 3)]
            
    else:           
        for record in fasta:
            all_seqs.append(list(str(record.seq)))
            all_ids.append(record.id)
        
        all_seqs = np.array(all_seqs)

    if align_to is not None:
        gap_indeces = np.where(align_to == '-')
        
        for gap_index in zip(gap_indeces[0], gap_indeces[1]):
            all_seqs[gap_index[0]].insert(gap_index[1], '---')
        
        all_seqs = np.array(all_seqs)
        # deleting stop codons as no protein assigned to them
        all_seqs = np.delete(all_seqs, -1, axis=1)
    
    all_ids = np.array(all_ids).reshape(len(all_ids), 1)
    
    return all_seqs

def filter_columns_by_gap_threshold(seq_arr, threshold=0.5):
    n_rows = seq_arr.shape[0]
    valid_columns = []
    deleted_columns = []
    
    # Iterate through each column index
    for col_index in range(seq_arr.shape[1]):
        gap_count = np.sum(seq_arr[:, col_index] == 0)
        gap_percentage = gap_count / n_rows
        
        # Append column index based on gap percentage
        if gap_percentage <= threshold:
            valid_columns.append(col_index)
        else:
            deleted_columns.append(col_index)
    
    filtered_seq_arr = seq_arr[:, valid_columns]
    
    # Return the filtered array and the list of deleted columns' indices
    return filtered_seq_arr, deleted_columns

def cub_msa_table(prot_seq_arr=None, cod_seq_arr=None):
    cub_table = {
    # '*': {'TAA': None, 'TAG': None, 'TGA': None}, ignoring stop codons
    'A': {'GCA': None, 'GCC': None, 'GCG': None, 'GCT': None},
    'C': {'TGC': None, 'TGT': None},
    'D': {'GAC': None, 'GAT': None},
    'E': {'GAA': None, 'GAG': None},
    'F': {'TTC': None, 'TTT': None},
    'G': {'GGA': None, 'GGC': None, 'GGG': None, 'GGT': None},
    'H': {'CAC': None, 'CAT': None},
    'I': {'ATA': None, 'ATC': None, 'ATT': None},
    'K': {'AAA': None, 'AAG': None},
    'L': {'CTA': None, 'CTC': None, 'CTG': None, 'CTT': None, 'TTA': None, 'TTG': None},
    'M': {'ATG': None},
    'N': {'AAC': None, 'AAT': None},
    'P': {'CCA': None, 'CCC': None, 'CCG': None, 'CCT': None},
    'Q': {'CAA': None, 'CAG': None},
    'R': {'AGA': None, 'AGG': None, 'CGA': None, 'CGC': None, 'CGG': None, 'CGT': None},
    'S': {'AGC': None, 'AGT': None, 'TCA': None, 'TCC': None, 'TCG': None, 'TCT': None},
    'T': {'ACA': None, 'ACC': None, 'ACG': None, 'ACT': None},
    'V': {'GTA': None, 'GTC': None, 'GTG': None, 'GTT': None},
    'W': {'TGG': None},
    'Y': {'TAC': None, 'TAT': None}}
    
    for aa in cub_table.keys():
        n_AA = np.count_nonzero(prot_seq_arr == aa)
                
        nc_AA = len(cub_table[aa].keys())
        
        for codon in cub_table[aa].keys():
            nc = np.count_nonzero(cod_seq_arr == codon)
            
            fc =(nc / n_AA) * 1/nc_AA

            cub_table[aa][codon] = round(fc,6)
        
    return cub_table

def map_rarity(protein_alignment, nustrudb, cu_table):
    codon_position_start = 0
    alignment_value_matrix = np.zeros((len(protein_alignment), len(protein_alignment[0])))
    seq_name = [seq.id for seq in protein_alignment]
    seq_pos = [i for i in range(len(protein_alignment[0]))]

    sart_count = [0 for i in range(len(seq_name))]
    pos_count_dict = {seq_name[i]: 0 for i in range(len(seq_name))}
    for position in range(len(protein_alignment[0])):

        for i, (aa, seq) in enumerate(zip(protein_alignment[:,position],seq_name)):
            if aa == '-':
                alignment_value_matrix[i, position] = 0
                pos_count_dict[seq] += 1
            else:
                prot_position = pos_count_dict[seq]
                position_adj = position - prot_position
                
                sequence = nustrudb[nustrudb["primary_id"] == seq]["nucleotide_sequence"].values[0]
                alignment_value_matrix[i, position] = cu_table[aa][sequence[position_adj*3:position_adj*3+3].upper()]

    residue_sum = []
    for col_mean in np.sum(alignment_value_matrix, axis=0):
        residue_sum.append(col_mean / len(seq_name))
        
    residue_max = []
    for col_max in np.max(alignment_value_matrix, axis=0):
        residue_max.append(col_max)
        
    sequence_sum = {}
    for col_mean, name in zip(np.sum(alignment_value_matrix, axis=1), seq_name):
        sequence_sum[name] = round((col_mean / len(alignment_value_matrix[0])), 5)
    
    return alignment_value_matrix, seq_name, seq_pos, residue_sum, residue_max, sequence_sum

def sum_distances_to_root(tree):
    leaf_distances = {}
    for leaf in tree.iter_leaves():
        current, distance_sum = leaf, 0
        while not current.is_root():
            distance_sum += current.dist
            current = current.up
        leaf_distances[leaf.name] = round(distance_sum, 5)
    return leaf_distances

## Transform alignment and sequences to arrays and align the gaps if necessary

In [None]:
all_seqs_protein = fasta_to_array(protein_alignment)
all_seqs_nt = fasta_to_array(fasta=nucleotide_sequences, align_to=all_seqs_protein, codon=True)

## Calculate the codon frequency for each amino acid based on the alignment

#### $$ f_c = { \sum n_c \over \sum n_{AA} } * { 1 \over n_{cAA} } $$

In [None]:
def cub_msa_table(prot_seq_arr=None, cod_seq_arr=None):
    cub_table = {
    # '*': {'TAA': None, 'TAG': None, 'TGA': None}, ignoring stop codons
    'A': {'GCA': None, 'GCC': None, 'GCG': None, 'GCT': None},
    'C': {'TGC': None, 'TGT': None},
    'D': {'GAC': None, 'GAT': None},
    'E': {'GAA': None, 'GAG': None},
    'F': {'TTC': None, 'TTT': None},
    'G': {'GGA': None, 'GGC': None, 'GGG': None, 'GGT': None},
    'H': {'CAC': None, 'CAT': None},
    'I': {'ATA': None, 'ATC': None, 'ATT': None},
    'K': {'AAA': None, 'AAG': None},
    'L': {'CTA': None, 'CTC': None, 'CTG': None, 'CTT': None, 'TTA': None, 'TTG': None},
    'M': {'ATG': None},
    'N': {'AAC': None, 'AAT': None},
    'P': {'CCA': None, 'CCC': None, 'CCG': None, 'CCT': None},
    'Q': {'CAA': None, 'CAG': None},
    'R': {'AGA': None, 'AGG': None, 'CGA': None, 'CGC': None, 'CGG': None, 'CGT': None},
    'S': {'AGC': None, 'AGT': None, 'TCA': None, 'TCC': None, 'TCG': None, 'TCT': None},
    'T': {'ACA': None, 'ACC': None, 'ACG': None, 'ACT': None},
    'V': {'GTA': None, 'GTC': None, 'GTG': None, 'GTT': None},
    'W': {'TGG': None},
    'Y': {'TAC': None, 'TAT': None}}
    
    for aa in cub_table.keys():
           
        # total number of the amino acid or total number of codon for the amino acid
        n_AA = np.count_nonzero(prot_seq_arr == aa)
                
        # total number of codons for each amino acid
        nc_AA = len(cub_table[aa].keys())
        
        for codon in cub_table[aa].keys():
            
            # number of a codon in the alignment for each amino acid
            nc = np.count_nonzero(cod_seq_arr == codon)
            
            # caluclate the frequency of codons in the alignment for each amino acid
            fc =(nc / n_AA) * 1/nc_AA
            
            
            # round the frequency to 5 decimal places and assign it to the codon usage bias table
            cub_table[aa][codon] = round(fc,5)
        
    return cub_table
        
cub_msa_table_ddla = cub_msa_table(prot_seq_arr=all_seqs_protein, cod_seq_arr=all_seqs_nt)
    

## Calculate the Codon Rarity Score for each position of the alignment

#### $$ CR_{position}= {\sum \limits _{AA_{AA}} ^{n_{aln}}(\sum \limits _{occ=1} ^{n_{aln}} {AA_{occ}} * f_c) \over {len_{total}(alignment)} - (gaps)} $$

In [None]:
cub_msa_table_ddla = cub_msa_table(prot_seq_arr=all_seqs_protein, cod_seq_arr=all_seqs_nt)
alignment_value_matrix, seq_name, seq_pos, residue_sum, residue_max, sequence_sum = map_rarity(protein_alignment, nustrudb, cub_msa_table_ddla)


sorted_alignment_value_matrix = np.sort(alignment_value_matrix, axis=0)
sorted_alignment_value_matrix = np.flip(sorted_alignment_value_matrix, axis=0)


In [None]:
# np.savez(f"{output_path}/cub_msa_table_ddla.npz", alignment_value_matrix=alignment_value_matrix, seq_name=seq_name, seq_pos=seq_pos, residue_sum=residue_sum, residue_max=residue_max, sequence_sum=sequence_sum)

In [None]:
alignment_value_matrix_filtered,deleted_columns = filter_columns_by_gap_threshold(alignment_value_matrix)
sorted_alignment_value_matrix_filtered,deleted_columns = filter_columns_by_gap_threshold(sorted_alignment_value_matrix)

In [None]:
for index in sorted(deleted_columns, reverse=True):
    del seq_pos[index]
    del residue_max[index]
    del residue_sum[index]


In [None]:
seq_pos_new = [i for i in range(len(residue_max))]

## Visualize the Codon Rarity Score for each position of the alignment in MSA

In [None]:
df = pd.DataFrame({
    'x': seq_pos,
    'y': residue_sum
})

# Compute the rolling mean with a window of 10
df['smoothed_y'] = df['y'].rolling(window=10, center=True).mean()

In [None]:
fig1 = make_subplots(rows=3, cols=1, shared_xaxes=True, vertical_spacing=0.02)
fig1.add_trace((go.Heatmap(z=alignment_value_matrix_filtered, y=seq_name, colorscale="blues", showlegend=False)), row=3, col=1)
fig1.add_trace((go.Heatmap(z=sorted_alignment_value_matrix_filtered, y=seq_name, colorscale="blues")), row=2, col=1)
fig1.add_trace((go.Scatter(x=seq_pos_new, y=df['smoothed_y'], mode="lines", line=dict(color="navy"))), row=1, col=1)
fig1['layout']['yaxis1']['visible']=True
fig1['layout']['yaxis2']['visible']=False
fig1['layout']['yaxis3']['visible']=False
fig1.write_image(f"{output_path}/{working_name}_codon_rarity_heatmap.png")
fig1.write_html(f"{output_path}/{working_name}_codon_rarity_heatmap.html")

In [None]:
def get_secondary_structure(seq_names, aln_prot_arr, nustrudb):
    # Initialize a list to store the aligned secondary structure arrays
    aligned_secstru_list = []

    for seq in seq_names:
        # Retrieve the secondary structure dictionary for the current sequence
        secstru_dict = nustrudb[nustrudb["primary_id"] == seq]["secondary_structure"].values[0]
        
        secstru_dict = ast.literal_eval(secstru_dict)
        # Convert the dictionary to a list that aligns with the protein sequence length
        max_pos = max(secstru_dict.keys())
        secstru_list = ['-' for _ in range(max_pos)]
        for pos, ss in secstru_dict.items():
            secstru_list[pos - 1] = ss  # Adjust for 1-based to 0-based indexing
        
        # Align the secondary structure list with the protein alignment
        aligned_secstru = []
        secstru_idx = 0
        
        for aa in aln_prot_arr[seq_names.index(seq)]:
            if aa == '-':
                aligned_secstru.append('0')
            else:
                aligned_secstru.append(secstru_list[secstru_idx])
                secstru_idx += 1
        
        aligned_secstru_list.append(aligned_secstru)
    
    # Convert the list of lists to a numpy array
    aligned_secstru_arr = np.array(aligned_secstru_list)
    
    return aligned_secstru_arr

def secstru_to_numeric(secstru_arr):
    # Convert secondary structure elements to numeric values for plotting
    secstru_numeric = []
    for secstru in secstru_arr:
        numeric_seq = []
        for element in secstru:
            if element == 'H':
                numeric_seq.append(1)
            elif element == 'E':
                numeric_seq.append(2)
            elif element == '-':
                numeric_seq.append(3)
            elif element == '0':
                numeric_seq.append(0)
            else:
                numeric_seq.append(0)
        secstru_numeric.append(numeric_seq)
    return np.array(secstru_numeric)      

def plot_secondary_structure_frequency(secstru_numeric, smoothed_y):
    secstru_numeric = secstru_numeric.T  # Transpose to get positions as rows
    positions = np.arange(secstru_numeric.shape[0])
    
    # Calculate the frequency of each secondary structure type at each position
    freq_H = np.sum(secstru_numeric == 1, axis=1) / secstru_numeric.shape[1]
    freq_E = np.sum(secstru_numeric == 2, axis=1) / secstru_numeric.shape[1]
    freq_C = np.sum(secstru_numeric == 3, axis=1) / secstru_numeric.shape[1]
    freq_0 = np.sum(secstru_numeric == 0, axis=1) / secstru_numeric.shape[1]
    
    # Set up the grid for the plots
    fig = plt.figure(figsize=(14, 10))
    gs = GridSpec(2, 1, height_ratios=[3, 1])

    # Plotting the frequencies and smoothed Y on the top plot
    ax1 = fig.add_subplot(gs[0])
    ax1.plot(positions, freq_H, label='Helix (H)')
    ax1.plot(positions, freq_E, label='Sheet (E)')
    ax1.plot(positions, freq_C, label='Coil (C)')
    ax1.plot(positions, freq_0, label='Gap (0)')
    ax1.plot(positions, smoothed_y, label='Smoothed Y', linewidth=2, linestyle='--')

    ax1.set_xlabel('Position')
    ax1.set_ylabel('Frequency / Smoothed Y')
    ax1.set_title('Frequency of Secondary Structure Types Over Position with Smoothed Y')
    ax1.legend()


    # Plotting the frequencies as a bar plot on the bottom plot
    ax2 = fig.add_subplot(gs[1])
    width = 0.2  # Width of the bars

    '''
    # Plotting the frequencies as a bar plot
    ax2.bar(positions, freq_H, width, alpha=0.6, color='blue', label='Helix (H)')
    ax2.bar(positions, freq_E, width, alpha=0.6, color='orange', label='Sheet (E)')
    ax2.bar(positions, freq_C, width, alpha=0.6, color='green', label='Coil (C)')
    ax2.bar(positions, freq_0, width, alpha=0.6, color='red', label='Gap (0)')

    ax2.set_xlabel('Position')
    ax2.set_ylabel('Frequency')
    ax2.set_title('Frequency of Secondary Structure Types Over Position (Bar Plot)')
    ax2.legend()'''

    plt.tight_layout()

    plt.savefig("secondary_structure_peptm7_no_gaps")

aligned_secstru_arr = get_secondary_structure(seq_name, all_seqs_protein, nustrudb)
secstru_numeric = secstru_to_numeric(aligned_secstru_arr)
secstru_numeric_ng = np.delete(secstru_numeric, deleted_columns, axis=1)
plot_secondary_structure_frequency(secstru_numeric_ng, df['smoothed_y'])

In [None]:
split1 = sorted_alignment_value_matrix_filtered[0:200,:]
split2 = sorted_alignment_value_matrix_filtered[201:400,:]

residue_sum1 = []
for col_mean in np.sum(split1, axis=0):
    residue_sum1.append(col_mean / len(seq_name))

df1 = pd.DataFrame({
    'x': seq_pos,
    'y': residue_sum1
})

residue_sum2 = []
for col_mean in np.sum(split2, axis=0):
    residue_sum2.append(col_mean / len(seq_name))

df2 = pd.DataFrame({
    'x': seq_pos,
    'y': residue_sum2
})

# Compute the rolling mean with a window of 10
df1['smoothed_y'] = df1['y'].rolling(window=10, center=True).mean()
df2['smoothed_y'] = df2['y'].rolling(window=10, center=True).mean()


fig = go.Figure()
fig.add_scatter(x=seq_pos_new, y=df1['smoothed_y'], mode="lines", line=dict(color="blue"))
fig.add_scatter(x=seq_pos_new, y=df2['smoothed_y'], mode="lines", line=dict(color="red"))
fig.write_image(f"{output_path}/seperate.png")

In [None]:
dendro = ff.create_dendrogram(alignment_value_matrix_filtered, labels=seq_name, orientation='right')
heatmap_labels = dendro['layout']['yaxis']['ticktext']

# Create and add the heatmap in the second column
heatmap = go.Heatmap(z=alignment_value_matrix_filtered, x=seq_pos_new, y=heatmap_labels,  colorscale="blues")

fig2 = make_subplots(rows=1, cols=2, shared_yaxes=True, shared_xaxes=True,
                    horizontal_spacing=0.01, subplot_titles=("Dendrogram", "Heatmap"))

for data in dendro['data']:
    fig2.add_trace(data, row=1, col=1)

fig2.add_trace(heatmap, row=1, col=2)
fig2.update_layout(width=1000, height=800, showlegend=False)
fig2.write_image(f"{output_path}/codon_rarity_dendrogram_heatmap.png")
fig2.write_html(f"{output_path}/codon_rarity_dendrogram_heatmap.html")

# Phylogenetic tree analysis

In [None]:
tree = Tree(tree_file)
leaf_distances = sum_distances_to_root(tree)
x_values = [leaf_distances[key] for key in sorted(leaf_distances)]
y_values = [sequence_sum[key] for key in sorted(sequence_sum)]

x_array = np.array(x_values)
y_array = np.array(y_values)

pearson_corr, _ = stats.pearsonr(x_array, y_array)
spearman_corr, _ = stats.spearmanr(x_array, y_array)
kendall_corr, _ = stats.kendalltau(x_array, y_array)

slope, intercept, r, p, stderr = stats.linregress(x_array, y_array)
print(r)
plt.figure(figsize=(8, 5))
plt.style.use('ggplot')
sns.regplot(x=x_values, y=y_values,
            scatter_kws={'color': 'blue', 'alpha': 0.5}, 
            line_kws={'color': 'navy', })
plt.text(0.02, -0.1, f"Pearson Correlation: {round(pearson_corr, 5)}\nSpearman Correlation: {round(spearman_corr, 5)}\nKendall Correlation: {round(kendall_corr, 5)}", fontsize=12, transform=plt.gcf().transFigure)
plt.title('Divergence of Codon Rarity for IS1 element transposase InsA')
plt.xlabel('Branch Lengths from Root (divergence)')
plt.ylabel('Codon Rarity Score (sum of all residues)')

plt.savefig(f"{output_path}/divergence_codon_rarity.png")

In [None]:
tree = Tree(tree_file)

# Dictionary with leaf name and value
leaf_values = sequence_sum

# Function to get a color based on the value
def get_color(value, min_val, max_val):
    # Normalize the value between 0 and 1
    norm_val = (value - min_val) / (max_val - min_val)
    # Generate a color from green to red
    r = int(255 * norm_val)
    g = int(255 * (1 - norm_val))
    return f'#{r:02x}{g:02x}00'

# Determine min and max values for normalization
min_val = min(leaf_values.values())
max_val = max(leaf_values.values())

# Apply styles to the tree nodes
for leaf in tree:
    if leaf.name in leaf_values:
        value = leaf_values[leaf.name]
        color = get_color(value, min_val, max_val)
        
        # Set node style
        style = NodeStyle()
        style["fgcolor"] = color
        style["size"] = 10
        
        leaf.set_style(style)
        
# Define a tree style
ts = TreeStyle()
# ts.mode = "c"
ts.show_leaf_name = False
ts.show_branch_length = False
ts.show_branch_support = False
#ts.scale = 20
#ts.branch_vertical_margin = 50

# Render the tree
# tree.render('%%inline', w=1400, h=1800, tree_style=ts)
tree.render('peptM7_tree.png', w=1400, h=1800, tree_style=ts)


In [None]:
codon_data = nustrudb

# Extract nucleotide sequences
nucleotide_sequences = codon_data.set_index('nucleotide_id')['nucleotide_sequence'].to_dict()


# Function to extract codons from nucleotide sequences
def extract_codons(nucleotide_sequences):
    codon_counter = Counter()
    for seq in nucleotide_sequences.values():
        for i in range(0, len(seq) - 2, 3):
            codon = seq[i:i+3]
            if len(codon) == 3:
                codon_counter[codon] += 1
    return codon_counter

codon_counter = extract_codons(nucleotide_sequences)

# Determine common and rare codons
total_codons = sum(codon_counter.values())
threshold = total_codons * 0.01  # 1% threshold
common_codons = {codon for codon, count in codon_counter.items() if count >= threshold}
rare_codons = set(codon_counter.keys()) - common_codons

# Step 2: Read alignment and tree data
alignment = AlignIO.read(alignment_file, 'fasta')
tree = Tree(tree_file, format=1)

# Function to map codon rarity
def map_codon_rarity(sequences, common_codons, rare_codons):
    rarity_map = {}
    for header, seq in sequences.items():
        rarity_status = []
        for i in range(0, len(seq) - 2, 3):
            codon = seq[i:i+3]
            if codon in common_codons:
                rarity_status.append('common')
            elif codon in rare_codons:
                rarity_status.append('rare')
            else:
                rarity_status.append('unknown')
        rarity_map[header] = rarity_status
    return rarity_map

codon_rarity_map = map_codon_rarity(nucleotide_sequences, common_codons, rare_codons)

# Function to assign rarity status to internal nodes using maximum parsimony
def ancestral_reconstruction(tree, codon_rarity_map):
    for leaf in tree.iter_leaves():
        leaf.add_feature("rarity", codon_rarity_map.get(leaf.name, ['unknown'] * len(next(iter(codon_rarity_map.values())))))

    def reconcile_rarity(node):
        if not node.is_leaf():
            children = node.get_children()
            child_rarities = [child.rarity for child in children if hasattr(child, "rarity")]
            if child_rarities:
                transposed_rarities = list(zip(*child_rarities))
                majority_rarities = [max(set(column), key=column.count) for column in transposed_rarities]
                node.add_feature("rarity", majority_rarities)

    for node in tree.traverse("postorder"):
        reconcile_rarity(node)

ancestral_reconstruction(tree, codon_rarity_map)

def layout(node):
    if node.is_leaf():
        faces.add_face_to_node(AttrFace("name", fsize=10), node, column=0, position="branch-right")
    if hasattr(node, "rarity"):
        rarity_face = faces.TextFace(','.join(node.rarity), fsize=8)
        faces.add_face_to_node(rarity_face, node, column=1, position="branch-right")

ts = TreeStyle()
ts.layout_fn = layout
ts.show_leaf_name = False
ts.title.add_face(faces.TextFace("Phylogenetic Tree with Codon Rarity", fsize=12), column=0)

tree.render('%%inline', w=1400, h=1800, tree_style=ts)

# Calculate and map the Codon conservation at each position of a multiple sequence alignment

### Function to calculate the codon conservation at each position of a multiple sequence alignment:
CCS = Codon Conservation Score,
$$ CCS_{position} = ? $$