In [2]:
import pandas as pd
import numpy as np

import uuid
import os

from Bio import AlignIO
from Bio import SeqIO
from Bio import AlignIO

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats

from ete3 import Tree, TreeStyle, NodeStyle

# Calculate and map the Codon rarity at each position of a multiple sequence alignment

##### The calculations are done as in the other script, where the codon rarity score is calculated on the msa and normalized by the number of synonymous codons. Furthermore, gaps in the alignment and then alignment length are taken into account. Unlike the other script, the codon rarity is here not analyzed on a position basis, but for the whole sequence. The codon rarities are summed and devided by the number of codons in the sequence. The tree which is based on the alignment is used to map the codon rarity scores to the leaves of the tree. The phylogenetic tree is colored based on the Codon Rarity Score values.

### Set data to analyse

In [5]:
savefig = True # define if figures should be saved

# assign the paths to the files need for the analysis
alignment_file = "/Users/dominiquefastus/master_project/NuStru/Example/examples_family/example_fam1_aln.fasta" 
nustrudb =  "/Users/dominiquefastus/master_project/NuStru/Example/examples_family/example_fam1_nustrudb.csv"
tree_file = "/Users/dominiquefastus/master_project/NuStru/Example/examples_family/example_fam1_tree.tree.rooted"
output_path = "/Example/examples_family/"

# get a working name from the input file (e.g. protein family name) to use in the output files
working_name = os.path.basename(alignment_file)

nt_fasta_file = None  # nucleotide sequences in fasta format (optional, so no extra file will be created)

# for the temporary created files, like the nucleotide sequences, create a unique job_id
job_id = uuid.uuid4()

# read the nucleotide structure database as csv
nustrudb = pd.read_csv(nustrudb)

# read the protein fasta alignment and nucleotide fasta sequences if provided
protein_alignment = AlignIO.read(alignment_file, "fasta")
if nt_fasta_file is not None:
    nucleotide_sequences = SeqIO.parse(nt_fasta_file, "fasta")
else:
    # else create a temporary file with the nucleotide sequences corresponding to the protein sequences
    for record in protein_alignment:
        try:
            # get the nucleotide id and sequence from the nustrudb
            # and write the id and sequence to a temporary file in the defined output path
            nucleotide_id = nustrudb.loc[nustrudb["primary_id"] == record.id]["nucleotide_id"].values[0]
            nucleotide_sequence = nustrudb.loc[nustrudb["primary_id"] == record.id]["nucleotide_sequence"].values[0]
            with open(f"{output_path}/{job_id}_nucleotide_sequences.fasta", "a") as f:
                f.write(f">{nucleotide_id}\n{nucleotide_sequence}\n")
        except:
            continue
    
    # then read the temporary created nucleotide sequences
    nucleotide_sequences = SeqIO.parse(f"{output_path}/{job_id}_nucleotide_sequences.fasta", "fasta")

### Define functions

In [None]:
# note some of the functions are from the msa_codon_structure notebook, they are copied here for the analysis
# could be put in a separate file and imported...

def fasta_to_array(fasta, align_to=None, codon=False):
    """Converts a fasta file to a numpy array"""
    # set empty lists to store the sequences and ids
    all_seqs = []
    all_ids = []
    
    if codon:
        # if the sequences are nucleotide sequences and codons should be used
        # then split the sequences into codons for each sequence in the fasta file
        for (ind,record) in enumerate(fasta):
            all_seqs.append(list(str(record.seq)))
            all_seqs[ind] = [''.join(map(str, all_seqs[ind][i:i+3])) for i in range(0, len(all_seqs[ind]), 3)]
            
    else:         
        # else just append the sequences and ids to the lists 
        # this can be done always for protein sequences  
        for record in fasta:
            all_seqs.append(list(str(record.seq)))
            all_ids.append(record.id)
        
        # convert the sequences to a numpy array
        all_seqs = np.array(all_seqs)

    if align_to is not None:
        # if an alignment is provided, then insert gaps in the sequences where gaps are in the alignment
        # get the gap positions in the alignment and create a numpy array as a mask
        gap_indeces = np.where(align_to == '-')
        
        # loop through the gap positions and insert a gap in the sequences
        for gap_index in zip(gap_indeces[0], gap_indeces[1]):
            # insert the gap index at the correpsonding position in the sequences
            # since we align the codons we use 3 dashes for a gap to show nucleotide triplets
            all_seqs[gap_index[0]].insert(gap_index[1], '---')
        
        # convert the aligned sequences with gaps to a numpy array
        all_seqs = np.array(all_seqs)
        # deleting stop codons as no protein assigned to them
        all_seqs = np.delete(all_seqs, -1, axis=1)
    
    # reshape the ids to a numpy array according to the number of sequences (one column)
    all_ids = np.array(all_ids).reshape(len(all_ids), 1)
    
    return all_seqs

def cub_msa_table(prot_seq_arr=None, cod_seq_arr=None):
    """Create a codon usage bias table for the multiple sequence alignment"""
    cub_table = {
    # '*': {'TAA': None, 'TAG': None, 'TGA': None}, ignoring stop codons
    'A': {'GCA': None, 'GCC': None, 'GCG': None, 'GCT': None},
    'C': {'TGC': None, 'TGT': None},
    'D': {'GAC': None, 'GAT': None},
    'E': {'GAA': None, 'GAG': None},
    'F': {'TTC': None, 'TTT': None},
    'G': {'GGA': None, 'GGC': None, 'GGG': None, 'GGT': None},
    'H': {'CAC': None, 'CAT': None},
    'I': {'ATA': None, 'ATC': None, 'ATT': None},
    'K': {'AAA': None, 'AAG': None},
    'L': {'CTA': None, 'CTC': None, 'CTG': None, 'CTT': None, 'TTA': None, 'TTG': None},
    'M': {'ATG': None},
    'N': {'AAC': None, 'AAT': None},
    'P': {'CCA': None, 'CCC': None, 'CCG': None, 'CCT': None},
    'Q': {'CAA': None, 'CAG': None},
    'R': {'AGA': None, 'AGG': None, 'CGA': None, 'CGC': None, 'CGG': None, 'CGT': None},
    'S': {'AGC': None, 'AGT': None, 'TCA': None, 'TCC': None, 'TCG': None, 'TCT': None},
    'T': {'ACA': None, 'ACC': None, 'ACG': None, 'ACT': None},
    'V': {'GTA': None, 'GTC': None, 'GTG': None, 'GTT': None},
    'W': {'TGG': None},
    'Y': {'TAC': None, 'TAT': None}}
    
    for aa in cub_table.keys():
        # loop through the amino acids and codons in the cub table and calculate the frequency
        # n_AA is the number of occurences of the amino acid in the protein sequence
        n_AA = np.count_nonzero(prot_seq_arr == aa)
                
        # nc_AA is the number of synonyomous codons for the amino acid
        nc_AA = len(cub_table[aa].keys())
        
        for codon in cub_table[aa].keys():
            # now for each codon a frequency based on the alignment is calculated
            # nc is the number of occurences of the codon in the in the sequences
            nc = np.count_nonzero(cod_seq_arr == codon)
            
            # fc is the frequency of the codon in the alignment
            # it is calculated by dividing the number of occurences of the codon by the number of occurences of the amino acid
            # then its normalized by the number of synonymous codons
            fc =(nc / n_AA) * 1/nc_AA

            # round the frequency to 6 decimal places and create an cub table with the frequencies based on the msa
            cub_table[aa][codon] = round(fc,6)
        
    return cub_table

def map_rarity(protein_alignment, nustrudb, cu_table):
    """Map the rarity of the codons to the protein alignment"""
    codon_position_start = 0
    
    # calculate amino acid occurence in the alignment
    # and create a dictionary with the amino acid and its occurence
    unique_aa, aa_counts = np.unique(protein_alignment, return_counts=True)
    aa_counts = dict(zip(unique_aa, aa_counts))
    
    # create an empty matrix to store the rarity values
    alignment_value_matrix = np.zeros((len(protein_alignment), len(protein_alignment[0])))
    seq_name = [seq.id for seq in protein_alignment] # get the sequence names or ids from the alignment
    seq_pos = [i for i in range(len(protein_alignment[0]))] # get the sequence positions (with gaps)

    # keep track of the position in the alignment for each sequence
    # this is needed to adjust the position based on the gaps in the alignment
    # so keeps track of the gaps in the alignment
    pos_count_dict = {seq_name[i]: 0 for i in range(len(seq_name))}
    
    # loop over the sequences in the alignment
    for position in range(len(protein_alignment[0])):
        # loop over the amino acids and sequences in the alignment
        for i, (aa, seq) in enumerate(zip(protein_alignment[:,position],seq_name)):
            # if the amino acid is a gap, then set the rarity to 0
            # else calculate the rarity based on the codon usage table
            if aa == '-':
                # add gaps to the alignment value matrix
                alignment_value_matrix[i, position] = 0 # gaps have no information
                pos_count_dict[seq] += 1 # count the gaps for each sequence
            else:
                # adjust the position in the nucleotide sequence based on the gaps in the alignment
                prot_position = pos_count_dict[seq]
                position_adj = position - prot_position 
                
                # get the nucleotide sequence from the nustrudb
                sequence = nustrudb[nustrudb["primary_id"] == seq]["nucleotide_sequence"].values[0]
                # set the rarity value based on the codon usage table
                # since the codon usage table is based on codons, we need to adjust the position by 3
                # the values are then stored in the equivalent position in the alignment value matrix
                alignment_value_matrix[i, position] = cu_table[aa][sequence[position_adj*3:position_adj*3+3].upper()]
        
    # calculate the mean rarity of the sequences in the alignment
    # horizontal mean of the rarity values in a row
    sequence_mean = {}
    for col_mean, name in zip(np.sum(alignment_value_matrix, axis=1), seq_name):
        sequence_mean[name] = round((col_mean / len(alignment_value_matrix[0])), 5)
    
    return sequence_mean

def sum_distances_to_root(tree):
    """Sums the distances of the leaves to the root of the tree"""
    # store the distances in a dictionary with the leaf name as key
    leaf_distances = {}
    
    # loop over the leaves in the tree and calculate the distance to the root
    for leaf in tree.iter_leaves():
        current, distance_sum = leaf, 0
        # while the current node is not the root, sum the distances to the root
        while not current.is_root():
            distance_sum += current.dist
            current = current.up
        # round the distance to 5 decimal places and store it in the dictionary
        leaf_distances[leaf.name] = round(distance_sum, 5)
    return leaf_distances

### Get the mean Codon Rarity Score for each sequence

In [None]:
# convert the fasta files to numpy arrays
# split the nucleotide sequences into codons and align them to the protein sequences
all_seqs_protein = fasta_to_array(protein_alignment)
all_seqs_nt = fasta_to_array(fasta=nucleotide_sequences, align_to=all_seqs_protein, codon=True)

# calculate the codon usage table based on the protein and nucleotide sequences
# assign the rarity values to the protein alignment in a array
cub_msa_table = cub_msa_table(prot_seq_arr=all_seqs_protein, cod_seq_arr=all_seqs_nt)
sequence_mean = map_rarity(protein_alignment, nustrudb, cub_msa_table)

# See relationship between codon rarity and evolution in phylogenetic tree

### Define functions for plotting

In [None]:
def get_color(value, min_val, max_val):
    """Determine the color based on the value"""
    # normalize the value between 0 and 1
    # creates a color gradient from red to green
    norm_val = (value - min_val) / (max_val - min_val)
    
    # lower values are green, higher values are red
    r = int(255 * norm_val)
    g = int(255 * (1 - norm_val))
    return f'#{r:02x}{g:02x}00'

def compute_mean_value(node, leaf_values):
    """Mean for two or branches"""
    # get the leaf descendants of the node
    leaf_descendants = node.get_leaves()
    # store the values of the leaf descendants in a list
    values = [leaf_values[leaf.name] for leaf in leaf_descendants if leaf.name in leaf_values]
    # calculate the mean value
    if values:
        return sum(values) / len(values)
    else:
        return None

### Regression analysis for the rarity values and the distances to the root of the tree (rarity over divergence)

In [None]:
# load the phylogenetic tree
tree = Tree(tree_file)

# calculate the distances of the leaves to the root of the tree
leaf_distances = sum_distances_to_root(tree)
x_values = [leaf_distances[key] for key in sorted(leaf_distances)] # sort the distances for each leaf
y_values = [sequence_mean[key] for key in sorted(sequence_mean)] # sort the rarity values for each sequence

# convert to numpy arrays
x_array = np.array(x_values)
y_array = np.array(y_values)

# calculate pearson correlation
pearson_corr, _ = stats.pearsonr(x_array, y_array)

# perform linear regression
slope, intercept, r, p, stderr = stats.linregress(x_array, y_array)

# determine min and max values for normalization
min_val = min(sequence_mean.values())
max_val = max(sequence_mean.values())

# create a list of colors based on the rarity values
colors = [get_color(value, min_val, max_val) for value in y_values]

plt.figure(figsize=(8, 5))
plt.style.use('ggplot')

sns.regplot(x=x_values, y=y_values, 
            scatter_kws={'color': colors, 'alpha': 0.4},  
            line_kws={'color': 'navy',})

# print and save the pearson correlation
print(f"Pearson Correlation: {round(pearson_corr, 5)}\n")
with open(f"{output_path}/{working_name}_divergence_codon_rarity_correlations.txt", "w") as f:
    f.write(f"Pearson Correlation: {round(pearson_corr, 5)}\n")

plt.xlabel('Branch Lengths from Root (divergence)')
plt.ylabel('Codon Rarity Score (sum of all residues)')
plt.ylim(0, 1)

if savefig:
    plt.savefig(f"{output_path}/{working_name}_divergence_codon_rarity.png", dpi=600)

### Circular phylogenetic tree colored by codon rarity over the whole sequence

In [None]:
# the leaf values are the rarity values for the sequences
leaf_values = sequence_mean

# get the min and max values for normalization
min_val = min(leaf_values.values())
max_val = max(leaf_values.values())

# loop over the nodes in the tree and set the style based on the rarity values
for node in tree.traverse():
    # if the node is a leaf, then set the style based on the rarity value
    if node.is_leaf():
        # set the color based on the rarity value of the leaf
        if node.name in leaf_values:
            value = leaf_values[node.name]
            color = get_color(value, min_val, max_val)
            
            lstyle = NodeStyle()
            lstyle["fgcolor"] = color
            lstyle["size"] = 0
            lstyle["vt_line_color"] = color 
            lstyle["hz_line_color"] = color
            lstyle["vt_line_width"] = 0.2
            lstyle["hz_line_width"] = 0.2
            
            node.set_style(lstyle)
    else:
        # else set the style based on the mean value of the leaf descendants
        # this is to color the internal nodes
        mean_value = compute_mean_value(node, leaf_values)
        if mean_value is not None:
            color = get_color(mean_value, min_val, max_val)
            
        nstyle = NodeStyle()
        nstyle["size"] = 0
        
        nstyle["vt_line_color"] = color
        nstyle["hz_line_color"] = color
        nstyle["vt_line_width"] = 0.2
        nstyle["hz_line_width"] = 0.2
        
        node.set_style(nstyle)

# apply a specific style to the root node (dashed line)
rstyle = NodeStyle()
rstyle["fgcolor"] = "black"
rstyle["shape"] = "square"
rstyle["size"] = 0
tree.set_style(rstyle)
                    
# tree style for circular tree
# no branch lengths, no leaf names, no support values
ts = TreeStyle()
ts.mode = "c"
ts.show_leaf_name = False
ts.show_branch_length = False
ts.show_branch_support = False
ts.root_opening_factor = 0.1

if savefig:
    tree.render(f"{output_path}/{working_name}_tree_codon_rarity.png", w=1400, h=1600, tree_style=ts, dpi=600)
tree.render("%%inline", w=1400, h=1600, tree_style=ts)