In [None]:
from Bio import AlignIO
from Bio import SeqIO
import pandas as pd
import numpy as np
import uuid
import os

from ete3 import Tree

from pycirclize import Circos

from plotly.subplots import make_subplots
import plotly.graph_objects as go


import matplotlib.pyplot as plt
import seaborn as sns

# Calculate and map the Codon rarity at each position of a multiple sequence alignment

### Function to calculate the codon rarity at each position of a multiple sequence alignment:
CRS = Codon Rarity Score, AA = Amino Acid, occ = Occurence, f_c = Frequency of codon, len = Length, aln = Alignment, gaps = Gaps, n_aln = Number of sequences in the alignment
$$ CRS_{position}= {\sum \limits _{AA_{AA}} ^{n_{aln}}(\sum \limits _{occ=1} ^{n_{aln}} {AA_{occ}} * f_c) \over {len_{total}(alignment)} - (gaps)} $$
$$ f_c = { \sum n_c \over \sum n_{AA} } * { 1 \over n_{cAA} } $$

## Set data to analyse

In [None]:
alignment_file = "/Users/dominiquefastus/Downloads/nustruTREE/MSA/peptM7_nustru_secstru_filtered_protein_aligned.fasta" 
nt_fasta_file = None 
nustruDB = "/Users/dominiquefastus/master_project/Data/protFAMS/peptM7/peptM7_nustru_secstru_filtered.csv" 
tree_file = "/Users/dominiquefastus/master_project/NuStru/nustruEVOL/Tools/rerooted_tree_file.nwk"
output_path = "/Users/dominiquefastus/master_project/NuStru/nustruEVOL/" 
working_name = os.path.basename(alignment_file)
family_name = None

job_id = uuid.uuid4()

nustrudb = pd.read_csv(nustruDB)
protein_alignment = AlignIO.read(alignment_file, "fasta")
if nt_fasta_file is not None:
    nucleotide_sequences = SeqIO.parse(nt_fasta_file, "fasta")
else:
    for record in protein_alignment:
        try:
            nucleotide_id = nustrudb.loc[nustrudb["primary_id"] == record.id]["nucleotide_id"].values[0]
            nucleotide_sequence = nustrudb.loc[nustrudb["primary_id"] == record.id]["nucleotide_sequence"].values[0]
            with open(f"{output_path}/{job_id}_nucleotide_sequences.fasta", "a") as f:
                f.write(f">{nucleotide_id}\n{nucleotide_sequence}\n")
        except:
            continue
    nucleotide_sequences = SeqIO.parse(f"{output_path}/{job_id}_nucleotide_sequences.fasta", "fasta")

## Define functions

In [None]:
def fasta_to_array(fasta, align_to=None, codon=False):
    all_seqs = []
    all_ids = []
    
    if codon:
        for (ind,record) in enumerate(fasta):
            all_seqs.append(list(str(record.seq)))
            all_seqs[ind] = [''.join(map(str, all_seqs[ind][i:i+3])) for i in range(0, len(all_seqs[ind]), 3)]
            
    else:           
        for record in fasta:
            all_seqs.append(list(str(record.seq)))
            all_ids.append(record.id)
        
        all_seqs = np.array(all_seqs)

    if align_to is not None:
        gap_indeces = np.where(align_to == '-')
        
        for gap_index in zip(gap_indeces[0], gap_indeces[1]):
            all_seqs[gap_index[0]].insert(gap_index[1], '---')
        
        all_seqs = np.array(all_seqs)
        # deleting stop codons as no protein assigned to them
        all_seqs = np.delete(all_seqs, -1, axis=1)
    
    all_ids = np.array(all_ids).reshape(len(all_ids), 1)
    
    return all_seqs

def filter_columns_by_gap_threshold(seq_arr, threshold=0.5):
    n_rows = seq_arr.shape[0]
    valid_columns = []
    deleted_columns = []
    
    # Iterate through each column index
    for col_index in range(seq_arr.shape[1]):
        gap_count = np.sum(seq_arr[:, col_index] == 0)
        gap_percentage = gap_count / n_rows
        
        # Append column index based on gap percentage
        if gap_percentage <= threshold:
            valid_columns.append(col_index)
        else:
            deleted_columns.append(col_index)
    
    filtered_seq_arr = seq_arr[:, valid_columns]
    
    # Return the filtered array and the list of deleted columns' indices
    return filtered_seq_arr, deleted_columns

def cub_msa_table(prot_seq_arr=None, cod_seq_arr=None):
    cub_table = {
    # '*': {'TAA': None, 'TAG': None, 'TGA': None}, ignoring stop codons
    'A': {'GCA': None, 'GCC': None, 'GCG': None, 'GCT': None},
    'C': {'TGC': None, 'TGT': None},
    'D': {'GAC': None, 'GAT': None},
    'E': {'GAA': None, 'GAG': None},
    'F': {'TTC': None, 'TTT': None},
    'G': {'GGA': None, 'GGC': None, 'GGG': None, 'GGT': None},
    'H': {'CAC': None, 'CAT': None},
    'I': {'ATA': None, 'ATC': None, 'ATT': None},
    'K': {'AAA': None, 'AAG': None},
    'L': {'CTA': None, 'CTC': None, 'CTG': None, 'CTT': None, 'TTA': None, 'TTG': None},
    'M': {'ATG': None},
    'N': {'AAC': None, 'AAT': None},
    'P': {'CCA': None, 'CCC': None, 'CCG': None, 'CCT': None},
    'Q': {'CAA': None, 'CAG': None},
    'R': {'AGA': None, 'AGG': None, 'CGA': None, 'CGC': None, 'CGG': None, 'CGT': None},
    'S': {'AGC': None, 'AGT': None, 'TCA': None, 'TCC': None, 'TCG': None, 'TCT': None},
    'T': {'ACA': None, 'ACC': None, 'ACG': None, 'ACT': None},
    'V': {'GTA': None, 'GTC': None, 'GTG': None, 'GTT': None},
    'W': {'TGG': None},
    'Y': {'TAC': None, 'TAT': None}}
    
    for aa in cub_table.keys():
        n_AA = np.count_nonzero(prot_seq_arr == aa)
                
        nc_AA = len(cub_table[aa].keys())
        
        for codon in cub_table[aa].keys():
            nc = np.count_nonzero(cod_seq_arr == codon)
            
            fc =(nc / n_AA) * 1/nc_AA

            cub_table[aa][codon] = round(fc,6)
        
    return cub_table

def map_rarity(protein_alignment, nustrudb, cu_table):
    codon_position_start = 0
    alignment_value_matrix = np.zeros((len(protein_alignment), len(protein_alignment[0])))
    seq_name = [seq.id for seq in protein_alignment]
    seq_pos = [i for i in range(len(protein_alignment[0]))]

    sart_count = [0 for i in range(len(seq_name))]
    pos_count_dict = {seq_name[i]: 0 for i in range(len(seq_name))}
    for position in range(len(protein_alignment[0])):

        for i, (aa, seq) in enumerate(zip(protein_alignment[:,position],seq_name)):
            if aa == '-':
                alignment_value_matrix[i, position] = 0
                pos_count_dict[seq] += 1
            else:
                prot_position = pos_count_dict[seq]
                position_adj = position - prot_position
                
                sequence = nustrudb[nustrudb["primary_id"] == seq]["nucleotide_sequence"].values[0]
                alignment_value_matrix[i, position] = cu_table[aa][sequence[position_adj*3:position_adj*3+3].upper()]

    residue_sum = []
    for col_mean in np.sum(alignment_value_matrix, axis=0):
        residue_sum.append(col_mean / len(seq_name))
        
    residue_max = []
    for col_max in np.max(alignment_value_matrix, axis=0):
        residue_max.append(col_max)
        
    sequence_sum = {}
    for col_mean, name in zip(np.sum(alignment_value_matrix, axis=1), seq_name):
        sequence_sum[name] = round((col_mean / len(alignment_value_matrix[0])), 5)
    
    return alignment_value_matrix, seq_name, seq_pos, residue_sum, residue_max, sequence_sum

def sum_distances_to_root(tree):
    leaf_distances = {}
    for leaf in tree.iter_leaves():
        current, distance_sum = leaf, 0
        while not current.is_root():
            distance_sum += current.dist
            current = current.up
        leaf_distances[leaf.name] = round(distance_sum, 5)
    return leaf_distances

## Transform alignment and sequences to arrays and align the gaps if necessary

In [None]:
all_seqs_protein = fasta_to_array(protein_alignment)
all_seqs_nt = fasta_to_array(fasta=nucleotide_sequences, align_to=all_seqs_protein, codon=True)

In [None]:
print(all_seqs_protein.shape)

## Calculate the codon frequency for each amino acid based on the alignment

#### $$ f_c = { \sum n_c \over \sum n_{AA} } * { 1 \over n_{cAA} } $$

In [None]:
def cub_msa_table(prot_seq_arr=None, cod_seq_arr=None):
    cub_table = {
    # '*': {'TAA': None, 'TAG': None, 'TGA': None}, ignoring stop codons
    'A': {'GCA': None, 'GCC': None, 'GCG': None, 'GCT': None},
    'C': {'TGC': None, 'TGT': None},
    'D': {'GAC': None, 'GAT': None},
    'E': {'GAA': None, 'GAG': None},
    'F': {'TTC': None, 'TTT': None},
    'G': {'GGA': None, 'GGC': None, 'GGG': None, 'GGT': None},
    'H': {'CAC': None, 'CAT': None},
    'I': {'ATA': None, 'ATC': None, 'ATT': None},
    'K': {'AAA': None, 'AAG': None},
    'L': {'CTA': None, 'CTC': None, 'CTG': None, 'CTT': None, 'TTA': None, 'TTG': None},
    'M': {'ATG': None},
    'N': {'AAC': None, 'AAT': None},
    'P': {'CCA': None, 'CCC': None, 'CCG': None, 'CCT': None},
    'Q': {'CAA': None, 'CAG': None},
    'R': {'AGA': None, 'AGG': None, 'CGA': None, 'CGC': None, 'CGG': None, 'CGT': None},
    'S': {'AGC': None, 'AGT': None, 'TCA': None, 'TCC': None, 'TCG': None, 'TCT': None},
    'T': {'ACA': None, 'ACC': None, 'ACG': None, 'ACT': None},
    'V': {'GTA': None, 'GTC': None, 'GTG': None, 'GTT': None},
    'W': {'TGG': None},
    'Y': {'TAC': None, 'TAT': None}}
    
    for aa in cub_table.keys():
           
        # total number of the amino acid or total number of codon for the amino acid
        n_AA = np.count_nonzero(prot_seq_arr == aa)
                
        # total number of codons for each amino acid
        nc_AA = len(cub_table[aa].keys())
        
        for codon in cub_table[aa].keys():
            
            # number of a codon in the alignment for each amino acid
            nc = np.count_nonzero(cod_seq_arr == codon)
            
            # caluclate the frequency of codons in the alignment for each amino acid
            fc =(nc / n_AA) * 1/nc_AA
            
            
            # round the frequency to 5 decimal places and assign it to the codon usage bias table
            cub_table[aa][codon] = round(fc,5)
        
    return cub_table
        
cub_msa_table_ddla = cub_msa_table(prot_seq_arr=all_seqs_protein, cod_seq_arr=all_seqs_nt)
    

## Calculate the Codon Rarity Score for each position of the alignment

#### $$ CR_{position}= {\sum \limits _{AA_{AA}} ^{n_{aln}}(\sum \limits _{occ=1} ^{n_{aln}} {AA_{occ}} * f_c) \over {len_{total}(alignment)} - (gaps)} $$

In [None]:
cub_msa_table_ddla = cub_msa_table(prot_seq_arr=all_seqs_protein, cod_seq_arr=all_seqs_nt)
alignment_value_matrix, seq_name, seq_pos, residue_sum, residue_max, sequence_sum = map_rarity(protein_alignment, nustrudb, cub_msa_table_ddla)


sorted_alignment_value_matrix = np.sort(alignment_value_matrix, axis=0)
sorted_alignment_value_matrix = np.flip(sorted_alignment_value_matrix, axis=0)


In [None]:
print(seq_name)

In [None]:
np.savez(f"{output_path}/cub_msa_table_ddla.npz", alignment_value_matrix=alignment_value_matrix, seq_name=seq_name, seq_pos=seq_pos, residue_sum=residue_sum, residue_max=residue_max, sequence_sum=sequence_sum)

In [None]:
alignment_value_matrix_filtered,deleted_columns = filter_columns_by_gap_threshold(alignment_value_matrix)
sorted_alignment_value_matrix_filtered,deleted_columns = filter_columns_by_gap_threshold(sorted_alignment_value_matrix)

In [None]:
for index in sorted(deleted_columns, reverse=True):
    del seq_pos[index]
    del residue_max[index]
    del residue_sum[index]


In [None]:
print(len(seq_pos))
print(len(residue_max))
seq_pos_new = [i for i in range(len(residue_max))]

In [None]:
import ast
def get_secstru(seq_name, seq_pos, aln_prot_arr, nustrudb):
    secstru_dicts = []
    for seq in seq_name:
        secstru = nustrudb[nustrudb["primary_id"] == seq]["secondary_structure"].values[0]
        secstru_dicts.append(secstru)
        
    secstru_df = pd.DataFrame(secstru_dicts)
    secstru_arr = secstru_df.to_numpy()
    
    print(aln_prot_arr)
    
        
get_secstru(seq_name=seq_name, seq_pos=None, aln_prot_arr=all_seqs_protein, nustrudb=nustrudb)

## Visualize the Codon Rarity Score for each position of the alignment in MSA

In [None]:
df = pd.DataFrame({
    'x': seq_pos,
    'y': residue_sum
})

# Compute the rolling mean with a window of 10
df['smoothed_y'] = df['y'].rolling(window=10, center=True).mean()

fig1 = make_subplots(rows=3, cols=1, shared_xaxes=True, vertical_spacing=0.02)
fig1.add_trace((go.Heatmap(z=alignment_value_matrix_filtered, y=seq_name, colorscale="blues", showlegend=False)), row=3, col=1)
fig1.add_trace((go.Heatmap(z=sorted_alignment_value_matrix_filtered, y=seq_name, colorscale="blues")), row=2, col=1)
fig1.add_trace((go.Scatter(x=seq_pos_new, y=df['smoothed_y'], mode="lines", line=dict(color="navy"))), row=1, col=1)
fig1['layout']['yaxis1']['visible']=True
fig1['layout']['yaxis2']['visible']=False
fig1['layout']['yaxis3']['visible']=False
fig1.write_image(f"{output_path}/{working_name}_codon_rarity_heatmap.png")
fig1.write_html(f"{output_path}/{working_name}_codon_rarity_heatmap.html")

In [None]:
split1 = sorted_alignment_value_matrix_filtered[0:200,:]
split2 = sorted_alignment_value_matrix_filtered[201:400,:]

residue_sum1 = []
for col_mean in np.sum(split1, axis=0):
    residue_sum1.append(col_mean / len(seq_name))

df1 = pd.DataFrame({
    'x': seq_pos,
    'y': residue_sum1
})

residue_sum2 = []
for col_mean in np.sum(split2, axis=0):
    residue_sum2.append(col_mean / len(seq_name))

df2 = pd.DataFrame({
    'x': seq_pos,
    'y': residue_sum2
})

# Compute the rolling mean with a window of 10
df1['smoothed_y'] = df1['y'].rolling(window=10, center=True).mean()
df2['smoothed_y'] = df2['y'].rolling(window=10, center=True).mean()


fig = go.Figure()
fig.add_scatter(x=seq_pos_new, y=df1['smoothed_y'], mode="lines", line=dict(color="blue"))
fig.add_scatter(x=seq_pos_new, y=df2['smoothed_y'], mode="lines", line=dict(color="red"))
fig.write_image(f"{output_path}/seperate.png")

In [None]:
import plotly.figure_factory as ff
from scipy.cluster.hierarchy import linkage, dendrogram
import plotly.graph_objects as go


dendro = ff.create_dendrogram(alignment_value_matrix_filtered, labels=seq_name, orientation='right')
heatmap_labels = dendro['layout']['yaxis']['ticktext']

# Create and add the heatmap in the second column
heatmap = go.Heatmap(z=alignment_value_matrix_filtered, x=seq_pos_new, y=heatmap_labels,  colorscale="blues")

fig2 = make_subplots(rows=1, cols=2, shared_yaxes=True, shared_xaxes=True,
                    horizontal_spacing=0.01, subplot_titles=("Dendrogram", "Heatmap"))

for data in dendro['data']:
    fig2.add_trace(data, row=1, col=1)

fig2.add_trace(heatmap, row=1, col=2)
fig2.update_layout(width=1000, height=800, showlegend=False)
fig2.write_image(f"{output_path}/codon_rarity_dendrogram_heatmap.png")
fig2.write_html(f"{output_path}/codon_rarity_dendrogram_heatmap.html")

In [None]:
import scipy.stats as stats

tree = Tree(tree_file)
leaf_distances = sum_distances_to_root(tree)
x_values = [leaf_distances[key] for key in sorted(leaf_distances)]
y_values = [sequence_sum[key] for key in sorted(sequence_sum)]

x_array = np.array(x_values)
y_array = np.array(y_values)

pearson_corr, _ = stats.pearsonr(x_array, y_array)
spearman_corr, _ = stats.spearmanr(x_array, y_array)
kendall_corr, _ = stats.kendalltau(x_array, y_array)

slope, intercept, r, p, stderr = stats.linregress(x_array, y_array)
print(r)
plt.figure(figsize=(8, 5))
plt.style.use('ggplot')
sns.regplot(x=x_values, y=y_values,
            scatter_kws={'color': 'blue', 'alpha': 0.5}, 
            line_kws={'color': 'navy', })
plt.text(0.02, -0.1, f"Pearson Correlation: {round(pearson_corr, 5)}\nSpearman Correlation: {round(spearman_corr, 5)}\nKendall Correlation: {round(kendall_corr, 5)}", fontsize=12, transform=plt.gcf().transFigure)
plt.title('Divergence of Codon Rarity for Peptidase M7')
plt.xlabel('Branch Lengths from Root (divergence)')
plt.ylabel('Codon Rarity Score (sum of all residues)')
plt.show()
plt.savefig(f"{output_path}/divergence_codon_rarity.png")

In [None]:
print(sequence_sum)

# Visualize the Codon Rarity Change in Circle Plot

In [None]:
'''sectors = {}

for seq in seq_name[0:10]:
    sectors[seq] = alignment_value_matrix.shape[1]
    
circos = Circos(sectors, space=5)

for sector in circos.sectors:
    
    rarity = alignment_value_matrix[seq_name[0:10].index(sector.name)]
    # add sequence track
    sector.text(f"{sector.name}", r=110, size=10)
    track = sector.add_track((95, 100))
    track.axis()
    track.text(sector.name, color="white", size=12)
    track.xticks_by_interval(50)
    
    # add codon rarity track
    line_track = sector.add_track((85, 95), r_pad_ratio=0.1)
    line_track.axis()
    line_track.line(seq_pos[0:10],rarity[0:10])

all_seqs_nt = all_seqs_nt[0:10]
reference = all_seqs_nt[0:10]
for seq_num in range(0, len(all_seqs_nt[0:10])):
    compare = np.tile(all_seqs_nt[seq_num], (len(all_seqs_nt), 1))

    # Finding the differences
    differences = compare != reference

    # Getting the indices where the differences occur
    diff_indices = np.where(differences)

    for codon_change in zip(diff_indices[0], diff_indices[1]):
        seq_cod =  seq_name[0:10][codon_change[0]]
        circos.link((f"{seq_name[0:10][0]}", codon_change[1], codon_change[1]), (f"{seq_name[0:10][codon_change[0]]}", codon_change[1], codon_change[1]))

fig = circos.savefig(f"{output_path}/codon_rarity_circos.png")
'''

In [None]:
'''
dicts = []
for seq in seq_name:
    dicts.append(nustrudb[nustrudb["primary_id"] == seq]["secondary_structure"].values[0].replace('"', ''))

# Map secondary structures to numerical values
structure_to_number = {'-': 0, 'E': 1, 'H': 2, 'S': 3, 'T': 4, 'B': 5, 'G': 6, 'I': 7, 'P': 8}

# Assume all dictionaries are the same length
positions = range(1, 331)  # Update based on the actual number of positions

# Initialize an empty list to store values for each position
values = {pos: [] for pos in positions}

# Fill values for each position from all dictionaries
for d in dicts:
    for pos, struct in d.items():
        values[pos].append(structure_to_number[struct])

# Calculate variance at each position
variances = {pos: np.var(val) for pos, val in values.items()}

# Plotting
plt.figure(figsize=(15, 5))
plt.plot(list(variances.keys()), list(variances.values()), marker='o')
plt.title('Variance of Secondary Structure Along Residue Positions')
plt.xlabel('Residue Position')
plt.ylabel('Variance')
plt.grid(True)
plt.show()'''

# Calculate and map the Codon conservation at each position of a multiple sequence alignment

### Function to calculate the codon conservation at each position of a multiple sequence alignment:
CCS = Codon Conservation Score,
$$ CCS_{position} = ? $$