# Calculate and map the Codon rarity at each position of a multiple sequence alignment

### Function to calculate the codon rarity at each position of a multiple sequence alignment:
CR = Codon Rarity, AA = Amino Acid, occ = Occurence, f_c = Frequency of codon, len = Length, aln = Alignment, gaps = Gaps, n_aln = Number of sequences in the alignment
$$ CR_{position}= {\sum \limits _{AA_{AA}} ^{n_{aln}}(\sum \limits _{occ=1} ^{n_{aln}} {AA_{occ}} * f_c) \over {len_{total}(alignment)} - (gaps)} $$
$$ f_c = { \sum n_c \over \sum n_{AA} } * { 1 \over n_{cAA} } $$

In [137]:
from Bio import AlignIO
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import pathlib
import os 

In [27]:
name_of_protein_alignment = "test_ddla_protein_aligned.fasta"
name_of_nucleotide_alignment = "test_ddla_nucleotide_aligned.fasta"

working_dir = "/Users/dominiquefastus/master_project/NuStru/nustruEVOL/nustruTREE/MSA"

protein_alignment = AlignIO.read(f"{working_dir}/{name_of_protein_alignment}", "fasta")
nucleotide_alignment = AlignIO.read(f"{working_dir}/{name_of_nucleotide_alignment}", "fasta")

print(nucleotide_alignment[:, 570:573])

Alignment with 5 rows and 3 columns
aaa A0A7Y8AQ80
aaa A0A6I6DLM7
aaa A0A212ICI3
aaa A0A7Y8BMW5
aaa A0A4Y8J7B5


In [1]:
import python_codon_tables as pct
e_coli_pct = pct.get_codons_table("e_coli_316407")
print(e_coli_pct)

{'*': {'TAA': 0.64, 'TAG': 0.07, 'TGA': 0.29}, 'A': {'GCA': 0.21, 'GCC': 0.27, 'GCG': 0.36, 'GCT': 0.16}, 'C': {'TGC': 0.56, 'TGT': 0.44}, 'D': {'GAC': 0.37, 'GAT': 0.63}, 'E': {'GAA': 0.69, 'GAG': 0.31}, 'F': {'TTC': 0.43, 'TTT': 0.57}, 'G': {'GGA': 0.11, 'GGC': 0.41, 'GGG': 0.15, 'GGT': 0.34}, 'H': {'CAC': 0.43, 'CAT': 0.57}, 'I': {'ATA': 0.07, 'ATC': 0.42, 'ATT': 0.51}, 'K': {'AAA': 0.76, 'AAG': 0.24}, 'L': {'CTA': 0.04, 'CTC': 0.1, 'CTG': 0.5, 'CTT': 0.1, 'TTA': 0.13, 'TTG': 0.13}, 'M': {'ATG': 1.0}, 'N': {'AAC': 0.55, 'AAT': 0.45}, 'P': {'CCA': 0.19, 'CCC': 0.12, 'CCG': 0.53, 'CCT': 0.16}, 'Q': {'CAA': 0.35, 'CAG': 0.65}, 'R': {'AGA': 0.04, 'AGG': 0.02, 'CGA': 0.06, 'CGC': 0.4, 'CGG': 0.1, 'CGT': 0.38}, 'S': {'AGC': 0.28, 'AGT': 0.15, 'TCA': 0.12, 'TCC': 0.15, 'TCG': 0.15, 'TCT': 0.15}, 'T': {'ACA': 0.13, 'ACC': 0.44, 'ACG': 0.27, 'ACT': 0.16}, 'V': {'GTA': 0.15, 'GTC': 0.22, 'GTG': 0.37, 'GTT': 0.26}, 'W': {'TGG': 1.0}, 'Y': {'TAC': 0.43, 'TAT': 0.57}}


In [20]:
codon_position_start = 0
alignment_value_matrix = np.zeros((len(protein_alignment), len(protein_alignment[0])))

for position in range(len(protein_alignment[0])):
    aa_list= []
    codon_list = []
    
    for i,a in enumerate(protein_alignment[:,position]):
        aa_list.append(a)
    
    codon_position_end = codon_position_start + 3
    for i,n in enumerate(nucleotide_alignment[:,codon_position_start:codon_position_end]):
        codon_list.append(n.seq[0] + n.seq[1] + n.seq[2])

    for i, (aa, codon) in enumerate(zip(aa_list, codon_list)):
        if codon == '---':
            alignment_value_matrix[i, position] = 0
        else:
            if aa == '-':
                alignment_value_matrix[i, position] = 0
            else:
                alignment_value_matrix[i, position] = e_coli_pct[aa][codon.upper()]
    codon_position_start += 3

print(alignment_value_matrix[:,144])

residue_mean = []
for col_mean in np.mean(alignment_value_matrix, axis=0):
    residue_mean.append(col_mean)
    
seq_name = [seq.id for seq in protein_alignment]



[1. 1. 1. 1. 1.]


In [138]:
import numpy as np
from Bio import AlignIO
from Bio import SeqIO
import re
name_of_protein_alignment = "test_ddla_protein_aligned.fasta"
name_of_nucleotide = "test_ddla_nucleotide.fasta"

working_dir = "/Users/dominiquefastus/master_project/NuStru/nustruEVOL/nustruTREE/MSA"

protein_alignment = AlignIO.read(f"{working_dir}/{name_of_protein_alignment}", "fasta")
nucleotide_alignment = SeqIO.parse(f"{working_dir}/{name_of_nucleotide}", "fasta")
# nucleotide_alignment = SeqIO.parse(f"/Users/dominiquefastus/master_project/NuStru/nustruMOTIF/test.fasta", "fasta")
# nucleotide_alignment = SeqIO.parse(f"/Users/dominiquefastus/master_project/NuStru/nustruDB/test_ddla_nucleotide.fasta", "fasta")

def fasta_to_array(fasta, align_to=None, codon=False):
    all_seqs = []
    all_ids = []
    
    if codon:
        for (ind,record) in enumerate(fasta):
            all_seqs.append(list(str(record.seq)))
            all_seqs[ind] = [''.join(map(str, all_seqs[ind][i:i+3])) for i in range(0, len(all_seqs[ind]), 3)]
            
    else:           
        for record in fasta:
            all_seqs.append(list(str(record.seq)))
            all_ids.append(record.id)
        
        all_seqs = np.array(all_seqs)

    if align_to is not None:
        gap_indeces = np.where(align_to == '-')
        
        for gap_index in zip(gap_indeces[0], gap_indeces[1]):
            all_seqs[gap_index[0]].insert(gap_index[1], '---')
        
        all_seqs = np.array(all_seqs)
        # deleting stop codons as no protein assigned to them
        all_seqs = np.delete(all_seqs, -1, axis=1)
            
    
    
    all_ids = np.array(all_ids).reshape(len(all_ids), 1)
    # all_seqs = np.append(all_seqs, all_ids, axis=1)
    
    return all_seqs


all_seqs_protein = fasta_to_array(protein_alignment, codon=False)
all_seqs_nt = fasta_to_array(fasta=nucleotide_alignment, align_to=all_seqs_protein, codon=True)




$$ f_c = { \sum n_c \over \sum n_{AA} } * { 1 \over n_{cAA} } $$

In [139]:
def cub_msa_table(prot_seq_arr=None, cod_seq_arr=None):
    cub_table = {
    # '*': {'TAA': None, 'TAG': None, 'TGA': None}, ignoring stop codons
    'A': {'GCA': None, 'GCC': None, 'GCG': None, 'GCT': None},
    'C': {'TGC': None, 'TGT': None},
    'D': {'GAC': None, 'GAT': None},
    'E': {'GAA': None, 'GAG': None},
    'F': {'TTC': None, 'TTT': None},
    'G': {'GGA': None, 'GGC': None, 'GGG': None, 'GGT': None},
    'H': {'CAC': None, 'CAT': None},
    'I': {'ATA': None, 'ATC': None, 'ATT': None},
    'K': {'AAA': None, 'AAG': None},
    'L': {'CTA': None, 'CTC': None, 'CTG': None, 'CTT': None, 'TTA': None, 'TTG': None},
    'M': {'ATG': None},
    'N': {'AAC': None, 'AAT': None},
    'P': {'CCA': None, 'CCC': None, 'CCG': None, 'CCT': None},
    'Q': {'CAA': None, 'CAG': None},
    'R': {'AGA': None, 'AGG': None, 'CGA': None, 'CGC': None, 'CGG': None, 'CGT': None},
    'S': {'AGC': None, 'AGT': None, 'TCA': None, 'TCC': None, 'TCG': None, 'TCT': None},
    'T': {'ACA': None, 'ACC': None, 'ACG': None, 'ACT': None},
    'V': {'GTA': None, 'GTC': None, 'GTG': None, 'GTT': None},
    'W': {'TGG': None},
    'Y': {'TAC': None, 'TAT': None}}
    
    for aa in cub_table.keys():
           
        # total number of the amino acid or total number of codon for the amino acid
        n_AA = np.count_nonzero(prot_seq_arr == aa)
                
        # total number of codons for each amino acid
        nc_AA = len(cub_table[aa].keys())
        
        for codon in cub_table[aa].keys():
            
            # number of a codon in the alignment for each amino acid
            nc = np.count_nonzero(cod_seq_arr == codon)
            
            # caluclate the frequency of codons in the alignment for each amino acid
            fc =(nc / n_AA) * 1/nc_AA
            
            
            # round the frequency to 5 decimal places and assign it to the codon usage bias table
            cub_table[aa][codon] = round(fc,5)
        
    return cub_table
        
cub_msa_table_ddla = cub_msa_table(prot_seq_arr=all_seqs_protein, cod_seq_arr=all_seqs_nt)
    

$$ CR_{position}= {\sum \limits _{AA_{AA}} ^{n_{aln}}(\sum \limits _{occ=1} ^{n_{aln}} {AA_{occ}} * f_c) \over {len_{total}(alignment)} - (gaps)} $$

In [140]:
import pandas as pd

def map_rarity(protein_alignment, nustrudb, cu_table):
    codon_position_start = 0
    alignment_value_matrix = np.zeros((len(protein_alignment), len(protein_alignment[0])))
    seq_name = [seq.id for seq in protein_alignment]
    seq_pos = [i for i in range(len(protein_alignment[0]))]

    sart_count = [0 for i in range(len(seq_name))]
    pos_count_dict = {seq_name[i]: 0 for i in range(len(seq_name))}
    for position in range(len(protein_alignment[0])):

        for i, (aa, seq) in enumerate(zip(protein_alignment[:,position],seq_name)):
            if aa == '-':
                alignment_value_matrix[i, position] = 0
                pos_count_dict[seq] += 1
            else:
                prot_position = pos_count_dict[seq]
                position_adj = position - prot_position
                
                sequence = nustrudb[nustrudb["primary_id"] == seq]["nucleotide_sequence"].values[0]
                alignment_value_matrix[i, position] = cu_table[aa][sequence[position_adj*3:position_adj*3+3].upper()]

    residue_mean = []
    for col_mean in np.sum(alignment_value_matrix, axis=0):
        residue_mean.append(col_mean / len(seq_name))
    
    return alignment_value_matrix, seq_name, seq_pos, residue_mean

nustrudb = pd.read_csv("/Users/dominiquefastus/master_project/NuStru/nustruDB/DDLA_uniprot_sec_struct_04.csv")
alignment_value_matrix, seq_name, seq_pos, residue_mean = map_rarity(protein_alignment, nustrudb, cu_table=cub_msa_table_ddla)
print(residue_mean)

[0.2, 0.054700000000000006, 0.2, 0.0040219999999999995, 0.045454, 0.054166000000000006, 1.0, 0.077904, 0.262498, 0.06639400000000001, 0.02246, 0.033622, 0.06409400000000001, 0.12299, 0.076726, 0.23636200000000002, 0.06409400000000001, 0.06409400000000001, 0.11086399999999999, 0.01814, 0.086758, 0.25470000000000004, 0.294118, 0.2641, 0.11983000000000002, 0.025841999999999997, 0.060796, 0.17308, 0.03019, 0.086486, 0.177502, 0.184618, 0.186494, 0.064942, 0.26409800000000005, 0.07081, 0.11376, 0.251924, 0.153596, 0.10296200000000003, 0.08631600000000002, 0.254546, 0.2359, 0.125914, 0.151726, 0.10276, 0.096898, 0.109734, 0.17931, 0.251924, 0.262498, 0.221082, 0.086914, 0.218644, 1.0, 0.294118, 0.062347999999999994, 0.300944, 0.251924, 0.09151400000000001, 0.08663799999999999, 0.269092, 0.241444, 0.10276, 0.10565200000000001, 0.271794, 0.157594, 0.267948, 0.315382, 0.15909, 0.102162, 0.10206999999999997, 0.20747, 0.098034, 0.03817, 0.24499200000000002, 0.13945400000000002, 0.02343, 0.262394,