In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast

from collections import Counter
from scipy.stats import entropy
from scipy.stats import chi2_contingency

# Perform KL divergence analysis for synonymous codons and their probability to appear in correlation with a specific secondary structure element

### Load the dataset

In [None]:
savefig = False # define if figures should be saved

#load nustrudb data, the pandas dataframe with structure and nucleotide information
nustrudb = pd.read_csv('/Example/examples_organism/example_nustru_ecoli.csv', converters={'secondary_structure': ast.literal_eval})

### Define functions and codon table needed for the analysis

In [None]:
# this time we need the amino acids as keys and the codons as list values
# by that it's possible to use a counter
standard_codons = {
    'A': ['GCT', 'GCC', 'GCA', 'GCG'], 
    'C': ['TGT', 'TGC'],             
    'D': ['GAT', 'GAC'],              
    'E': ['GAA', 'GAG'],               
    'F': ['TTT', 'TTC'],               
    'G': ['GGT', 'GGC', 'GGA', 'GGG'],  
    'H': ['CAT', 'CAC'],            
    'I': ['ATT', 'ATC', 'ATA'],        
    'K': ['AAA', 'AAG'],             
    'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],  
    'M': ['ATG'],                   
    'N': ['AAT', 'AAC'],              
    'P': ['CCT', 'CCC', 'CCA', 'CCG'], 
    'Q': ['CAA', 'CAG'],               
    'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'], 
    'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'], 
    'T': ['ACT', 'ACC', 'ACA', 'ACG'], 
    'V': ['GTT', 'GTC', 'GTA', 'GTG'],  
    'W': ['TGG'],                      
    'Y': ['TAT', 'TAC']                 
}

def split_into_codons(sequence):
    """Split the sequence into codons"""
    # codons are a subset of 3 nucleotide, so we check again if the sequence is divisible by 3
    # due to previous filtering, this should be always the case
    return [sequence[i:i+3] for i in range(0, len(sequence), 3) if len(sequence[i:i+3]) == 3]
    
def count_codons(nustrudb):
    """Count the codons and structure for amino acids"""
    
    # set a dictionary with counter values for the syononymous codons
    # later the count present in the whole dataset will be stored here
    aa_codon_counts = {aa: Counter() for aa in standard_codons}

    # set a dictionary with counter values for the structure present in correlation with a codon
    aa_structure_counts = {aa: {'Helix': Counter(), 'Coil': Counter(), 'Sheet': Counter()} for aa in standard_codons}

    # now loop over each row in the database and count the codon in appearance with a structure
    for _, row in nustrudb.iterrows():
        # split the nucleotide sequence into codons
        # set both the codons and structure to variables
        codons = split_into_codons(row['nucleotide_sequence'])
        structure = row['secondary_structure']

        for i, codon in enumerate(codons):
            # loop over the codons present in the nucleotide sequence

            for aa, codons_list in standard_codons.items():
                # also loop over all codons and amino acids in the scripts
                if codon in codons_list:
                    # if the codon is in the synonymous codon of a protein, 
                    # then count the codon for this amino acid
                    aa_codon_counts[aa][codon] += 1
                    # then we count the codons for each structure
                    if i + 1 in structure:
                        # since the dictionary starts at 1 and not 0, we add 1 to the iter
                        # the structures are also aggregated based on their type here
                        # the underlyi
                        if structure[i + 1] == 'H' or structure[i + 1] == 'I' or structure[i + 1] == 'G':
                            aa_structure_counts[aa]['Helix'][codon] += 1
                        elif structure[i + 1] == '-' or structure[i + 1] == 'T' or structure[i + 1] == 'S':
                            aa_structure_counts[aa]['Coil'][codon] += 1
                        elif structure[i + 1] == 'E' or structure[i + 1] == 'B':
                            aa_structure_counts[aa]['Sheet'][codon] += 1
                            
    return aa_codon_counts, aa_structure_counts


def print_chi_squared_results(data, significance_level = 0.05):
    """Perform chi-squared test and return the results"""
    # to test if the structures and codons are significantly
    # define the significance threshold
    # here no bonferoni correction is applied since the hypotheses test are quite small
    # store statistical test results in a dictionary
    results_values = {}
    results_print = {}
    
    # loop over the list of codons
    for aa, codons in standard_codons.items():
        # get the frequencies of the codons in the amino acid for a structure to test for significance
        observed_frequencies = []
        for structure in ['Helix', 'Coil', 'Sheet']:
            observed_frequencies.append([data[aa][structure].get(codon, 0) for codon in codons])
    
        # transform the list of freuqencies to an array, need for chi2 test
        observed_frequencies_array = np.array(observed_frequencies)
        
        # perform the chi-squared test with scipy method
        chi2, p_value, dof, expected = chi2_contingency(observed_frequencies_array)
    
        # for easier evaluation print the chi-square static results
        # and if the frequency differences are significant
        if p_value < significance_level:
            significance_msg = 'Significant difference in distribution'
        else:
            significance_msg = 'No significant difference'

        # Store the result with the message
        results_values[aa] = (chi2, p_value, significance_msg)
        # store the results for each amino acid
        results_print[aa] = f"{aa}: Chi-squared = {chi2:e}, p-value = {p_value:e} -> {significance_msg}"

    return results_values, results_print

In [None]:
# count codons and structures from the data provided
aa_codon_counts, aa_structure_counts = count_codons(nustrudb)

In [None]:
# create dictionaries to store kl divergence values and probabilities
aa_kl_divergences = {}
aa_overall_probs = {}
aa_structure_probs = {'Helix': {}, 'Coil': {}, 'Sheet': {}}

# loop over the amino acids and the counted codons for each of them
for aa, codons_list in standard_codons.items():
    # sume the codons that appear for any structure
    # calculate the probability that a codon appears with any structure
    # or the overall probability of a codon appearing with any structure
    total_overall = sum(aa_codon_counts[aa].values())
    aa_overall_probs[aa] = {codon: (aa_codon_counts[aa][codon] / total_overall) if total_overall else 0 for codon in codons_list}

    for structure in ['Helix', 'Coil', 'Sheet']:
        # then we calculate the probability that a codon occurs with one of the structures
        total_structure = sum(aa_structure_counts[aa][structure].values())
        aa_structure_probs[structure][aa] = {codon: (aa_structure_counts[aa][structure][codon] / total_structure) if total_structure else 0 for codon in codons_list}

        # calculate KL divergence or entropy for each structure compared to overall
        # we take the overall probability of a codon and the structure dependent probabilty
        # by that two lists of probabilities can be used to calculate the entropy 
        # this is done for each amino acid and not for individual codons
        overall_probs_list = [aa_overall_probs[aa].get(codon, 0) for codon in codons_list]
        structure_probs_list = [aa_structure_probs[structure][aa].get(codon, 0) for codon in codons_list]
        kl_div = entropy(structure_probs_list, overall_probs_list) if total_structure else None

        # store KL divergence values in a dictionary for each amino acid
        if aa not in aa_kl_divergences:
            aa_kl_divergences[aa] = {}
        aa_kl_divergences[aa][structure] = kl_div


### Plot the probabilities for each synonymous codon

In [None]:
# create line plots for all 20 amino acids and their probabilites
fig, axes = plt.subplots(nrows=5, ncols=4, figsize=(20, 20), sharey=False)
axes = axes.flatten()  # Flatten the axes array for easy iteration

# for each synonymous codon plot the probabilities overall vs for each structure
for ax, (aa, codons) in zip(axes, standard_codons.items()):
    overall_probs = [aa_overall_probs[aa][codon] for codon in codons]
    helix_probs = [aa_structure_probs['Helix'][aa].get(codon, 0) for codon in codons]
    sheet_probs = [aa_structure_probs['Sheet'][aa].get(codon, 0) for codon in codons]
    coil_probs = [aa_structure_probs['Coil'][aa].get(codon, 0) for codon in codons]

    ax.plot(codons, overall_probs, label='Overall', marker='o', color='red')
    ax.plot(codons, helix_probs, label='Helix', marker='o', color='orange', alpha=0.6)
    ax.plot(codons, sheet_probs, label='Sheet', marker='o', color='lightgreen', alpha=0.6)
    ax.plot(codons, coil_probs, label='Coil', marker='o', color='coral', alpha=0.6)
    
    ax.set_ylim(0, 0.8)
    ax.set_title(f'{aa} Codon Usage')
    ax.set_xlabel('Codons')
    ax.set_ylabel('Probability')
    ax.legend()

plt.tight_layout(rect=[0, 0.03, 1, 0.97])
if savefig:
    plt.savefig('pdb_codon_probabilities_across_structures.png')

### Plot the KL divergence for each amino acid

In [None]:
# create a dataframe from the KL divergence dictionary
kl_df = pd.DataFrame.from_dict(aa_kl_divergences, orient='index').fillna(0)

plt.figure(figsize=(12, 8))
sns.heatmap(kl_df, annot=False, cmap='viridis', linewidths=0.5)
plt.xlabel('Secondary Structure')
plt.ylabel('Amino Acid')
if savefig:
    plt.savefig('pdb_kl_divergence_heatmap.png')

# Investigate the frequencies of codons in each secondary structure element

In [None]:
# set dictionaries to calculate the frequencies
aa_structure_frequencies = {aa: {} for aa in standard_codons}

# similar to when calculating the probabilities
# the frequencies are calculated based on the counts of the codons and the counts for a structure
for aa, structures in aa_structure_counts.items():
    for structure_type, counts in structures.items():
        # get the total of the structure and calculate the frequency
        total = sum(counts.values())
        aa_structure_frequencies[aa][structure_type] = {codon: count / total for codon, count in counts.items()}

# prepare the data for plotting by storing the individual frequencies for each structure type
list_of_codon_frequencies = []

# loop over the codons of each amino acid and structure
for aa, structures in aa_structure_frequencies.items():
    # then loop over each synonymous codon for the amino acid and get the frequencies of each structure
    codons = standard_codons[aa]
    for codon in codons:
        helix_freq = structures['Helix'].get(codon, 0)
        coil_freq = structures['Coil'].get(codon, 0)
        sheet_freq = structures['Sheet'].get(codon, 0)

        # append each structure frequency of codons independently
        list_of_codon_frequencies.append({'Amino Acid': aa, 'Codon': codon, 'Structure': 'Helix', 'Frequency': helix_freq})
        list_of_codon_frequencies.append({'Amino Acid': aa, 'Codon': codon, 'Structure': 'Coil', 'Frequency': coil_freq})
        list_of_codon_frequencies.append({'Amino Acid': aa, 'Codon': codon, 'Structure': 'Sheet', 'Frequency': sheet_freq})

# create a dataframe which makes it easier to plot the data
df_of_codon_frequencies = pd.DataFrame(list_of_codon_frequencies)

### Plot the frequencies for each synonymous codon

In [None]:
# plotting the codon distributions
fig, axes = plt.subplots(nrows=5, ncols=4, figsize=(20, 20), sharey=True)
fig.suptitle('Codon Frequency Distributions Across Structures for Each Amino Acid', fontsize=16)
axes = axes.flatten()

# get all amino acids
amino_acids = standard_codons.keys()

# for each synonymous codon plot the frequencies for each structure
for idx, aa in enumerate(amino_acids):
    subset = df_of_codon_frequencies[df_of_codon_frequencies['Amino Acid'] == aa]
    ax = axes[idx]
    
    barplot = sns.barplot(ax=ax, x='Codon', y='Frequency', hue='Structure', data=subset, palette='viridis')
    ax.set_title(aa)
    ax.set_xlabel('Codons')
    ax.set_ylabel('Frequency')
    ax.legend()

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
if savefig:
    plt.savefig('pdb_codon_frequencies_across_structures.png')

### Perform a chi-squared test

In [None]:
# print the results to see which amino acids show a significant frequencies between the structures
_, chi_squared_summary = print_chi_squared_results(data=aa_structure_counts)
for result in chi_squared_summary.values():
    print(result)

## Cross-validation with Random sampling to test if the frequencies for the structure elements and codons are significant for subgroups

#### Since the dataaset is so large, random sampling can help to see if the frequencies in codons and structures is still true for a random subset of samples. This minimzes statistical errors drawn from the large dataset

In [None]:
def analyze_sample(df_sample):
    """Calculate frequencies for each sample"""
    # count codons and structures from the data provided
    sample_aa_codon_counts, sample_aa_structure_counts = count_codons(df_sample)
    
    return sample_aa_structure_counts

sample_size = 10  # set sample size for cross validation analysis
number_of_samples = 20  # number of times to perform cross validation

# we store the frequencies for all samples
all_samples_counts = []

for _ in range(number_of_samples):
    # collect samples and calculate the frequencies
    sample = nustrudb.sample(n=sample_size, random_state=_) 
    sample_counts = analyze_sample(sample) # get the frequencies for each sample
    all_samples_counts.append(sample_counts)

chi_squared_results_all_samples = [print_chi_squared_results(sample_counts)[0] for sample_counts in all_samples_counts]

# summarize results across all samples
aggregated_results = {aa: {'significant': 0, 'non_significant': 0} for aa in standard_codons}

# count the number of times the frequencies were significantly different
significance_level = 0.05
for chi_squared_summary in chi_squared_results_all_samples:
    for aa, result in chi_squared_summary.items():
        if result[1] < significance_level:
            aggregated_results[aa]['significant'] += 1
        else:
            aggregated_results[aa]['non_significant'] += 1

# print the results of cross validation test
for aa, freqs in aggregated_results.items():
    print(f"{aa}: {counts['significant']} out of {number_of_samples} samples showed significant difference in distribution")