# Correlation analysis for E.Coli dataset

In [37]:
genetic_code_RNA2AA = {'UUU': 'F', 'UUC': 'F', 'UUA': 'L', 'UUG': 'L',
    'CUU': 'L', 'CUC': 'L', 'CUA': 'L', 'CUG': 'L',
    'AUU': 'I', 'AUC': 'I', 'AUA': 'I', 'AUG': 'M',
    'GUU': 'V', 'GUC': 'V', 'GUA': 'V', 'GUG': 'V',
    'UCU': 'S', 'UCC': 'S', 'UCA': 'S', 'UCG': 'S',
    'CCU': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
    'ACU': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
    'GCU': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
    'UAU': 'Y', 'UAC': 'Y', 'UAG': 'X', 'UAA': 'X',
    'CAU': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
    'AAU': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
    'GAU': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
    'UGU': 'C', 'UGC': 'C',  'UGG': 'W', 'UGA':'X',
    'CGU': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
    'AGU': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
    'GGU': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'}

synonymous_codons = {
    'Alanine': ['GCT', 'GCC', 'GCA', 'GCG'],
    'Arginine': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
    'Asparagine': ['AAT', 'AAC'],
    'Aspartic acid': ['GAT', 'GAC'],
    'Cysteine': ['TGT', 'TGC'],
    'Glutamine': ['CAA', 'CAG'],
    'Glutamic acid': ['GAA', 'GAG'],
    'Glycine': ['GGT', 'GGC', 'GGA', 'GGG'],
    'Histidine': ['CAT', 'CAC'],
    'Isoleucine': ['ATT', 'ATC', 'ATA'],
    'Leucine': ['CTT', 'CTC', 'CTA', 'CTG', 'TTA', 'TTG'],
    'Lysine': ['AAA', 'AAG'],
    'Methionine': ['ATG'],  # Start codon as well
    'Phenylalanine': ['TTT', 'TTC'],
    'Proline': ['CCT', 'CCC', 'CCA', 'CCG'],
    'Serine': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
    'Threonine': ['ACT', 'ACC', 'ACA', 'ACG'],
    'Tryptophan': ['TGG'],
    'Tyrosine': ['TAT', 'TAC'],
    'Valine': ['GTT', 'GTC', 'GTA', 'GTG'],
    'STOP': ['TAA', 'TAG', 'TGA']  # Stop codons
}

In [1]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from Bio.Seq import Seq

#load pandas
pandas_Data = pd.read_csv('/Users/dominiquefastus/master_project/NuStru/nustruDB/NEW_ECOLI_FULL_uniprot_02.csv')

def translate_RNA2prot(seqRNA):
    seqRNA = seqRNA.replace('T', 'U')
    lcod = list()
    for i in range(0, len(seqRNA), 3):
        lcod.append(f"{seqRNA[i]}{seqRNA[i+1]}{seqRNA[i+2]}")
    seqRNA = ''.join([genetic_code_RNA2AA[i] for i in lcod])
    return seqRNA


#### TRANSFORM PROTEIN SEQUENCE IN A DATABASE - FASTA FORMAT #####


with open('/Users/dominiquefastus/master_project/NuStru/nustruDB/filtered_NEW_ECOLI_FULL_uniprot_02.csv', 'w') as fdb:
    for row in pandas_Data.itertuples(index=False):
        id = row.primary_id
        protein_sequence = row.protein_sequence
        nucleotide_sequence = row.nucleotide_sequence
        if nucleotide_sequence[0:3] == 'ATG':
            if len(nucleotide_sequence) % 3 == 0:
                if nucleotide_sequence.count('A') + nucleotide_sequence.count('T') + nucleotide_sequence.count('G') + nucleotide_sequence.count('C') == len(nucleotide_sequence):
                    trad_RNA = translate_RNA2prot(nucleotide_sequence[0:-3]) #except stop codon
                    if protein_sequence == trad_RNA:
                        fdb.write(f">{id}\n{protein_sequence}\n")


In [46]:
import pandas as pd
nustru_data = pd.read_csv('/Users/dominiquefastus/master_project/NuStru/nustruDB/NEW_ECOLI_FULL_uniprot_02_sec_struc_updated.csv')

In [None]:
# Extending the analysis to all proteins in the dataset and plotting the results as a heat map
from collections import defaultdict
import ast

aa_name = 'Alanine'

# Re-initialize the codon_structure_counts dictionary for a fresh analysis
codon_structure_counts = defaultdict(lambda: defaultdict(int))

# Analyze all entries in the dataset
for _, row in nustru_data.head(1000).iterrows():
    nucleotide_sequence = row['nucleotide_sequence']
    secondary_structure_dict = ast.literal_eval(row['secondary_structure'])
    
    # Convert the nucleotide sequence to triplets (codons)
    codons = [nucleotide_sequence[i:i+3] for i in range(0, len(nucleotide_sequence), 3)]
    
    # Map each codon to its corresponding secondary structure, focusing on leucine codons
    for i, codon in enumerate(codons):
        for aa, codon_list in synonymous_codons.items():
            if aa == aa_name and codon in codon_list:
                aa_pos = i + 1  # Amino acid positions are 1-based
                structure = secondary_structure_dict.get(aa_pos, '-')
                codon_structure_counts[codon][structure] += 1

# Convert the counts to a DataFrame for visualization
codon_structure_df = pd.DataFrame(codon_structure_counts).fillna(0)

# Plotting the heat map
plt.figure(figsize=(10, 8))
sns.heatmap(codon_structure_df, annot=True, cmap="YlGnBu", fmt=".0f")
plt.title(f'Codon Usage for {aa_name} in Different Secondary Structures')
plt.xlabel('Codon')
plt.ylabel('Secondary Structure')
plt.show()
