In [None]:
from collections import Counter
import pandas as pd
import numpy as np
import ast

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# load the data and convert the secondary structure to a list of tuples
data = pd.read_csv('/Users/dominiquefastus/master_project/Data/NEW_ECOLI_FULL_uniprot_02_sec_struc_filtered_final.csv', converters={'secondary_structure': ast.literal_eval}, nrows=1000)
data['secondary_structure'] = data['secondary_structure'].apply(lambda x: list(x.items())) 

# create new columns for the protein sequence split into 3 parts (start, between, end)
# create new columns for the nucleotide sequence split into 3 parts (start, between, end)
data['protein_sequence_start'], data['protein_sequence_between'], data['protein_sequence_end'] = data['protein_sequence'].str[:20], data['protein_sequence'].str[20:-20], data['protein_sequence'].str[-20:]
data['nucleotide_sequence_start'], data['nucleotide_sequence_between'], data['nucleotide_sequence_end'] = data['nucleotide_sequence'].str[:60], data['nucleotide_sequence'].str[60:-60], data['nucleotide_sequence'].str[-60:]

# create new columns for the secondary structure split into 3 parts (start, between, end)
# convert the secondary structure dictionary to a string
data['secstru_sequence_start'], data['secstru_sequence_between'], data['secstru_sequence_end'] = data['secondary_structure'].apply(lambda x: x[:20]), data['secondary_structure'].apply(lambda x: x[20:-20]), data['secondary_structure'].apply(lambda x: x[-20:])
data['secstru_sequence_start'], data['secstru_sequence_between'], data['secstru_sequence_end'] = data['secstru_sequence_start'].apply(lambda x: "".join(dict(x).values())), data['secstru_sequence_between'].apply(lambda x: "".join(dict(x).values())), data['secstru_sequence_end'].apply(lambda x: "".join(dict(x).values()))
data['secondary_structure'] = data['secondary_structure'].apply(lambda x: "".join(dict(x).values()))

In [None]:
# Define codon table for amino acids
codon_table = {
    'F': ['TTT', 'TTC'], 'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
    'I': ['ATT', 'ATC', 'ATA'], 'M': ['ATG'], 'V': ['GTT', 'GTC', 'GTA', 'GTG'],
    'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'], 'P': ['CCT', 'CCC', 'CCA', 'CCG'],
    'T': ['ACT', 'ACC', 'ACA', 'ACG'], 'A': ['GCT', 'GCC', 'GCA', 'GCG'],
    'Y': ['TAT', 'TAC'], 'H': ['CAT', 'CAC'], 'Q': ['CAA', 'CAG'],
    'N': ['AAT', 'AAC'], 'K': ['AAA', 'AAG'], 'D': ['GAT', 'GAC'],
    'E': ['GAA', 'GAG'], 'C': ['TGT', 'TGC'], 'W': ['TGG'], 'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
    'G': ['GGT', 'GGC', 'GGA', 'GGG'], '*': ['TAA', 'TGA', 'TAG']
}

def calculate_rscu(nucleotide_sequence):
    codon_counts = Counter([nucleotide_sequence[i:i+3] for i in range(0, len(nucleotide_sequence), 3)])
    amino_acid_counts = {aa: sum([codon_counts[codon] for codon in codons]) for aa, codons in codon_table.items()}

    rscu = {}
    for aa, codons in codon_table.items():
        total = amino_acid_counts[aa]
        for codon in codons:
            if total > 0:
                rscu[codon] = codon_counts[codon] / (total / len(codons))
            else:
                rscu[codon] = 0
    return rscu

# Apply the RSCU calculation to each nucleotide sequence
data['rscu_start'] = data['nucleotide_sequence_start'].apply(calculate_rscu)
data['rscu_between'] = data['nucleotide_sequence_between'].apply(calculate_rscu)
data['rscu_end'] = data['nucleotide_sequence_end'].apply(calculate_rscu)
data[['primary_id', 'rscu_start', 'rscu_end']].head()


In [None]:
# Function to quantify secondary structure motifs
def quantify_secondary_structure(structure_sequence):
    counts = Counter(structure_sequence)
    total = len(structure_sequence)
    return {key: counts[key] / total for key in counts}

# Apply the function to the secondary structure column
data['secondary_structure_quantified_start'] = data['secstru_sequence_start'].apply(quantify_secondary_structure)
data['secondary_structure_quantified_between'] = data['secstru_sequence_between'].apply(quantify_secondary_structure)
data['secondary_structure_quantified_end'] = data['secstru_sequence_end'].apply(quantify_secondary_structure)
data[['primary_id', 'secondary_structure_quantified_start']].head()


In [None]:
# Function to aggregate RSCU values (e.g., using mean RSCU per protein)
def aggregate_rscu(rscu):
    return np.mean(list(rscu.values()))

# Apply the aggregation function to the RSCU column
data['rscu_aggregate_start'] = data['rscu_start'].apply(aggregate_rscu)
data['rscu_aggregate_between'] = data['rscu_between'].apply(aggregate_rscu)
data['rscu_aggregate_end'] = data['rscu_end'].apply(aggregate_rscu)
data[['primary_id', 'rscu_aggregate_start']].head()


In [None]:
# Define the standard genetic code
codon_table = {
    'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
    'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
    'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*',
    'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W',
    'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
    'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
    'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
    'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
    'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
    'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
    'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
    'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
    'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
    'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
    'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
    'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'
}

# Function to split sequence into codons
def split_into_codons(sequence):
    return [sequence[i:i+3] for i in range(0, len(sequence), 3) if len(sequence[i:i+3]) == 3]

# Extract all codons from the nucleotide sequences and calculate RSCU values
def calculate_rscu(data, column):
    all_codons = []
    for seq in data[column]:
        all_codons.extend(split_into_codons(seq))

    codon_counts = Counter(all_codons)
    amino_acid_counts = Counter()
    for codon, count in codon_counts.items():
        try:
            amino_acid = codon_table[codon]
            amino_acid_counts[amino_acid] += count
        except:
            continue
        
    rscu_values = {codon: (count / amino_acid_counts[codon_table[codon]]) for codon, count in codon_counts.items()}
    return rscu_values

# Map RSCU values to codons in each sequence
def map_rscu_to_codons(seq, rscu_values):
    codons = split_into_codons(seq)
    return [rscu_values.get(codon, np.nan) for codon in codons]

# Extract secondary structure elements from structure string
def extract_secondary_structure(structure_string):
    return list(structure_string.replace('-', ''))

# Align lengths of RSCU values and secondary structure elements by trimming the excess
def align_rscu_and_structure(rscu_values, secondary_elements):
    min_length = min(len(rscu_values), len(secondary_elements))
    return rscu_values[:min_length], secondary_elements[:min_length]

# Process each sequence range
sequence_ranges = ['start', 'between', 'end']
flat_data_list = []

for range_ in sequence_ranges:
    nucleotide_col = f'nucleotide_sequence_{range_}'
    secstru_col = f'secstru_sequence_{range_}'

    # Calculate RSCU values
    rscu_values = calculate_rscu(data, nucleotide_col)

    # Map RSCU values to codons in each sequence
    data[f'rscu_values_{range_}'] = data[nucleotide_col].apply(lambda seq: map_rscu_to_codons(seq, rscu_values))

    # Extract secondary structure elements
    data[f'secondary_structure_elements_{range_}'] = data[secstru_col].apply(extract_secondary_structure)

    # Align RSCU values with secondary structure elements
    data[f'aligned_rscu_{range_}'], data[f'aligned_secondary_{range_}'] = zip(*data.apply(
        lambda row: align_rscu_and_structure(row[f'rscu_values_{range_}'], row[f'secondary_structure_elements_{range_}']), axis=1))

    # Flatten the data for correlation analysis
    rscu_values_flat = [rscu for sublist in data[f'aligned_rscu_{range_}'] for rscu in sublist]
    secondary_elements_flat = [sec for sublist in data[f'aligned_secondary_{range_}'] for sec in sublist]

    flat_data = pd.DataFrame({
        'RSCU': rscu_values_flat,
        'Secondary Structure': secondary_elements_flat,
        'Range': range_.capitalize()
    })

    flat_data_list.append(flat_data)

# Combine all flat data into a single DataFrame
combined_flat_data = pd.concat(flat_data_list, ignore_index=True)

# Plot the combined data
plt.figure(figsize=(18, 6))
ax = sns.boxplot(x='Range', y='RSCU', hue='Secondary Structure', data=combined_flat_data, palette='viridis', gap=0.2, width=0.95, showfliers=False)
    
plt.title('Boxplot of RSCU Values by Secondary Structure Element')
plt.ylim(0,1.1)
plt.legend(loc='upper center', ncol=7)
plt.xlabel('Secondary Structure Element')
plt.ylabel('RSCU Value')
plt.xticks(ticks=range(3), labels=['Start', 'Between', 'End'])
plt.show()

In [None]:
#Overlapping KDE Plots
# show the density of RSCU values by secondary structure element
sns.kdeplot(data=combined_flat_data, x='RSCU', hue='Secondary Structure', fill=True,
           palette='viridis', linewidth=0.4)
plt.title('KDE Plot of RSCU Values by Secondary Structure Element')
plt.show()