In [None]:
import pandas as pd
import pickle
import ast

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats

from CAI import RSCU

# Domain and Fold class analysis

##### The domain analysis is baseed on the relative synonymous codon usage (RSCU) or codon rarity score (CRS) and needs the domain annotation for the provided data. The CRS can be obtained from the previous analysis of the msa of protein families. The RSCU is calculated in this notobook based on each sequence. When analysing the domains and fold classes on the organism data, so not a protein familiy, the CRS can't be calculated and thus only the RSCU should be used.

### Set data to analyse

In [None]:
savefig = False # define if figures should be saved

# load the nustrudb and convert the secondary structure to a list of tuples
nustrudb = pd.read_csv('/Example/examples_family/example_fam2_domain.csv', converters={'secondary_structure': ast.literal_eval})
nustrudb['secondary_structure'] = nustrudb['secondary_structure'].apply(lambda x: list(x.items())) 
nustrudb['secondary_structure'] = nustrudb['secondary_structure'].apply(lambda x: "".join(dict(x).values()))

# get the codon rarity score (CRS) for a protein family 
c_rarity = pickle.load(open('/Example/examples_family/example_fam2_CRS.pkl', 'rb'))
output_path = "/Example/examples_family/" # define output path

### Define functions and codon table

In [None]:
# define the standard genetic code with stop codons
codon_table = {
    'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
    'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
    'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*',
    'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W',
    'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
    'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
    'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
    'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
    'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
    'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
    'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
    'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
    'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
    'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
    'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
    'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'
}

def split_into_codons(sequence):
    """Split the sequence into codons"""
    # codons are a subset of 3 nucleotide, so we check again if the sequence is divisible by 3
    # due to previous filtering, this should be always the case
    return [sequence[i:i+3] for i in range(0, len(sequence), 3) if len(sequence[i:i+3]) == 3]

def extract_secondary_structure(structure_string):
    """Extract secondary structure elements from structure string"""
    # convert string to list, a function is needed as it is applied to each row individually
    return list(structure_string) 

def map_rscu_to_codons(seq, rscu_values):
    """Map RSCU values to codons in each sequence"""
    # get a list of the codons of the nucleotide sequence (same order)
    codons = split_into_codons(seq)

    # return the calculated rscu values of each codon in the sequence
    mapped_rscu_value = [rscu_values.get(codon, 'nan') for codon in seq]

    # most RSCU calculations ignore the stop codons and since it is of no interest for the analysis, it is deleted
    if mapped_rscu_value[-1] == 'nan':
        mapped_rscu_value.pop()

    return mapped_rscu_value

def rscu_method_calculation(seq_given=None, data=None, column=None):
    """Calculates the RSCU values based on the CAI package"""
    # setup a list of sequences to calculate the codons from
    if isinstance(data, pd.DataFrame):
        seq = data[column].values.tolist()
    else:
        seq = [str(seq_given)]

    # there are multiple ways to calculate the rscu for codons based on literature
    # while rscu values for one sequence can indicate sequence (protein specific codon usage)
    # drawing the rscu from a set can give rise to species or overall codon usage
    if len(seq) == 1:
        # calculate the rscu for codons based on the sequence
        rscu_values = RSCU(seq)

        # create a list of the rscu based on the codon order in the sequence
        rscu_results = map_rscu_to_codons(seq_given, rscu_values)
        
        
    elif len(seq) >= 1:
        # calculate the rscu for codons based on multiple sequences
        rscu_values = RSCU(seq)
        rscu_results = rscu_values

    else:
        print("Can't calculate rscu as no sequences provivded!")

    return rscu_results

def classify_fold(alpha, beta):
    """Classify the fold class based on the alpha and beta content"""
    # since the actual fold class could not be mapped to the data, 
    # the classification is based on the alpha and beta content (ratio)
    if alpha > 0 and beta == 0: 
        return 'All Alpha'
    elif beta > 0 and alpha == 0: 
        return 'All Beta'
    else:
        return 'Alpha/Beta'

## Assign data to domains from the initial dataset

In [None]:
# store the data from prev
domain_df = pd.DataFrame(columns=['primary_id', 'domain', 'alpha', 'beta', 'coils' 'RSCU', 'freq', 'cath_fclass'])


for _, row in nustrudb.iterrows():
    # calculate RSCU values
    rscu_values = rscu_method_calculation(data=nustrudb, column='nucleotide_sequence')

    # create a new dataframe of the primary_id, domain_name, nucleotide_sequence as well as secondary structure for the domain range
    # and (rscu, frequencies) for each codon in the sequence
    codons = split_into_codons(row['nucleotide_sequence'])
    secstru = extract_secondary_structure(row['secondary_structure'])
    
    # loop over the domains and extract the nucleotide sequence and secondary structure
    for domain in eval(row['domains']):
        # get the sequence range of the domain
        domain_start = int(domain[0])
        domain_end = int(domain[1])
        domain_name = domain[2]
        
        # extract the domain nucleotide sequence and secondary structure
        domain_nucleotide_sequence = codons[domain_start:domain_end]
        domain_secstru_sequence = secstru[domain_start:domain_end]
        
        # count the alpha and beta content of the domain (count the occurences of the elements)
        alpha = domain_secstru_sequence.count('H') + domain_secstru_sequence.count('G') + domain_secstru_sequence.count('I')
        beta = domain_secstru_sequence.count('E') + domain_secstru_sequence.count('B')
        
        # calculate the RSCU and CRS values for the domain
        rscu = sum(map_rscu_to_codons(domain_nucleotide_sequence, rscu_values)) / len(domain_nucleotide_sequence)
        crs = sum(map_rscu_to_codons(domain_nucleotide_sequence, c_rarity)) / len(domain_nucleotide_sequence)

        # store the domain data in a new dataframe
        domain_data = pd.DataFrame.from_dict([{'primary_id':row['primary_id'], 'domain': domain_name, 'alpha': alpha, 'beta': beta, 'RSCU': rscu, 'CRS': crs}])
        domain_df = pd.concat([domain_df, domain_data], ignore_index=True)

# classify the fold class based on the alpha and beta content/ ratio
domain_df['fold_class'] = domain_df.apply(lambda row: classify_fold(row['alpha'], row['beta']), axis=1)

## Analyse the top 20 most common domains

In [None]:
# extracting the top 20 most common domains from the dataset
top_domains = domain_df['domain'].value_counts().index[:20]
top_data = domain_df[domain_df['domain'].isin(top_domains)]

### Plot the fold class distribution (Alpha helix / Beta sheet ratio)

In [None]:
codon_metric = 'CRS' # define the metric to be used for the analysis

In [None]:
plt.figure(figsize=(12, 8))

markers = ('o', 's', '^', 'v', '<', '>', 'D', 'p', 'P', '*', 'X')
num_markers = len(markers)

# normalize the RSCU values for the color map
norm = plt.Normalize(top_data[codon_metric].min(), top_data[codon_metric].max())
cmap = plt.cm.viridis

for i, domain in enumerate(top_domains):
    domain_data = top_data[top_data['domain'] == domain]
    scatter = plt.scatter(x=domain_data['alpha'], y=domain_data['beta'], c=domain_data[codon_metric], cmap=cmap,
                          norm=norm, label=domain, marker=markers[i % num_markers])

plt.colorbar(scatter, label='RSCU')

plt.title('Scatter Plot of RSCU vs. Frequency by Domain')
plt.xlabel('Number of Alpha Helices')
plt.ylabel('Number of Beta Sheets')
plt.legend(loc='upper right', title='Domains')
# plt.legend(bbox_to_anchor=(0, -0.1), loc='upper left', ncol=2)

if savefig:
    plt.savefig(f'{output_path}/helix_sheet_ratio_plot.png', dpi=600)

### Plot the domain distribution 

In [None]:
codon_metric = 'RSCU' # define the metric to be used for the analysis

In [None]:
plt.figure(figsize=(10, 6))

sns.set_theme(style="ticks")
sns.boxplot(data=top_data, x=codon_metric, y='domain', hue='domain')
sns.stripplot(data=top_data, x=codon_metric, y='domain', hue='domain', palette='dark:.4', alpha=0.7)
sns.despine(trim=True, left=True)

plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=True, rotation=90)
plt.ylabel("Domains")

if savefig:
    plt.savefig(f'{output_path}/domain_codon_usage.png', dpi=600, bbox_inches='tight')

### Save boxplot data

In [None]:
# save the boxplot data
top_data.groupby('domain').describe().to_csv(f'{output_path}/domains_boxplot_statistics.csv')

## Analyse RSCU or CRS correlation with domain and fold class

### Perfom a one-way ANOVA test to determine the significance of the correlation between RSCU and CRS with domain and fold class

In [None]:
# group the fold class data by the fold class and perform an ANOVA test
fold_class_groups = [group["CRS"].values for name, group in domain_df.groupby("fold_class")]
f_stat_fold_class, p_value_fold_class = stats.f_oneway(*fold_class_groups)

print("ANOVA results for Fold Class:")
print(f"F-statistic: {f_stat_fold_class}, p-value: {p_value_fold_class}")

# group the domain data by the domain and perform an ANOVA test
domain_groups = [group["CRS"].values for name, group in domain_df.groupby("domain")]
f_stat_domain, p_value_domain = stats.f_oneway(*domain_groups)

print("\nANOVA results for Domain:")
print(f"F-statistic: {f_stat_domain}, p-value: {p_value_domain}")