In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy



Here, the different secondary structures are assigned to their respective amino acid sequences.
Afterwards, in the DF structure_elements, it is defined whether a structure element belongs to domain 1 or 2.
Source: DOI: 10.1107/S0907444994014496

In [3]:
structure_sequence_allignment = {
    'Element': ['h1', 'b1', 'b2', 'h2', 'h3a', 'h3b', 'h4', 'h5', 'h6', 'h7', 'h8', 'h9', 'h10', 'b3', 'b4', 'b5', 'h11'],
    'Range': ['27-40', '43-50', '56-60', '72-85', '99-101', '109-114', '119-128', '132-142', '145-154', '168-170',
              '183-195', '201-212', '221-224', '230-237', '244-251', '259-266', '272-288']
}

domain1_elements = ['h1', 'b1', 'b2', 'h8', 'h10', 'b3', 'b4', 'b5', 'h11']
structural_elements = pd.DataFrame(structure_sequence_allignment)
structural_elements["Domain"] = None

# Assigning domain numbers to structure elements
for i in range(len(structural_elements["Element"])):
    if structural_elements.iloc[i, 0] in domain1_elements:
        structural_elements.iloc[i, 2] = 1
    else:
        structural_elements.iloc[i, 2] = 2

structural_elements

Unnamed: 0,Element,Range,Domain
0,h1,27-40,1
1,b1,43-50,1
2,b2,56-60,1
3,h2,72-85,2
4,h3a,99-101,2
5,h3b,109-114,2
6,h4,119-128,2
7,h5,132-142,2
8,h6,145-154,2
9,h7,168-170,2


Each position is assigned to its respective structure elements and domains of the structural_elements DataFrame.
For each row in the structural_elements DataFrame, the range values from the Range column are split into two numbers using the "map" command and separated by "-". These values are then stored as integers.
This allows to store the range information.
Next, all positions from position_elements_alignment that fall within the range of each element are labeled with that element in the "Element" column.
Afterwards, it is checked whether this element is part of domain 1. If it is, all corresponding rows in the "Domain" column are assigned a value of 1. If it is not, they are assigned a value of 2.

In [7]:
# Create the DataFrame for positions
position_elements_alignment = pd.DataFrame({'Position': range(24, 269)})

# Create columns for Structural Element and Domain
position_elements_alignment['Structural_Element'] = None
position_elements_alignment['Domain'] = None

# Iterate over the ranges in the structural_elements DataFrame
for index, row in structural_elements.iterrows():
    start, end = (int, row['Range'].split('-'))

    # Assign the values for the corresponding positions in the position_elements_alignment DataFrame
    position_elements_alignment.loc[(position_elements_alignment['Position'] >= start) & (position_elements_alignment['Position'] <= end), 'Structural_Element'] = row['Element']

    # Check if the element belongs to domain 1 and assign the Domain accordingly
    if row['Element'] in domain1_elements:
        position_elements_alignment.loc[(position_elements_alignment['Position'] >= start) & (position_elements_alignment['Position'] <= end), 'Domain'] = 1
    else:
        position_elements_alignment.loc[(position_elements_alignment['Position'] >= start) & (position_elements_alignment['Position'] <= end), 'Domain'] = 2

# Now, use the Position column as the index. To avoid having duplicate columns with position information, drop the "Position" column while keeping the index column with the same name.
position_elements_alignment = position_elements_alignment.set_index('Position', drop=False)
position_elements_alignment = position_elements_alignment.drop("Position", axis=1)

Unnamed: 0_level_0,Structural_Element,Domain
Position,Unnamed: 1_level_1,Unnamed: 2_level_1
24,,
25,,
26,,
27,h1,1
28,h1,1
...,...,...
264,b5,1
265,b5,1
266,b5,1
267,,


From this point, the position-effekt and structural element analysis is performed, based on the mean fitness values.
To do this, the DataFrame "mean_analysis" is created as a copy of the mean values that have been z-scored.
Additionally, each position is assigned its corresponding domain and structural element based on the position_elements_alignment DataFrame.

WICHTIG: Funktioniert nur, wenn position_means_Z vorher definiert wurde.

In [None]:
mean_analyse = position_means_Z.copy()
mean_analyse.insert(0, "Structual_Element", position_elements_alignment.loc[: , "Structural_Element"])
mean_analyse.insert(1, "Domain", position_elements_alignment.loc[: , "Domain"])
mean_analyse

For the analysis of whether there is a relationship between fitness values and structural elements, the variance of the mean values for all positions within a structural element is calculated. This is done individually for all datasets and also for the mean of the datasets ("all_Varianz").

The variance within a structural element is compared to the variance across all positions. If the variance within the structural element is lower than the variance across all positions, it indicates a relationship between fitness and the structural element.

In [None]:
for index, row in structural_elements.iterrows():
    start, end = map(int, row['Range'].split('-'))
    for column in ["Stiffler", "Firnberg", "Deng", "all"]:
        varianz = mean_analyse.loc[start:end, f"mean_{column}_z"].var()
        structural_elements.at[index, f"{column}_Varianz"] = varianz

structural_elements

Here, the variance across all positions is displayed.

However, it is weird that the variance is not 1, as the fitness values were been z-normalized.
--> For most structural elements, the variance within the structural element is lower than the variance across all positions.
--> There is a relationship between the structural element and fitness.

In [None]:
# Variance column names
variance_columns = ['Stiffler', 'Firnberg', 'Deng', 'all']

# Create an empty DataFrame
data_variance = pd.DataFrame(columns=variance_columns)

# Calculate variance values and insert them into the row
variance_values = []
for column in variance_columns:
    variance = mean_analyse[f"mean_{column}_z"].var()
    variance_values.append(variance)

data_variance.loc[0] = variance_values

data_variance
