In [1]:
# import packages
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import Bio
import statsmodels.api as sm
from pathlib import Path
import ast
import re

#### This notebook can be used to generate the full dataset used in our works, but shouldn't be run because it may take a very long time
#### The finished dataframe 'prokaryotes_348columns.csv' is available in the 'data' folder (after running initialisation)

In [3]:
# import data from computer
path: str = "./data"  #folder where files are stored
species: pd.DataFrame = pd.read_csv(os.path.join(path, "cross-species.csv"), dtype=str)   # imports file from the folder as species, based on name
identifiers: pd.DataFrame = pd.read_csv(os.path.join(path, "identifiers.tsv"), sep='\t', dtype=str) # imports 3rd file from the folder, based on name
prokaryotes_auc : pd.DataFrame = pd.read_csv(os.path.join(path, "data_prokaryotes_auc.csv"), dtype=str)
Uniprot_ID_mapping : pd.DataFrame = pd.read_csv(os.path.join(path, "Uniprot_ID_mapping.tsv"), sep = '\t', dtype=str)

#import fasta file with biopython (Bio)
from Bio import SeqIO
fasta_id = []
fasta_seq = []
with open(os.path.join(path, "identifiers.fasta"), 'r') as handle:
    for record in SeqIO.parse(handle, "fasta"): 
        fasta_id.append(record.id) 
        fasta_seq.append(str(record.seq))
# Create df prot_seq which includes proper ID and sequences
fasta_id_df = pd.DataFrame([item.split('|') for item in fasta_id], columns=['0', 'From','ID']) # creates a df by splitting fasta_id list into 3 columns
fasta_id_EntryName:list = fasta_id_df['ID'] # creates a list with the Entry Names from the split df
prot_seq = pd.DataFrame(list(zip(fasta_id_EntryName, fasta_seq)), columns = ["ID", "Sequence"]) # creates a df by combining the Entry Names and the Sequences



#Define datatype for each column
species['fold_change'] = species['fold_change'].astype('float64') # changes fold_change column to float64
species['temperature'] = species['temperature'].astype('float64') # changes temperature column to int64
identifiers['Length'] = identifiers['Length'].astype('float64') # changes Length column to int64

#Update index
species = species.reset_index(drop=True) # resets index of species df
identifiers = identifiers.reset_index(drop=True) # resets index of identifiers df

Uniprot_ID_mapping = Uniprot_ID_mapping.drop(columns = ['From', 'Entry', 'Gene Ontology (cellular component)', 'Gene Ontology (biological process)', '3D', 'Subcellular location [CC]', 'Intramembrane', 'Topological domain', 'Transmembrane'], errors= 'ignore')



Add protein sequence from protein_seq dataset to identifiers to create identifiers_seq, based on 'ID' and 'Entry Name' respectively

Create joint data set from identifiers_seq and species_seq based on gene_name and Gene Names1 respectively

In [4]:
# merge dataframes based on Entry Name (df.identifiers) andID (df.prot_seq), if they are similar add the sequence to identifiers, if not add NaN
identifiers_seq = identifiers.merge(prot_seq, how='left', left_on='Entry Name', right_on='ID')[identifiers.columns.tolist() + ['Sequence']] # merges identifiers and prot_seq on Entry Name and ID
identifiers_seq.head()

#merge species and identifiers_seq based on gene_name and Gene Names1
species[['ProtID1', 'ProtID2']] = species['Protein_ID'].str.split("_", expand = True, n=1) # splits the Gene Names column into 2 columns
species_seq = pd.merge(left = species, right = identifiers_seq, how='outer', left_on='ProtID1', right_on='Entry') # merges species and identifiers_seq on gene_name and Gene Names1

# drop entrys with no sequence and unnessecary columns
species_seq.dropna(subset = ['Sequence'], inplace=True) # drops rows with NaN in the Sequence column
species_seq = species_seq.drop(columns=['ProtID1', 'ProtID2'])

Extract list of unique run_names from species and manually seperate in prokaryotes and eukaryotes. Create split datasets for eukaryotes and prokaryotes based on species_seq

In [5]:
# Get a list of unique entries in the 'run_name' column
#print(species['run_name'].unique())

# Based on uniqe run names all entries split into eukaryotes and prokaryotes (manually)
prokaryotes_list:list = ['Bacillus subtilis_168_lysate_R1', 'Escherichia coli lysate',
 'Geobacillus stearothermophilus NCA26 lysate',
 'Thermus thermophilus HB27 lysate', 'Thermus thermophilus HB27 cells',
 'Escherichia coli cells', 'Picrophilus torridus DSM9790 lysate', 'Oleispira antarctica_RB-8_lysate_R1']

#creates new dataframes that only contain prokaryotes or eukaryotes based
prokaryotes_all = species_seq[species_seq['run_name'].isin(prokaryotes_list)]

##reset index
prokaryotes_all = prokaryotes_all.reset_index(drop=True) # resets index of species df

In [6]:
prokaryotes_all_list = [
    'Thermus thermophilus HB27 lysate',                 #
    'Thermus thermophilus HB27 cells',                  #
    'Picrophilus torridus DSM9790 lysate',              #
    'Bacillus subtilis_168_lysate_R1',                  #
    'Escherichia coli lysate',                          #
    'Escherichia coli cells',                           #
    'Geobacillus stearothermophilus NCA26 lysate',
    'Oleispira antarctica_RB-8_lysate_R1'
    ]

prokaryotes_auc_list = [
    'T.thermophilus_P023431',                          # Thermus thermophilus HB27 lysate
    'T.thermophilus_cells_P023757',                     # Thermus thermophilus HB27 cells
    'P.torridus_P023430',                               # Picrophilus torridus DSM9790 lysate
    'B.subtilis_P023755',                               # Bacillus subtilis_168_lysate_R1                                                   
    'E.coli_cells_P023756',                             # Escherichia coli lysate
    'E.coli_P023428',                                   # Escherichia coli cells
    'G.stearothermophilus_P023429',                     # Geobacillus stearothermophilus NCA26 lysate
    'O.antarctica_P028248',                             # Oleispira antarctica_RB-8_lysate_R1  
]
# E.coli_ArcticExpress_P028249 not included

#Prokaryotes from R-shiny (https://meltomeatlas.proteomics.wzw.tum.de/master_meltomeatlasapp/)
'Escherichia coli lysate', 
'Escherichia coli cells', 
'Geobacillus stearothermophilus NCA26 lysate',
'Thermus thermophilus HB27 lysate',
'Thermus thermophilus HB27 cells',
'Picrophilus torridus DSM790 lysate',

('Picrophilus torridus DSM790 lysate',)

Adjust organism name in prokaryotes_auc to match names from prokaryotes_all

In [7]:
prokaryotes_auc['Sample'] = prokaryotes_auc['Sample'].replace(dict(zip(prokaryotes_auc_list, prokaryotes_all_list)))

#remove E.coli_ArcticExpress_P028249
prokaryotes_auc = prokaryotes_auc[prokaryotes_auc['Sample'] != 'E.coli_ArcticExpress_P028249']

Combine prokaryotes_all dataframe with prokaryotes_auc

In [8]:
prokaryotes_all['run_name_Protein_ID'] = prokaryotes_all['run_name'] + prokaryotes_all['Protein_ID']
prokaryotes_auc['Sample_Protein_ID'] = prokaryotes_auc['Sample'] + prokaryotes_auc['Protein_ID']

prokaryotes_all = pd.merge(left = prokaryotes_all, right = prokaryotes_auc , how='left', left_on='run_name_Protein_ID', right_on='Sample_Protein_ID') # merges species and identifiers_seq on gene_name and Gene Names1


drop unessesary columns

In [9]:
#drop columns
col_drop = ['Reviewed', 'run_name_Protein_ID', 'channel', 'From', 'Protein_ID_y',
            'Proteinname_P023428_E.coli', 'meltPoint_P023428_E.coli', 'gene_name_y',
            'Sample_Protein_ID', 'Sample', 'Entry', 'Gene Names', 'uniprot_ac']
for n in range(len(col_drop)):
    if col_drop[n] in list(prokaryotes_all.columns):
        prokaryotes_all = prokaryotes_all.drop(columns=[col_drop[n]])

#rename columns
prokaryotes_all = prokaryotes_all.rename(columns={'Protein_ID_x': 'Protein_ID'})
prokaryotes_all = prokaryotes_all.rename(columns={'gene_name_x': 'gene_name'})

#define variable
prokaryotes_all = prokaryotes_all.astype({'auc': 'float64'})



Combine Uniprot_ID_mapping with prokaryotes dataframe

In [10]:
prokaryotes_all = pd.merge(left = prokaryotes_all, right = Uniprot_ID_mapping, how='left', left_on='Entry Name', right_on='Entry Name') # merges species and identifiers_seq on gene_name and Gene Names1

Re-order dataframe columns

In [11]:
columns_ordered =['run_name', 'Organism', 'Protein_ID' ,'Entry Name' ,'gene_name', 
 'Protein names',  'Temperature dependence', 'Length', 'Sequence', 
 'temperature', 'fold_change', 'meltPoint', 'auc', 'Gene Ontology IDs', 
 'Gene Ontology (biological process)', 'Gene Ontology (cellular component)', 
 'Gene Ontology (molecular function)', 'KEGG','EC number', 'Helix', 'Turn', 'Beta strand',
       'AlphaFoldDB', 'PDB']

prokaryotes_all = prokaryotes_all[columns_ordered]

Create a list of each unique protein for each organism

In [12]:
# prokaryotes
prokaryotes = prokaryotes_all.drop_duplicates(subset=['run_name', 'Protein_ID'], keep='first') #create new dataframe with only unique proteins
prokaryotes = prokaryotes.reset_index(drop=True) #reset index

# define meltpoint as float64
prokaryotes['meltPoint'] = prokaryotes['meltPoint'].astype('float64') # changes meltpoint column to float64
prokaryotes['temperature'] = prokaryotes['temperature'].astype('float64')
prokaryotes['fold_change'] = prokaryotes['fold_change'].astype('float64')
prokaryotes['Length'] = prokaryotes['Length'].astype('float64')
prokaryotes['auc'] = prokaryotes['auc'].astype('float64')

In [13]:
prokaryotes_all.to_csv(os.path.join(path,'prokaryotes_all.csv'), index=False)
prokaryotes.to_csv(os.path.join(path,'prokaryotes_unique_prot.csv'), index=False)
prokaryotes['Entry Name'].to_csv(os.path.join(path, 'Uniprot_IDs.csv'), index=False)

Read fasta file with all secondary structure data from s4pred

In [14]:
path2: str = f"{os.path.abspath(os.path.join(os.getcwd()))}/data/sec-structure_prediction.fas"
with open(path2, 'r') as file:
    content = file.read()

Create list with secondary structure data for all proteins

In [15]:
contentl=content.split('#')
contentl.pop(0)

''

#### Import prokaryotes dataframe from csv and add columns for secondary structure

In [16]:
path: str = "./data"
prokaryotes: pd.DataFrame = pd.read_csv(os.path.join(path, "prokaryotes_unique_prot.csv"), dtype=str)

In [17]:
# add columns for secondary structures
prokaryotes['Helix1']=pd.Series(dtype=object)
prokaryotes['Turn1']=pd.Series(dtype=object)
prokaryotes['Sheet1']=pd.Series(dtype=object)
prokaryotes['Helix2']=pd.Series(dtype=object)
prokaryotes['Coil2']=pd.Series(dtype=object)
prokaryotes['Sheet2']=pd.Series(dtype=object)

#### Adding columns with secondary structure as lists of lists **(only ones from crystal structure)**

In [18]:
#regex patterns for following steps
pattern = r'\b\d+\.\.\d+\b'
pattern2 = r'\b\d+\b'
#filling helix1 column
for q in range(len(prokaryotes)):
    templist = []
    if pd.isnull(prokaryotes.loc[q, 'Helix']) == False:
        tempt = re.findall(pattern, prokaryotes.loc[q, 'Helix'])
        for m in range(len(tempt)):    
            tempt2 = re.findall(pattern2, tempt[m])
            tempt2 = list(map(int, tempt2))
            templist.append(list(range(tempt2[0], tempt2[1]+1)))
        prokaryotes.at[q,'Helix1'] = templist
    else: 
        prokaryotes.at[q, 'Helix1'] = np.NaN
#filling Turn1 column
for w in range(len(prokaryotes)):
    templist = []
    if pd.isnull(prokaryotes.loc[w, 'Turn']) == False:
        tempt = re.findall(pattern, prokaryotes.loc[w, 'Turn'])
        for m in range(len(tempt)):    
            tempt2 = re.findall(pattern2, tempt[m])
            tempt2 = list(map(int, tempt2))
            templist.append(list(range(tempt2[0], tempt2[1]+1)))
        prokaryotes.at[w,'Turn1'] = templist
    else: 
        prokaryotes.at[w, 'Turn1'] = np.NaN
#filling Sheet1 column
for w in range(len(prokaryotes)):
    templist = []
    if pd.isnull(prokaryotes.loc[w, 'Beta strand']) == False:
        tempt = re.findall(pattern, prokaryotes.loc[w, 'Beta strand'])
        for m in range(len(tempt)):    
            tempt2 = re.findall(pattern2, tempt[m])
            tempt2 = list(map(int, tempt2))
            templist.append(list(range(tempt2[0], tempt2[1]+1)))
        prokaryotes.at[w,'Sheet1'] = templist
    else: 
        prokaryotes.at[w, 'Sheet1'] = np.NaN

#### Adding columns with secondary structure from S4pred predictions for **all** proteins

In [19]:
for n in range(len(contentl)):
    pat = r'\d+\s\w\s\w'
    patl = re.findall(pat, contentl[n])
    for p in range(len(patl)):
        patl[p] = patl[p].split(' ')
    helic = []
    sheet = []
    coil = []
    for k in range(len(patl)):
        if patl[k][2] == 'H':
            helic.append(int(patl[k][0]))
        elif patl[k][2] == 'E':
            sheet.append(int(patl[k][0]))
        elif patl[k][2] == 'C':
            coil.append(int(patl[k][0]))
    helices = []
    sheets = []
    coils = []
    for val in helic:
        if val-1 not in helic:
            lower = val
        if val+1 not in helic:
            upper = val
            helices.append(list(range(lower,upper+1)))
    for coi in coil:
        if coi-1 not in coil:
            lower = coi
        if coi+1 not in coil:
            upper = coi
            coils.append(list(range(lower,upper+1)))
    for she in sheet:
        if she-1 not in sheet:
            lower = she
        if she+1 not in sheet:
            upper = she
            sheets.append(list(range(lower,upper+1)))
    prokaryotes.at[n, 'Helix2'] = helices
    prokaryotes.at[n, 'Coil2'] = coils
    prokaryotes.at[n, 'Sheet2'] = sheets

In [20]:
#separate lysate and cell data

prokaryotes = prokaryotes[prokaryotes['run_name'].str.contains('lysate', case=True)].dropna(subset=['meltPoint'])

In [21]:
prokaryotes.to_csv(os.path.join(path,'prokaryotes_sec_structure.csv'), index=False)

In [22]:
path: str = './data/'
prokaryotes: pd.DataFrame = pd.read_csv(os.path.join(path, "prokaryotes_sec_structure.csv"))
prokaryotes = prokaryotes[prokaryotes['run_name'].str.contains('lysate', case=True)].dropna(subset=['meltPoint'])
secs = ['Helix1','Turn1','Sheet1','Helix2','Sheet2']
# convert string to list because list information was while saving as csv
if isinstance(prokaryotes.iloc[0,28],str):
    for s in secs:
        prokaryotes[s] = prokaryotes[s].apply(lambda x: ast.literal_eval(x) if pd.isnull(x)==False else x)

#### Adding columns for total number of helices, sheets and coils

In [23]:
for s in secs:
    prokaryotes[f'{s}count'] = prokaryotes[s].apply(lambda x: len(x) if isinstance(x, list) else np.nan)

#### Adding columns for relative Helix and Sheet abundance

In [24]:
a = np.array(prokaryotes['Length'])
for s in secs:
    prokaryotes[f'{s}perc'] = prokaryotes[s].apply(lambda x: pd.Series(x).map(len).sum() if isinstance(x, list) else np.nan)
    prokaryotes[f'{s}perc'] = np.array(prokaryotes[f'{s}perc'])/a

#### Adding columns for average Helix and Sheet length

In [25]:
for s in secs:
    prokaryotes[f'{s}avg'] = prokaryotes[s].apply(lambda x: np.array([len(lst) for lst in np.array(x,dtype=object)]).mean() if isinstance(x, list) and len(x)>0 else np.nan)

#### Adding column for relative fraction of secondary structures (Helix and Beta sheet combined)

In [26]:
prokaryotes['secstr1'] = np.array(prokaryotes['Helix1perc']) + np.array(prokaryotes['Sheet1perc'])
prokaryotes['secstr2'] = np.array(prokaryotes['Helix2perc']) + np.array(prokaryotes['Sheet2perc'])

In [27]:
aacids = ['A', 'V', 'I', 'L', 'M', 'F', 'W','N', 'Q', 'S', 'T', 'Y','D', 'E','R', 'H', 'K', 'C', 'P', 'G' ]
from function import rel_aa_comp
aagl = []
for n in aacids:
    for m in aacids:
        if n != m and [m,n] not in aagl:
            aagl.append([n,m])

In [28]:
for g in aagl:
    prokaryotes[f'{g[0]}{g[1]}'] = prokaryotes['Sequence'].apply(lambda x: rel_aa_comp(x,[g[0],g[1]]))
    if abs(prokaryotes[f'{g[0]}{g[1]}'].corr(prokaryotes['meltPoint'])) < 0.2:  #Threshold for correlation
        prokaryotes = prokaryotes.drop(columns = [f'{g[0]}{g[1]}']).reset_index(drop=True)

#### Calculating amino acid percentage inside helices and sheets

In [29]:
import itertools
test = [[1,2,3],[4,5,6],[7,8,9]]
tests = str(list(itertools.chain.from_iterable(test)))
print(type(tests))

<class 'str'>


In [30]:
prokaryotes['helixind'] = prokaryotes['Helix2'].apply(lambda x: list(np.concatenate(np.array(x,dtype=object))) if len(x) > 0 else [])
prokaryotes['helixseq'] = prokaryotes.apply(lambda row: [row['Sequence'][i] for i in row['helixind'] if i < len(row['Sequence'])], axis=1)
for a in aacids:
    prokaryotes[f'{a}helix'] = prokaryotes['helixseq'].apply(lambda x: x.count(a)/len(x) if len(x) > 0 else np.nan) 

In [31]:
def p_val(corr, n, alpha):
    import math
    import scipy.stats as stats
    if math.sqrt((1-(corr**2))/(n-2)) != 0 and n-2 != 0:
        t = (corr)/(math.sqrt((1-(corr**2))/(n-2)))
        p = 1 - stats.t.cdf(t, n-2)
        return [p, p < alpha]

In [32]:
for a in aacids:
    prokaryotes[str(a)] = prokaryotes['Sequence'].apply(lambda x: x.count(a)/len(x) if len(x) > 0 else np.nan)

In [33]:
aat = []
for a in aacids:
    for b in aacids:
        aat.append(f'{a}{b}')

for a in aat:
    prokaryotes[f'{a}motif'] = prokaryotes['Sequence'].apply(lambda x: x.count(a)/len(x) if len(x) > 0 else np.nan)
    if abs(prokaryotes[f'{a}motif'].corr(prokaryotes['meltPoint'])) < 0.1:  #Threshold for correlation
        prokaryotes = prokaryotes.drop(columns = [f'{a}motif']).reset_index(drop=True)

In [34]:
prokaryotes['HydrophobicAA'] = prokaryotes['Sequence'].apply(lambda x: rel_aa_comp(x,['A','V','I','L','M','F','W']))
prokaryotes['ChargedAA'] = prokaryotes['Sequence'].apply(lambda x: rel_aa_comp(x,['R','H','K','D','E']))
prokaryotes['PolarAA'] = prokaryotes['Sequence'].apply(lambda x: rel_aa_comp(x,['N','Q','S','T','Y']))

#### PDB data import from AlphaFold database (chunk takes a long time because ~ 5000 pdb files are downloaded, download link is provided in README, but 1 GB of data)

In [None]:
path: str = "./data" #folder where files are stored
path2 = f"{os.path.abspath(os.path.join(os.getcwd(), '..'))}\data\pdbs"
prokaryotes: pd.DataFrame = pd.read_csv(os.path.join(path, "prokaryotes_unique_prot.csv"), dtype=str)   
prokaryotes2 = prokaryotes.dropna(subset=['AlphaFoldDB'])
tzui = []
for m in range(len(prokaryotes2)):
    if pd.isnull(prokaryotes.iloc[m,23]) == True:
        tzui.append(m)
prokaryotes3 = prokaryotes2.iloc[tzui,:]
inp = prokaryotes3['AlphaFoldDB']
inp.reset_index(drop=True, inplace=True)
for i in range(len(inp)):
    inp[i] = inp[i].replace(';','')
    inp[i] = f'AF-{inp[i]}-F1'
database_version = 'v4'
base_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
target_dir = os.path.join(base_dir, 'data', 'pdbs')
os.makedirs(target_dir, exist_ok=True)
for o in range(len(inp)):
    alphafold_ID = inp[o]
    path_finalpdb = os.path.join(target_dir, f'{alphafold_ID}.pdb')
    url = f'https://alphafold.ebi.ac.uk/files/{alphafold_ID}-model_{database_version}.pdb' 
    os.system(f'curl {url} -o "{path_finalpdb}"')

#### Integration of tertiary structure data

In [35]:
from function import salt_bridge
path = './data/pdbs'
Salt_bridges = salt_bridge(path)

  distance[:,0] = distance[:,0].astype('int')


In [36]:
# here I calculate the amount of brdiges as well as the sum of the bridge distances within the saltbridge dictionary
test = Salt_bridges['P10943']
amount_bridges = {}
for n, k in Salt_bridges.items():
    amount_bridges[n] = np.sum(~np.isnan(k)) - sum(Salt_bridges[n].shape) +2
sum_bridges = {}
for n, k in Salt_bridges.items():
    sum_bridges[n] = np.nansum(k) - np.nansum(Salt_bridges[n][0,:]) - np.nansum(Salt_bridges[n][:,0])


In [37]:
for n in range(len(prokaryotes)):
    if pd.isnull(prokaryotes.loc[n,'AlphaFoldDB']) == False:
        if prokaryotes.loc[n,'AlphaFoldDB'].replace(';','')  in amount_bridges:
            prokaryotes.loc[n, 'A_Salty'] = amount_bridges[prokaryotes.loc[n,'AlphaFoldDB'].replace(';','')]

for n in range(len(prokaryotes)):
    if pd.isnull(prokaryotes.loc[n,'AlphaFoldDB']) == False:
        if prokaryotes.loc[n,'AlphaFoldDB'].replace(';','')  in sum_bridges:
            prokaryotes.loc[n, 'S_Salty'] = sum_bridges[prokaryotes.loc[n,'AlphaFoldDB'].replace(';','')]


In [None]:
#calculate mean salt bridge length per protein and append it to the df
M_Salty = []

for n in range(len(prokaryotes)):
    if pd.isnull(prokaryotes.loc[n, 'S_Salty']) == False and pd.isnull(prokaryotes.loc[n, 'A_Salty']) == False and prokaryotes.loc[n, 'S_Salty'] !=0 and prokaryotes.loc[n, 'A_Salty'] !=0:
        M_Salty = (prokaryotes.loc[n, 'S_Salty']/prokaryotes.loc[n, 'A_Salty'])
        prokaryotes.loc[n, 'M_Salty'] = M_Salty
    elif pd.isnull(prokaryotes.loc[n, 'S_Salty']) == True:
        prokaryotes.loc[n, 'M_Salty'] = np.nan
    elif pd.isnull(prokaryotes.loc[n, 'A_Salty']) == True:
        prokaryotes.loc[n, 'M_Salty'] = np.nan
    elif prokaryotes.loc[n, 'S_Salty'] ==0:
        prokaryotes.loc[n, 'M_Salty'] = np.nan
    elif prokaryotes.loc[n, 'A_Salty'] ==0:
        prokaryotes.loc[n, 'M_Salty'] = np.nan


Hydrophobic patches - integration

In [None]:
from function import VdW_interaction
VdW_clus, VdW_vol = VdW_interaction('./data/pdbs' )


calculate amount of clusters/hydrophobic regions

In [None]:
#calculate the amount of cluster
amount_cluster = {}

for n,k in VdW_clus.items():
        amount_cluster[n] = len(k)

length_cluster = {}
cluster_list = []

#calculate the lenght of the lists i.e. the amount of AS per cluster/hydrophobic patch
def calculate_list_lengths(VdW_clus):
    length_cluster = {}
    for key, value in VdW_clus.items():
        total_length = 0
        
        if isinstance(value, dict):
            for sub_key, sub_value in value.items():
                    total_length += len(sub_value)
        
        length_cluster[key] = total_length
    
    return length_cluster

length_cluster = calculate_list_lengths(VdW_clus)
                


now concat the dictionaries onto the prokaryote df

In [None]:
for n in range(len(prokaryotes)):
    if pd.isnull(prokaryotes.loc[n,'AlphaFoldDB']) == False:
        if prokaryotes.loc[n,'AlphaFoldDB'].replace(';','')  in amount_cluster:
            prokaryotes.loc[n, 'Amount_Cluster'] = amount_cluster[prokaryotes.loc[n,'AlphaFoldDB'].replace(';','')]

#now divide amount of clusters by amino acid sequenz

for n in range(len(prokaryotes)):
    prokaryotes.loc[n, 'relative_Amount_Cluster'] = prokaryotes.loc[n, 'Amount_Cluster']/prokaryotes.loc[n, 'Length']


for n in range(len(prokaryotes)):
    if pd.isnull(prokaryotes.loc[n,'AlphaFoldDB']) == False and prokaryotes.loc[n,'AlphaFoldDB'].replace(';','')  in length_cluster:
        prokaryotes.loc[n, 'Cluster_length'] = length_cluster[prokaryotes.loc[n,'AlphaFoldDB'].replace(';','')]
    prokaryotes.loc[n, 'relative_Cluster_length'] = prokaryotes.loc[n, 'Cluster_length']/prokaryotes.loc[n, 'Length']


now use VdW_vol to caclulate the relative total overlapping volume and further data

In [None]:
#calculating sum of the overlapping volume
sum_volumes = {}
for n, k in VdW_vol.items():
    sum_volumes[n] = np.nansum(k) - np.nansum(VdW_vol[n][0,:]) - np.nansum(VdW_vol[n][:,0])

print(sum_volumes)

#calculating the amount of overlapping points 
amount_overlapping = {}
for n, k in VdW_vol.items():
    amount_overlapping[n] = np.sum(~np.isnan(k)) - sum(VdW_vol[n].shape) +2

print(amount_overlapping)

#add the values to prokaryotes
for n in range(len(prokaryotes)):
    if pd.isnull(prokaryotes.loc[n,'AlphaFoldDB']) == False and prokaryotes.loc[n,'AlphaFoldDB'].replace(';','')  in sum_bridges:
            prokaryotes.loc[n, 'Overlapping_Volume'] = sum_volumes[prokaryotes.loc[n,'AlphaFoldDB'].replace(';','')]
    prokaryotes.loc[n, 'relative_Overlapping_Volume'] = prokaryotes.loc[n, 'Overlapping_Volume']/prokaryotes.loc[n, 'Length']


for n in range(len(prokaryotes)):
    if pd.isnull(prokaryotes.loc[n,'AlphaFoldDB']) == False and prokaryotes.loc[n,'AlphaFoldDB'].replace(';','')  in amount_bridges:
            prokaryotes.loc[n, 'Overlapping_AS'] = amount_overlapping[prokaryotes.loc[n,'AlphaFoldDB'].replace(';','')]
    prokaryotes.loc[n, 'relative_Overlapping_AS'] = prokaryotes.loc[n, 'Overlapping_AS']/prokaryotes.loc[n, 'Length']


for n in range(len(prokaryotes)):
    prokaryotes.loc[n, 'Overlapping_Volume_by_Overlapping_AS'] = prokaryotes.loc[n, 'Overlapping_Volume']/prokaryotes.loc[n, 'Overlapping_AS']
    prokaryotes.loc[n, 'relative_Overlapping_Volume_by_Overlapping_AS'] = (prokaryotes.loc[n, 'Overlapping_Volume']/prokaryotes.loc[n, 'Overlapping_AS'])/prokaryotes.loc[n, 'Length']


{'C0H3Q1': 343062.5631772673, 'C0H3V2': 360551.1470793815, 'C0H3V8': 152017.0014089145, 'C0H3Y1': 152114.26815110946, 'C0H3Z2': 83557.04121169771, 'C0H405': 121633.35457529366, 'C0H423': 215895.89164380904, 'C0H437': 53258.826126248605, 'C0H453': 116104.35297660701, 'C0SP82': 1220264.9627090078, 'C0SP85': 157349.43756347726, 'C0SP86': 1511758.2617319832, 'C0SP89': 1480232.9625747148, 'C0SP93': 488314.52510102466, 'C0SP94': 829479.1360115968, 'C0SP95': 1435674.957782208, 'C0SP98': 720796.9549783808, 'C0SPA0': 1294443.776143754, 'C0SPB0': 1089578.9915820053, 'C0SPB1': 310748.37453035754, 'C0SPB4': 764212.4658118361, 'C0SPC3': 338405.4727585046, 'G0ZKW2': 1639387.4381474704, 'I1W5V5': 368723.6072985027, 'O05217': 418187.969903914, 'O05220': 423448.5283512976, 'O05227': 518031.6061179957, 'O05234': 171340.4537196954, 'O05236': 430069.5208123765, 'O05239': 790514.9652858633, 'O05240': 809923.901248175, 'O05243': 216205.14769780327, 'O05248': 728439.2392119728, 'O05250': 1465729.5948363394, 

In [None]:
import pandas as pd
from function import p_val
oligolength = 4       #lengths of oligos to test, found for 3,4

sequences = prokaryotes['Sequence']
meltPoints = prokaryotes['meltPoint']

oligo_counts = {}

for seq in sequences:                               # Count oligos in sequences and assign count to dictionary
    for i in range(len(seq) - oligolength + 1):
        oligo = seq[i:i + oligolength]
        if oligo in oligo_counts:
            oligo_counts[oligo] += 1
        else:
            oligo_counts[oligo] = 1

filtered_oligos = {oligo: count for oligo, count in oligo_counts.items() if count > 20} # Filter oligos with count > 20

for oligo in filtered_oligos.keys():
    oligo_series = sequences.apply(lambda x: x.count(oligo))
    oligocorr = oligo_series.corr(meltPoints)
    if abs(oligocorr) > 0.2 and p_val(oligocorr, len(prokaryotes), 0.05)[1] == True:     #Filter only oligos with correlation > 0.2 and p-value < 0.05
        prokaryotes[f'{oligo}motif'] = oligo_series



In [None]:
path: str = './data/'
prokaryotes.to_csv(os.path.join(path, "prokaryotes_322columns.csv"), index=False)


Hydrogen bond processing

In [None]:
from function import H_bond_calc

path = './data/pqrs'
Hydrogen_bonds = H_bond_calc(path)


SyntaxError: f-string: expecting '}' (function.py, line 42)

In [None]:
path: str = './data/'
prokaryotes: pd.DataFrame = pd.read_csv(os.path.join(path, "prokaryotes_322columns.csv"))

  prokaryotes: pd.DataFrame = pd.read_csv(os.path.join(path, "prokaryotes_322columns.csv"))


In [None]:
H_bond_count = {}
for key, value in Hydrogen_bonds.items():
    count = np.count_nonzero(~np.isnan(Hydrogen_bonds[key][1:,1:,0]))
    H_bond_count[str(key)] = int(count)


for n in range(len(prokaryotes)):
    if pd.isnull(prokaryotes.loc[n,'AlphaFoldDB']) == False and prokaryotes.loc[n,'AlphaFoldDB'].replace(';','')  in H_bond_count:
        prokaryotes.loc[n, 'H_Bonds'] = H_bond_count[prokaryotes.loc[n,'AlphaFoldDB'].replace(';','')]
        prokaryotes.loc[n, 'relative_H_bond_count'] = prokaryotes.loc[n, 'H_Bonds']/prokaryotes.loc[n, 'Length']
        prokaryotes.loc[n, 'H_bond_count_by_Asalty'] = prokaryotes.loc[n, 'H_Bonds']/prokaryotes.loc[n, 'A_Salty']
        prokaryotes.loc[n, 'relative_H_bond_by_Ssalty'] = prokaryotes.loc[n, 'H_Bonds']/prokaryotes.loc[n, 'S_Salty']
        prokaryotes.loc[n, 'relative_H_bond_by_overlapping_as'] = prokaryotes.loc[n, 'H_Bonds']/prokaryotes.loc[n, 'Overlapping_AS']


  prokaryotes.loc[n, 'H_bond_count_by_Asalty'] = prokaryotes.loc[n, 'H_Bonds']/prokaryotes.loc[n, 'A_Salty']
  prokaryotes.loc[n, 'relative_H_bond_by_Ssalty'] = prokaryotes.loc[n, 'H_Bonds']/prokaryotes.loc[n, 'S_Salty']
  prokaryotes.loc[n, 'H_bond_count_by_Asalty'] = prokaryotes.loc[n, 'H_Bonds']/prokaryotes.loc[n, 'A_Salty']
  prokaryotes.loc[n, 'relative_H_bond_by_Ssalty'] = prokaryotes.loc[n, 'H_Bonds']/prokaryotes.loc[n, 'S_Salty']
  prokaryotes.loc[n, 'H_bond_count_by_Asalty'] = prokaryotes.loc[n, 'H_Bonds']/prokaryotes.loc[n, 'A_Salty']
  prokaryotes.loc[n, 'relative_H_bond_by_Ssalty'] = prokaryotes.loc[n, 'H_Bonds']/prokaryotes.loc[n, 'S_Salty']
  prokaryotes.loc[n, 'H_bond_count_by_Asalty'] = prokaryotes.loc[n, 'H_Bonds']/prokaryotes.loc[n, 'A_Salty']
  prokaryotes.loc[n, 'relative_H_bond_by_Ssalty'] = prokaryotes.loc[n, 'H_Bonds']/prokaryotes.loc[n, 'S_Salty']
  prokaryotes.loc[n, 'H_bond_count_by_Asalty'] = prokaryotes.loc[n, 'H_Bonds']/prokaryotes.loc[n, 'A_Salty']
  proka

In [None]:
from function import SASA_calc
path = './data/pdbs'
Surface_area = SASA_calc(path)


KeyboardInterrupt: 

In [None]:
for n in range(len(prokaryotes)):
    if pd.isnull(prokaryotes.loc[n,'AlphaFoldDB']) == False and prokaryotes.loc[n,'AlphaFoldDB'].replace(';','')  in Surface_area:
        prokaryotes.loc[n, 'Surface_Area'] = Surface_area[prokaryotes.loc[n,'AlphaFoldDB'].replace(';','')]

In [None]:
path: str = './data/'
prokaryotes.to_csv(os.path.join(path, 'prokaryotes_323columns.csv'))

In [None]:
#Data preprocessing including meltPoint and addition of Sheet amino acid percentages and fix of helix amino acid percentages because S4pred counts from 1, python from 0
while os.path.basename(os.getcwd()) != 'topic04_02':
    os.chdir('..')
    print(os.getcwd())
path = './data'
prokaryotes = pd.read_csv(os.path.join(path, "prokaryotes_323columns.csv"))
droplist = [0,1,2,3,4,5,6,7,8,10,11,12,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,33,38,43,-1,-2,-3,-4,-5,-6,-12,-13,-14,-15,-16,-17,-18,-19,20,-21,-22,-23,-24]
col = prokaryotes.columns
Sheetind = pd.Series(np.zeros(prokaryotes.shape[0]))
Sheetind = prokaryotes['Sheet2'].apply(lambda x: list(map(int, re.findall(r'\d+', x) )) if type(x) == str else x)
Sheetseq = pd.Series(np.zeros(prokaryotes.shape[0]))
for n in range(len(Sheetseq)):
    Sheetseq[n] = np.array(list(prokaryotes.loc[n,'Sequence']))[Sheetind[n]]
prokaryotes = prokaryotes.drop(columns = ['helixind','helixseq'])
aacid = ['A', 'V', 'L', 'I', 'P', 'F', 'W', 'M', 'G', 'S', 'C', 'T', 'Y', 'N', 'Q', 'D', 'E', 'K', 'R', 'H']
for a in aacid:
    prokaryotes.drop(columns = f'{a}helix', inplace = True)
helixind = pd.Series(np.zeros(prokaryotes.shape[0]))
helixind = prokaryotes['Helix2'].apply(lambda x: np.array(list(map(int, re.findall(r'\d+', x) ))) -1 if type(x) == str else x)
for n in range(len(helixind)):
    if len(helixind[n]) != 0:
        prokaryotes.at[n,'helixseq'] = np.array(list(prokaryotes.loc[n,'Sequence']))[helixind[n]]
for a in aacid:
    prokaryotes[f'{a}helix'] = prokaryotes['helixseq'].apply(lambda x: list(x).count(a)/len(x) if type(x) == np.ndarray and np.ndim(x)!=0 else np.nan)
Sheetind = pd.Series(np.zeros(prokaryotes.shape[0]))
Sheetind = prokaryotes['Sheet2'].apply(lambda x: np.array(list(map(int, re.findall(r'\d+', x) ))) -1 if type(x) == str else x)
for n in range(len(Sheetseq)):
    if len(Sheetind[n])!=0:
        prokaryotes.at[n,'sheetseq'] = np.array(list(prokaryotes.loc[n,'Sequence']))[Sheetind[n]]
for a in aacid:
    prokaryotes[f'{a}sheet'] = prokaryotes['sheetseq'].apply(lambda x: list(x).count(a)/len(x) if type(x) == np.ndarray and np.ndim(x)!=0  and len(x) != 0 else np.nan)
if 'Unnamed: 0' in prokaryotes.columns:
    prokaryotes.drop(columns = ['Unnamed: 0.1','Unnamed: 0'], inplace = True)
prokaryotes.to_csv(os.path.join(path, "prokaryotes_348columns.csv"), index = False)

prokaryotes_348columns.csv was the final dataframe used in everything else. It includes 348 features of 6558 proteins and also other information like UniprotIDs and other miscellaneous things.