In [145]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import re
import ast

In [146]:
path: str = './data/'
prokaryotes: pd.DataFrame = pd.read_csv(os.path.join(path, "prokaryotes_sec_structure.csv"))

In [147]:
prokaryotes = prokaryotes[prokaryotes['run_name'].str.contains('lysate', case=True)].dropna(subset=['meltPoint'])

In [148]:
secs = ['Helix1','Turn1','Sheet1','Helix2','Sheet2']
if isinstance(prokaryotes.iloc[0,28],str):
    for s in secs:
        prokaryotes[s] = prokaryotes[s].apply(lambda x: ast.literal_eval(x) if pd.isnull(x)==False else x)

Adding columns for total number of helices, sheets and coils

In [149]:
for s in secs:
    prokaryotes[f'{s}count'] = prokaryotes[s].apply(lambda x: len(x) if isinstance(x, list) else np.nan)

Adding columns for relative Helix and Sheet abundance

In [150]:
a = np.array(prokaryotes['Length'])
for s in secs:
    prokaryotes[f'{s}perc'] = prokaryotes[s].apply(lambda x: pd.Series(x).map(len).sum() if isinstance(x, list) else np.nan)
    prokaryotes[f'{s}perc'] = np.array(prokaryotes[f'{s}perc'])/a

Adding columns for average Helix and Sheet length

In [151]:
for s in secs:
    prokaryotes[f'{s}avg'] = prokaryotes[s].apply(lambda x: np.array([len(lst) for lst in np.array(x,dtype=object)]).mean() if isinstance(x, list) and len(x)>0 else np.nan)

Adding column for relative fraction of secondary structures (Helix and Beta sheet combined)

In [152]:
prokaryotes['secstr1'] = np.array(prokaryotes['Helix1perc']) + np.array(prokaryotes['Sheet1perc'])
prokaryotes['secstr2'] = np.array(prokaryotes['Helix2perc']) + np.array(prokaryotes['Sheet2perc'])

In [153]:
aacids = ['A', 'V', 'I', 'L', 'M', 'F', 'W','N', 'Q', 'S', 'T', 'Y','D', 'E','R', 'H', 'K', 'C', 'P', 'G' ]
from function import rel_aa_comp
aagl = []
for n in aacids:
    for m in aacids:
        if n != m and [m,n] not in aagl:
            aagl.append([n,m])

In [154]:
for g in aagl:
    prokaryotes[f'{g[0]}{g[1]}'] = prokaryotes['Sequence'].apply(lambda x: rel_aa_comp(x,[g[0],g[1]]))
    if abs(prokaryotes[f'{g[0]}{g[1]}'].corr(prokaryotes['meltPoint'])) < 0.2:  #Threshold for correlation
        prokaryotes = prokaryotes.drop(columns = [f'{g[0]}{g[1]}']).reset_index(drop=True)

In [155]:
correlation = prokaryotes.corr(method='spearman',numeric_only=True)['meltPoint']

Calculating amino acid percentage in (helices and sheets)

In [156]:
import itertools
test = [[1,2,3],[4,5,6],[7,8,9]]
tests = str(list(itertools.chain.from_iterable(test)))
print(type(tests))

<class 'str'>


In [157]:
prokaryotes['helixind'] = prokaryotes['Helix2'].apply(lambda x: list(np.concatenate(np.array(x,dtype=object))) if len(x) > 0 else [])
prokaryotes['helixseq'] = prokaryotes.apply(lambda row: [row['Sequence'][i] for i in row['helixind'] if i < len(row['Sequence'])], axis=1)
for a in aacids:
    prokaryotes[f'{a}helix'] = prokaryotes['helixseq'].apply(lambda x: x.count(a)/len(x) if len(x) > 0 else np.nan) 

In [158]:
def p_val(corr, n, alpha):
    import math
    import scipy.stats as stats
    if math.sqrt((1-(corr**2))/(n-2)) != 0 and n-2 != 0:
        t = (corr)/(math.sqrt((1-(corr**2))/(n-2)))
        p = 1 - stats.t.cdf(t, n-2)
        return [p, p < alpha]

In [159]:
for a in aacids:
    prokaryotes[str(a)] = prokaryotes['Sequence'].apply(lambda x: x.count(a)/len(x) if len(x) > 0 else np.nan)

In [160]:
from function import pain
quantiles_prokaryotes = pain(data = prokaryotes, by = 'meltPoint', lower = 0.1, upper = 0.9)

In [161]:
quacorr = quantiles_prokaryotes.corr(method='spearman',numeric_only=True)['meltPoint']

In [162]:
aat = []
for a in aacids:
    for b in aacids:
        aat.append(f'{a}{b}')

for a in aat:
    prokaryotes[f'{a}motif'] = prokaryotes['Sequence'].apply(lambda x: x.count(a)/len(x) if len(x) > 0 else np.nan)
    if abs(prokaryotes[f'{a}motif'].corr(prokaryotes['meltPoint'])) < 0.1:  #Threshold for correlation
        prokaryotes = prokaryotes.drop(columns = [f'{a}motif']).reset_index(drop=True)

In [163]:
b = prokaryotes.corr(method='pearson',numeric_only=True)['meltPoint']
cp = b.apply(lambda x: p_val(x, len(prokaryotes), 0.05))
WICHTIG = []
for j in range(len(b)):
    if cp[j] is not None and cp[j][1] == True:
        WICHTIG.append(b.index[j])
len(WICHTIG)

  if cp[j] is not None and cp[j][1] == True:


116