In [351]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import Bio
import statsmodels.api as sm
import re
import ast

In [352]:
path: str = './data/'
prokaryotes: pd.DataFrame = pd.read_csv(os.path.join(path, "prokaryotes_sec_structure.csv"))

In [353]:
prokaryotes = prokaryotes.dropna(subset=['meltPoint'])

In [354]:
secs = ['Helix1','Turn1','Sheet1','Helix2','Sheet2']
if isinstance(prokaryotes.iloc[0,28],str):
    for s in secs:
        prokaryotes[s] = prokaryotes[s].apply(lambda x: ast.literal_eval(x) if pd.isnull(x)==False else x)

Adding columns for total number of helices, sheets and coils

In [355]:
for s in secs:
    prokaryotes[f'{s}count'] = prokaryotes[s].apply(lambda x: len(x) if isinstance(x, list) else np.nan)

Adding columns for relative Helix and Sheet abundance

In [356]:
a = np.array(prokaryotes['Length'])
for s in secs:
    prokaryotes[f'{s}perc'] = prokaryotes[s].apply(lambda x: pd.Series(x).map(len).sum() if isinstance(x, list) else np.nan)
    prokaryotes[f'{s}perc'] = np.array(prokaryotes[f'{s}perc'])/a

Adding columns for average Helix and Sheet length

In [357]:
for s in secs:
    prokaryotes[f'{s}avg'] = prokaryotes[s].apply(lambda x: np.array([len(lst) for lst in np.array(x,dtype=object)]).mean() if isinstance(x, list) and len(x)>0 else np.nan)

Adding column for relative fraction of secondary structures (Helix and Beta sheet combined)

In [358]:
prokaryotes['secstr1'] = np.array(prokaryotes['Helix1perc']) + np.array(prokaryotes['Sheet1perc'])
prokaryotes['secstr2'] = np.array(prokaryotes['Helix2perc']) + np.array(prokaryotes['Sheet2perc'])

In [320]:
print(type(prokaryotes['meltPoint'][0]))

<class 'numpy.float64'>


In [360]:
lower_threshold = prokaryotes['meltPoint'].quantile(0.1)
upper_threshold = prokaryotes['meltPoint'].quantile(0.9)
quantiles_prokaryotes = prokaryotes[(prokaryotes['meltPoint'] <= lower_threshold) | (prokaryotes['meltPoint'] >= upper_threshold)].reset_index()

In [97]:
quantiles_prokaryotes_temp_NaN.corr(method='spearman',numeric_only=True)['meltPoint']

Length        -0.076928
temperature    0.828756
fold_change    0.084312
meltPoint      1.000000
auc            0.907914
Helix1count    0.307083
Turn1count     0.026546
Sheet1count    0.210829
Helix2count   -0.080430
Sheet2count    0.053873
Helix1perc     0.105795
Turn1perc     -0.151912
Sheet1perc     0.111733
Helix2perc    -0.111735
Sheet2perc     0.130885
Helix1avg     -0.154751
Turn1avg       0.020562
Sheet1avg     -0.020224
Helix2avg     -0.095630
Sheet2avg     -0.017054
secstr1        0.238029
secstr2       -0.070365
Name: meltPoint, dtype: float64

In [361]:
aacids = ['A', 'V', 'I', 'L', 'M', 'F', 'W','N', 'Q', 'S', 'T', 'Y','D', 'E','R', 'H', 'K', 'C', 'P', 'G' ]
from function import rel_aa_comp
aagl = []
for n in aacids:
    for m in aacids:
        if n != m and [m,n] not in aagl:
            aagl.append([n,m])

In [370]:
for g in aagl:
    prokaryotes[f'{g[0]}{g[1]}'] = prokaryotes['Sequence'].apply(lambda x: rel_aa_comp(x,[g[0],g[1]]))
    if abs(prokaryotes[f'{g[0]}{g[1]}'].corr(prokaryotes['meltPoint'])) < 0.4:
        prokaryotes = prokaryotes.drop(columns = [f'{g[0]}{g[1]}']).reset_index(drop=True)

In [372]:
correlation = prokaryotes.corr(method='spearman',numeric_only=True)['meltPoint']

Calculating amino acid percentage in (helices and sheets)

In [168]:
import itertools
test = [[1,2,3],[4,5,6],[7,8,9]]
tests = str(list(itertools.chain.from_iterable(test)))
print(type(tests))

<class 'str'>


In [381]:
a = np.array(prokaryotes['Helix2'][0],dtype=object)
A = list(np.concatenate(a))
A

[18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 106,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140]

In [386]:
data = {
    'indices': [[4, 5, 6, 11, 12], [0, 1, 2], [10, 13, 14, 15]],
    'letters': [['a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'i', 'o', 'u', 'z', 't', 'r', 'e', 'w', 'q']] * 3
}
df = pd.DataFrame(data)

# Create the new column by selecting elements from 'letters' based on 'indices'
df['selected_letters'] = df.apply(lambda row: [row['letters'][i] for i in row['indices']], axis=1)

# Display the result
print(df)

             indices                                            letters  \
0  [4, 5, 6, 11, 12]  [a, s, d, f, g, h, j, k, i, o, u, z, t, r, e, ...   
1          [0, 1, 2]  [a, s, d, f, g, h, j, k, i, o, u, z, t, r, e, ...   
2   [10, 13, 14, 15]  [a, s, d, f, g, h, j, k, i, o, u, z, t, r, e, ...   

  selected_letters  
0  [g, h, j, z, t]  
1        [a, s, d]  
2     [u, r, e, w]  


In [389]:
prokaryotes['helixind'] = prokaryotes['Helix2'].apply(lambda x: list(np.concatenate(np.array(x,dtype=object))) if len(x) > 0 else [])
prokaryotes['helixseq'] = prokaryotes.apply(lambda row: [row['Sequence'][i] for i in row['helixind'] if i < len(row['Sequence'])], axis=1)

In [189]:
for a in aacids:
    for y in range(len(prokaryotes)):
        helixind = list(itertools.chain.from_iterable(prokaryotes.loc[y,'Helix2']))
        helixseq = ''
        for u in helixind:
            if u < len(prokaryotes.loc[y,'Sequence']):
                helixseq += prokaryotes.loc[y,'Sequence'][u]
        if len(prokaryotes.loc[y,'Helix2']) != 0:
            prokaryotes.loc[y,f'{a}helixcomp'] = rel_aa_comp(helixseq,a)

  prokaryotes.loc[y,f'{a}helixcomp'] = rel_aa_comp(helixseq,a)
  prokaryotes.loc[y,f'{a}helixcomp'] = rel_aa_comp(helixseq,a)
  prokaryotes.loc[y,f'{a}helixcomp'] = rel_aa_comp(helixseq,a)
  prokaryotes.loc[y,f'{a}helixcomp'] = rel_aa_comp(helixseq,a)
  prokaryotes.loc[y,f'{a}helixcomp'] = rel_aa_comp(helixseq,a)
  prokaryotes.loc[y,f'{a}helixcomp'] = rel_aa_comp(helixseq,a)
  prokaryotes.loc[y,f'{a}helixcomp'] = rel_aa_comp(helixseq,a)
  prokaryotes.loc[y,f'{a}helixcomp'] = rel_aa_comp(helixseq,a)


In [212]:
b = prokaryotes.corr(method='pearson',numeric_only=True)['meltPoint']

In [195]:
def p_val(corr, n, alpha):
    import math
    import scipy.stats as stats
    if math.sqrt((1-(corr**2))/(n-2)) != 0 and n-2 != 0:
        t = (corr)/(math.sqrt((1-(corr**2))/(n-2)))
        p = 1 - stats.t.cdf(t, n-2)
        return [p, p < alpha]

In [213]:
cp = b.apply(lambda x: p_val(x, 12500, 0.05))

In [223]:
WICHTIG = []
for j in range(len(b)):
    if cp[j] is not None and cp[j][1] == True and b[j] > 0.4:
        WICHTIG.append(b.index[j])
        

  if cp[j] is not None and cp[j][1] == True and b[j] > 0.4:


In [222]:
WICHTIG

['temperature',
 'fold_change',
 'AL',
 'AR',
 'AP',
 'VL',
 'VR',
 'VP',
 'LF',
 'LW',
 'LY',
 'LE',
 'LR',
 'LC',
 'LP',
 'LG',
 'MR',
 'FR',
 'WR',
 'WP',
 'YR',
 'YP',
 'YG',
 'ER',
 'EP',
 'EG',
 'RH',
 'RC',
 'RP',
 'RG',
 'PG',
 'Rhelixcomp']