In [15]:
import pandas as pd
import pprint as p
import re

# imports
import sys
sys.path.insert(1, 'modules')

from sklearn.utils import resample

In [4]:
# local imports
import modules.config as cfg
import modules.utils as u

import modules.atchley as a
import modules.parser as par
import modules.protein as prot

In [5]:
"""ATCHLEY VALUE PARSING"""

'ATCHLEY VALUE PARSING'

In [12]:
# loading atchley csv data and turning to dict
df_atchley = pd.read_csv(filepath_or_buffer=cfg.f_in_atchley, sep='\t')
p.pprint(df_atchley)

amino.acid      f1      f2      f3      f4      f5
0           A  −0.591  −1.302  −0.733   1.570  −0.146
1           C  −1.343   0.465  −0.862  −1.020  −0.255
2           D   1.050   0.302  −3.656  −0.259  −3.242
3           E   1.357  −1.453   1.477   0.113  −0.837
4           F  −1.006  −0.590   1.891  −0.397   0.412
5           G  −0.384   1.652   1.330   1.045   2.064
6           H   0.336  −0.417  −1.673  −1.474  −0.078
7           I  −1.239  −0.547   2.131   0.393   0.816
8           K   1.831  −0.561   0.533  −0.277   1.648
9           L  −1.019  −0.987  −1.505   1.266  −0.912
10          M  −0.663  −1.524   2.219  −1.005   1.212
11          N   0.945   0.828   1.299  −0.169   0.933
12          P   0.189   2.081  −1.628   0.421  −1.392
13          Q   0.931  −0.179  −3.005  −0.503  −1.853
14          R   1.538  −0.055   1.502   0.440   2.897
15          S  −0.228   1.399  −4.760   0.670  −2.647
16          T  −0.032   0.326   2.213   0.908   1.313
17          V  −1.337  −0.279  

In [13]:
df_atchley.rename(columns={'amino.acid': 'amino_acid'}, inplace=True)
df_atchley.set_index('amino_acid', inplace=True)
p.pprint(df_atchley)

f1      f2      f3      f4      f5
amino_acid                                        
A           −0.591  −1.302  −0.733   1.570  −0.146
C           −1.343   0.465  −0.862  −1.020  −0.255
D            1.050   0.302  −3.656  −0.259  −3.242
E            1.357  −1.453   1.477   0.113  −0.837
F           −1.006  −0.590   1.891  −0.397   0.412
G           −0.384   1.652   1.330   1.045   2.064
H            0.336  −0.417  −1.673  −1.474  −0.078
I           −1.239  −0.547   2.131   0.393   0.816
K            1.831  −0.561   0.533  −0.277   1.648
L           −1.019  −0.987  −1.505   1.266  −0.912
M           −0.663  −1.524   2.219  −1.005   1.212
N            0.945   0.828   1.299  −0.169   0.933
P            0.189   2.081  −1.628   0.421  −1.392
Q            0.931  −0.179  −3.005  −0.503  −1.853
R            1.538  −0.055   1.502   0.440   2.897
S           −0.228   1.399  −4.760   0.670  −2.647
T           −0.032   0.326   2.213   0.908   1.313
V           −1.337  −0.279  −0.544   1.242  −1.

In [16]:
# cleaning values
for col in df_atchley['f1': 'f5']:
  df_atchley[col] = df_atchley[col].apply(lambda x: re.sub(r'[^\x00-\x7F]+','-', x)).astype(float)

In [17]:
dict_atchley = df_atchley.T.to_dict('list')
u.pickle_method(filename=cfg.f_out_atchley, method='wb', context=dict_atchley)

In [19]:
# unpickling saved dictionary
dict_atchley = u.pickle_method(filename=cfg.f_out_atchley, method='rb')
p.pprint(dict_atchley)

{'A': [-0.591, -1.302, -0.733, 1.57, -0.146],
 'C': [-1.343, 0.465, -0.862, -1.02, -0.255],
 'D': [1.05, 0.302, -3.656, -0.259, -3.242],
 'E': [1.357, -1.453, 1.477, 0.113, -0.837],
 'F': [-1.006, -0.59, 1.891, -0.397, 0.412],
 'G': [-0.384, 1.652, 1.33, 1.045, 2.064],
 'H': [0.336, -0.417, -1.673, -1.474, -0.078],
 'I': [-1.239, -0.547, 2.131, 0.393, 0.816],
 'K': [1.831, -0.561, 0.533, -0.277, 1.648],
 'L': [-1.019, -0.987, -1.505, 1.266, -0.912],
 'M': [-0.663, -1.524, 2.219, -1.005, 1.212],
 'N': [0.945, 0.828, 1.299, -0.169, 0.933],
 'P': [0.189, 2.081, -1.628, 0.421, -1.392],
 'Q': [0.931, -0.179, -3.005, -0.503, -1.853],
 'R': [1.538, -0.055, 1.502, 0.44, 2.897],
 'S': [-0.228, 1.399, -4.76, 0.67, -2.647],
 'T': [-0.032, 0.326, 2.213, 0.908, 1.313],
 'V': [-1.337, -0.279, -0.544, 1.242, -1.262],
 'W': [-0.595, 0.009, 0.672, -2.128, -0.184],
 'Y': [0.26, 0.83, 3.097, -0.838, 1.512]}


In [20]:
"""INITIAL PRE-PROCESSING"""

'INITIAL PRE-PROCESSING'

In [21]:
# Parsing fast files
toxic_proteins = par.parse_fasta(path_fasta=cfg.f_train_toxic_fasta, is_toxic=1)
atoxic_proteins = par.parse_fasta(path_fasta=cfg.f_train_atoxic_fasta, is_toxic=0)

In [22]:
# print test
print(f'Total toxic sequences: {len(toxic_proteins)}')
print(f'Total atoxic sequences: {len(atoxic_proteins)}')

Total toxic sequences: 6001
Total atoxic sequences: 49764


In [23]:
# DOWNSAMPLING ATOXIC SEQUENCES
toxic_proteins = par.crop_sequences(toxic_proteins)
atoxic_proteins = par.crop_sequences(atoxic_proteins)

In [24]:
total_toxic_seqs = len(toxic_proteins)
print(f'Total toxic sequences ( <= {cfg.MAX_SEQ_LEN}): {total_toxic_seqs}')
print(f'Total toxic sequences ( <= {cfg.MAX_SEQ_LEN}): {len(atoxic_proteins)}')

Total toxic sequences ( <= 500): 5896
Total toxic sequences ( <= 500): 40282


In [27]:
"""Checking to see if the downsampled atoxic list is the same length as the toxic list."""

atoxic_proteins = resample(atoxic_proteins, replace=False, n_samples=total_toxic_seqs, random_state=cfg.RANDOM_SEED)

print(f'Total protein sequences in atoxic list post-downsampling: {len(atoxic_proteins)}')
print('Lists are EQUAL length') if len(atoxic_proteins) == len(toxic_proteins) else print("Lists are NOT equal length")

Total protein sequences in atoxic list post-downsampling: 5896
Lists are EQUAL length


In [29]:
# COMBINING & APPENDING SEQUENCES

proteins = toxic_proteins + atoxic_proteins
par.append_proteins(proteins)

AttributeError: module 'config' has no attribute 'DICT_ATCHLEY'

In [None]:
print(f'Total overall protein sequences: {len(proteins)}')
print(proteins[0].matrix_diff)

In [None]:
df_proteins = hf.proteins_to_df(proteins)

In [None]:
# print test
print('\nProtein training set info:')
df_proteins.info()

In [None]:
print('\nChecking value counts for each class in df combined:\n1 == toxic\n0 == atoxic\n----------')
print(df_proteins['toxic'].value_counts())