In [29]:
import os
import sys
import json
import pandas as pd
sys.path.append('../src/')
from configs import *
from utils_feature_engineering import *

from matplotlib import rcParams
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
rcParams.update(fig_params)

import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Settings

In [30]:
data_folder = os.path.join('..', 'data')
features_folder = os.path.join('..', 'features')

train_fasta_file = os.path.join(data_folder, 'MLCPP2_Training.fasta')
test_fasta_file = os.path.join(data_folder, 'MLCPP2_Independent.fasta')

train_targets_file = os.path.join(data_folder, 'MLCPP2_TrainingCPPvalues.csv')
test_targets_file = os.path.join(data_folder, 'MLCPP2_IndependentCPPvalues.csv')

## Parameters

In [31]:
with open(os.path.join(data_folder, 'Protein_parameters_setting.json'), 'r') as f:
    params = json.load(f)

## Feature engineering

### Structure-based descriptors (Physicochemical properties)

In [17]:
comb = create_comb(train_fasta_file, train_targets_file, test_fasta_file, test_targets_file)

In [18]:
comb = add_structure_based_descriptors(df=comb, dataset_name='comb', save_folder=features_folder, return_df=True)

In [20]:
print(comb.shape)
comb.head()

(3487, 20)


Unnamed: 0,ID,Sequence,CPP,Dataset,MW,NRB,tPSA,Fsp3,cLogP,NAR,HBD,HBA,HBD_minus_HBA,NPA,NG,NetC,IsoP,Hydrophobicity,Aromaticity,Length
0,Positive_1,GRKGKHKRKKLP,1,train,1432.791,54,657.21,0.730159,-6.31646,1,24,21,3,8,2,7.029472,12.572391,-2.525,0.0,12
1,Positive_2,KFLNRFWHWLQLKPGQPMY,1,train,2489.986,76,893.24,0.508197,-1.56493,8,30,29,1,7,1,3.02957,10.901836,-0.726316,0.263158,19
2,Positive_3,RRRRRRRRRGPGVTWTPQAWFQWV,1,train,3165.69,99,1446.15,0.539007,-13.76747,7,57,38,19,12,9,8.994831,13.403098,-1.775,0.166667,24
3,Positive_4,AEKVDPVKLNLTLSAAAEALTGLGDK,1,train,2625.02,89,1113.07,0.730435,-11.3754,0,37,38,-1,5,0,-1.005546,4.544657,0.1,0.0,26
4,Positive_5,GLKKLARLFHKLLKLGC,1,train,1938.512,70,723.58,0.703297,-2.36803,2,27,25,2,6,1,4.887885,11.249135,0.341176,0.058824,17


### Sequence-based descriptors

In [32]:
comb = create_comb(train_fasta_file, train_targets_file, test_fasta_file, test_targets_file)

In [33]:
comb = add_sequence_based_descriptors(df=comb, train_fasta_file=train_fasta_file, test_fasta_file=test_fasta_file,
                                      parameters_setting_file=os.path.join(data_folder, 'Protein_parameters_setting.json'),
                                      descriptors=['AAC',
                                                    'CKSAAP type 1',
                                                    'TPC type 1',  ## THIS COMPUTATION REQUIRES > 20mins
                                                    'DPC type 1',
                                                    'DDE',
                                                    'GAAC',
                                                    'CKSAAGP type 1',
                                                    'GDPC type 1',
                                                    'GTPC type 1',
                                                    'Moran',
                                                    'Geary',
                                                    'NMBroto',
                                                    'CTDC',
                                                    'CTDT',
                                                    'CTDD',
                                                    'CTriad',
                                                    'KSCTriad',
                                                    'SOCNumber',
                                                    'QSOrder',
                                                    'PAAC',
                                                    'APAAC',
                                                    'ASDC',
                                                    'AC',
                                                    'CC',
                                                    'ACC'],
                                       dataset_name='comb', save_folder=features_folder, return_df=True)

File imported successfully.
File imported successfully.
AAC: done in 0.144 s
File imported successfully.
File imported successfully.
CKSAAP type 1: done in 5.784 s
File imported successfully.
File imported successfully.
DPC type 1: done in 4.968 s
File imported successfully.
File imported successfully.
DDE: done in 6.391 s
File imported successfully.
File imported successfully.
GAAC: done in 0.069 s
File imported successfully.
File imported successfully.
CKSAAGP type 1: done in 1.582 s
File imported successfully.
File imported successfully.
GDPC type 1: done in 0.150 s
File imported successfully.
File imported successfully.
GTPC type 1: done in 0.481 s
File imported successfully.
File imported successfully.
Moran: done in 3.583 s
File imported successfully.
File imported successfully.
Geary: done in 3.674 s
File imported successfully.
File imported successfully.
NMBroto: done in 2.290 s
File imported successfully.
File imported successfully.
CTDC: done in 0.318 s
File imported successf

In [34]:
print(comb.shape)
comb.head()

(3487, 5543)


Unnamed: 0,ID,Sequence,CPP,Dataset,AAC_A,AAC_C,AAC_D,AAC_E,AAC_F,AAC_G,...,ACC_BEGF750102_BHAR880101_lag.3,ACC_BHAR880101_BEGF750102_lag.1,ACC_BHAR880101_BEGF750102_lag.2,ACC_BHAR880101_BEGF750102_lag.3,ACC_BEGF750103_BHAR880101_lag.1,ACC_BEGF750103_BHAR880101_lag.2,ACC_BEGF750103_BHAR880101_lag.3,ACC_BHAR880101_BEGF750103_lag.1,ACC_BHAR880101_BEGF750103_lag.2,ACC_BHAR880101_BEGF750103_lag.3
0,Positive_1,GRKGKHKRKKLP,1,train,0.0,0.0,0.0,0.0,0.0,0.166667,...,-0.034404,0.072149,-0.067206,0.113576,0.029151,-0.131319,0.067029,-0.086821,-0.028583,-0.018773
1,Positive_2,KFLNRFWHWLQLKPGQPMY,1,train,0.0,0.0,0.0,0.0,0.105263,0.052632,...,0.005998,0.035633,0.033473,-0.054483,-0.016844,-0.061969,-0.335118,-0.121722,0.049401,0.279786
2,Positive_3,RRRRRRRRRGPGVTWTPQAWFQWV,1,train,0.041667,0.0,0.0,0.0,0.041667,0.083333,...,0.062613,-0.387712,-0.035798,-0.148861,0.287498,0.609847,-0.080366,0.328426,-0.000529,0.33326
3,Positive_4,AEKVDPVKLNLTLSAAAEALTGLGDK,1,train,0.192308,0.0,0.076923,0.076923,0.0,0.076923,...,0.026399,0.140638,0.02104,-0.067608,-0.115566,0.170423,-0.043706,-0.079227,0.07228,0.20074
4,Positive_5,GLKKLARLFHKLLKLGC,1,train,0.058824,0.058824,0.0,0.0,0.058824,0.117647,...,-0.162409,0.150431,0.157102,-0.339355,-0.269119,-0.007127,0.134447,-0.184672,-0.11532,0.265157


In [36]:
comb = create_comb(train_fasta_file, train_targets_file, test_fasta_file, test_targets_file)

comb = add_sequence_based_descriptors(df=comb,
                                       train_fasta_file=os.path.join(data_folder, 'MLCPP2_Training_equal_length.fasta'),
                                       test_fasta_file=os.path.join(data_folder, 'MLCPP2_Independent_equal_length.fasta'),
                                       parameters_setting_file=os.path.join(data_folder, 'Protein_parameters_setting.json'),
                                       descriptors=['EAAC',
                                                    'EGAAC',
                                                    'AAIndex',
                                                    'BLOSUM62',
                                                    'ZScale'],
                                       dataset_name='comb', save_folder=features_folder, return_df=True)

File imported successfully.
File imported successfully.
EAAC: done in 0.216 s
File imported successfully.
File imported successfully.
EGAAC: done in 0.188 s
File imported successfully.
File imported successfully.
AAIndex: done in 0.116 s
File imported successfully.
File imported successfully.
BLOSUM62: done in 0.358 s
File imported successfully.
File imported successfully.
ZScale: done in 0.101 s


In [37]:
print(comb.shape)
comb.head()

(3487, 282)


Unnamed: 0,ID,Sequence,CPP,Dataset,EAAC_SW.1.A,EAAC_SW.1.R,EAAC_SW.1.N,EAAC_SW.1.D,EAAC_SW.1.C,EAAC_SW.1.Q,...,ZScale_p5.z1,ZScale_p5.z2,ZScale_p5.z3,ZScale_p5.z4,ZScale_p5.z5,ZScale_p6.z1,ZScale_p6.z2,ZScale_p6.z3,ZScale_p6.z4,ZScale_p6.z5
0,Positive_1,GRKGKHKRKKLP,1,train,0.0,0.25,0.0,0.0,0.0,0.0,...,-4.28,-1.3,-1.49,-0.72,0.84,-1.66,0.27,1.84,0.7,2.0
1,Positive_2,KFLNRFWHWLQLKPGQPMY,1,train,0.0,0.0,0.0,0.0,0.0,0.25,...,-2.85,-0.22,0.47,1.94,-0.98,-2.54,2.44,0.43,0.04,-1.47
2,Positive_3,RRRRRRRRRGPGVTWTPQAWFQWV,1,train,0.0,0.25,0.0,0.0,0.0,0.25,...,-4.36,3.94,0.59,3.44,-1.59,-2.59,-2.64,-1.54,-0.85,-0.02
3,Positive_4,AEKVDPVKLNLTLSAAAEALTGLGDK,1,train,0.25,0.0,0.0,0.0,0.0,0.0,...,3.98,0.93,1.93,-2.46,0.75,2.29,0.89,-2.49,1.49,0.31
4,Positive_5,GLKKLARLFHKLLKLGC,1,train,0.0,0.0,0.0,0.0,0.0,0.0,...,2.05,-4.06,0.36,-0.82,-0.38,0.84,-1.67,3.71,0.18,-2.65
