In [3]:
import os
import sys
import json
import pandas as pd
sys.path.append('../src/')
from configs import *
from feature_engineering import *

from matplotlib import rcParams
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
rcParams.update(fig_params)

import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Settings

In [4]:
data_folder = os.path.join('..', 'data')
features_folder = os.path.join('..', 'features')

train_fasta_file = os.path.join(data_folder, 'MLCPP2_Training.fasta')
test_fasta_file = os.path.join(data_folder, 'MLCPP2_Independent.fasta')

train_targets_file = os.path.join(data_folder, 'MLCPP2_TrainingCPPvalues.csv')
test_targets_file = os.path.join(data_folder, 'MLCPP2_IndependentCPPvalues.csv')

## Feature engineering

### Structure-based descriptors (Physicochemical properties)

In [5]:
comb = create_comb_MLCPP2(train_fasta_file, train_targets_file, test_fasta_file, test_targets_file)

In [6]:
comb = add_structure_based_descriptors(df=comb, dataset_name='comb', save_folder=features_folder, return_df=True)

### Sequence-based descriptors

In [7]:
comb = create_comb_MLCPP2(train_fasta_file, train_targets_file, test_fasta_file, test_targets_file)

In [8]:
comb = add_sequence_based_descriptors(df=comb, train_fasta_file=train_fasta_file, test_fasta_file=test_fasta_file,
                                      parameters_setting_file=os.path.join('..', 'data', 'Protein_parameters_setting.json'),
                                      descriptors=['AAC',
                                                   'CKSAAP type 1',
                                                   'TPC type 1',  ## THIS COMPUTATION REQUIRES > 20mins
                                                   'DPC type 1',
                                                   'DDE',
                                                   'GAAC',
                                                   'CKSAAGP type 1',
                                                   'GDPC type 1',
                                                   'GTPC type 1',
                                                   'Moran',
                                                   'Geary',
                                                   'NMBroto',
                                                   'CTDC',
                                                   'CTDT',
                                                   'CTDD',
                                                   'CTriad',
                                                   'KSCTriad',
                                                   'SOCNumber',
                                                   'QSOrder',
                                                   'PAAC',
                                                   'APAAC',
                                                   'ASDC',
                                                   'AC',
                                                   'CC',
                                                   'ACC'],
                                       dataset_name='comb', save_folder=features_folder, return_df=True)

File imported successfully.
File imported successfully.
AAC: done in 0.150 s
File imported successfully.
File imported successfully.
CKSAAP type 1: done in 5.735 s
File imported successfully.
File imported successfully.
TPC type 1: done in 1602.966 s
File imported successfully.
File imported successfully.
DPC type 1: done in 5.196 s
File imported successfully.
File imported successfully.
DDE: done in 6.581 s
File imported successfully.
File imported successfully.
GAAC: done in 0.067 s
File imported successfully.
File imported successfully.
CKSAAGP type 1: done in 1.559 s
File imported successfully.
File imported successfully.
GDPC type 1: done in 0.149 s
File imported successfully.
File imported successfully.
GTPC type 1: done in 0.479 s
File imported successfully.
File imported successfully.
Moran: done in 3.521 s
File imported successfully.
File imported successfully.
Geary: done in 3.550 s
File imported successfully.
File imported successfully.
NMBroto: done in 2.219 s
File imported

In [9]:
comb = create_comb_MLCPP2(train_fasta_file, train_targets_file, test_fasta_file, test_targets_file)

comb = add_sequence_based_descriptors(df=comb,
                                      train_fasta_file=os.path.join(data_folder, 'MLCPP2_Training_equal_length.fasta'),
                                      test_fasta_file=os.path.join(data_folder, 'MLCPP2_Independent_equal_length.fasta'),
                                      parameters_setting_file=os.path.join('..', 'data', 'Protein_parameters_setting.json'),
                                      descriptors=['EAAC',
                                                   'EGAAC',
                                                   'AAIndex',
                                                   'BLOSUM62',
                                                   'ZScale'],
                                       dataset_name='comb', save_folder=features_folder, return_df=True)

File imported successfully.
File imported successfully.
EAAC: done in 0.219 s
File imported successfully.
File imported successfully.
EGAAC: done in 0.262 s
File imported successfully.
File imported successfully.
AAIndex: done in 0.116 s
File imported successfully.
File imported successfully.
BLOSUM62: done in 0.251 s
File imported successfully.
File imported successfully.
ZScale: done in 0.100 s
