In [1]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [1]:
import os

In [2]:
# data reading and parsing
import pandas as pd

In [16]:
# feature encoding
import ifeatpro.features as ipro
import pssmpro.features as ppro
import ngrampro as npro
from scipy import sparse, io
import numpy as np

# TE raw data files read and parse

## Raw data reading

In [7]:
train_raw = "../data/raw/TE_trainset.csv" #rename for other datasets
test_raw = "../data/raw/TE_testset.csv" # rename for other datasets


df_train = pd.read_csv(train_raw, header=None, names=["enz_name", "enz_seq", "enz_label"])
df_test = pd.read_csv(test_raw, header=None, names=["enz_name", "enz_seq"])

In [8]:
df_train.head(2)

Unnamed: 0,enz_name,enz_seq,enz_label
0,A._hypogaea_l._(AhFatA),MLKVSCNGSDRVQFMAQCGFAGQPASVLVRRRSVSAVGFGYPMNRV...,1
1,Arabidopsis_thaliana,MVATSATSSFFPVPSSSLDPNGKGNKIGSTNLAGLNSTPNSGRMKV...,1


## Raw data parsing 

*for easier downstream applications*

In [22]:
# parse raw file 
# upper case all sequences
def up_seq(seq):
    return seq.upper().replace('-','')


df_train["enz_seq"] = df_train.enz_seq.apply(up_seq)
df_test["enz_seq"] = df_test.enz_seq.apply(up_seq)

# get rid of sequences with illegitimate amino acids 
df_train = df_train.loc[~df_train["enz_seq"].str.contains('B|J|O|U|X|Z')]
df_test = df_test.loc[~df_test["enz_seq"].str.contains('B|J|O|U|X|Z')]

# create enzyme alias
enz_alias_train = [f'enz_{i}' for i in range(len(df_train['enz_name']))]
df_train = df_train.assign(enz_alias=enz_alias_train)
enz_alias_test = [f'test_enz_{i}' for i in range(len(df_test['enz_name']))]
df_test = df_test.assign(enz_alias=enz_alias_test)

# enzyme alias to original enzyme name mapping
enz_train_name_map = "../data/mappings/train_enz_map.csv"
enz_test_name_map = "../data/mappings/test_enz_map.csv"
df_train.loc[:, ["enz_alias", "enz_name"]].to_csv(enz_train_name_map, index=False, header=False)
df_test.loc[:, ["enz_alias", "enz_name"]].to_csv(enz_test_name_map, index=False, header=False)

# create fasta file of sequence
enz_train_fasta = "../data/seq/train_enz.fa"
enz_test_fasta = "../data/seq/test_enz.fa"

train_fasta_stream = open(enz_train_fasta, "w")
test_fasta_stream = open(enz_test_fasta, "w")

for value in df_train.loc[:, ["enz_alias", "enz_seq"]].values:
    train_fasta_stream.write(f">{value[0]}\n{value[1]}\n")

for value in df_test.loc[:, ["enz_alias", "enz_seq"]].values:
    test_fasta_stream.write(f">{value[0]}\n{value[1]}\n")

train_fasta_stream.close()
test_fasta_stream.close()

# create csv file of sequence
enz_train_csv = "../data/seq/train_enz.csv"
enz_test_csv = "../data/seq/test_enz.csv"

df_train.loc[:, ["enz_alias", "enz_seq"]].to_csv(enz_train_csv, header=False, index=False)
df_test.loc[:, ["enz_alias", "enz_seq"]].to_csv(enz_test_csv, header=False, index=False)

## Creating labels

In [23]:
# create labels as csv
train_labels = "../data/label/train_enz_label.csv"

df_train.loc[:, ["enz_alias", "enz_label"]].to_csv(train_labels, index=False, header=False)

# Numerically encoding TE sequences

The TE sequences are numerically encoded in 47 different ways. They are

1. 21 types of physicochemical encoding using ifeatpro tool created as a part of this project. [ifeatpro link](https://pypi.org/project/ifeatpro/)


2. 21 types of PSSM based encodings using pssmpro tool created as a part of this project. [pssmpro link](https://pypi.org/project/pssmpro/)


3. 2 types of ngram based encodings using ngrampro tool created as a part of this project. [ngrampro link](https://pypi.org/project/ngrampro/)


4. 3 types of kernel based encodings using an external tool written in R known as KeBABS. [KeBABS link](https://bioconductor.org/packages/release/bioc/vignettes/kebabs/inst/doc/kebabs.pdf)

## ifeatpro

ifeatpro [link](https://pypi.org/project/ifeatpro/) can be directly used with a fasta file that contains the protein sequences in fasta format. 

In [11]:
help(ipro.get_all_features)

Help on function get_all_features in module ifeatpro.features:

get_all_features(fasta_file, output_dir)
    A function to create 21 numerically encoded features for protein sequences
    :param fasta_file: The path to a file that contains all the protein sequences in fasta format
    :param output_dir: The path to a directory where the feature encoded files will be stored
    :return: None



In [12]:
train_fasta_file = "../data/seq/train_enz.fa"
test_fasta_file = "../data/seq/test_enz.fa"

train_output_dir = "../data/features/ifeatpro/train/"
test_output_dir = "../data/features/ifeatpro/test/"

os.makedirs(train_output_dir, exist_ok=True)
os.makedirs(test_output_dir, exist_ok=True)

ipro.features.get_all_features(train_fasta_file, train_output_dir) 
ipro.features.get_all_features(test_fasta_file, test_output_dir)

Descriptor type: aac
Descriptor type: cksaap
Descriptor type: tpc
Descriptor type: dpc
Descriptor type: dde
Descriptor type: gaac
Descriptor type: cksaagp
Descriptor type: gtpc
Descriptor type: gdpc
Descriptor type: moran
Descriptor type: geary
Descriptor type: nmbroto
Descriptor type: ctdc
Descriptor type: ctdt
Descriptor type: ctdd
Descriptor type: ctriad
Descriptor type: ksctriad
Descriptor type: socnumber
Descriptor type: qsorder
Descriptor type: paac
Descriptor type: apaac
Descriptor type: aac
Descriptor type: cksaap
Descriptor type: tpc
Descriptor type: dpc
Descriptor type: dde
Descriptor type: gaac
Descriptor type: cksaagp
Descriptor type: gtpc
Descriptor type: gdpc
Descriptor type: moran
Descriptor type: geary
Descriptor type: nmbroto
Descriptor type: ctdc
Descriptor type: ctdt
Descriptor type: ctdd
Descriptor type: ctriad
Descriptor type: ksctriad
Descriptor type: socnumber
Descriptor type: qsorder
Descriptor type: paac
Descriptor type: apaac


## pssmpro

pssmpro [link](https://pypi.org/project/pssmpro/) requires the pssm profile of the protein sequences as input. At first the pssm profiles need to be created and then these profiles can be numerically encoded using pssmpro provided function. pssmpro also provides a function to create the numerical encodings of proteins sequences. The psiblast program path and an indexed blast database are required as function argument.

### Creating the pssm profiles

In [12]:
help(ppro.create_pssm_profile)

Help on function create_pssm_profile in module pssmpro.features:

create_pssm_profile(seq_file, out_dir, psiblast_exec, database_prefix, num_threads=24)
    A function to create psiblast or pssm profile for protein sequences
    :param seq_file: A csv file with name of the protein followed by its sequence separated by a comma
    :param out_dir: The directory where the user would like to store the pssm profiles of all the sequences
    :param psiblast_exec: The path of the psiblast executable. psiblast program needs to be installed
    :param database_prefix: The path of the indexed blast database directory prefix
    :param num_threads: Number of threads to use while creating the psiblast profile
    :return: The output directory where the psiblast/pssm profiles are stored



In [15]:
train_seq_file = "../data/seq/train_enz.csv"
test_seq_file = "../data/seq/test_enz.csv"
psiblast_path = "/opt/aci/sw/ncbi-rmblastn/2.9.0_gcc-8.3.1-bxy/bin/psiblast" # provide the path to your psiblast program 
database_pre = "../../pssmpro_test_data/uniref50/uniref50db" # database creation described in pssmpro link given above

In [14]:
output_dir_train = "../data/features/pssmpro/pssm_profiles/train/"
os.makedirs(output_dir, exist_ok=True)
ppro.features.create_pssm_profile(train_seq_file, output_dir_train, psiblast_path, database_pre)

Generating psiblast profile for protein: enz_0
Generating psiblast profile for protein: enz_1
Generating psiblast profile for protein: enz_2
Generating psiblast profile for protein: enz_3
Generating psiblast profile for protein: enz_4
Generating psiblast profile for protein: enz_5
Generating psiblast profile for protein: enz_6
Generating psiblast profile for protein: enz_7
Generating psiblast profile for protein: enz_8
Generating psiblast profile for protein: enz_9
Generating psiblast profile for protein: enz_10
Generating psiblast profile for protein: enz_11
Generating psiblast profile for protein: enz_12
Generating psiblast profile for protein: enz_13
Generating psiblast profile for protein: enz_14
Generating psiblast profile for protein: enz_15
Generating psiblast profile for protein: enz_16
Generating psiblast profile for protein: enz_17
Generating psiblast profile for protein: enz_18
Generating psiblast profile for protein: enz_19
Generating psiblast profile for protein: enz_20
Ge

'../data/features/pssmpro/pssm_profiles'

In [16]:
output_dir_test = "../data/features/pssmpro/pssm_profiles/test/"
os.makedirs(output_dir_test, exist_ok=True)
ppro.create_pssm_profile(test_seq_file, output_dir_test, psiblast_path, database_pre)

Generating psiblast profile for protein: test_enz_0
Generating psiblast profile for protein: test_enz_1
Generating psiblast profile for protein: test_enz_2
Generating psiblast profile for protein: test_enz_3
Generating psiblast profile for protein: test_enz_4
Generating psiblast profile for protein: test_enz_5
Generating psiblast profile for protein: test_enz_6
Generating psiblast profile for protein: test_enz_7
Generating psiblast profile for protein: test_enz_8
Generating psiblast profile for protein: test_enz_9
Generating psiblast profile for protein: test_enz_10
Generating psiblast profile for protein: test_enz_11
Generating psiblast profile for protein: test_enz_12
Generating psiblast profile for protein: test_enz_13
Generating psiblast profile for protein: test_enz_14
Generating psiblast profile for protein: test_enz_15
Generating psiblast profile for protein: test_enz_16
Generating psiblast profile for protein: test_enz_17
Generating psiblast profile for protein: test_enz_18
Gen

Generating psiblast profile for protein: test_enz_154
Generating psiblast profile for protein: test_enz_155
Generating psiblast profile for protein: test_enz_156
Generating psiblast profile for protein: test_enz_157
Generating psiblast profile for protein: test_enz_158
Generating psiblast profile for protein: test_enz_159
Generating psiblast profile for protein: test_enz_160
Generating psiblast profile for protein: test_enz_161
Generating psiblast profile for protein: test_enz_162
Generating psiblast profile for protein: test_enz_163
Generating psiblast profile for protein: test_enz_164
Generating psiblast profile for protein: test_enz_165
Generating psiblast profile for protein: test_enz_166
Generating psiblast profile for protein: test_enz_167
Generating psiblast profile for protein: test_enz_168
Generating psiblast profile for protein: test_enz_169
Generating psiblast profile for protein: test_enz_170
Generating psiblast profile for protein: test_enz_171
Generating psiblast profile 

Generating psiblast profile for protein: test_enz_306
Generating psiblast profile for protein: test_enz_307
Generating psiblast profile for protein: test_enz_308
Generating psiblast profile for protein: test_enz_309
Generating psiblast profile for protein: test_enz_310
Generating psiblast profile for protein: test_enz_311
Generating psiblast profile for protein: test_enz_312
Generating psiblast profile for protein: test_enz_313
Generating psiblast profile for protein: test_enz_314
Generating psiblast profile for protein: test_enz_315
Generating psiblast profile for protein: test_enz_316
Generating psiblast profile for protein: test_enz_317
Generating psiblast profile for protein: test_enz_318
Generating psiblast profile for protein: test_enz_319
Generating psiblast profile for protein: test_enz_320
Generating psiblast profile for protein: test_enz_321
Generating psiblast profile for protein: test_enz_322
Generating psiblast profile for protein: test_enz_323
Generating psiblast profile 

Generating psiblast profile for protein: test_enz_458
Generating psiblast profile for protein: test_enz_459
Generating psiblast profile for protein: test_enz_460
Generating psiblast profile for protein: test_enz_461
Generating psiblast profile for protein: test_enz_462
Generating psiblast profile for protein: test_enz_463
Generating psiblast profile for protein: test_enz_464
Generating psiblast profile for protein: test_enz_465
Generating psiblast profile for protein: test_enz_466
Generating psiblast profile for protein: test_enz_467
Generating psiblast profile for protein: test_enz_468
Generating psiblast profile for protein: test_enz_469
Generating psiblast profile for protein: test_enz_470
Generating psiblast profile for protein: test_enz_471
Generating psiblast profile for protein: test_enz_472
Generating psiblast profile for protein: test_enz_473
Generating psiblast profile for protein: test_enz_474
Generating psiblast profile for protein: test_enz_475
Generating psiblast profile 

'../data/features/pssmpro/pssm_profiles/test/'

### Generating features from the profiles

In [7]:
help(ppro.get_all_features)

Help on function get_all_features in module pssmpro.features:

get_all_features(pssm_dir, store_dir='./')



In [19]:
train_out_dir = "../data/features/pssmpro/train/"
test_out_dir = "../data/features/pssmpro/test/"

In [17]:
os.makedirs(train_out_dir, exist_ok=True)
ppro.get_all_features("../data/features/pssmpro/pssm_profiles/train/", train_out_dir)

In [20]:
os.makedirs(test_out_dir, exist_ok=True)
ppro.get_all_features("../data/features/pssmpro/pssm_profiles/test/", test_out_dir)

['aac_pssm',
 'aadp_pssm',
 'aatp',
 'ab_pssm',
 'd_fpssm',
 'dp_pssm',
 'dpc_pssm',
 'edp',
 'eedp',
 'k_separated_bigrams_pssm',
 'medp',
 'pse_pssm',
 'pssm_ac',
 'pssm_cc',
 'pssm_composition',
 'rpm_pssm',
 'rpssm',
 's_fpssm',
 'smoothed_pssm',
 'tpc',
 'tri_gram_pssm']

## ngrampro

ngrampro [link](https://pypi.org/project/ngrampro/) based features are created online during model training. 

## KeBABS

Kernel based features are created using an external software package called KeBABS. The Rscript used to generate the features called *Kernels-KeBABs.r* is given in the *utils* directory. 

### Running Kebabs

In [4]:
# Running the Rscript using a bash command

!Rscript ../utils/Kernels-KeBABs.r

Loading required package: Biostrings
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: 'BiocGenerics'

The following objects are masked from 'package:parallel':

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs

The following objects are masked from 'package:base':

    Filter, Find, Map, Position, Reduce, anyDuplicated, append,
    as.data.frame, basename, cbind, colnames, dirname, do.call,
    duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
    lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
    pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
    tapply, union, unique, unsplit, which.max, which.min

Loading required package: S4Vectors
Loading required package: stats4

Attaching pack

### Parsing Kebabs output

*required for downstream analysis* 

In [18]:
featfile_dir = "../data/features/kernel/"
file_prefixes = ['spec', 'gap', 'mism']

output_train_file = "../data/features/kernel/train/"
output_test_file = "../data/features/kernel/test/"

os.makedirs(output_train_file, exist_ok=True)
os.makedirs(output_test_file, exist_ok=True)

In [19]:
def save_feat_vec_files(file_dir, outdir_train, outdir_test, file_prefix):
    sp_mat_file = file_dir + file_prefix + "/" + file_prefix + '_kern_sparsematrix.txt'
    enz_name_file = file_dir + file_prefix + "/" + file_prefix + '_kern_rownames.txt'
    
    sp_mat = io.mmread(sp_mat_file).tocsr()
    enz_names = np.genfromtxt(enz_name_file, dtype=str)
    
    
    train_enz_idx = []
    test_enz_idx = []

    for idx, enz_name in enumerate(enz_names):
        if enz_name.startswith('enz'):
            train_enz_idx.append(idx)
        elif enz_name.startswith('test'):
            test_enz_idx.append(idx)
        else:
            raise ValueError('Wrong Enzyme Prefix')
            
    X_train, X_test = sp_mat[train_enz_idx,:], sp_mat[test_enz_idx,:]

    enz_names_train, enz_names_test = enz_names[train_enz_idx], enz_names[test_enz_idx]
    
    assert X_train.shape[0] == len(enz_names_train)
    assert X_test.shape[0] == len(enz_names_test)

    
    sparse.save_npz(outdir_train+file_prefix+'mat.npz', X_train)
    sparse.save_npz(outdir_test+file_prefix+'mat.npz', X_test)

    np.savetxt(outdir_train+file_prefix+'enz_names.txt', enz_names_train, fmt='%s')
    np.savetxt(outdir_test+file_prefix+'enz_names.txt', enz_names_test, fmt='%s')

    
    return 

In [20]:
for fp in file_prefixes:
    save_feat_vec_files(featfile_dir, output_train_file, 
                        output_test_file, fp)

# EnZymClass

**En**semble model for en**Zym**e **Class**ification

## Training

### Best base learner decider

In [2]:
import sys
sys.path.append("../")

In [3]:
from utils import helper

In [4]:
import itertools
import numpy as np
import multiprocessing as mp
from itertools import groupby
from collections import Counter
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [5]:
train_enz_seq_file = "../data/seq/train_enz.csv"
test_enz_seq_file = "../data/seq/test_enz.csv"
label_file = "../data/label/train_enz_label.csv"
train_feature_dirs = ["../data/features/ifeatpro/train/", 
                      "../data/features/kernel/train/",
                      "../data/features/pssmpro/train/"]
test_feature_dirs = ["../data/features/ifeatpro/test/",
                    "../data/features/kernel/test/",
                    "../data/features/pssmpro/test/"]

In [6]:
Num_sim = 10000

In [7]:
def check_performance(enz_file, test_enz_file, label_file, train_feat_dirs, test_feat_dirs, hyper_param_file, base_algo, k, opt, N):
    pool = mp.Pool(mp.cpu_count())
    
    iter_svm = zip([enz_file for _ in range(N)],
                   [test_enz_file for _ in range(N)],
                   [label_file for _ in range(N)],
                   [train_feat_dirs for _ in range(N)],
                   [test_feat_dirs for _ in range(N)],
                   [hyper_param_file for _ in range(N)],
                   [base_algo for _ in range(N)],
                   [k for _ in range(N)],
                   [opt for _ in range(N)],
                   range(N))
    
    metrics = pool.starmap(helper.check_base_performance,iter_svm)
    
    precision = [m[0] for m in metrics]
    recall = [m[1] for m in metrics]
    accuracy = [m[2] for m in metrics]
    
    return round(np.mean(precision), 2), round(np.mean(recall), 2), round(np.mean(accuracy), 2)

In [8]:
# Checking the performance of SVM

In [9]:
%%time
check_performance(train_enz_seq_file, None, label_file, train_feature_dirs, None, None, 'SVM', 5, False, Num_sim)

CPU times: user 716 ms, sys: 218 ms, total: 934 ms
Wall time: 1h 51min 26s


(0.88, 0.89, 0.8)

In [10]:
# Checking the performance of NN

In [11]:
%%time
check_performance(train_enz_seq_file, None, label_file, train_feature_dirs, None, None, 'NN', 5, False, Num_sim)

CPU times: user 646 ms, sys: 242 ms, total: 888 ms
Wall time: 2h 4min 47s


(0.81, 0.94, 0.79)

In [12]:
# Checking the performance of GBC

In [13]:
%%time
check_performance(train_enz_seq_file, None, label_file, train_feature_dirs, None, None, 'GBC', 5, False, Num_sim)

CPU times: user 504 ms, sys: 225 ms, total: 729 ms
Wall time: 1h 55min 56s


(0.82, 0.92, 0.79)

### Individual Hyperparameter Optimization

In [14]:
def most_frequent(arr):
    count = Counter(arr)
    most_freq = count.most_common(1)[0][0]
    return most_freq


def store_best_hps(enz_file, test_enz_file, label_file, train_feat_dirs, test_feat_dirs, hyper_param_file, base_algo, k, opt, N):
    pool = mp.Pool(mp.cpu_count())
    
    iter_func = zip([enz_file for _ in range(N)],
               [test_enz_file for _ in range(N)],
               [label_file for _ in range(N)],
               [train_feat_dirs for _ in range(N)],
               [test_feat_dirs for _ in range(N)],
               [hyper_param_file for _ in range(N)],
               [base_algo for _ in range(N)],
               [k for _ in range(N)],
               [opt for _ in range(N)],
               range(N))
    
    all_hps = sum(list(pool.starmap(helper.best_hps, iter_func)),[])
    all_hps = sorted(all_hps, key=lambda x: x[0])

    with open('../data/results/hpopt/IndHPOpt.csv','w') as f:
        f.write('feat_name,regC,kernel,pca_comp')
        f.write('\n')
        for feat_name, feat_info in groupby(all_hps, key=lambda x: x[0]):
            best_hp_list = [hp[1] for hp in list(feat_info)]
            best_regC = most_frequent([x[0] for x in best_hp_list])
            best_kernel = most_frequent([x[1] for x in best_hp_list])
            best_ncomp = most_frequent([x[2] for x in best_hp_list])
            f.write(f"{feat_name},{best_regC},{best_kernel},{best_ncomp}")
            f.write('\n')

    return 

In [15]:
%%time 
store_best_hps(train_enz_seq_file, None, label_file, train_feature_dirs, None, None, 'SVM', 5, True, Num_sim//10)

CPU times: user 2.5 s, sys: 963 ms, total: 3.46 s
Wall time: 11h 59min 14s


### Feature extraction technique performance

In [16]:
indhpoptfile = '../data/results/hpopt/IndHPOpt.csv'


def indfeat_measure(enz_file, test_enz_file, label_file, train_feat_dirs, test_feat_dirs, hyper_param_file, base_algo, k, opt, N):
    pool = mp.Pool(mp.cpu_count())
    
    iter_func = zip([enz_file for _ in range(N)],
               [test_enz_file for _ in range(N)],
               [label_file for _ in range(N)],
               [train_feat_dirs for _ in range(N)],
               [test_feat_dirs for _ in range(N)],
               [hyper_param_file for _ in range(N)],
               [base_algo for _ in range(N)],
               [k for _ in range(N)],
               [opt for _ in range(N)],
               range(N))
    
    all_metrics = sum(list(pool.starmap(helper.indfeat_performance, iter_func)), [])
    all_metrics = sorted(all_metrics, key=lambda x: x[0])
    with open("../data/results/indfeatreport.csv", 'w') as f:
        f.write('featname,min_precision,max_precision,mean_precision,std_precision,min_recall,max_recall,mean_recall,std_recall,min_accuracy,max_accuracy,mean_accuracy,std_accuracy')
        f.write('\n')
        for featname, featinfo in groupby(all_metrics, key=lambda x:x[0]):
            all_metric_list = [met[1] for met in featinfo]
            all_prec = [m[0] for m in all_metric_list]
            all_rec = [m[1] for m in all_metric_list]
            all_acc = [m[2] for m in all_metric_list]

            min_prec = round(min(all_prec),2)
            max_prec = round(max(all_prec),2)
            mean_prec = round(np.mean(all_prec),2)
            std_prec = round(np.std(all_prec),2)

            min_rec = round(min(all_rec),2)
            max_rec = round(max(all_rec),2)
            mean_rec = round(np.mean(all_rec),2)
            std_rec = round(np.std(all_rec),2)

            min_acc = round(min(all_acc),2)
            max_acc = round(max(all_acc),2)
            mean_acc = round(np.mean(all_acc),2)
            std_acc = round(np.std(all_acc),2)
            
            f.write(f'{featname},{min_prec},{max_prec},{mean_prec},{std_prec},{min_rec},{max_rec},{mean_rec},{std_rec},{min_acc},{max_acc},{mean_acc},{std_acc}')
            f.write('\n')

        
    return

In [17]:
%%time
indfeat_measure(train_enz_seq_file, None, label_file, train_feature_dirs, None, indhpoptfile, 'SVM', 5, False, Num_sim)

CPU times: user 34.2 s, sys: 3.81 s, total: 38 s
Wall time: 1h 47min 25s


### Parametric sweep of ensemble model hyperparameter k

In [18]:
def ensemble_param_sweep(enz_file, test_enz_file, label_file, train_feat_dirs, test_feat_dirs, hyper_param_file, base_algo, ks, opt, N):

    _iter = list(itertools.product(ks, range(N)))
    iter_func = zip([enz_file for _ in range(len(_iter))],
           [test_enz_file for _ in range(len(_iter))],
           [label_file for _ in range(len(_iter))],
           [train_feat_dirs for _ in range(len(_iter))],
           [test_feat_dirs for _ in range(len(_iter))],
           [hyper_param_file for _ in range(len(_iter))],
           [base_algo for _ in range(len(_iter))],
           [k[0] for k in _iter],
           [opt for _ in _iter],
           [rs[1] for rs in _iter])
    
    pool = mp.Pool(mp.cpu_count())
    all_preds = list(pool.starmap(helper.ensemble_pred, iter_func))
    all_metrics = list(map(helper.get_metrics, all_preds))
    def get_mean(met):
        return round(np.mean(met), 2)
    
    with open('../data/results/en_param_sweep.csv','w') as f:
        f.write('model_k,mean_precision,mean_recall,mean_accuracy')
        f.write('\n')
        for model_k, model_info in itertools.groupby(zip(list(_iter), all_metrics), key=lambda x:x[0][0]):
            model_metrics = [m[1] for m in list(model_info)]
            prec = [m[0] for m in model_metrics]
            rec = [m[1] for m in model_metrics]
            acc = [m[2] for m in model_metrics]    
        
            f.write(f"{model_k},{','.join(list(map(str,(list(map(get_mean,[prec,rec,acc]))))))}")
            f.write('\n')
        
    return 

In [19]:
%%time
ensemble_param_sweep(train_enz_seq_file, None, label_file, train_feature_dirs, None, indhpoptfile, 'SVM', [5,9,15,21,31], False, Num_sim)

CPU times: user 1min 19s, sys: 591 ms, total: 1min 20s
Wall time: 8h 53min 48s


In [20]:
pd.read_csv('../data/results/en_param_sweep.csv')

Unnamed: 0,model_k,mean_precision,mean_recall,mean_accuracy
0,5,0.87,0.89,0.8
1,9,0.87,0.88,0.79
2,15,0.87,0.87,0.79
3,21,0.86,0.87,0.78
4,31,0.85,0.87,0.78


### Ensemble performance evaluation

In [21]:
def ensemble_eval(enz_file, test_enz_file, label_file, train_feat_dirs, test_feat_dirs, hyper_param_file, base_algo, k, opt, N):

    pool = mp.Pool(mp.cpu_count())
    
    iter_func = zip([enz_file for _ in range(N)],
               [test_enz_file for _ in range(N)],
               [label_file for _ in range(N)],
               [train_feat_dirs for _ in range(N)],
               [test_feat_dirs for _ in range(N)],
               [hyper_param_file for _ in range(N)],
               [base_algo for _ in range(N)],
               [k for _ in range(N)],
               [opt for _ in range(N)],
               range(N))
    
    all_preds = list(pool.starmap(helper.ensemble_pred, iter_func))
    model_metrics = list(map(helper.get_metrics, all_preds))
    all_prec = [m[0] for m in model_metrics]
    all_rec = [m[1] for m in model_metrics]
    all_acc = [m[2] for m in model_metrics]
    
    min_prec = round(min(all_prec),2)
    max_prec = round(max(all_prec),2)
    mean_prec = round(np.mean(all_prec),2)
    std_prec = round(np.std(all_prec),2)

    min_rec = round(min(all_rec),2)
    max_rec = round(max(all_rec),2)
    mean_rec = round(np.mean(all_rec),2)
    std_rec = round(np.std(all_rec),2)

    min_acc = round(min(all_acc),2)
    max_acc = round(max(all_acc),2)
    mean_acc = round(np.mean(all_acc),2)
    std_acc = round(np.std(all_acc),2)
    
    with open('../data/results/ensemble_results.csv','w') as f:
        for prec, rec, acc in model_metrics:
            f.write(f"{prec},{rec},{acc}\n")
    
    with open('../data/results/ensemble_report.csv','w') as f:
        f.write('ensemble,min_precision,max_precision,mean_precision,std_precision,min_recall,max_recall,mean_recall,std_recall,min_accuracy,max_accuracy,mean_accuracy,std_accuracy')
        f.write('\n')
    
        f.write(f'ensemble,{min_prec},{max_prec},{mean_prec},{std_prec},{min_rec},{max_rec},{mean_rec},{std_rec},{min_acc},{max_acc},{mean_acc},{std_acc}')
        f.write('\n')
        
    return

In [22]:
%%time
ensemble_eval(train_enz_seq_file, None, label_file, train_feature_dirs, None, indhpoptfile, 'SVM', 5, False, Num_sim)

CPU times: user 16.4 s, sys: 0 ns, total: 16.4 s
Wall time: 1h 47min 6s


In [23]:
def ensemble_store(enz_file, test_enz_file, label_file, train_feat_dirs, test_feat_dirs, hyper_param_file, base_algo, k, opt, N):
    """function stores the validation data in first line and prediction data in second line"""
    pool = mp.Pool(mp.cpu_count())
    
    iter_func = zip([enz_file for _ in range(N)],
           [test_enz_file for _ in range(N)],
           [label_file for _ in range(N)],
           [train_feat_dirs for _ in range(N)],
           [test_feat_dirs for _ in range(N)],
           [hyper_param_file for _ in range(N)],
           [base_algo for _ in range(N)],
           [k for _ in range(N)],
           [opt for _ in range(N)],
           range(N))

    all_preds = list(pool.starmap(helper.ensemble_pred, iter_func))
    
    with open("../data/results/ensemble_preds.csv","w") as f:
        for rs_pred in all_preds:
            f.write(",".join(map(str,rs_pred[0])))
            f.write("\n")
            f.write(",".join(map(str,rs_pred[1])))
            f.write("\n")
    return

In [24]:
%%time
ensemble_store(train_enz_seq_file, None, label_file, train_feature_dirs, None, indhpoptfile, 'SVM', 5, False, Num_sim)

CPU times: user 1.07 s, sys: 0 ns, total: 1.07 s
Wall time: 1h 46min 28s
