# Prediction of test subjects

In [1]:
import sys
import tqdm
import multiprocessing as mp
from collections import Counter
from operator import itemgetter

In [2]:
sys.path.append('../')

In [3]:
from utils import helper

In [4]:
# Sequence and label files 
enz_file = '../data/seq/train_enzyme_sequence.csv'
test_enz_file = '../data/seq/test_enzyme_sequence.csv'
test_enz_map_file = '../data/mappings/test_enzyme_name_map.csv'
label_file = '../data/label/train_enzyme_labels.csv'
indhpoptfile = '../data/results/hpopt/IndHPOpt.csv'

# Feature dir for iFeature,kernel,pssm 
ifeatdatadir = '../data/features/ifeature/featvec/train/'
kerneldatadir = '../data/features/kernel/featvec/train/'
pssmdatadir = '../data/features/possum/featvec/train/'

trainfeatdirs = [ifeatdatadir,kerneldatadir,pssmdatadir]

ifeattestdatadir = '../data/features/ifeature/featvec/test/'
kerneltestdatadir = '../data/features/kernel/featvec/test/'
pssmtestdatadir = '../data/features/possum/featvec/test/'

testfeatdirs = [ifeattestdatadir,kerneltestdatadir,pssmtestdatadir]

In [5]:
def test_pred_multifunc(enz_file, test_enz_file, label_file, train_feat_dirs, test_feat_dirs, hyper_param_file, base_algo, k, opt, N, test_enz_map_file):
    pool = mp.Pool(mp.cpu_count())
    
    iter_func = zip([enz_file for _ in range(N)],
           [test_enz_file for _ in range(N)],
           [label_file for _ in range(N)],
           [train_feat_dirs for _ in range(N)],
           [test_feat_dirs for _ in range(N)],
           [hyper_param_file for _ in range(N)],
           [base_algo for _ in range(N)],
           [k for _ in range(N)],
           [opt for _ in range(N)],
           range(N),
           [test_enz_map_file for _ in range(N)],)
    
    all_names = list(pool.starmap(helper.function_predict_test, iter_func))
    return all_names


def save_test_preds(enz_file, test_enz_file, label_file, train_feat_dirs, test_feat_dirs, hyper_param_file, base_algo, k, opt, N, test_enz_map_file):
    test_preds = test_pred_multifunc(enz_file, test_enz_file, label_file, train_feat_dirs, test_feat_dirs, hyper_param_file, base_algo, k, opt, N, test_enz_map_file)
    
    filename_all = '../data/predictions/thyme/test_multipreds.csv'
    filename_set = '../data/predictions/thyme/testset_multipreds.csv'
    
    set_test_preds = set(sum(test_preds,[]))
    with open(filename_set, 'w') as f:
        for enz in set_test_preds:
            f.write(enz)
            f.write('\n')
    
    with open(filename_all, 'w') as f:
        for rs, enz_list in zip(range(N), test_preds):
            f.write(f'{rs},{",".join(enz_list)}')
            f.write('\n')
    return

In [6]:
%%time
save_test_preds(enz_file, test_enz_file, label_file, trainfeatdirs, testfeatdirs, indhpoptfile, 'SVM', 5, False, 1000, test_enz_map_file)

CPU times: user 300 ms, sys: 131 ms, total: 432 ms
Wall time: 19min 34s


# Get the maximum counts from predicted file

In [7]:
def read_enzymes(filename):
    enzymes = []
    with open(filename,'r') as f:
        for line in f:
            data = line.strip().split(',')
            enzymes.extend(data[1:])
    return enzymes


def save_enzyme_counts(filename, enz_list):
    count = Counter(enz_list)
    with open(filename, 'w') as f:
        for key, value in count.most_common():
            f.write(f'{key},{value}')
            f.write('\n')
    return

In [8]:
test_enz_file = '../data/predictions/thyme/test_multipreds.csv'

test_enz_preds = read_enzymes(test_enz_file)
save_enzyme_counts('../data/predictions/thyme/test_enzcount.csv', test_enz_preds)