# Prediction of the 9 test subjects

In [32]:
import sys
import tqdm
import multiprocessing as mp
from collections import Counter
from operator import itemgetter

In [2]:
sys.path.append('../scripts/')

In [3]:
from MultiClassClassification import TEClassification

In [4]:
# Sequence and label files 
enz_file = '../data/seq/EnzymeSequence.csv'
test_enz_file = '../data/seq/TestEnzymeSequence.csv'
test_enz_map_file = '../data/TestEnzymeNameMap.csv'
label_file = '../data/label/EnzymeLabelsMultiClass.csv'
indhpoptfile = '../data/results/hpopt/IndHPOpt.csv'

# Feature dir for iFeature,kernel,pssm 
ifeatdatadir = '../featEngg/offline/ifeatMethods/data/featvec/trainfiles/'
kerneldatadir = '../featEngg/offline/kernelMethods/data/featvec/thymefiles1/train/'
pssmdatadir = '../featEngg/offline/pssmMethods/data/featvec/trainfiles/'

trainfeatdirs = [ifeatdatadir,kerneldatadir,pssmdatadir]

ifeattestdatadir = '../featEngg/offline/ifeatMethods/data/featvec/testfiles/'
kerneltestdatadir = '../featEngg/offline/kernelMethods/data/featvec/thymefiles1/test/'
pssmtestdatadir = '../featEngg/offline/pssmMethods/data/featvec/testfiles/'

testfeatdirs = [ifeattestdatadir,kerneltestdatadir,pssmtestdatadir]

In [5]:
def get_test_map_dict(filename):
    map_dict = dict()
    with open(filename,'r') as f:
        for lines in f:
            data = lines.strip().split(",")
            map_dict[data[0]] = data[1]
    return map_dict

def function_predict_test(rs):
    enz_map_dict = get_test_map_dict(test_enz_map_file)
    names = []
    te = TEClassification(enz_file,test_enz_file,label_file,trainfeatdirs,testfeatdirs,hyperparamfile=indhpoptfile,random_seed=rs, model='SVM', optimize=False)
    for name,pred in zip(te.testenz_names, te.en.preds):
        if pred == 3:
            names.append(enz_map_dict[name])
    return names

def test_pred_multifunc(N):
    pool = mp.Pool(mp.cpu_count())
    all_names = list(tqdm.tqdm(pool.imap(function_predict_test, range(N)), total=N))
    return all_names

def save_test_preds(N):
    test_preds = test_pred_multifunc(N)
    
    filename_all = '../data/predictions/test/test_multipreds.csv'
    filename_set = '../data/predictions/test/testset_multipreds.csv'
    
    set_test_preds = set(sum(test_preds,[]))
    with open(filename_set, 'w') as f:
        for enz in set_test_preds:
            f.write(enz)
            f.write('\n')
    
    with open(filename_all, 'w') as f:
        for rs, enz_list in zip(range(N), test_preds):
            f.write(f'{rs},{",".join(enz_list)}')
            f.write('\n')
    return

In [6]:
save_test_preds(1000)

100%|██████████| 1000/1000 [11:52<00:00,  1.40it/s]


# Prediction of thyme1 sequences

In [7]:
# Sequence and mapping files 
thyme1_enz_file = '../data/seq/ThymeEnzymeSequence1.csv'
thyme_enz_map_file = '../data/ThymeEnzymeNameMap.csv'


ifeatthyme1datadir = '../featEngg/offline/ifeatMethods/data/featvec/thymefiles1/'
kernelthyme1datadir = '../featEngg/offline/kernelMethods/data/featvec/thymefiles1/thyme/'
pssmthyme1datadir = '../featEngg/offline/pssmMethods/data/featvec/thymefiles1/'

thyme1featdirs = [ifeatthyme1datadir,kernelthyme1datadir,pssmthyme1datadir]

In [8]:
def function_predict_thyme1(rs):
    enz_map_dict = get_test_map_dict(thyme_enz_map_file)
    names = []
    te_thyme1 = TEClassification(enz_file,thyme1_enz_file,label_file,trainfeatdirs,thyme1featdirs,hyperparamfile=indhpoptfile,random_seed=rs, model='SVM', optimize=False)
    for name,pred in zip(te_thyme1.testenz_names, te_thyme1.en.preds):
        if pred == 3:
            names.append(enz_map_dict[name])   
    return names

def thyme1_pred_multifunc(N):
    pool = mp.Pool(mp.cpu_count())
    all_names = list(tqdm.tqdm(pool.imap(function_predict_thyme1, range(N)), total=N))
    return all_names

def save_thyme1_preds(N):
    thyme1_preds = thyme1_pred_multifunc(N)
    
    filename_all = '../data/predictions/thyme/thyme1_multipreds.csv'
    filename_set = '../data/predictions/thyme/thyme1set_multipreds.csv'
    
    set_thyme1_preds = set(sum(thyme1_preds,[]))
    with open(filename_set, 'w') as f:
        for enz in set_thyme1_preds:
            f.write(enz)
            f.write('\n')
    
    with open(filename_all, 'w') as f:
        for rs, enz_list in zip(range(N), thyme1_preds):
            f.write(f'{rs},{",".join(enz_list)}')
            f.write('\n')
    return

In [9]:
save_thyme1_preds(1000)

100%|██████████| 1000/1000 [15:13<00:00,  1.09it/s]


# Prediction of thyme2 sequences

In [10]:
# Sequence and label files 
thyme2_enz_file = '../data/seq/ThymeEnzymeSequence2.csv'

kernelthyme2datadir = '../featEngg/offline/kernelMethods/data/featvec/thymefiles2/train/'

trainthyme2featdirs = [ifeatdatadir,kernelthyme2datadir,pssmdatadir]

ifeatthyme2datadir = '../featEngg/offline/ifeatMethods/data/featvec/thymefiles2/'
kernelthyme2datadir = '../featEngg/offline/kernelMethods/data/featvec/thymefiles2/thyme/'
pssmthyme2datadir = '../featEngg/offline/pssmMethods/data/featvec/thymefiles2/'

thyme2featdirs = [ifeatthyme2datadir,kernelthyme2datadir,pssmthyme2datadir]

In [11]:
def function_predict_thyme2(rs):
    enz_map_dict = get_test_map_dict(thyme_enz_map_file)
    names = []
    te_thyme2 = TEClassification(enz_file,thyme2_enz_file,label_file,trainthyme2featdirs,thyme2featdirs,hyperparamfile=indhpoptfile,random_seed=rs, model='SVM', optimize=False)
    for name,pred in zip(te_thyme2.testenz_names, te_thyme2.en.preds):
        if pred == 3:
            names.append(enz_map_dict[name])   
    return names

def thyme2_pred_multifunc(N):
    pool = mp.Pool(mp.cpu_count())
    all_names = list(tqdm.tqdm(pool.imap(function_predict_thyme2, range(N)), total=N))
    return all_names

def save_thyme2_preds(N):
    thyme2_preds = thyme2_pred_multifunc(N)
    
    filename_all = '../data/predictions/thyme/thyme2_multipreds.csv'
    filename_set = '../data/predictions/thyme/thyme2set_multipreds.csv'
    
    set_thyme2_preds = set(sum(thyme2_preds,[]))
    with open(filename_set, 'w') as f:
        for enz in set_thyme2_preds:
            f.write(enz)
            f.write('\n')
    
    with open(filename_all, 'w') as f:
        for rs, enz_list in zip(range(N), thyme2_preds):
            f.write(f'{rs},{",".join(enz_list)}')
            f.write('\n')
    return

In [12]:
save_thyme2_preds(1000)

100%|██████████| 1000/1000 [15:15<00:00,  1.09it/s]


# Get the maximum counts from predicted file

## Test Enzymes

In [35]:
def read_enzymes(filename):
    enzymes = []
    with open(filename,'r') as f:
        for line in f:
            data = line.strip().split(',')
            enzymes.extend(data[1:])
    return enzymes


def save_enzyme_counts(filename, enz_list):
    count = Counter(enz_list)
    with open(filename, 'w') as f:
        for key, value in count.most_common():
            f.write(f'{key},{value}')
            f.write('\n')
    return

In [36]:
test_enz_file = '../data/predictions/test/test_multipreds.csv'

test_enz_preds = read_enzymes(test_enz_file)
save_enzyme_counts('../data/predictions/test/test_enzcount.csv', test_enz_preds)

## Thyme Enzymes

In [37]:
thyme1_enz_file = '../data/predictions/thyme/thyme1_multipreds.csv'

thyme1_enz_preds = read_enzymes(thyme1_enz_file)
save_enzyme_counts('../data/predictions/thyme/thyme1_enzcount.csv', thyme1_enz_preds)

thyme2_enz_file = '../data/predictions/thyme/thyme2_multipreds.csv'

thyme2_enz_preds = read_enzymes(thyme2_enz_file)
save_enzyme_counts('../data/predictions/thyme/thyme2_enzcount.csv', thyme2_enz_preds)