# Prediction of the 9 test subjects

In [1]:
import sys
import tqdm
import multiprocessing as mp

In [2]:
sys.path.append('../scripts/')

In [3]:
from MultiClassClassification import TEClassification

In [4]:
# Sequence and label files 
enz_file = '../data/seq/EnzymeSequence.csv'
test_enz_file = '../data/seq/TestEnzymeSequence.csv'
test_enz_map_file = '../data/TestEnzymeNameMap.csv'
label_file = '../data/label/EnzymeLabelsMultiClass.csv'
indhpoptfile = '../data/results/hpopt/IndHPOpt.csv'

# Feature dir for iFeature,kernel,pssm 
ifeatdatadir = '../featEngg/offline/ifeatMethods/data/featvec/trainfiles/'
kerneldatadir = '../featEngg/offline/kernelMethods/data/featvec/trainfiles/'
pssmdatadir = '../featEngg/offline/pssmMethods/data/featvec/trainfiles/'

trainfeatdirs = [ifeatdatadir,kerneldatadir,pssmdatadir]

ifeattestdatadir = '../featEngg/offline/ifeatMethods/data/featvec/testfiles/'
kerneltestdatadir = '../featEngg/offline/kernelMethods/data/featvec/testfiles/'
pssmtestdatadir = '../featEngg/offline/pssmMethods/data/featvec/testfiles/'

testfeatdirs = [ifeattestdatadir,kerneltestdatadir,pssmtestdatadir]

In [5]:
%%time
te = TEClassification(enz_file,test_enz_file,label_file,trainfeatdirs,testfeatdirs,hyperparamfile=indhpoptfile,random_seed=2, model='SVM', optimize=False)

APAAC
GTPC
Moran
DPC
KSCTriad
QSOrder
CTDT
AAC
Geary
CKSAAGP
DDE
CTDD
PAAC
CTDC
NMBroto
CTriad
SOCNumber
TPC
CKSAAP
GAAC
GDPC
mismatchKernel
spectrumKernel
gappyKernel
aadp_pssm
dp_pssm
aatp
rpm_pssm
s_fpssm
medp
ab_pssm
eedp
pse_pssm
aac_pssm
pssm_ac
d_fpssm
k_separated_bigrams_pssm
pssm_cc
edp
tpc
smoothed_pssm
pssm_composition
dpc_pssm
tri_gram_pssm
rpssm
CPU times: user 43.3 s, sys: 3.67 s, total: 47 s
Wall time: 13 s


In [6]:
def get_test_map_dict(filename):
    map_dict = dict()
    with open(filename,'r') as f:
        for lines in f:
            data = lines.strip().split(",")
            map_dict[data[0]] = data[1]
    return map_dict

def function_predict_test(rs):
    enz_map_dict = get_test_map_dict(test_enz_map_file)
    names = []
    te = TEClassification(enz_file,test_enz_file,label_file,trainfeatdirs,testfeatdirs,hyperparamfile=indhpoptfile,random_seed=rs, model='SVM', optimize=False)
    for name,pred in zip(te.testenz_names, te.en.preds):
        if pred == 3:
            names.append(enz_map_dict[name])
    return names

def test_pred_multifunc(N):
    pool = mp.Pool(mp.cpu_count())
    all_names = list(tqdm.tqdm(pool.imap(function_predict_test, range(N)), total=N))
    return all_names

def save_test_preds():
    N = 24
    filename_all = '../data/predictions/test/test_multipreds.csv'
    filename_set = '../data/predictions/test/testset_multipreds.csv'
    
    set_test_preds = set(sum(test_preds,[]))
    with open(filename_set, 'w') as f:
        for enz in set_test_preds:
            f.write(enz)
            f.write('\n')
    
    with open(filename_all, 'w') as f:
        for rs, enz_list in zip(range(N), test_preds):
            f.write(f'{rs},{",".join(enz_list)}')
            f.write('\n')
    return

In [7]:
#test_preds = test_pred_multifunc(24)



In [8]:
#save_test_preds()

# Prediction of thyme1 sequences

In [9]:
# Sequence and mapping files 
thyme1_enz_file = '../data/seq/ThymeEnzymeSequence1.csv'
thyme_enz_map_file = '../data/ThymeEnzymeNameMap.csv'

# Feature dir for iFeature,kernel,pssm 
kerneltrain1datadir = '../featEngg/offline/kernelMethods/data/featvec/thymefiles1/train/'

trainthyme1featdirs = [ifeatdatadir,kerneltrain1datadir,pssmdatadir]

ifeatthyme1datadir = '../featEngg/offline/ifeatMethods/data/featvec/thymefiles1/'
kernelthyme1datadir = '../featEngg/offline/kernelMethods/data/featvec/thymefiles1/thyme/'
pssmthyme1datadir = '../featEngg/offline/pssmMethods/data/featvec/thymefiles1/'

thyme1featdirs = [ifeatthyme1datadir,kernelthyme1datadir,pssmthyme1datadir]

In [10]:
def function_predict_thyme1(rs):
    enz_map_dict = get_test_map_dict(thyme_enz_map_file)
    names = []
    te_thyme1 = TEClassification(enz_file,thyme1_enz_file,label_file,trainthyme1featdirs,thyme1featdirs,hyperparamfile=indhpoptfile,random_seed=rs, model='SVM', optimize=False)
    for name,pred in zip(te_thyme1.testenz_names, te_thyme1.en.preds):
        if pred == 3:
            names.append(enz_map_dict[name])   
    return names

def thyme1_pred_multifunc(N):
    pool = mp.Pool(mp.cpu_count())
    all_names = list(tqdm.tqdm(pool.imap(function_predict_thyme1, range(N)), total=N))
    return all_names

def save_thyme1_preds():
    N = 24
    filename_all = '../data/predictions/thyme/thyme1_multipreds.csv'
    filename_set = '../data/predictions/test/thyme1set_multipreds.csv'
    
    set_thyme1_preds = set(sum(thyme1_preds,[]))
    with open(filename_set, 'w') as f:
        for enz in set_thyme1_preds:
            f.write(enz)
            f.write('\n')
    
    with open(filename_all, 'w') as f:
        for rs, enz_list in zip(range(N), thyme1_preds):
            f.write(f'{rs},{",".join(enz_list)}')
            f.write('\n')
    return

In [11]:
%%time
te_thyme1 = TEClassification(enz_file,thyme1_enz_file,label_file,trainthyme1featdirs,thyme1featdirs,hyperparamfile=indhpoptfile,random_seed=0, model='SVM', optimize=False)

APAAC
GTPC
Moran
DPC
KSCTriad
QSOrder
CTDT
AAC
Geary
CKSAAGP
DDE
CTDD
PAAC
CTDC
NMBroto
CTriad
SOCNumber
TPC
CKSAAP
GAAC
GDPC
mismatchKernel
spectrumKernel
gappyKernel
aadp_pssm
dp_pssm
aatp
rpm_pssm
s_fpssm
medp
ab_pssm
eedp
pse_pssm
aac_pssm
pssm_ac
d_fpssm
k_separated_bigrams_pssm
pssm_cc
edp
tpc
smoothed_pssm
pssm_composition
dpc_pssm
tri_gram_pssm
rpssm
CPU times: user 7min 55s, sys: 16.7 s, total: 8min 11s
Wall time: 6min 56s


In [11]:
thyme1_preds = thyme1_pred_multifunc(24)

  0%|          | 0/24 [00:00<?, ?it/s]

CTDC
CTDC
CTDCCTDCCTDC

CTDC
CTDC
CTDD

CTDD
CTDC
CTDDCTDC

CTDDCTDDCTDC


CTDDCTDC
CTDDCTDC


CTDC
CTDCCTDD
CTDD

CTDDCTDD

CTDC
CTDCCTDC

CTDD
SOCNumber
CTDCCTDCCTDC

CTDDSOCNumber


CTDDSOCNumber

CTDCSOCNumberCTDDSOCNumberCTDC


SOCNumber

SOCNumber
CTDD

CTDD
CTDD
CTDDAPAACCTDCCTDC
CTDD



SOCNumberSOCNumberAPAAC
SOCNumber
SOCNumber


APAAC
CTDDCTDDAPAAC
APAACAPAAC

SOCNumber

APAAC

CTDD
SOCNumberCTDD

SOCNumber
CTriadSOCNumber
APAACAPAACSOCNumber

CTriadAPAAC


SOCNumberSOCNumber


CTriadCTriadSOCNumberSOCNumber



CTriad
CTriadAPAACAPAAC
CTriad


SOCNumber
APAACAPAAC
APAACAPAAC


SOCNumberAPAACCTriadSOCNumberSOCNumber
APAAC
CTriadAPAAC

CTriad




APAACCTriad
APAACCTriad


CTriadCTriadCTriad
TPC


CTriadAPAACAPAACAPAAC

TPC
TPC

CTriadCTriad

TPCTPC

CTriad

TPCTPC

CTriad
CTriad
TPCTPCCTriad


CTriad
TPCCTriad

TPC
TPC
TPCTPCTPC

TPC
TPC
TPC

TPC
TPCTPC

TPC
TPC
TPC
CKSAAGP
CKSAAGPCKSAAGP


# Prediction of thyme2 sequences

In [13]:
# Sequence and label files 
thyme2_enz_file = '../data/seq/ThymeEnzymeSequence2.csv'

# Feature dir for iFeature,kernel,pssm 
kerneltrain2datadir = '../featEngg/offline/kernelMethods/data/featvec/thymefiles2/train/'

trainthyme2featdirs = [ifeatdatadir,kerneltrain2datadir,pssmdatadir]

ifeatthyme2datadir = '../featEngg/offline/ifeatMethods/data/featvec/thymefiles2/'
kernelthyme2datadir = '../featEngg/offline/kernelMethods/data/featvec/thymefiles2/thyme/'
pssmthyme2datadir = '../featEngg/offline/pssmMethods/data/featvec/thymefiles2/'

thyme2featdirs = [ifeatthyme2datadir,kernelthyme2datadir,pssmthyme2datadir]

In [21]:
def function_predict_thyme2(rs):
    enz_map_dict = get_test_map_dict(thyme_enz_map_file)
    names = []
    te_thyme2 = TEClassification(enz_file,thyme2_enz_file,label_file,trainthyme2featdirs,thyme2featdirs,hyperparamfile=indhpoptfile,random_seed=rs, model='SVM', optimize=False)
    for name,pred in zip(te_thyme2.testenz_names, te_thyme2.en.preds):
        if pred == 3:
            names.append(enz_map_dict[name])   
    return names

def thyme2_pred_multifunc(N):
    pool = mp.Pool(mp.cpu_count())
    all_names = list(tqdm.tqdm(pool.imap(function_predict_thyme2, range(N)), total=N))
    return all_names

def save_thyme2_preds():
    N = 24
    filename_all = '../data/predictions/thyme/thyme2_multipreds.csv'
    filename_set = '../data/predictions/test/thyme2set_multipreds.csv'
    
    set_thyme2_preds = set(sum(thyme2_preds,[]))
    with open(filename_set, 'w') as f:
        for enz in set_thyme2_preds:
            f.write(enz)
            f.write('\n')
    
    with open(filename_all, 'w') as f:
        for rs, enz_list in zip(range(N), thyme2_preds):
            f.write(f'{rs},{",".join(enz_list)}')
            f.write('\n')
    return

In [None]:
thyme2_preds = thyme2_pred_multifunc(24)