In [1]:
# modules
import os
import sys
import numpy as np
import pandas as pd
import multiprocessing as mp
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split

sys.path.append('../')
from baseModels.SVM.model import SVMRegressor
from ensemble.model import EnsembleRegression

In [2]:
class TERegression:
    
    def __init__(self,enzseqdata,labelfile,featurefiledirs,random_seed=None):
        
        self.random_seed = random_seed
        
        # original data based on which everything is obtained
        df1 = pd.read_csv(enzseqdata,header=None)
        df2 = pd.read_csv(label_file,header=None)
        self.df = df1.merge(df2,on=0)
        
        self.enz_names = self.df[0].values
        self.X = self.df.iloc[:,1].values
        self.y = self.df.iloc[:,-1].values
        
        # training and testing data for general use
        self.X_train, self.X_test, self.y_train, self.y_test,self.enz_train,self.enz_test = train_test_split(self.X, self.y,self.enz_names, test_size=0.25, random_state=self.random_seed)
        
        
        
        self.label_file = labelfile
        
        #generate a list of names from the directories
        featfiles = [d+f.name for d in featurefiledirs for f in os.scandir(d) if f.name.endswith('.csv.gz')]
        self.featnames = [f.name.replace('.csv.gz','') for d in featurefiledirs for f in os.scandir(d) if f.name.endswith('.csv.gz')]
        
        # getting all SVM objects together 
        self.objects = list(map(self.get_model_feat,featfiles))

        
        # select only the best models based on training
        self.best_idx,self.best_models = self.select_top_models(self.objects)
        self.best_model_names = np.array(self.featnames)[self.best_idx]
        
        # getting all model predictions together
        self.all_model_preds = [svmo.ypredtest for svmo in self.best_models]
        self.en = EnsembleRegression(self.all_model_preds,self.y_test)
        
        pass
    
    
    def get_model_feat(self,featfilename):
        df1 = pd.read_csv(featfilename,header=None)
        df2 = pd.read_csv(self.label_file,header=None)
        df_feat = df1.merge(df2,on=0).set_index(0)
        df_feat_train = df_feat.loc[self.enz_train]
        df_feat_test = df_feat.loc[self.enz_test]
        X_train_feat,y_train_feat = df_feat_train.iloc[:,0:-1].values,df_feat_train.iloc[:,-1].values
        X_test_feat,y_test_feat = df_feat_test.iloc[:,0:-1].values,df_feat_test.iloc[:,-1].values

        if X_train_feat.shape[1]<40:
            n_comp = int(0.75*X_train_feat.shape[1])
        else:
            n_comp=40
            
        obj = SVMRegressor(X_train_feat,X_test_feat,y_train_feat,y_test_feat,verbose=False,optimize=False, pca_comp=n_comp,random_seed=self.random_seed,regC=30,kern='rbf')
        return obj
        
    def select_top_models(self,SVMOs):
        svm_train_mses = [svmo.error_train for svmo in SVMOs]
        sorted_idx = np.argsort(svm_train_mses)
        best_idx = sorted_idx[:27]
        return best_idx,np.array(SVMOs)[best_idx]
        

In [3]:
%%time
if __name__=='__main__':
    # Sequence and label files 
    enz_file = '../data/SeqFile/EnzymeSequence.csv'
    label_file = '../data/LabelFiles/EnzymeLabelsRegression.csv'

    # Feature dir for iFeature,kernel,pssm 
    ifeatdatadir = '../featEngg/ifeatMethods/data/featvec/'
    kerneldatadir = '../featEngg/kernelMethods/data/featvec/'
    pssmdatadir = '../featEngg/pssmMethods/data/featvec/'
    featdirs = [ifeatdatadir,kerneldatadir,pssmdatadir]


    te = TERegression(enz_file,label_file,featdirs,random_seed=7)
    
    def multi_func(rs):
        te_i = TERegression(enz_file,label_file,featdirs,random_seed=rs)
        return te_i.en.mse   
    
    pool = mp.Pool(mp.cpu_count())

    
    
    
    mses = list(pool.map(multi_func,range(10000)))
    


CPU times: user 12.8 s, sys: 1.05 s, total: 13.9 s
Wall time: 1h 7min 53s


In [5]:
print(min(mses),np.mean(mses),max(mses),np.std(mses))

0.010830218731252417 0.044904942333331484 0.11592679965722981 0.011756249869242253


In [4]:
print(min(mses),np.mean(mses),max(mses),np.std(mses))

0.03598897944584124 0.08093976230667074 0.16690193708025364 0.015617864012292223
