In [1]:
# modules
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from IndModels.ModelsEdited import NGModel,GAACModel
import numpy as np
from AutoPos.AutoPosModel import AutoPosClass
from sklearn.feature_selection import chi2,mutual_info_regression
from mySVM.model import SVM,SVMRegressor
from Ensemble.model import EnsembleRegression
import multiprocessing as mp

In [2]:
class TERegression:
    
    def __init__(self,enzymedata,alignedenzymedata,enzymenamemap,
                ifeaturefilenames,ifeaturelabelfile,random_seed=None):
        
        self.random_seed = random_seed
        
        # original data based on which everything is obtained
        self.df = pd.read_csv(enzymedata,header=None)
        self.enz_names = self.df[0].values
        self.X = self.df.iloc[:,1].values
        self.y = self.df.iloc[:,-1].values
        
        # training and testing data for kmer and gaakmer - will be used for others as well
        self.X_train, self.X_test, self.y_train, self.y_test,self.enz_train,self.enz_test = train_test_split(self.X, self.y,self.enz_names, test_size=0.25, random_state=self.random_seed)
        
        #ng and gaang model
        self.ngmodel = NGModel(self.X_train,self.X_test,self.y_train,self.y_test,random_seed=self.random_seed,regression=True)
        self.gmodel = GAACModel(self.X_train,self.X_test,self.y_train,self.y_test,random_seed=self.random_seed,regression=True)

        # aligned data for autopos
        self.df_algn = pd.read_csv(alignedenzymedata,header=None).set_index(0)
        
        # make train test for autopos based on train test for original data
        self.df_algn_train = self.df_algn.loc[self.enz_train]
        self.df_algn_test = self.df_algn.loc[self.enz_test]
        
        self.X_train_algn,self.y_train_algn = self.df_algn_train[1].values,self.df_algn_train[2].values
        self.X_test_algn,self.y_test_algn = self.df_algn_test[1].values,self.df_algn_test[2].values
        
        assert all(self.y_test==self.y_test_algn)
        
        #autopos model
        self.apmodel = AutoPosClass(self.X_train_algn,self.X_test_algn,self.y_train_algn,self.y_test_algn,mutual_info_regression,None,50,imp=True,verbose=False,regression=True)
        
        # the ifeature vectors are mapped to the original enzyme names (will change this later)
        # get the alias to original name mappings
        self.df_enmap = pd.read_csv(enzymenamemap,header=None)
        self.enz_mapdict = dict(zip(self.df_enmap[0],self.df_enmap[1]))
        
        self.ifeat_label_file = ifeaturelabelfile
        
        # getting all SVM objects together 
        self.SVMobjects = list(map(self.get_model_ifeat,ifeaturefilenames))
        self.SVMobjects.extend([self.ngmodel.SVMobject,self.gmodel.SVMobject,self.apmodel.SVMobject])
        
        # select only the best models based on training
        self.best_idx,self.best_models = self.select_top_models(self.SVMobjects)
        
        # getting all model predictions together
        self.all_model_preds = [svmo.ypredtest for svmo in self.best_models]
        self.en = EnsembleRegression(self.all_model_preds,self.y_test)
        
        pass
    
    
    def get_model_ifeat(self,ifeatfilename):
        df1 = pd.read_csv(ifeatfilename,header=None)
        df2 = pd.read_csv(self.ifeat_label_file,header=None)
        df_feat = df1.merge(df2,on=0).set_index(0)
        enz_name_train = [self.enz_mapdict[al] for al in self.enz_train]
        enz_name_test = [self.enz_mapdict[al] for al in self.enz_test]
        df_feat_train = df_feat.loc[enz_name_train]
        df_feat_test = df_feat.loc[enz_name_test]
        X_train_feat,y_train_feat = df_feat_train.iloc[:,0:-1].values,df_feat_train.iloc[:,-1].values
        X_test_feat,y_test_feat = df_feat_test.iloc[:,0:-1].values,df_feat_test.iloc[:,-1].values
        assert all(self.y_test==y_test_feat)
        assert all(self.y_train==y_train_feat)
        if X_train_feat.shape[1]<40:
            n_comp = int(0.75*X_train_feat.shape[1])
        else:
            n_comp=40
        svm = SVMRegressor(X_train_feat,X_test_feat,y_train_feat,y_test_feat,verbose=False,optimize=False, pca_comp=n_comp,random_seed=self.random_seed)
        return svm
        
    def select_top_models(self,SVMOs):
        svm_train_accs = [svmo.error_train for svmo in SVMOs]
        sorted_idx = np.argsort(svm_train_accs)[::-1]
        best_idx = sorted_idx[:3]
        return best_idx,np.array(SVMOs)[best_idx]
        

In [3]:
%%time
if __name__=='__main__':
    # Sequence and label files for autopos,kmer and gaa-kmer
    datadir = 'Data/TE_ML_Data/'
    enz_file = datadir + 'EnzymeDatasetRegression.csv'
    enz_file_aligned = datadir + 'EnzymeDatasetAlignedRegression.csv'

    # Feature files for iFeature 
    ifeatdatadir = 'Data/FeatureVectors/iFeatureVectors/'
    ifeature_files = [ifeatdatadir+f.name for f in os.scandir(ifeatdatadir)]
    ifeatlabelfile = 'Data/Labels/'+'Labels_Regression.csv'

    # alias to original enzyme name
    enz_name_map = datadir + 'EnzymeNameMap.csv'
    te = TERegression(enz_file,enz_file_aligned,enz_name_map,
                          ifeature_files,ifeatlabelfile,random_seed=7)
    
    def multi_func(rs):
        te_i = TERegression(enz_file,enz_file_aligned,enz_name_map,
                          ifeature_files,ifeatlabelfile,random_seed=rs)
        return te_i.en.mse   
    
    pool = mp.Pool(mp.cpu_count())

    
    
    
    mses = list(pool.map(multi_func,range(10000)))
    


CPU times: user 12.5 s, sys: 1.47 s, total: 13.9 s
Wall time: 1h 21min 59s


In [4]:
print(min(mses),np.mean(mses),max(mses),np.std(mses))

0.03598897944584124 0.08093976230667074 0.16690193708025364 0.015617864012292223
