In [17]:
# modules
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from IndModels.ModelsEdited import NGModel,GAACModel
import numpy as np
from AutoPos.AutoPosModel import AutoPosClass
from sklearn.feature_selection import chi2,mutual_info_regression
from mySVM.model import SVM,SVMRegressor
from Ensemble.model import Ensemble
import multiprocessing as mp

In [18]:
class TERegression:
    
    def __init__(self,enzymedata,alignedenzymedata,enzymenamemap,
                ifeaturefilenames,ifeaturelabelfile,random_seed=None):
        
        self.random_seed = random_seed
        
        # original data based on which everything is obtained
        self.df = pd.read_csv(enzymedata,header=None)
        self.enz_names = self.df[0].values
        self.X = self.df.iloc[:,1].values
        self.y = self.df.iloc[:,-1].values
        
        # training and testing data for kmer and gaakmer - will be used for others as well
        self.X_train, self.X_test, self.y_train, self.y_test,self.enz_train,self.enz_test = train_test_split(self.X, self.y,self.enz_names, test_size=0.25, random_state=self.random_seed)
        
        #ng and gaang model
        self.ngmodel = NGModel(self.X_train,self.X_test,self.y_train,self.y_test,random_seed=self.random_seed,regression=True)
        self.gmodel = GAACModel(self.X_train,self.X_test,self.y_train,self.y_test,random_seed=self.random_seed,regression=True)

        # aligned data for autopos
        self.df_algn = pd.read_csv(alignedenzymedata,header=None).set_index(0)
        
        # make train test for autopos based on train test for original data
        self.df_algn_train = self.df_algn.loc[self.enz_train]
        self.df_algn_test = self.df_algn.loc[self.enz_test]
        
        self.X_train_algn,self.y_train_algn = self.df_algn_train[1].values,self.df_algn_train[2].values
        self.X_test_algn,self.y_test_algn = self.df_algn_test[1].values,self.df_algn_test[2].values
    
        
        #autopos model
        self.apmodel = AutoPosClass(self.X_train_algn,self.X_test_algn,self.y_train_algn,self.y_test_algn,mutual_info_regression,None,50,imp=True,verbose=False,regression=True)
        
        # the ifeature vectors are mapped to the original enzyme names (will change this later)
        # get the alias to original name mappings
        self.df_enmap = pd.read_csv(enzymenamemap,header=None)
        self.enz_mapdict = dict(zip(self.df_enmap[0],self.df_enmap[1]))
        
        self.ifeat_label_file = ifeaturelabelfile
        
        # getting all SVM objects together 
        self.SVMobjects = list(map(self.get_model_ifeat,ifeaturefilenames))
        self.SVMobjects.extend([self.ngmodel.SVMobject,self.gmodel.SVMobject,self.apmodel.SVMobject])
        
        # select only the best models based on training
        self.best_idx,self.best_models = self.select_top_7_models(self.SVMobjects)
        
        # getting all model predictions together
        self.all_model_preds = [svmo.ypredtest for svmo in self.best_models]
        self.en = Ensemble(self.all_model_preds,self.y_test)
        
        pass
    
    
    def get_model_ifeat(self,ifeatfilename):
        df1 = pd.read_csv(ifeatfilename,header=None)
        df2 = pd.read_csv(self.ifeat_label_file,header=None)
        df_feat = df1.merge(df2,on=0).set_index(0)
        enz_name_train = [self.enz_mapdict[al] for al in self.enz_train]
        enz_name_test = [self.enz_mapdict[al] for al in self.enz_test]
        df_feat_train = df_feat.loc[enz_name_train]
        df_feat_test = df_feat.loc[enz_name_test]
        X_train_feat,y_train_feat = df_feat_train.iloc[:,0:-1].values,df_feat_train.iloc[:,-1].values
        X_test_feat,y_test_feat = df_feat_test.iloc[:,0:-1].values,df_feat_test.iloc[:,-1].values
        if X_train_feat.shape[1]<40:
            n_comp = int(0.75*X_train_feat.shape[1])
        else:
            n_comp=40
        svm = SVMRegressor(X_train_feat,X_test_feat,y_train_feat,y_test_feat,verbose=False,optimize=False, pca_comp=n_comp,random_seed=self.random_seed)
        return svm
        
    def select_top_7_models(self,SVMOs):
        svm_train_accs = [svmo.acc_train for svmo in SVMOs]
        sorted_idx = np.argsort(svm_train_accs)[::-1]
        best_idx = sorted_idx[:3]
        return best_idx,np.array(SVMOs)[best_idx]
        

In [19]:
datadir = 'Data/TE_ML_Data/'
enz_file = datadir + 'EnzymeDatasetRegression.csv'
enz_file_aligned = datadir + 'EnzymeDatasetAlignedRegression.csv'


In [20]:
df = pd.read_csv(enz_file,header=None)
enz_names = df[0].values
X = df.iloc[:,1].values
y = df.iloc[:,-1].values

# training and testing data for kmer and gaakmer - will be used for others as well
X_train, X_test, y_train, y_test,enz_train,enz_test = train_test_split(X, y,enz_names, test_size=0.25, random_state=7)


In [21]:
enz_name_map = datadir + 'EnzymeNameMap.csv'

In [22]:
df_enmap = pd.read_csv(enz_name_map,header=None)
enz_mapdict = dict(zip(df_enmap[0],df_enmap[1]))
        

In [23]:
ifeatdatadir = 'Data/FeatureVectors/iFeatureVectors/'
ifeature_files = [ifeatdatadir+f.name for f in os.scandir(ifeatdatadir)]
ifeatlabelfile = 'Data/Labels/'+'Labels_Regression.csv'

In [24]:
def get_model_ifeat(ifeatfilename):
    df1 = pd.read_csv(ifeatfilename,header=None)
    df2 = pd.read_csv(ifeatlabelfile,header=None)
    df_feat = df1.merge(df2,on=0).set_index(0)
    enz_name_train = [enz_mapdict[al] for al in enz_train]
    enz_name_test = [enz_mapdict[al] for al in enz_test]
    df_feat_train = df_feat.loc[enz_name_train]
    df_feat_test = df_feat.loc[enz_name_test]
    X_train_feat,y_train_feat = df_feat_train.iloc[:,0:-1].values,df_feat_train.iloc[:,-1].values
    X_test_feat,y_test_feat = df_feat_test.iloc[:,0:-1].values,df_feat_test.iloc[:,-1].values
    if X_train_feat.shape[1]<40:
        n_comp = int(0.75*X_train_feat.shape[1])
    else:
        n_comp=40
    svm = SVMRegressor(X_train_feat,X_test_feat,y_train_feat,y_test_feat,verbose=False,optimize=False, pca_comp=n_comp,random_seed=self.random_seed)
    return svm

In [29]:
df1 = pd.read_csv(ifeature_files[0],header=None)
df2 = pd.read_csv(ifeatlabelfile,header=None)
df_feat = df1.merge(df2,on=0).set_index(0)


In [33]:
df1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,231,232,233,234,235,236,237,238,239,240
0,A._hypogaea_l._AhFatA,1.059601,1.035644,1.030299,1.037473,1.117120,0.969175,1.048442,1.002341,0.924542,...,1.083196,0.959607,1.037849,0.905386,0.922135,1.044258,0.997960,0.977693,0.971270,1.018606
1,Arabidopsis_thaliana,1.038211,1.086110,0.997523,1.042527,1.048423,0.960117,0.959658,0.996927,0.938147,...,0.994237,1.054042,1.046776,0.916757,0.967150,0.958708,1.051377,1.027984,1.049874,0.986125
2,Auxenochlorella_protothecoides,0.991754,1.044612,1.091602,1.097133,1.008558,0.861974,1.011384,0.987881,1.036885,...,1.070561,1.062828,0.901601,0.948079,0.927181,0.945350,0.989628,1.067135,1.016484,1.020914
3,Brassica_juncea_BjFatB1,1.046705,1.086684,1.011947,1.040608,1.070325,0.945798,0.939856,0.961976,0.941488,...,1.001038,1.039166,1.029962,0.932098,1.011826,0.909744,1.055923,1.108728,1.063555,1.011879
4,Brassica_juncea_BjFatB2,1.047791,1.103053,1.005984,1.040264,1.046655,0.941978,0.946726,0.963830,0.923890,...,1.018626,1.030433,1.041315,0.906356,0.985699,0.924480,1.065097,1.096078,1.085648,1.011539
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,UcFatB1R197M-M199H-T231K,1.089223,1.078905,0.996980,0.958548,1.069109,1.081807,0.968990,1.020585,1.000026,...,1.010365,1.015678,0.933337,0.961092,1.067753,1.023884,1.048715,1.006420,1.076937,1.030424
112,UcFatB1T231K,1.091090,1.085235,1.004887,0.957201,1.066033,1.073516,0.973669,1.016755,0.987042,...,1.017074,1.012758,0.931386,0.964969,1.064291,1.018962,1.048085,1.005716,1.076174,1.032665
113,Ulmus_americana,1.075791,1.087098,1.037337,1.024645,1.007201,0.980951,0.918619,1.035046,0.969454,...,0.977342,0.955235,1.012640,0.960966,1.002977,0.947901,1.104033,1.009689,1.063204,1.017038
114,Umbellularia_californica_UcFatB1,1.087501,1.096164,1.027661,0.944367,1.058115,1.052678,1.055625,1.018800,0.966755,...,1.051973,0.962804,0.930152,1.021401,1.038595,1.040446,1.021367,1.025952,1.067458,1.065059


In [31]:
df_feat

Unnamed: 0_level_0,1_x,2,3,4,5,6,7,8,9,10,...,232,233,234,235,236,237,238,239,240,1_y
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A._hypogaea_l._AhFatA,1.059601,1.035644,1.030299,1.037473,1.117120,0.969175,1.048442,1.002341,0.924542,1.034378,...,0.959607,1.037849,0.905386,0.922135,1.044258,0.997960,0.977693,0.971270,1.018606,0.042200
Arabidopsis_thaliana,1.038211,1.086110,0.997523,1.042527,1.048423,0.960117,0.959658,0.996927,0.938147,1.050950,...,1.054042,1.046776,0.916757,0.967150,0.958708,1.051377,1.027984,1.049874,0.986125,0.000000
Auxenochlorella_protothecoides,0.991754,1.044612,1.091602,1.097133,1.008558,0.861974,1.011384,0.987881,1.036885,1.076971,...,1.062828,0.901601,0.948079,0.927181,0.945350,0.989628,1.067135,1.016484,1.020914,0.012846
Brassica_juncea_BjFatB1,1.046705,1.086684,1.011947,1.040608,1.070325,0.945798,0.939856,0.961976,0.941488,1.027882,...,1.039166,1.029962,0.932098,1.011826,0.909744,1.055923,1.108728,1.063555,1.011879,0.000000
Brassica_juncea_BjFatB2,1.047791,1.103053,1.005984,1.040264,1.046655,0.941978,0.946726,0.963830,0.923890,1.023000,...,1.030433,1.041315,0.906356,0.985699,0.924480,1.065097,1.096078,1.085648,1.011539,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Sorghum_bicolor_4,0.978596,1.103386,1.111992,0.922916,1.003046,1.035498,0.992215,0.973949,1.039455,1.038537,...,1.047308,0.981318,0.978263,1.004468,1.050323,1.052657,1.040071,0.879311,0.970605,0.060000
UcFatB1T231K,1.091090,1.085235,1.004887,0.957201,1.066033,1.073516,0.973669,1.016755,0.987042,1.023753,...,1.012758,0.931386,0.964969,1.064291,1.018962,1.048085,1.005716,1.076174,1.032665,0.827146
Ulmus_americana,1.075791,1.087098,1.037337,1.024645,1.007201,0.980951,0.918619,1.035046,0.969454,1.046533,...,0.955235,1.012640,0.960966,1.002977,0.947901,1.104033,1.009689,1.063204,1.017038,0.810000
Umbellularia_californica_UcFatB1,1.087501,1.096164,1.027661,0.944367,1.058115,1.052678,1.055625,1.018800,0.966755,1.038958,...,0.962804,0.930152,1.021401,1.038595,1.040446,1.021367,1.025952,1.067458,1.065059,0.827146


In [32]:
get_model_ifeat(ifeature_files[0])

KeyError: "Passing list-likes to .loc or [] with any missing labels is no longer supported. The following labels were missing: Index(['UcFatB1R197M-M199H-T231K', 'UcFatB1197M-M199H'], dtype='object', name=0). See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike"

In [8]:
%%time
if __name__=='__main__':
    # Sequence and label files for autopos,kmer and gaa-kmer
    datadir = 'Data/TE_ML_Data/'
    enz_file = datadir + 'EnzymeDatasetRegression.csv'
    enz_file_aligned = datadir + 'EnzymeDatasetAlignedRegression.csv'

    # Feature files for iFeature 
    ifeatdatadir = 'Data/FeatureVectors/iFeatureVectors/'
    ifeature_files = [ifeatdatadir+f.name for f in os.scandir(ifeatdatadir)]
    ifeatlabelfile = 'Data/Labels/'+'Labels_Regression.csv'

    # alias to original enzyme name
    enz_name_map = datadir + 'EnzymeNameMap.csv'
    te = TERegression(enz_file,enz_file_aligned,enz_name_map,
                          ifeature_files,ifeatlabelfile,random_seed=7)
    
    def multi_func(rs):
        te_i = TERegression(enz_file,enz_file_aligned,enz_name_map,
                          ifeature_files,ifeatlabelfile,random_seed=rs)
        return te_i.en.acc    
    
    pool = mp.Pool(mp.cpu_count())

    
    
    
#     accs = list(pool.map(multi_func,range(10)))
    


KeyError: "Passing list-likes to .loc or [] with any missing labels is no longer supported. The following labels were missing: Index(['Cuphea_viscosisssima_CvB2MT2', 'Cuphea_viscosisssima_CvB2MT47',\n       'A._hypogaea_l._AhFatA', 'Cuphea_viscosisssima_CvB2MT48',\n       'Cuphea_leptopoda_Cl2FatB2',\n       ...\n       'Cuphea_viscosisssima_CvB2MT24', 'Cuphea_viscosisssima_CvB2MT18',\n       'Cuphea_viscosisssima_CvB2MT44', 'Brassica_juncea_BjFatB1',\n       'Cuphea_viscosisssima_CvB2MT42'],\n      dtype='object', name=0, length=61). See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike"

In [4]:
print(min(accs),np.mean(accs),max(accs),np.std(accs))

NameError: name 'accs' is not defined