In [1]:
# modules
import re
import os
import sys
import itertools
import numpy as np
import pandas as pd
import multiprocessing as mp
from sklearn.metrics import precision_score,recall_score,accuracy_score,f1_score
from sklearn.model_selection import train_test_split

sys.path.append('../')
from ensemble.model import Ensemble
from baseModels.SVM.model import SVM
from featEngg.online.kmerMethods.models import ngModel,gaangModel

In [6]:
class TEClassification:
    
    def __init__(self,enzseqdata,testenzseqdata,labelfile,trainfeaturefiledirs,testfeaturefiledirs,
                 hyperparamfile,random_seed=None,n_models=17,validation_fraction=0.25):
        
        self.random_seed = random_seed
        self.n_models = n_models
        self.validation_fraction = validation_fraction
        self.test = True if testfeaturefiledirs else False
        
                
        # original data based on which everything is obtained
        df1 = pd.read_csv(enzseqdata,header=None)
        df2 = pd.read_csv(labelfile,header=None)
        self.train_df = df1.merge(df2,on=0)
        
        self.enz_names = self.train_df[0].values
        self.X = self.train_df.iloc[:,1].values
        self.y = self.train_df.iloc[:,-1].values
        
        # training and validation data for general use
        self.X_train, self.X_valid, self.y_train, self.y_valid,self.enz_train,self.enz_valid = train_test_split(self.X, self.y,self.enz_names, test_size=self.validation_fraction, random_state=self.random_seed)
        
        self.label_file = labelfile
        
        # test data
        if self.test:
            self.test_df = pd.read_csv(testenzseqdata,header=None)
            self.testenz_names = self.test_df[0].values
            self.X_test = self.test_df.iloc[:,1].values
        else:
            self.X_test=None
            
        self.df_hyperparam = pd.read_csv(hyperparamfile).set_index('feat_name')
        
        # kmer and gaakmer
        ng = ngModel(self.X_train,self.X_valid,self.X_test)
        gaang = gaangModel(self.X_train,self.X_valid,self.X_test)
        kmernames = ['kmer','gaakmer']
        kmerObjs = [self.get_model_online(ng.Xtrain,ng.Xvalid,self.y_train,self.y_valid,ng.Xtest),self.get_model_online(gaang.Xtrain,gaang.Xvalid,self.y_train,self.y_valid,gaang.Xtest)]

        
        #generate a list of names from the directories
        trainfeatfiles = [d+f.name for d in trainfeaturefiledirs for f in os.scandir(d) if f.name.endswith('.csv.gz')]            
        self.featnames = [f.name.replace('.csv.gz','') for d in trainfeaturefiledirs for f in os.scandir(d) if f.name.endswith('.csv.gz')]
        
        if self.test:
            testfeatfiles = [d+f.name for d in testfeaturefiledirs for f in os.scandir(d) if f.name.endswith('.csv.gz')]
            func_iter = list(zip(trainfeatfiles,self.featnames,testfeatfiles))
            assert [f.name for d in trainfeaturefiledirs for f in os.scandir(d) if f.name.endswith('.csv.gz')]==[f.name for d in testfeaturefiledirs for f in os.scandir(d) if f.name.endswith('.csv.gz')]
            self.objects=list(itertools.starmap(self.get_model_offline,func_iter))

        else:
            # getting all SVM objects together
            func_iter = list(zip(trainfeatfiles,self.featnames))
            self.objects = list(itertools.starmap(self.get_model_offline,func_iter))
            
        self.featnames.extend(kmernames)
        self.objects.extend(kmerObjs)
            
        
        # select only the best models based on training or validation
        self.best_idx,self.best_models = self.select_top_models(self.objects)
        self.best_model_names = np.array(self.featnames)[self.best_idx]
        
        # getting all model predictions together for ensemble
        if not self.test:
            self.all_model_preds = [o.ypredvalid for o in self.best_models]
            self.en = Ensemble(self.all_model_preds,self.y_valid)
            self.precision = precision_score(self.y_valid,self.en.preds,labels=[3],average='micro')
            
        else:
            self.all_model_preds = [o.yhattest for o in self.best_models]
            self.en = Ensemble(self.all_model_preds)
        
        pass
    
    def get_model_online(self,X_train,X_valid,y_train,y_valid,X_test=None):

        if X_train.shape[1]<55:
            pca_components = int(0.75*X_train.shape[1])
        else:
            pca_components=55
            
        obj = SVM(X_train,X_valid,y_train,y_valid,X_test,pca_comp=55,
                           regC=30,kern='rbf',optimize=False,
                           verbose=False,random_seed=self.random_seed)
        
        return obj
    
    
    def get_model_offline(self,featfilename,featname,testfeatfilename=None):
        
        df1 = pd.read_csv(featfilename,header=None)
        df2 = pd.read_csv(self.label_file,header=None)
        df_feat = df1.merge(df2,on=0).set_index(0)
        df_feat_train = df_feat.loc[self.enz_train]
        df_feat_valid = df_feat.loc[self.enz_valid]
        X_train_feat,y_train_feat = df_feat_train.iloc[:,0:-1].values,df_feat_train.iloc[:,-1].values
        X_valid_feat,y_valid_feat = df_feat_valid.iloc[:,0:-1].values,df_feat_valid.iloc[:,-1].values
        
        pca_components = self.df_hyperparam.loc[featname,'pca_comp']
        regCparam = self.df_hyperparam.loc[featname,'regC']
        kernparam = self.df_hyperparam.loc[featname,'kernel']
        
        if X_train_feat.shape[1]<pca_components:
            raise ValueError('Wrong Hyper Parameter')
            
        if self.test:
            df_feat_test = pd.read_csv(testfeatfilename,header=None).set_index(0)
            X_test_feat = df_feat_test.loc[self.testenz_names].values
            if X_train_feat.shape[1] != X_test_feat.shape[1]:
                print(featfilename)
        else:
            X_test_feat=None
                
        obj = SVM(X_train_feat,X_valid_feat,y_train_feat,y_valid_feat,X_test_feat,pca_comp=pca_components,
                           regC=regCparam,kern=kernparam,optimize=False,
                           verbose=False,random_seed=self.random_seed)
        return obj
        
    def select_top_models(self,Os):
        o_valid_accs = [o.acc_valid for o in Os] #if self.test else [o.acc_train for o in Os] 
        sorted_idx = np.argsort(o_valid_accs)[::-1]
        best_idx = sorted_idx[:self.n_models]
        return best_idx,np.array(Os)[best_idx]

    

def get_precision(y,yhat):
    return round(precision_score(y,yhat,labels=[3],average='micro'),2)


def get_recall(y,yhat):
    return round(recall_score(y,yhat,labels=[3],average='micro'),2)


def get_f1_score(y,yhat):
    return round(f1_score(y,yhat,average='micro'),2)

def get_accuracy(y,yhat):
    return round(accuracy_score(y,yhat),2)

def write_model_stats(model_metrics):
    datadict = {'precision':[],'recall':[],'f1_score':[],'accuracy':[]}
    index_names = ['ensemble']
    
    precs = [m[0] for m in model_metrics]
    recalls = [m[1] for m in model_metrics]
    f1_scores = [m[2] for m in model_metrics]
    accs = [m[3] for m in model_metrics]

            
    datadict['precision'].append(np.mean(precs))
    datadict['recall'].append(np.mean(recalls))
    datadict['f1_score'].append(np.mean(f1_scores))
    datadict['accuracy'].append(np.mean(accs))

    df = pd.DataFrame(datadict,index=index_names)
    return df.to_csv(filename_model,mode='a',index=True,header=False)       

def write_model_report(model_metrics):
    datadict = {'min precision':[],'min recall':[],'min f1_score':[],'min accuracy':[],
               'max precision':[],'max recall':[],'max f1_score':[],'max accuracy':[],
               'mean precision':[],'mean recall':[],'mean f1_score':[],'mean accuracy':[],
               'std precision':[],'std recall':[],'std f1_score':[],'std accuracy':[]}
    
    index_names = ['ensemble']
    
    precs = [m[0] for m in model_metrics]
    recalls = [m[1] for m in model_metrics]
    f1_scores = [m[2] for m in model_metrics]
    accs = [m[3] for m in model_metrics]

            
    datadict['mean precision'].append(np.mean(precs))
    datadict['mean recall'].append(np.mean(recalls))
    datadict['mean f1_score'].append(np.mean(f1_scores))
    datadict['mean accuracy'].append(np.mean(accs))
    
    datadict['min precision'].append(min(precs))
    datadict['min recall'].append(min(recalls))
    datadict['min f1_score'].append(min(f1_scores))
    datadict['min accuracy'].append(min(accs))
    
    datadict['max precision'].append(max(precs))
    datadict['max recall'].append(max(recalls))
    datadict['max f1_score'].append(max(f1_scores))
    datadict['max accuracy'].append(max(accs))
    
    datadict['std precision'].append(np.std(precs))
    datadict['std recall'].append(np.std(recalls))
    datadict['std f1_score'].append(np.std(f1_scores))
    datadict['std accuracy'].append(np.std(accs))
    

    df = pd.DataFrame(datadict,index=index_names)
    return df#.to_csv(filename_model_report,mode='a',index=True,header=False)    


def save_model_metrics(metrics):
    filename = '../data/results/EnsembleResults/MultiClassEnsemble.csv'
    
    with open(filename, 'w') as f:
        for i,j,k,l in metrics:
            f.write(f'{i},{j},{k},{l}')
            f.write('\n')
    
    return 

In [3]:
%%time
if __name__=='__main__':
    # Sequence and label files 
    enz_file = '../data/seq/EnzymeSequence.csv'
    label_file = '../data/label/EnzymeLabelsMultiClass.csv'
    
    
    hyperparam_file = '../data/results/HyperParameterOptimization/IndHPOpt.csv'

    # Feature dir for iFeature,kernel,pssm 
    ifeatdatadir = '../featEngg/offline/ifeatMethods/data/featvec/trainfiles/'
    kerneldatadir = '../featEngg/offline/kernelMethods/data/featvec/trainfiles/'
    pssmdatadir = '../featEngg/offline/pssmMethods/data/featvec/trainfiles/'

    trainfeatdirs = [ifeatdatadir,kerneldatadir,pssmdatadir]


    
    def multi_func(rs):
        te_i = TEClassification(enz_file,None,label_file,trainfeatdirs,None,hyperparam_file,random_seed=rs,n_models=5)
        yi = te_i.y_valid
        yhati = te_i.en.preds
        prec = get_precision(yi,yhati)
        rec = get_recall(yi,yhati)
        f1_score=get_f1_score(yi,yhati)
        acc = get_accuracy(yi,yhati)
        return prec,rec,f1_score,acc   
    
    pool = mp.Pool(mp.cpu_count())

    
    
    
    models = list(pool.map(multi_func,range(10000)))
    


CPU times: user 588 ms, sys: 187 ms, total: 775 ms
Wall time: 53min 8s


In [4]:
filename_model = '../data/results/feat_results.csv'
filename_model_report = '../data/results/feat_report.csv'

In [5]:
write_model_report(models)

Unnamed: 0,min precision,min recall,min f1_score,min accuracy,max precision,max recall,max f1_score,max accuracy,mean precision,mean recall,mean f1_score,mean accuracy,std precision,std recall,std f1_score,std accuracy
ensemble,0.5,0.5,0.55,0.55,1.0,1.0,1.0,1.0,0.898873,0.916701,0.829638,0.829638,0.081997,0.078663,0.06185,0.06185


In [7]:
save_model_metrics(models)

In [6]:
write_model_stats(models)