In [1]:
# get top5 feats
filename = '../data/results/feat_results.csv'

In [2]:
import os
import re
import pandas as pd
from sklearn.model_selection import train_test_split
import itertools 
import multiprocessing as mp
import numpy as np
import sys
from sklearn.metrics import precision_score,recall_score,accuracy_score,f1_score

sys.path.append('../')
from baseModels.SVM.model import SVM

In [3]:
filename_model_report = '../data/results/feat_report.csv'

In [4]:
df = pd.read_csv(filename,index_col=0)

In [5]:
df.sort_values('accuracy',ascending=False).head(6)

Unnamed: 0,precision,recall,f1_score,accuracy
DDE,0.895216,0.907472,0.806711,0.806711
CTriad,0.888547,0.893962,0.793264,0.793264
KSCTriad,0.888547,0.893962,0.793264,0.793264
TPC,0.892713,0.89274,0.791088,0.791088
ensemble,0.874912,0.891455,0.790993,0.790993
gappyKernel,0.880781,0.857368,0.787154,0.787154


In [6]:
best_feats = ['DDE','CTriad','KSCTriad','TPC','gappyKernel'] 

In [7]:
# hyperparameter optimization file
hyperparam_file = '../data/results/HyperParameterOptimization/IndHPOpt.csv'

df_hyperparam = pd.read_csv(hyperparam_file).set_index('feat_name')

In [8]:
# label file
label_file = '../data/label/EnzymeLabelsMultiClass.csv'

In [9]:
ifeat_feature_dir = '../featEngg/offline/ifeatMethods/data/featvec/trainfiles/'
ifeat_feature_files = [ifeat_feature_dir+f.name for f in os.scandir(ifeat_feature_dir) if f.name.endswith('.csv.gz') and f.name.replace('.csv.gz','') in best_feats]

In [10]:
kernel_feature_dir = '../featEngg/offline/kernelMethods/data/featvec/trainfiles/'
kernel_feature_files = [kernel_feature_dir+f.name for f in os.scandir(kernel_feature_dir) if f.name.endswith('.csv.gz') and f.name.replace('.csv.gz','') in best_feats]

In [11]:
pssm_feature_dir = '../featEngg/offline/kernelMethods/data/featvec/trainfiles/'
pssm_feature_files = [kernel_feature_dir+f.name for f in os.scandir(kernel_feature_dir) if f.name.endswith('.csv.gz') and f.name.replace('.csv.gz','') in best_feats]

In [12]:
feature_files = ifeat_feature_files + kernel_feature_files

In [13]:
def get_featname_from_file(filename):
    pattern = re.compile("^(.+/)+(.+)\.csv\.gz$")
    m = re.match(pattern,filename)
    return m.group(2)
    

In [14]:
def model_evaluate(filename,random_seed):
    featname = get_featname_from_file(filename)
    
    df1 = pd.read_csv(filename,header=None)
    df2 = pd.read_csv(label_file,header=None)
    df = df1.merge(df2,on=0)
    enz_names = df[0].values
    X = df.iloc[:,1:-1].values
    y = df.iloc[:,-1].values
    
    pca_components = df_hyperparam.loc[featname,'pca_comp']
    regCparam = df_hyperparam.loc[featname,'regC']
    kernparam = df_hyperparam.loc[featname,'kernel']


    X_train, X_valid, y_train, y_valid,enz_train,enz_valid = train_test_split(X, y,enz_names, test_size=0.25, random_state=random_seed)
    
    if X_train.shape[1]<pca_components:
        raise ValueError('Wrong Hyper Parameter')
        
    svm = SVM(X_train,X_valid,y_train,y_valid,verbose=False,optimize=False, pca_comp=pca_components,kern=kernparam,regC=regCparam)

    yi = svm.yvalid
    yhati = svm.ypredvalid
    prec=get_precision(yi,yhati)
    rec=get_recall(yi,yhati)
    f1_score =get_f1_score(yi,yhati)
    acc =get_accuracy(yi,yhati)
    
    return featname,prec,rec,f1_score,acc

def get_precision(y,yhat):
    return round(precision_score(y,yhat,labels=[3],average='micro'),2)


def get_recall(y,yhat):
    return round(recall_score(y,yhat,labels=[3],average='micro'),2)


def get_f1_score(y,yhat):
    return round(f1_score(y,yhat,average='micro'),2)

def get_accuracy(y,yhat):
    return round(accuracy_score(y,yhat),2)


In [16]:
pool = mp.Pool(mp.cpu_count())
iterable = list(itertools.product(feature_files,range(10000)))

In [17]:
%%time
models = list(pool.starmap(model_evaluate,iterable))

CPU times: user 380 ms, sys: 90.5 ms, total: 471 ms
Wall time: 28min 48s


In [22]:
def write_model_report(model_metrics):
    datadict = {'min precision':[],'min recall':[],'min f1_score':[],'min accuracy':[],
               'max precision':[],'max recall':[],'max f1_score':[],'max accuracy':[],
               'mean precision':[],'mean recall':[],'mean f1_score':[],'mean accuracy':[],
               'std precision':[],'std recall':[],'std f1_score':[],'std accuracy':[]}
    
    index_names = []
    
    for mn,group in itertools.groupby(model_metrics,key= lambda x:x[0]):
        
        index_names.append(mn)
        
        m_metrics = [g[1:] for g in list(group)]
        
        precs = [m[0] for m in m_metrics]
        recalls = [m[1] for m in m_metrics]
        f1_scores = [m[2] for m in m_metrics]
        accs = [m[3] for m in m_metrics]


        datadict['mean precision'].append(np.mean(precs))
        datadict['mean recall'].append(np.mean(recalls))
        datadict['mean f1_score'].append(np.mean(f1_scores))
        datadict['mean accuracy'].append(np.mean(accs))

        datadict['min precision'].append(min(precs))
        datadict['min recall'].append(min(recalls))
        datadict['min f1_score'].append(min(f1_scores))
        datadict['min accuracy'].append(min(accs))

        datadict['max precision'].append(max(precs))
        datadict['max recall'].append(max(recalls))
        datadict['max f1_score'].append(max(f1_scores))
        datadict['max accuracy'].append(max(accs))

        datadict['std precision'].append(np.std(precs))
        datadict['std recall'].append(np.std(recalls))
        datadict['std f1_score'].append(np.std(f1_scores))
        datadict['std accuracy'].append(np.std(accs))
    

    df = pd.DataFrame(datadict,index=index_names)
    return df.to_csv(filename_model_report,mode='w',index=True,header=True) 

In [23]:
write_model_report(models)