In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import itertools 
import multiprocessing as mp
import numpy as np
import sys
from sklearn.metrics import precision_score,recall_score,accuracy_score,f1_score

sys.path.append('../')
from baseModels.SVM.model import SVM
from featEngg.online.kmerMethods.models import ngModel,gaangModel

In [2]:
enzseqdata = '../data/seq/EnzymeSequence.csv'
labelfile = '../data/label/EnzymeLabelsMultiClass.csv'

In [3]:
# append model results to file
filename_model = '../data/results/feat_results.csv'

In [4]:
df1 = pd.read_csv(enzseqdata,header=None)
df2 = pd.read_csv(labelfile,header=None)
df = df1.merge(df2,on=0)
enz_names = df[0].values
X = df.iloc[:,1].values
y = df.iloc[:,-1].values

In [5]:
def get_model_online(X_train,X_valid,y_train,y_valid):

    if X_train.shape[1]<55:
        pca_components = int(0.75*X_train.shape[1])
    else:
        pca_components=55

    obj = SVM(X_train,X_valid,y_train,y_valid,pca_comp=pca_components,verbose=False)
    return obj


In [6]:
def model_evaluate(random_seed):
    X_train, X_valid, y_train, y_valid,enz_train,enz_valid = train_test_split(X, y,enz_names, test_size=0.25, random_state=random_seed)
    ng_i = ngModel(X_train,X_valid)
    gaang_i = gaangModel(X_train,X_valid)
    ng_obj = get_model_online(ng_i.Xtrain,ng_i.Xvalid,y_train,y_valid)
    gaang_obj = get_model_online(gaang_i.Xtrain,gaang_i.Xvalid,y_train,y_valid)
    
    yi = ng_obj.yvalid
    yhati = ng_obj.ypredvalid
    prec_ng=get_precision(yi,yhati)
    rec_ng=get_recall(yi,yhati)
    f1_score_ng =get_f1_score(yi,yhati)
    acc_ng =get_accuracy(yi,yhati)
    
    yi = gaang_obj.yvalid
    yhati = gaang_obj.ypredvalid
    prec_gaang=get_precision(yi,yhati)
    rec_gaang=get_recall(yi,yhati)
    f1_score_gaang =get_f1_score(yi,yhati)
    acc_gaang =get_accuracy(yi,yhati)
    
    return prec_ng,rec_ng,f1_score_ng,acc_ng,prec_gaang,rec_gaang,f1_score_gaang,acc_gaang

In [7]:
def get_precision(y,yhat):
    return round(precision_score(y,yhat,labels=[3],average='micro'),2)


def get_recall(y,yhat):
    return round(recall_score(y,yhat,labels=[3],average='micro'),2)


def get_f1_score(y,yhat):
    return round(f1_score(y,yhat,average='micro'),2)

def get_accuracy(y,yhat):
    return round(accuracy_score(y,yhat),2)

def write_model_stats(model_metrics):
    datadict = {'precision':[],'recall':[],'f1_score':[],'accuracy':[]}
    index_names = ['k-mer','gaa-kmer']
    
    ng_precs = [mo[0] for mo in model_metrics]
    ng_recalls = [mo[1] for mo in model_metrics]
    ng_f1_scores = [mo[2] for mo in model_metrics]
    ng_accs = [mo[3] for mo in model_metrics]
    gaang_precs = [mo[4] for mo in model_metrics]
    gaang_recalls = [mo[5] for mo in model_metrics]
    gaang_f1_scores = [mo[6] for mo in model_metrics]
    gaang_accs = [mo[7] for mo in model_metrics]
    
    
        


            
    datadict['precision'].append(np.mean(ng_precs))
    datadict['recall'].append(np.mean(ng_recalls))
    datadict['f1_score'].append(np.mean(ng_f1_scores))
    datadict['accuracy'].append(np.mean(ng_accs))
    datadict['precision'].append(np.mean(gaang_precs))
    datadict['recall'].append(np.mean(gaang_recalls))
    datadict['f1_score'].append(np.mean(gaang_f1_scores))
    datadict['accuracy'].append(np.mean(gaang_accs))

    df = pd.DataFrame(datadict,index=index_names)
    return df.to_csv(filename_model,mode='a',index=True,header=False)       


In [8]:
%%time
pool = mp.Pool(mp.cpu_count())
models = list(pool.map(model_evaluate,range(10000)))

CPU times: user 164 ms, sys: 88.6 ms, total: 252 ms
Wall time: 1min 29s


In [9]:
write_model_stats(models)

# Previous model metric generation - no longer useful 

In [None]:
def get_model_stats(accs):
    mnames = ['NG','GAANG']
    ngstats = [i[0] for i in accs]
    gaangstats = [i[1] for i in accs]
    train_ngs = [i[0] for i in ngstats]
    test_ngs = [i[1] for i in ngstats]
    train_gaangs = [i[0] for i in gaangstats]
    test_gaangs = [i[1] for i in gaangstats]
    
    return {'NG':{'train_min':min(train_ngs),'train_mean':np.mean(train_ngs),
            'train_max':max(train_ngs),'train_std':np.std(train_ngs),
            'test_min':min(test_ngs),'test_mean':np.mean(test_ngs),
            'test_max':max(test_ngs),'test_std':np.std(test_ngs)},
            'GAANG':{'train_min':min(train_gaangs),'train_mean':np.mean(train_gaangs),
            'train_max':max(train_gaangs),'train_std':np.std(train_gaangs),
            'test_min':min(test_gaangs),'test_mean':np.mean(test_gaangs),
            'test_max':max(test_gaangs),'test_std':np.std(test_gaangs)}}

In [8]:
results_dict = get_model_stats(model_accs)

In [9]:
results_df = pd.DataFrame(results_dict).T

In [10]:
results_df

Unnamed: 0,train_min,train_mean,train_max,train_std,test_min,test_mean,test_max,test_std
NG,0.72093,0.824529,0.930233,0.036551,0.37931,0.68689,0.965517,0.080228
GAANG,0.732558,0.847505,0.94186,0.028474,0.37931,0.708759,1.0,0.080503


In [11]:
results_df.to_csv('../data/SimResults/kmerFeatResults.csv',index=True)