In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import itertools 
import multiprocessing as mp
import numpy as np
import sys

sys.path.append('../')
from baseModels.SVM.model import SVM
from featEngg.online.kmerMethods.models import ngModel,gaangModel

In [2]:
enzseqdata = '../data/SeqFile/EnzymeSequence.csv'
labelfile = '../data/LabelFiles/EnzymeLabelsMultiClass.csv'

In [3]:
df1 = pd.read_csv(enzseqdata,header=None)
df2 = pd.read_csv(labelfile,header=None)
df = df1.merge(df2,on=0)
enz_names = df[0].values
X = df.iloc[:,1].values
y = df.iloc[:,-1].values

In [4]:
def get_model_online(X_train,X_valid,y_train,y_valid):

    if X_train.shape[1]<55:
        pca_components = int(0.75*X_train.shape[1])
    else:
        pca_components=55

    obj = SVM(X_train,X_valid,y_train,y_valid,pca_comp=pca_components,verbose=False)
    return obj.acc_train,obj.acc_valid


In [5]:
def model_evaluate(random_seed):
    X_train, X_valid, y_train, y_valid,enz_train,enz_valid = train_test_split(X, y,enz_names, test_size=0.25, random_state=random_seed)
    ng_i = ngModel(X_train,X_valid)
    gaang_i = gaangModel(X_train,X_valid)
    ng_acctr,ng_accte = get_model_online(ng_i.Xtrain,ng_i.Xvalid,y_train,y_valid)
    gaang_acctr,gaang_accte = get_model_online(gaang_i.Xtrain,gaang_i.Xvalid,y_train,y_valid)
    return ((ng_acctr,ng_accte),(gaang_acctr,gaang_accte))

In [6]:
def get_model_stats(accs):
    mnames = ['NG','GAANG']
    ngstats = [i[0] for i in accs]
    gaangstats = [i[1] for i in accs]
    train_ngs = [i[0] for i in ngstats]
    test_ngs = [i[1] for i in ngstats]
    train_gaangs = [i[0] for i in gaangstats]
    test_gaangs = [i[1] for i in gaangstats]
    
    return {'NG':{'train_min':min(train_ngs),'train_mean':np.mean(train_ngs),
            'train_max':max(train_ngs),'train_std':np.std(train_ngs),
            'test_min':min(test_ngs),'test_mean':np.mean(test_ngs),
            'test_max':max(test_ngs),'test_std':np.std(test_ngs)},
            'GAANG':{'train_min':min(train_gaangs),'train_mean':np.mean(train_gaangs),
            'train_max':max(train_gaangs),'train_std':np.std(train_gaangs),
            'test_min':min(test_gaangs),'test_mean':np.mean(test_gaangs),
            'test_max':max(test_gaangs),'test_std':np.std(test_gaangs)}}

In [7]:
%%time
pool = mp.Pool(mp.cpu_count())
model_accs = list(pool.map(model_evaluate,range(10000)))

CPU times: user 124 ms, sys: 89.6 ms, total: 213 ms
Wall time: 1min 24s


In [8]:
results_dict = get_model_stats(model_accs)

In [9]:
results_df = pd.DataFrame(results_dict).T

In [10]:
results_df

Unnamed: 0,train_min,train_mean,train_max,train_std,test_min,test_mean,test_max,test_std
NG,0.72093,0.824529,0.930233,0.036551,0.37931,0.68689,0.965517,0.080228
GAANG,0.732558,0.847505,0.94186,0.028474,0.37931,0.708759,1.0,0.080503


In [11]:
results_df.to_csv('../data/SimResults/kmerFeatResults.csv',index=True)