In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import itertools 
import multiprocessing as mp
import numpy as np
import sys

sys.path.append('../')
from baseModels.SVM.model import SVM

In [2]:
feature_dir = '../featEngg/offline/pssmMethods/data/featvec/trainfiles/'
feature_files = [f.name for f in os.scandir(feature_dir) if f.name.endswith('.csv.gz')]

In [3]:
label_file = '../data/LabelFiles/EnzymeLabelsMultiClass.csv'

In [4]:
def model_evaluate(filename,random_seed):
    df1 = pd.read_csv(feature_dir+filename,header=None)
    df2 = pd.read_csv(label_file,header=None)
    df = df1.merge(df2,on=0)
    enz_names = df[0].values
    X = df.iloc[:,1:-1].values
    y = df.iloc[:,-1].values
    if X.shape[1]<55:
        n_comp = int(0.75*X.shape[1])
    else:
        n_comp=55
    X_train, X_valid, y_train, y_valid,enz_train,enz_valid = train_test_split(X, y,enz_names, test_size=0.25, random_state=random_seed)
    svm = SVM(X_train,X_valid,y_train,y_valid,verbose=False,optimize=False, pca_comp=n_comp,kern='rbf',regC=30)
    return svm.acc_train,svm.acc_valid
 

In [5]:
mp.cpu_count()

24

In [6]:
pool = mp.Pool(mp.cpu_count())

In [7]:
iterable = list(itertools.product(feature_files,range(10000)))

In [8]:
%%time
model_accs = list(pool.starmap(model_evaluate,iterable))

CPU times: user 823 ms, sys: 136 ms, total: 958 ms
Wall time: 39min 8s


In [9]:
mydict = dict()

for m,acc in zip(iterable,model_accs):
    model_name=m[0].replace('.csv.gz','')
    seed = m[1]
    if model_name not in mydict:
        mydict[model_name] = {seed:acc}
    else:
        mydict[model_name][seed] = acc

In [10]:
def get_model_stats(mname):
    train_accs = [v[0] for k,v in mydict[mname].items()]
    test_accs = [v[1] for k,v in mydict[mname].items()]
    return {'train_min':min(train_accs),'train_mean':np.mean(train_accs),
            'train_max':max(train_accs),'train_std':np.std(train_accs),
            'test_min':min(test_accs),'test_mean':np.mean(test_accs),
            'test_max':max(test_accs),'test_std':np.std(test_accs)}

In [11]:
get_model_stats('rpssm')

{'train_min': 0.7906976744186046,
 'train_mean': 0.8806046511627909,
 'train_max': 0.9534883720930233,
 'train_std': 0.02382097007822772,
 'test_min': 0.27586206896551724,
 'test_mean': 0.6043137931034482,
 'test_max': 0.8620689655172413,
 'test_std': 0.0781228185616967}

In [12]:
results_dict= {mname:get_model_stats(mname) for mname in mydict.keys()}
    

In [13]:
results_df = pd.DataFrame(results_dict).T

In [14]:
results_df.to_csv('../data/SimResults/pssmResults.csv',index=True)

In [15]:
results_df.sort_values('train_mean')

Unnamed: 0,train_min,train_mean,train_max,train_std,test_min,test_mean,test_max,test_std
tpc,0.534884,0.735767,1.0,0.157587,0.275862,0.60181,0.931034,0.081838
edp,0.697674,0.784023,0.895349,0.027032,0.344828,0.629269,0.896552,0.076577
smoothed_pssm,0.790698,0.864966,0.965116,0.025264,0.344828,0.601103,0.896552,0.075538
aatp,0.755814,0.870126,1.0,0.047861,0.37931,0.695155,0.965517,0.07821
aac_pssm,0.802326,0.877252,0.965116,0.023327,0.37931,0.674748,0.965517,0.079295
rpssm,0.790698,0.880605,0.953488,0.023821,0.275862,0.604314,0.862069,0.078123
tri_gram_pssm,0.848837,0.905642,0.976744,0.018149,0.344828,0.642059,0.931034,0.078523
k_separated_bigrams_pssm,0.848837,0.910498,0.976744,0.017229,0.344828,0.649148,0.931034,0.078182
ab_pssm,0.848837,0.912169,0.988372,0.018123,0.344828,0.653583,0.931034,0.079058
aadp_pssm,0.860465,0.918052,0.976744,0.017358,0.310345,0.650966,0.931034,0.081237
