In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from mySVM.model import SVM
import itertools 
import multiprocessing as mp
import numpy as np

In [2]:
feature_dir = 'pssmMethods/data/featvec/'
feature_files = [f.name for f in os.scandir(feature_dir) if f.name.endswith('.csv')]

In [3]:
label_dir = 'Data/Labels/'
label_file = 'Labels_Classification.csv'

In [4]:
enz_map_file = 'Data/TE_ML_Data/EnzymeNameMap.csv'

In [5]:
enz_map_df = pd.read_csv(enz_map_file,header=None)
enz_map_dict = dict(zip(enz_map_df[0].values,enz_map_df[1].values))

In [6]:
def model_evaluate(filename,random_seed):
    df1=pd.read_csv(feature_dir+filename,header=None)
    df2=pd.read_csv(label_dir+label_file,header=None).set_index(0)
    def func(row):
        return enz_map_dict[row]
    df1[0] = df1[0].apply(func)
    df1 = df1.set_index(0)
    df = df1.merge(df2,left_index=True,right_index=True).reset_index()
    
    enz_names = df.iloc[:,0].values
    X = df.iloc[:,1:-1].values
    y = df.iloc[:,-1].values
    
    if X.shape[1]<40:
        n_comp = int(0.75*X.shape[1])
    else:
        n_comp=40
        
    X_train, X_test, y_train, y_test,enz_train,enz_test = train_test_split(X, y,enz_names, test_size=0.25, random_state=random_seed)
    svm = SVM(X_train,X_test,y_train,y_test,verbose=False,optimize=False, pca_comp=n_comp, kern='rbf',regC=30)
    return svm.acc_train,svm.acc_test
 

In [7]:
mp.cpu_count()

24

In [8]:
pool = mp.Pool(mp.cpu_count())

In [9]:
iterable = list(itertools.product(feature_files,range(1000)))

In [10]:
%%time
model_accs = list(pool.starmap(model_evaluate,iterable))

CPU times: user 277 ms, sys: 87 ms, total: 364 ms
Wall time: 3min 39s


In [11]:
mydict = dict()

for m,acc in zip(iterable,model_accs):
    model_name=m[0].replace('.csv','')
    seed = m[1]
    if model_name not in mydict:
        mydict[model_name] = {seed:acc}
    else:
        mydict[model_name][seed] = acc

In [12]:
def get_model_stats(mname):
    train_accs = [v[0] for k,v in mydict[mname].items()]
    test_accs = [v[1] for k,v in mydict[mname].items()]
    return {'train_min':min(train_accs),'train_mean':np.mean(train_accs),
            'train_max':max(train_accs),'train_std':np.std(train_accs),
            'test_min':min(test_accs),'test_mean':np.mean(test_accs),
            'test_max':max(test_accs),'test_std':np.std(test_accs)}

In [13]:
get_model_stats('rpssm')

{'train_min': 0.7558139534883721,
 'train_mean': 0.8351860465116279,
 'train_max': 0.9069767441860465,
 'train_std': 0.02347490979621664,
 'test_min': 0.3793103448275862,
 'test_mean': 0.6091379310344828,
 'test_max': 0.8620689655172413,
 'test_std': 0.08210065311177074}

In [14]:
results_dict= {mname:get_model_stats(mname) for mname in mydict.keys()}
    

In [15]:
results_df = pd.DataFrame(results_dict).T

In [16]:
results_df.to_csv('Data/SimResults/KernelResults.csv',index=True)

In [17]:
results_df.sort_values('train_mean')

Unnamed: 0,train_min,train_mean,train_max,train_std,test_min,test_mean,test_max,test_std
edp,0.651163,0.72414,0.802326,0.02713,0.37931,0.60669,0.862069,0.078762
tpc,0.55814,0.7355,1.0,0.155679,0.310345,0.593759,0.827586,0.085159
smoothed_pssm,0.732558,0.804849,0.872093,0.022655,0.310345,0.542552,0.793103,0.076028
aatp,0.72093,0.830547,0.965116,0.05213,0.413793,0.634276,0.862069,0.08213
rpssm,0.755814,0.835186,0.906977,0.023475,0.37931,0.609138,0.862069,0.082101
aac_pssm,0.790698,0.856523,0.918605,0.022215,0.310345,0.642517,0.931034,0.083433
medp,0.77907,0.858756,0.930233,0.022359,0.37931,0.59131,0.862069,0.080324
eedp,0.790698,0.86064,0.930233,0.022034,0.37931,0.591483,0.862069,0.079696
ab_pssm,0.77907,0.860791,0.94186,0.026082,0.37931,0.589897,0.793103,0.079202
pssm_cc,0.813953,0.874895,0.94186,0.0214,0.310345,0.634,0.896552,0.081914
