In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from mySVM.model import SVM
import itertools 
import multiprocessing as mp
import numpy as np

In [2]:
feature_dir = 'KernelMethods/featvec/'
feature_files = [f.name for f in os.scandir(feature_dir) if f.name.endswith('.csv.gz')]

In [3]:
label_dir = 'Data/Labels/'
label_file = 'Labels_Classification.csv'

In [4]:
enz_map_file = 'Data/TE_ML_Data/EnzymeNameMap.csv'

In [5]:
enz_map_df = pd.read_csv(enz_map_file,header=None)
enz_map_dict = dict(zip(enz_map_df[0].values,enz_map_df[1].values))

In [6]:
def model_evaluate(filename,random_seed):
    df1=pd.read_csv(feature_dir+filename)
    df2=pd.read_csv(label_dir+label_file,header=None).set_index(0)
    def func(row):
        return enz_map_dict[row]
    df1['Unnamed: 0'] = df1['Unnamed: 0'].apply(func)
    df1 = df1.set_index('Unnamed: 0')
    df = df1.merge(df2,left_index=True,right_index=True).reset_index()
    
    enz_names = df.iloc[:,0].values
    X = df.iloc[:,1:-1].values
    y = df.iloc[:,-1].values
    
    if X.shape[1]<40:
        n_comp = int(0.75*X.shape[1])
    else:
        n_comp=40
        
    X_train, X_test, y_train, y_test,enz_train,enz_test = train_test_split(X, y,enz_names, test_size=0.25, random_state=random_seed)
    svm = SVM(X_train,X_test,y_train,y_test,verbose=False,optimize=False, pca_comp=n_comp)
    return svm.acc_train,svm.acc_test
 

In [7]:
mp.cpu_count()

8

In [8]:
pool = mp.Pool(mp.cpu_count())

In [9]:
iterable = list(itertools.product(feature_files,range(100)))

In [10]:
%%time
model_accs = list(pool.starmap(model_evaluate,iterable))

CPU times: user 37.6 ms, sys: 26 ms, total: 63.5 ms
Wall time: 52.4 s


In [11]:
mydict = dict()

for m,acc in zip(iterable,model_accs):
    model_name=m[0].replace('.csv.gz','')
    seed = m[1]
    if model_name not in mydict:
        mydict[model_name] = {seed:acc}
    else:
        mydict[model_name][seed] = acc

In [12]:
def get_model_stats(mname):
    train_accs = [v[0] for k,v in mydict[mname].items()]
    test_accs = [v[1] for k,v in mydict[mname].items()]
    return {'train_min':min(train_accs),'train_mean':np.mean(train_accs),
            'train_max':max(train_accs),'train_std':np.std(train_accs),
            'test_min':min(test_accs),'test_mean':np.mean(test_accs),
            'test_max':max(test_accs),'test_std':np.std(test_accs)}

In [13]:
get_model_stats('mismatchKernel')

{'train_min': 0.7241379310344828,
 'train_mean': 0.7767816091954022,
 'train_max': 0.8390804597701149,
 'train_std': 0.02565974498862939,
 'test_min': 0.5172413793103449,
 'test_mean': 0.6786206896551724,
 'test_max': 0.8620689655172413,
 'test_std': 0.07329188465797692}

In [14]:
results_dict= {mname:get_model_stats(mname) for mname in mydict.keys()}
    

In [15]:
results_df = pd.DataFrame(results_dict).T

In [16]:
results_df.to_csv('Data/SimResults/KernelResults.csv',index=True)

In [17]:
results_df.sort_values('train_mean')

Unnamed: 0,train_min,train_mean,train_max,train_std,test_min,test_mean,test_max,test_std
mismatchKernel,0.724138,0.776782,0.83908,0.02566,0.517241,0.678621,0.862069,0.073292
gappyKernel,0.724138,0.778391,0.83908,0.025757,0.482759,0.676552,0.862069,0.074403
spectrumKernel,0.747126,0.796437,0.873563,0.027696,0.482759,0.676897,0.827586,0.070442
