In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from mySVM.model import SVM
import itertools 
import multiprocessing as mp
import numpy as np

In [2]:
feature_dir = 'Data/FeatureVectors/iFeatureVectors/'
feature_files = [f.name for f in os.scandir(feature_dir)]

In [3]:
label_dir = 'Data/Labels/'
label_file = 'Labels_Classification.csv'

In [4]:
def model_evaluate(filename,random_seed):
    df1 = pd.read_csv(feature_dir+filename,header=None)
    df2 = pd.read_csv(label_dir+label_file,header=None)
    df = df1.merge(df2,on=0)
    enz_names = df[0].values
    X = df.iloc[:,1:-1].values
    y = df.iloc[:,-1].values
    if X.shape[1]<40:
        n_comp = int(0.75*X.shape[1])
    else:
        n_comp=40
    X_train, X_test, y_train, y_test,enz_train,enz_test = train_test_split(X, y,enz_names, test_size=0.25, random_state=random_seed)
    svm = SVM(X_train,X_test,y_train,y_test,verbose=False,optimize=False, pca_comp=n_comp)
    return svm.acc_test
 

In [5]:
pool = mp.Pool(mp.cpu_count())

In [6]:
iterable = list(itertools.product(feature_files,range(100)))

In [7]:
%%time
model_accs = list(pool.starmap(model_evaluate,iterable))

CPU times: user 32.3 ms, sys: 21.2 ms, total: 53.5 ms
Wall time: 46.8 s


In [8]:
mydict = dict()

for m,acc in zip(iterable,model_accs):
    model_name=m[0].replace('.csv','')
    seed = m[1]
    if model_name not in mydict:
        mydict[model_name] = {seed:acc}
    else:
        mydict[model_name][seed] = acc

In [12]:
def get_model_stats(mname):
    accs = [v for k,v in mydict[mname].items()]
    return {'min':min(accs),'mean':np.mean(accs),
            'max':max(accs),'std':np.std(accs)}

In [13]:
get_model_stats('Geary')

{'min': 0.4827586206896552,
 'mean': 0.683448275862069,
 'max': 0.8620689655172413,
 'std': 0.07401834782340484}