# AutoPos

In [1]:
from AutoPos.AutoPosModel import AutoPosClass
from sklearn.feature_selection import chi2
from IndModels.ModelsEdited import NGModel,Model,GAACModel
from Ensemble.model import Ensemble

from sklearn.model_selection import train_test_split
import helper
import pandas as pd

import multiprocessing as mp
import numpy as np

In [2]:
Aligned_Data_File = 'Data/TE_ML_Data/EnzymeDatasetAlignedBinary.csv'
df = pd.read_csv(Aligned_Data_File,header=None)
enz_names = df[0].values
X = df.iloc[:,1].values
y = df.iloc[:,-1].values
X_train, X_test, y_train, y_test,enz_train,enz_test = train_test_split(X, y,enz_names, test_size=0.25, random_state=7)
sc_func = chi2
n_positions=50
apmodel = AutoPosClass(X_train,X_test,y_train,y_test,sc_func,None,n_positions,imp=True,verbose=False)



In [3]:
apmodel.SVMobject.acc_test

0.7931034482758621

# IndModels

In [4]:
enz_datafile = 'Data/TE_ML_Data/EnzymeDatasetBinary.csv'
df_k = pd.read_csv(enz_datafile,header=None)
enz_names_k = df_k[0].values
X_k = df_k.iloc[:,1].values
y_k = df_k.iloc[:,-1].values


X_train_k, X_test_k, y_train_k, y_test_k,enz_train_k,enz_test_k = train_test_split(X_k, y_k,enz_names_k, test_size=0.25, random_state=7)

m = Model(X_train_k,X_test_k,y_train_k,y_test_k)

ngmodel = NGModel(X_train_k,X_test_k,y_train_k,y_test_k)



gmodel = GAACModel(X_train_k,X_test_k,y_train_k,y_test_k)

ngmodel.SVMobject.acc_test,gmodel.SVMobject.acc_test

(0.8275862068965517, 0.8620689655172413)

# Ensemble

In [5]:
en = Ensemble([apmodel.SVMobject.ypredtest,ngmodel.SVMobject.ypredtest,gmodel.SVMobject.ypredtest],y_test_k)

In [6]:
en.acc

0.8275862068965517

# Multiprocessing

In [7]:
def model_evaluate(random_seed):
    Aligned_Data_File = 'Data/TE_ML_Data/EnzymeDatasetAlignedBinary.csv'
    df = pd.read_csv(Aligned_Data_File,header=None)
    enz_names = df[0].values
    X = df.iloc[:,1].values
    y = df.iloc[:,-1].values
    X_train, X_test, y_train, y_test,enz_train,enz_test = train_test_split(X, y,enz_names, test_size=0.25, random_state=random_seed)
    sc_func = chi2
    n_positions=50
    apmodel = AutoPosClass(X_train,X_test,y_train,y_test,sc_func,None,n_positions,imp=False,verbose=False)
    
    enz_datafile = 'Data/TE_ML_Data/EnzymeDatasetBinary.csv'
    df_k = pd.read_csv(enz_datafile,header=None)
    enz_names_k = df_k[0].values
    X_k = df_k.iloc[:,1].values
    y_k = df_k.iloc[:,-1].values
    X_train_k, X_test_k, y_train_k, y_test_k,enz_train_k,enz_test_k = train_test_split(X_k, y_k,enz_names_k, test_size=0.25, random_state=random_seed)
    ngmodel = NGModel(X_train_k,X_test_k,y_train_k,y_test_k)
    gmodel = GAACModel(X_train_k,X_test_k,y_train_k,y_test_k)
    
    en = Ensemble([apmodel.SVMobject.ypredtest,ngmodel.SVMobject.ypredtest,gmodel.SVMobject.ypredtest],y_test_k)
    
    return apmodel.SVMobject.acc_test,ngmodel.SVMobject.acc_test,gmodel.SVMobject.acc_test,en.acc,all(enz_test_k == enz_test)


In [8]:
model_evaluate(3)

(0.8275862068965517,
 0.7586206896551724,
 0.7931034482758621,
 0.8275862068965517,
 True)

In [9]:
mp.cpu_count()

24

In [10]:
pool = mp.Pool(mp.cpu_count())

In [11]:
%%time
model_accs = list(pool.map(model_evaluate,range(10000)))

CPU times: user 314 ms, sys: 98 ms, total: 412 ms
Wall time: 3min 41s


In [12]:
def get_model_stats(accs):
    mnames = ['AutoPos','NG','GAANG','Ensemble']
    maccs = [[accs[i][j] for i in range(len(accs)) ] for j in range(4)]
    return {mnames[i]:{'min':min(maccs[i]),'mean':np.mean(maccs[i]),
            'max':max(maccs[i]),'std':np.std(maccs[i])} for i in range(len(mnames))}

In [13]:
results_dict = get_model_stats(model_accs)

In [14]:
results_df = pd.DataFrame(results_dict).T

In [15]:
results_df.to_csv('Data/SimResults/myFeatResults.csv',index=True)

In [16]:
results_df

Unnamed: 0,min,mean,max,std
AutoPos,0.517241,0.862638,1.0,0.056384
NG,0.586207,0.850521,1.0,0.064964
GAANG,0.586207,0.853772,1.0,0.062313
Ensemble,0.586207,0.868141,1.0,0.059425
