In [1]:
import os
import sys
import numpy as np
import pandas as pd
import multiprocessing as mp
from itertools import starmap,groupby
from collections import Counter
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split

sys.path.append('../')
from baseModels.SVM.model import SVM

In [2]:
class features:
    def __init__(self):
        feature_files_dir = '../featEngg/'
        self.feature_dict = {}
        self.get_feature_dict(feature_files_dir)
        pass
    
    def get_feature_dict(self,feature_files_dir):
        for algo_type in os.scandir(feature_files_dir):
            if algo_type.is_dir():
                for feat_type in os.scandir(feature_files_dir+algo_type.name+'/'):
                    if feat_type.name.endswith('Methods'):
                        if feat_type.name == 'kmerMethods':
                            self.feature_dict[algo_type.name] = {feat_type.name:{'kmer':None,'gaakmer':None}}
                        else:
                            for file in os.scandir(feature_files_dir+algo_type.name+'/'+feat_type.name+'/data/featvec/trainfiles/'):
                                feat_name = file.name.replace('.csv.gz','')
                                if algo_type.name not in self.feature_dict:
                                    self.feature_dict[algo_type.name] = {feat_type.name:{feat_name:feature_files_dir+algo_type.name+'/'+feat_type.name+'/data/featvec/trainfiles/'+file.name}}
                                else:
                                    if feat_type.name not in self.feature_dict[algo_type.name]:
                                        self.feature_dict[algo_type.name][feat_type.name] = {feat_name:feature_files_dir+algo_type.name+'/'+feat_type.name+'/data/featvec/trainfiles/'+file.name}
                                    else:
                                        self.feature_dict[algo_type.name][feat_type.name][feat_name] = feature_files_dir+algo_type.name+'/'+feat_type.name+'/data/featvec/trainfiles/'+file.name
        return
    
    def get_feat_iter(self,iters_to_repeat):
        algo_type = []
        feat_type = []
        feat_name = []
        for feat_types in self.feature_dict['offline'].keys():
            for feat_names in self.feature_dict['offline'][feat_types].keys():
                algo_type.append('offline')
                feat_type.append(feat_types)
                feat_name.append(feat_names)
        args_list = [(fn,ft,at,it) for at,ft,fn in zip(algo_type,feat_type,feat_name) for it in range(iters_to_repeat)]
        return args_list
        



class feature_hpopt(features):
    def __init__(self,feature_name='dp_pssm',feature_type='pssmMethods',algo_type='offline',random_seed=None,pca_components=45):
        
        super().__init__()
        original_seq_file = '../data/SeqFile/EnzymeSequence.csv'
        label_file = '../data/LabelFiles/EnzymeLabelsMultiClass.csv'
        self.random_seed = random_seed
        self.default_pca_components = pca_components
        
        # original data based on which everything is obtained
        df1 = pd.read_csv(original_seq_file,header=None)
        df2 = pd.read_csv(label_file,header=None)
        self.train_df = df1.merge(df2,on=0)
        
        self.enz_names = self.train_df[0].values
        self.X = self.train_df.iloc[:,1].values
        self.y = self.train_df.iloc[:,-1].values
        
        # training and validation data for general use
        self.X_train, self.X_valid, self.y_train, self.y_valid,self.enz_train,self.enz_valid = train_test_split(self.X, self.y,self.enz_names, test_size=0.25, random_state=self.random_seed)

        self.label_file = label_file
        self.seq_file = original_seq_file
        
        self.hpopt_feature_file = self.feature_dict[algo_type][feature_type][feature_name]
        self.model = self.get_offline_model(self.hpopt_feature_file)
        
    def get_offline_model(self,featfilename,testfeatfilename=None):
        
        df1 = pd.read_csv(featfilename,header=None)
        df2 = pd.read_csv(self.label_file,header=None)
        df_feat = df1.merge(df2,on=0).set_index(0)
        df_feat_train = df_feat.loc[self.enz_train]
        df_feat_valid = df_feat.loc[self.enz_valid]
        X_train_feat,y_train_feat = df_feat_train.iloc[:,0:-1].values,df_feat_train.iloc[:,-1].values
        X_valid_feat,y_valid_feat = df_feat_valid.iloc[:,0:-1].values,df_feat_valid.iloc[:,-1].values

        if X_train_feat.shape[1]<self.default_pca_components:
            self.pca_components = int(0.75*X_train_feat.shape[1])
        else:
            self.pca_components=self.default_pca_components
            
        if testfeatfilename is not None:
            df_feat_test = pd.read_csv(testfeatfilename,header=None).set_index(0)
            X_test_feat = df_feat_test.loc[self.testenz_names].values
            if X_train_feat.shape[1] != X_test_feat.shape[1]:
                print(featfilename)
            obj = SVM(X_train_feat,X_valid_feat,y_train_feat,y_valid_feat,X_test_feat)
        else:
            obj = SVM(X_train_feat,X_valid_feat,y_train_feat,y_valid_feat,verbose=False,optimize=True,pca_comp=self.pca_components,multi_jobs=False)
        return obj
    
    def get_best_hps(self):
        return tuple(self.model.grid.best_params_.values())


In [3]:
def feat_hp_func(feat_name,feat_type,algo_type,rs):
    myfeat = feature_hpopt(feat_name,feat_type,algo_type,random_seed=rs)
    return myfeat.get_best_hps()

def most_frequent(arr):
    count = Counter(arr)
    most_freq = count.most_common(1)[0][0]
    return most_freq

In [4]:
def main(argument_iter):
    pool = mp.Pool(mp.cpu_count())
    best_hps = list(pool.starmap(feat_hp_func,argument_iter))
    best_hp_dict = {}
    for feat_name,feat_info in groupby(zip(argument_iter,best_hps),key=lambda x: x[0][0]):
        best_hp_list = [hp[1] for hp in list(feat_info)]
        best_regC = most_frequent([x[0] for x in best_hp_list])
        best_kernel = most_frequent([x[1] for x in best_hp_list])
        best_ncomp = most_frequent([x[2] for x in best_hp_list])
        best_hp_dict[feat_name] = {'regC':best_regC,'kernel':best_kernel,'pca_comp':best_ncomp}
    return best_hp_dict

In [5]:
%%time
if __name__=='__main__':
    feat = features()
    args_list = feat.get_feat_iter(100)   
    hps = main(args_list)
    with open('../data/SimResults/HyperParameterOptimization/IndHPOpt.csv','w') as f:
        f.write('feat_name,regC,kernel,pca_comp')
        f.write('\n')
        for key in hps.keys():
            f.write(f"{key},{hps[key]['regC']},{hps[key]['kernel']},{hps[key]['pca_comp']}")
            f.write('\n')
            
            

CPU times: user 11.1 s, sys: 8.33 s, total: 19.4 s
Wall time: 4h 16min 40s
