In [1]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [2]:
import sys
import itertools
import numpy as np
import tqdm
import multiprocessing as mp
from itertools import groupby
from collections import Counter
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [3]:
sys.path.append('../scripts/')

from MultiClassClassification import TEClassification

# Best base learner decider

First step is to decide which base learning algorithm is suited to this problem. We train all the base learners in the ensemble using each of the three learning algorithms, SVM, NN and GBC and record their performance using the evaluation scheme discussed in the paper.

In [4]:
# Sequence and label files 
enz_file = '../data/seq/EnzymeSequence.csv'
label_file = '../data/label/EnzymeLabelsMultiClass.csv'

# Feature dir for iFeature,kernel,pssm 
ifeatdatadir = '../featEngg/offline/ifeatMethods/data/featvec/trainfiles/'
kerneldatadir = '../featEngg/offline/kernelMethods/data/featvec/thymefiles1/train/'
pssmdatadir = '../featEngg/offline/pssmMethods/data/featvec/trainfiles/'

trainfeatdirs = [ifeatdatadir,kerneldatadir,pssmdatadir]

In [5]:
def check_base_performance(rs,base_algo):
    te = TEClassification(enz_file,None,label_file,trainfeatdirs,None,random_seed=rs, model=base_algo, optimize=False)
    return te.precision, te.recall, te.en.acc

def check_performance(base_algo):
    pool = mp.Pool(mp.cpu_count())
    N = 10000
    iter_svm = zip(range(N),[base_algo for _ in range(N)])
    metrics = pool.starmap(check_base_performance,iter_svm)
    
    precision = [m[0] for m in metrics]
    recall = [m[1] for m in metrics]
    accuracy = [m[2] for m in metrics]
    
    return round(np.mean(precision),2), round(np.mean(recall),2), round(np.mean(accuracy),2)

In [6]:
%%time
check_performance('SVM')

CPU times: user 433 ms, sys: 172 ms, total: 605 ms
Wall time: 1h 2min 12s


(0.9, 0.92, 0.83)

In [7]:
%%time
check_performance('NN')

CPU times: user 692 ms, sys: 249 ms, total: 940 ms
Wall time: 1h 32min 46s


(0.84, 0.94, 0.81)

In [8]:
%%time
check_performance('GBC')

CPU times: user 576 ms, sys: 223 ms, total: 799 ms
Wall time: 1h 22min 25s


(0.84, 0.93, 0.81)

# Individual Hyperparameter Optimization

After selecting the base learning algorithm selection, we hyperparameter optimize each base learner to get optimal results from the ensemble model. 

The steps are as follows:

1. select base learner and simulate model with the optimize flag activated.
2. train the ensemble and record the best hyperparameters learnt for each base model. 
3. run it 1000 times and record the hyperparameters for each base learner throughout the simulations.
4. store the most frequent hyperparameters across the 1000 simulations for each base learner. 

In [6]:
def best_hps(rs):
    te = TEClassification(enz_file, None, label_file, trainfeatdirs, None, model='SVM', random_seed=rs, optimize=True)
    return te.get_best_hps()


def most_frequent(arr):
    count = Counter(arr)
    most_freq = count.most_common(1)[0][0]
    return most_freq


def store_best_hps(start=0):
    N = 1000
    pool = mp.Pool(mp.cpu_count())
    all_hps = sum(list(tqdm.tqdm(pool.imap(best_hps, range(start,N+start)), total=N)),[])
    all_hps = sorted(all_hps, key=lambda x: x[0])

    with open('../data/results/hpopt/IndHPOpt.csv','w') as f:
        f.write('feat_name,regC,kernel,pca_comp')
        f.write('\n')
        for feat_name, feat_info in groupby(all_hps, key=lambda x: x[0]):
            best_hp_list = [hp[1] for hp in list(feat_info)]
            best_regC = most_frequent([x[0] for x in best_hp_list])
            best_kernel = most_frequent([x[1] for x in best_hp_list])
            best_ncomp = most_frequent([x[2] for x in best_hp_list])
            f.write(f"{feat_name},{best_regC},{best_kernel},{best_ncomp}")
            f.write('\n')

    return 

In [10]:
%%time 
store_best_hps()

100%|██████████| 1000/1000 [6:02:14<00:00, 21.73s/it]  


CPU times: user 7.53 s, sys: 1.62 s, total: 9.15 s
Wall time: 6h 2min 15s


# Feature extraction technique performance

Comparing the performance of individual feature extraction techniques.

After getting the best set of hyperparameters, change the model such that it can accept the individual hyperparameter file. 

In [7]:
indhpoptfile = '../data/results/hpopt/IndHPOpt.csv'


def get_precision(y,yhat,label=3):
    return round(precision_score(y,yhat,labels=[label],average='micro'),2)

def get_recall(y,yhat,label=3):
    return round(recall_score(y,yhat,labels=[label],average='micro'),2)


def get_accuracy(y,yhat):
    return round(accuracy_score(y,yhat),2)

def get_metrics(val_iter):
    return get_precision(*val_iter), get_recall(*val_iter), get_accuracy(*val_iter)


def get_validation_iter(obj):
    return obj.yvalid,obj.ypredvalid


def indfeat_performance(rs):
    te = TEClassification(enz_file, None, label_file, trainfeatdirs, None, hyperparamfile=indhpoptfile, model='SVM', random_seed=rs, optimize=False)
    val_iters = list(map(get_validation_iter,te.objects))
    mets = list(map(get_metrics, val_iters))
    return list(zip(te.featnames,mets))


def indfeat_measure():
    pool = mp.Pool(mp.cpu_count())
    N = 10000
    all_metrics = sum(list(tqdm.tqdm(pool.imap(indfeat_performance,range(N)), total=N)),[])
    all_metrics = sorted(all_metrics, key=lambda x: x[0])
    with open("../data/results/indfeatreport.csv",'w') as f:
        f.write('featname,min_precision,max_precision,mean_precision,std_precision,min_recall,max_recall,mean_recall,std_recall,min_accuracy,max_accuracy,mean_accuracy,std_accuracy')
        f.write('\n')
        for featname, featinfo in groupby(all_metrics, key=lambda x:x[0]):
            all_metric_list = [met[1] for met in featinfo]
            all_prec = [m[0] for m in all_metric_list]
            all_rec = [m[1] for m in all_metric_list]
            all_acc = [m[2] for m in all_metric_list]

            min_prec = round(min(all_prec),2)
            max_prec = round(max(all_prec),2)
            mean_prec = round(np.mean(all_prec),2)
            std_prec = round(np.std(all_prec),2)

            min_rec = round(min(all_rec),2)
            max_rec = round(max(all_rec),2)
            mean_rec = round(np.mean(all_rec),2)
            std_rec = round(np.std(all_rec),2)

            min_acc = round(min(all_acc),2)
            max_acc = round(max(all_acc),2)
            mean_acc = round(np.mean(all_acc),2)
            std_acc = round(np.std(all_acc),2)
            
            f.write(f'{featname},{min_prec},{max_prec},{mean_prec},{std_prec},{min_rec},{max_rec},{mean_rec},{std_rec},{min_acc},{max_acc},{mean_acc},{std_acc}')
            f.write('\n')

        
    return
        

In [12]:
indfeat_measure()

100%|██████████| 10000/10000 [59:21<00:00,  2.81it/s] 


# Parametric sweep of ensemble model hyperparameter k

With the individualized hyperparameters, its time to check the ensemble performance. We run a paramteric sweep of the ensemble model parameter k to get the best estimate of the parameter which denotes the number of base models to select in the ensemble. 

In [8]:
def ensemble_param(k, rs):
    te = TEClassification(enz_file, None, label_file, trainfeatdirs, None, hyperparamfile=indhpoptfile, model='SVM', random_seed=rs, n_models=k, optimize=False)
    return te.y_valid,te.en.preds

def ensemble_metrics(val_iter):
    return get_metrics(val_iter)


def ensemble_param_sweep(ks):
    N = 10000
    func_iter = list(itertools.product(ks, range(N)))
    pool = mp.Pool(mp.cpu_count())
    all_preds = list(pool.starmap(ensemble_param, func_iter))
    all_metrics = list(map(get_metrics, all_preds))
    
    def get_mean(met):
        return round(np.mean(met),2)
    
    with open('../data/results/en_param_sweep.csv','w') as f:
        f.write('model_k,mean_precision,mean_recall,mean_accuracy')
        f.write('\n')
        for model_k, model_info in itertools.groupby(zip(list(func_iter), all_metrics), key=lambda x:x[0][0]):
            model_metrics = [m[1] for m in list(model_info)]
            prec = [m[0] for m in model_metrics]
            rec = [m[1] for m in model_metrics]
            acc = [m[2] for m in model_metrics]    
        
            f.write(f"{model_k},{','.join(list(map(str,(list(map(get_mean,[prec,rec,acc]))))))}")
            f.write('\n')
        
    return 

In [9]:
%%time
ensemble_param_sweep([5,9,15,21,31])

CPU times: user 1min 24s, sys: 409 ms, total: 1min 24s
Wall time: 4h 54min 25s


# Ensemble model performance

With the best base learning algorithm, the best sets of base learner hyperparameters, and the 5 top ranked feature extraction techniques, check the ensemble model performance 

## Ensemble model parameter k=5 provides the best performance. Hence 5-base learners will be used in the ensemble.

In [10]:
pd.read_csv('../data/results/en_param_sweep.csv')

Unnamed: 0,model_k,mean_precision,mean_recall,mean_accuracy
0,5,0.89,0.91,0.83
1,9,0.89,0.9,0.82
2,15,0.89,0.89,0.81
3,21,0.89,0.89,0.81
4,31,0.88,0.89,0.8


In [13]:
def ensemble_preds(rs):
    te = TEClassification(enz_file, None, label_file, trainfeatdirs, None, use_feat=None, hyperparamfile=indhpoptfile, model='SVM', random_seed=rs, n_models=5, optimize=False)
    return te.y_valid,te.en.preds


def ensemble_eval():
    N = 10000
    pool = mp.Pool(mp.cpu_count())
    all_preds = list(tqdm.tqdm(pool.imap(ensemble_preds, range(N)), total=N))
    model_metrics = list(map(get_metrics, all_preds))
    all_prec = [m[0] for m in model_metrics]
    all_rec = [m[1] for m in model_metrics]
    all_acc = [m[2] for m in model_metrics]
    
    min_prec = round(min(all_prec),2)
    max_prec = round(max(all_prec),2)
    mean_prec = round(np.mean(all_prec),2)
    std_prec = round(np.std(all_prec),2)

    min_rec = round(min(all_rec),2)
    max_rec = round(max(all_rec),2)
    mean_rec = round(np.mean(all_rec),2)
    std_rec = round(np.std(all_rec),2)

    min_acc = round(min(all_acc),2)
    max_acc = round(max(all_acc),2)
    mean_acc = round(np.mean(all_acc),2)
    std_acc = round(np.std(all_acc),2)
    
    with open('../data/results/ensemble_results.csv','w') as f:
        f.write('ensemble,min_precision,max_precision,mean_precision,std_precision,min_recall,max_recall,mean_recall,std_recall,min_accuracy,max_accuracy,mean_accuracy,std_accuracy')
        f.write('\n')
    
        f.write(f'ensemble,{min_prec},{max_prec},{mean_prec},{std_prec},{min_rec},{max_rec},{mean_rec},{std_rec},{min_acc},{max_acc},{mean_acc},{std_acc}')
        f.write('\n')
        
    return

In [14]:
ensemble_eval()

100%|██████████| 10000/10000 [58:51<00:00,  2.83it/s] 


## Model Evaluation all three categories 