In [7]:
import numpy as np
import pandas as pd
import pickle as pkl

from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

from NCPR_functions import load_NCPR
from NCPR_functions import train_test_split
from NCPR_functions import get_initial_accuracy

from tqdm import tqdm
from joblib import Parallel, delayed
from copy import deepcopy

In [3]:
def ensemble_predictions(xtest, model_list, weights_list):
    '''
    Arguments
    xtest: the test set x values
    model_list: a list of models that have been fitted to the train set. 
    scores_list: a list of the average accuracy score of each model in model_list, in order

    Implements an emsemble prediction and returns
    ensemble_predictions: the y_pred made by the ensemble model. 
    '''
    ensemble_predictions = []
    
    #loops through each x value in the test set
    for X in xtest:
        # creates a dictionary that stores the predictions made by each model
        predictions = {}
        max_value = 0
        best_pred = 0

        #iterating through each model to get individual predictions
        for i in range(len(model_list)):
            pred = model_list[i].predict(X.reshape(1, -1))[0]
            # adding each prediction to the dictionary. The models are weighted by accuracy score, so a float is used as the value to the keys
            if pred not in predictions.keys():
                predictions[pred] = weights_list[i]
            else:
                predictions[pred] += weights_list[i]
        
        # after iterating through the model, looking through the dictionary to find the prediction with the highest score and using that as y_pred
        for key, value in predictions.items():
            if value > max_value:
                best_pred = deepcopy(key)
                max_value = deepcopy(value)
        ensemble_predictions.append(best_pred)
    
    return np.array(ensemble_predictions)

In [5]:
def check_seq(seq1, seq2):
    for seq in seq2:
        if seq in seq1:
            raise Exception('Repeat between the train and test sequences')
    pass 

In [4]:
def ensemble(xtrain, ytrain, xtest, ytest, models=None, model_names=None):
   '''
   Arguments: train and test sets, 
   model_names: a list of string model names to be used in the ensemble model
   models: alternatively, a list of pre-trained models can also be used

   Implements the get_accuracy function to get the average accuracy scores and fitted model for each classifier type
   implements ensemble predictions using the models and scores to obtain predicted y values
   
   Returns:
   y_pred: the predicted y values for the test set as generated by the ensemble model
   accuracy: the accuracy score of the prediction 
   '''
   check_seq(xtrain, xtest)
   if models==None and model_names!=None:
      models = []
      weights = []

      # iterating through each model in model_names and calling the get_accuracy to get the accuracy scores and a trained model
      for model_name in model_names:
         score, best_model = get_initial_accuracy(xtrain, ytrain, xtest, ytest, model_name, iterations=1)
         # assembling the lists of models and mean scores, so that they are in the same order
         models.append(best_model)
         weights.append(score)
   elif models!=None and model_names==None:
      weights = []
      for model in models:
         model.fit(xtrain, ytrain)
         pred = model.predict(xtest)
         score = accuracy_score(ytest, pred)
         weights.append(score)
   else:
      raise Exception('Exactly one of models and model_names must be initialized')
   
   # calling the ensemble_predictions function to get predicted y values, then calculating the accuracy score 
   y_pred = ensemble_predictions(xtest, models, weights)
   accuracy = accuracy_score(ytest, y_pred) 
   return y_pred, accuracy

from sklearn import datasets

iris = datasets.load_iris()
X = iris.data
y = iris.target
dict_iris = {}
index = []
species = []

for i in range(len(X)):
    dict_iris[i] = X[i]
    index.append(i)
    species.append(y[i])
df_iris = pd.DataFrame({'index': index, 'species': species, 'species index': species})

xtrain, ytrain, xtest, ytest = train_test_split(dict_iris, df_iris, seed = 'random')

ensemble(xtrain, ytrain, xtest, ytest, model_names=['XGBoost', 'Linear SVC', 'Random Forest'])

linear_model = pkl.load(open('hyperparam_results/linear_model.pkl', 'rb'))
xgb_model = pkl.load(open('hyperparam_results/xgb_model.pkl', 'rb'))
tree_model = pkl.load(open('hyperparam_results/tree_model.pkl', 'rb'))
forest_model = pkl.load(open('hyperparam_results/forest_model.pkl', 'rb'))
rbf_model = pkl.load(open('hyperparam_results/rbf_model.pkl', 'rb'))
sigmoid_model = pkl.load(open('hyperparam_results/sig_model.pkl', 'rb'))

In [5]:
dict_data, df_p = load_NCPR('NCPR_bert.npz', 'uniprot-NCPR.tab','uniprot-NCPR.fasta')
xtrain, ytrain, xtest, ytest = train_test_split(dict_data, df_p, seed='random')

In [None]:
linear_model = LinearSVC(C=1, loss='squared_hinge',penalty='l2')
xgb_model = XGBClassifier(eta=0.17, gamma=0.67, max_depth=14, min_child_weight=1,nthread=1)
tree_model = DecisionTreeClassifier(criterion='entropy',max_depth=10,max_features=30,min_samples_leaf=4,splitter='best')
forest_model = RandomForestClassifier(criterion='entropy',max_depth=16,max_features='log2',n_estimators=120)
rbf_model = SVC(kernel='rbf', C=100, gamma=0.01)
sigmoid_model = SVC(kernel='sigmoid',C=10, coef0=0,gamma='scale')
poly_model = SVC(kernel='poly',degree=4, C=10)

In [None]:
results = Parallel(n_jobs=20)(delayed(ensemble)(xtrain, ytrain, xtest, ytest, models=[linear_model, xgb_model, tree_model, forest_model, rbf_model, sigmoid_model, poly_model]) for i in tqdm(range(100)))

#y_pred, accuracy = ensemble(xtrain, ytrain, xtest, ytest, models=[linear_model, xgb_model, tree_model, forest_model, rbf_model, sigmoid_model, poly_model])

In [6]:
train_index = np.random.choice(range(len(xtrain)), 10)
test_index = np.random.choice(range(len(xtest)), 10)
new_xtest = np.append(xtrain[train_index], xtest[test_index], axis=0)
new_ytest = np.append(ytrain[train_index], ytest[test_index])

In [7]:
y_pred0, accuracy0 = ensemble(xtrain, ytrain, new_xtest, new_ytest, models=[linear_model, xgb_model, tree_model, forest_model, rbf_model, sigmoid_model, poly_model])

In [8]:
accuracy0

0.95

In [9]:
groupby_species = df_p.groupby(['species'])['index'].nunique().sort_values(ascending=False).to_frame().reset_index()
biggest_species = groupby_species['species'][:25].values
to_exclude = np.random.choice(biggest_species,10, replace=False)

In [10]:
reduced_df = df_p.loc[~df_p['species'].isin(to_exclude)]
reduced_dict = {}
for i in reduced_df['index']:
    reduced_dict[i] = dict_data[i]

In [11]:
xtrain1, ytrain1, xtest1, ytest1 = train_test_split(reduced_dict, reduced_df)
y_pred1, accuracy1 = ensemble(xtrain1, ytrain1, xtest1, ytest1, models=[linear_model, xgb_model, tree_model, forest_model, rbf_model, sigmoid_model])

In [12]:
accuracy1

0.9377593360995851