In [1]:
import numpy as np
def mean_predictions(probas):
 """
 Create mean predictions
 :param probas: 2-d array of probability values
 :return: mean probability
 """
 return np.mean(probas, axis=1)
def max_voting(preds):
 """
 Create mean predictions
 :param probas: 2-d array of prediction values
 :return: max voted predictions
 """
 idxs = np.argmax(preds, axis=1)
 return np.take_along_axis(preds, idxs[:, None], axis=1)

In [5]:
def rank_mean(probas):
 """
 Create mean predictions using ranks
 :param probas: 2-d array of probability values
 :return: mean ranks
 """
 ranked = []
 for i in range(probas.shape[1]):
     rank_data = stats.rankdata(probas[:, i])
     ranked.append(rank_data)
     ranked = np.column_stack(ranked)
 return np.mean(ranked, axis=1)

In [6]:
import numpy as np
from functools import partial
from scipy.optimize import fmin
from sklearn import metrics

class OptimizeAUC:
    """
    Class for optimizing AUC.
    This class is all you need to find best weights for
    any model and for any metric and for any types of predictions.
    With very small changes, this class can be used for optimization of
    weights in ensemble models of _any_ type of predictions
    """

    def __init__(self):
        self.coef_ = 0

    def _auc(self, coef, X, y):
        """
        This function calculates and returns AUC.
        :param coef: coef list, of the same length as number of models
        :param X: predictions, in this case a 2d array
        :param y: targets, in our case binary 1d array
        """
        # multiply coefficients with every column of the array
        # with predictions.
        # this means: element 1 of coef is multiplied by column 1
        # of the prediction array, element 2 of coef is multiplied
        # by column 2 of the prediction array and so on!
        x_coef = X * coef
        # create predictions by taking row wise sum
        predictions = np.sum(x_coef, axis=1)

        # calculate auc score
        auc_score = metrics.roc_auc_score(y, predictions)
        # return negative auc
        return -1.0 * auc_score

    def fit(self, X, y):
        # remember partial from hyperparameter optimization chapter?
        loss_partial = partial(self._auc, X=X, y=y)

        # dirichlet distribution. you can use any distribution you want
        # to initialize the coefficients
        # we want the coefficients to sum to 1
        initial_coef = np.random.dirichlet(np.ones(X.shape[1]), size=1)
        # use scipy fmin to minimize the loss function, in our case auc
        self.coef_ = fmin(loss_partial, initial_coef, disp=True)

    def predict(self, X):
        # this is similar to _auc function
        x_coef = X * self.coef_
        predictions = np.sum(x_coef, axis=1)
        return predictions


In [10]:
from sklearn import linear_model
from sklearn import ensemble
import xgboost as xgb

def train_and_predict(classifier, train_data, test_data):
    classifier.fit(train_data[0], train_data[1])
    pred_proba = classifier.predict_proba(test_data[0])[:, 1]
    return pred_proba

def evaluate_and_print(aucs, fold_name):
    for i, auc in enumerate(aucs):
        print(f"{fold_name}: Model {i + 1} AUC = {auc}")

# First Fold
logreg = linear_model.LogisticRegression()
rf = ensemble.RandomForestClassifier()
xgbc = xgb.XGBClassifier()

logreg_preds = train_and_predict(logreg, (xfold1, yfold1), (xfold2, yfold2))
rf_preds = train_and_predict(rf, (xfold1, yfold1), (xfold2, yfold2))
xgbc_preds = train_and_predict(xgbc, (xfold1, yfold1), (xfold2, yfold2))

avg_pred_fold2 = (logreg_preds + rf_preds + xgbc_preds) / 3
fold2_preds = np.column_stack((logreg_preds, rf_preds, xgbc_preds, avg_pred_fold2))

aucs_fold2 = [metrics.roc_auc_score(yfold2, fold2_preds[:, i]) for i in range(fold2_preds.shape[1])]
evaluate_and_print(aucs_fold2, "Fold-2")

# Second Fold
logreg = linear_model.LogisticRegression()
rf = ensemble.RandomForestClassifier()
xgbc = xgb.XGBClassifier()

logreg_preds = train_and_predict(logreg, (xfold2, yfold2), (xfold1, yfold1))
rf_preds = train_and_predict(rf, (xfold2, yfold2), (xfold1, yfold1))
xgbc_preds = train_and_predict(xgbc, (xfold2, yfold2), (xfold1, yfold1))

avg_pred_fold1 = (logreg_preds + rf_preds + xgbc_preds) / 3
fold1_preds = np.column_stack((logreg_preds, rf_preds, xgbc_preds, avg_pred_fold1))

aucs_fold1 = [metrics.roc_auc_score(yfold1, fold1_preds[:, i]) for i in range(fold1_preds.shape[1])]
evaluate_and_print(aucs_fold1, "Fold-1")

# Optimize Weights
opt = OptimizeAUC()
opt.fit(fold1_preds[:, :-1], yfold1)
opt_preds_fold2 = opt.predict(fold2_preds[:, :-1])
auc_fold2_optimized = metrics.roc_auc_score(yfold2, opt_preds_fold2)
print(f"Optimized AUC, Fold 2 = {auc_fold2_optimized}")
print(f"Coefficients = {opt.coef_}")

opt = OptimizeAUC()
opt.fit(fold2_preds[:, :-1], yfold2)
opt_preds_fold1 = opt.predict(fold1_preds[:, :-1])
auc_fold1_optimized = metrics.roc_auc_score(yfold1, opt_preds_fold1)
print(f"Optimized AUC, Fold 1 = {auc_fold1_optimized}")
print(f"Coefficients = {opt.coef_}")


NameError: name 'xfold1' is not defined