In [9]:
import numpy as np
from scipy import stats

In [2]:
def mean_predictions(probas): 
    """
    Create mean predictions
    :param probas: 2-d array of probability values :return: mean probability
    """
    return np.mean(probas, axis=1)

In [19]:
def max_voting(preds): 
    """
    Create mean predictions
    :param probas: 2-d array of prediction values
    :return: max voted predictions
    """
    idxs = np.argmax(preds, axis=1)
    return np.take_along_axis(preds, idxs[:, None], axis=1)

In [26]:
def rank_mean(probas): 
    """
    Create mean predictions using ranks
    :param probas: 2-d array of probability values :return: mean ranks
    """
    ranked = []
    for i in range(probas.shape[1]):
        rank_data = stats.rankdata(probas[:, i]) 
        ranked.append(rank_data)
    ranked = np.column_stack(ranked)
    return np.mean(ranked, axis=1)

In [27]:
probas = np.array([
    [0.1, 0.2],
    [0.4, 0.3],
    [0.35, 0.5]
])

In [28]:
rank_mean(probas)

array([1. , 2.5, 2.5])

In [29]:
probas[:, 0]

array([0.1 , 0.4 , 0.35])

In [30]:
probas[:, 1]

array([0.2, 0.3, 0.5])

In [57]:
import numpy as np
from functools import partial
from scipy.optimize import fmin
from sklearn import metrics


class OptimizeAUC:
    """
    Class for optimizing AUC.
    This class is all you need to find best weights for
    any model and for any metric and for any types of predictions.
    With very small changes, this class can be used for optimization of
    weights in ensemble models of _any_ type of predictions
    """
    def __init__(self):
        self.coef_ = 0

    def _auc(self, coef, X, y):
        """
        This functions calulates and returns AUC.
        :param coef: coef list, of the same length as number of models
        :param X: predictions, in this case a 2d array
        :param y: targets, in our case binary 1d array
        """
        # multiply coefficients with every column of the array with predictions.
        # this means: element 1 of coef is multiplied by column 1
        # of the prediction array, element 2 of coef is multiplied
        # by column 2 of the prediction array and so on!
        x_coef = X * coef

        # create predictions by taking row wise sum
        predictions = np.sum(x_coef, axis=1)

        # calculate auc score
        auc_score = metrics.roc_auc_score(y, predictions)

        # return negative auc
        return -1.0 * auc_score

    def fit(self, X, y):
        # remember partial from hyperparameter optimization chapter?
        loss_partial = partial(self._auc, X=X, y=y)

        # dirichlet distribution. you can use any distribution you want
        # to initialize the coefficients
        # we want the coefficients to sum to 1
        initial_coef = np.random.dirichlet(np.ones(X.shape[1]), size=1)

        # use scipy fmin to minimize the loss function, in our case auc
        self.coef_ = fmin(loss_partial, initial_coef, disp=True)

    def predict(self, X):
        # this is similar to _auc function
        x_coef = X * self.coef_
        predictions = np.sum(x_coef, axis=1)
        return predictions



In [58]:
import xgboost as xgb
from sklearn.datasets import make_classification
from sklearn import ensemble
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
import numpy as np


In [59]:
# make a binary classification dataset with 10k samples
# and 25 features
X, y = make_classification(n_samples=10000, n_features=25)

# split into two folds (for this example)
xfold1, xfold2, yfold1, yfold2 = model_selection.train_test_split(X,
                                                                  y, test_size=0.5, stratify=y
                                                                  )

In [60]:
xfold1

array([[ 1.17487004,  0.65548221, -0.47785476, ...,  0.90527309,
        -0.57431901, -0.36933863],
       [-0.4760209 , -1.60754802,  0.22496259, ...,  1.58323044,
        -1.10087941, -0.24167795],
       [ 0.29474326,  0.79031379,  3.39788006, ...,  0.89997396,
        -0.84451195,  0.77934592],
       ...,
       [ 0.55586449,  0.38940475, -1.79915524, ..., -1.18499049,
         1.1146075 , -1.03723971],
       [-1.37516793,  0.2134832 , -0.8950755 , ...,  0.26862395,
        -0.41450558,  0.91342018],
       [-0.95469001, -1.01887457,  0.90601411, ...,  0.5176188 ,
        -0.69597536,  1.32946223]])

In [61]:
# fit models on fold 1 and make predictions on fold 2
# we have 3 models:
# logistic regression, random forest and xgboost
logreg = linear_model.LogisticRegression()
rf = ensemble.RandomForestClassifier()
xgbc = xgb.XGBClassifier()

# fit all models on fold 1 data
logreg.fit(xfold1, yfold1)
rf.fit(xfold1, yfold1)
xgbc.fit(xfold1, yfold1)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [62]:
# predict all models on fold 2
# take probability for class 1
pred_logreg = logreg.predict_proba(xfold2)[:, 1]
pred_rf = rf.predict_proba(xfold2)[:, 1]
pred_xgbc = xgbc.predict_proba(xfold2)[:, 1]

In [63]:
logreg.predict_proba(xfold2).shape

(5000, 2)

In [64]:
# create an average of all predictions
# that is the simplest ensemble
avg_pred = (pred_logreg + pred_rf + pred_xgbc) / 3

In [65]:
# a 2d array of all predictions
fold2_preds = np.column_stack((pred_logreg,
                                pred_rf,
                                pred_xgbc,
                                avg_pred
                                ))

In [66]:
fold2_preds.shape

(5000, 4)

In [67]:
fold2_preds

array([[0.98228548, 0.95      , 0.99819142, 0.97682563],
       [0.81878213, 0.96      , 0.99504292, 0.92460835],
       [0.2240907 , 0.23      , 0.08329701, 0.17912924],
       ...,
       [0.99936661, 0.96      , 0.99968922, 0.98635195],
       [0.77698095, 0.9       , 0.99749315, 0.89149137],
       [0.00384836, 0.08      , 0.00482325, 0.0295572 ]])

In [68]:
# calculate and store individual AUC values
aucs_fold2 = []
for i in range(fold2_preds.shape[1]):
    auc = metrics.roc_auc_score(yfold2, fold2_preds[:, i])
    aucs_fold2.append(auc)
print(f"Fold-2: LR AUC = {aucs_fold2[0]}")
print(f"Fold-2: RF AUC = {aucs_fold2[1]}")
print(f"Fold-2: XGB AUC = {aucs_fold2[2]}")
print(f"Fold-2: Average Pred AUC = {aucs_fold2[3]}")

Fold-2: LR AUC = 0.9670788799999999
Fold-2: RF AUC = 0.98100432
Fold-2: XGB AUC = 0.98042384
Fold-2: Average Pred AUC = 0.97925504


In [69]:
# now we repeat the same for the other fold
# this is not the ideal way, if you ever have to repeat code,
# create a function!
# fit models on fold 2 and make predictions on fold 1
logreg = linear_model.LogisticRegression()
rf = ensemble.RandomForestClassifier()
xgbc = xgb.XGBClassifier()

logreg.fit(xfold2, yfold2)
rf.fit(xfold2, yfold2)
xgbc.fit(xfold2, yfold2)

pred_logreg = logreg.predict_proba(xfold1)[:, 1]
pred_rf = rf.predict_proba(xfold1)[:, 1]
pred_xgbc = xgbc.predict_proba(xfold1)[:, 1]
avg_pred = (pred_logreg + pred_rf + pred_xgbc) / 3

fold1_preds = np.column_stack(( pred_logreg,
                                pred_rf,
                                pred_xgbc,
                                avg_pred
                                ))

aucs_fold1 = []
for i in range(fold1_preds.shape[1]):
    auc = metrics.roc_auc_score(yfold1, fold1_preds[:, i])
    aucs_fold1.append(auc)

print(f"Fold-1: LR AUC = {aucs_fold1[0]}")
print(f"Fold-1: RF AUC = {aucs_fold1[1]}")
print(f"Fold-1: XGB AUC = {aucs_fold1[2]}")
print(f"Fold-1: Average prediction AUC = {aucs_fold1[3]}")

Fold-1: LR AUC = 0.9632784000000001
Fold-1: RF AUC = 0.97840488
Fold-1: XGB AUC = 0.9786933600000001
Fold-1: Average prediction AUC = 0.97658672


In [70]:
# find optimal weights using the optimizer
opt = OptimizeAUC()


In [71]:
fold1_preds

array([[2.89664496e-02, 1.20000000e-01, 3.57584562e-04, 4.97746780e-02],
       [1.79099851e-02, 6.00000000e-02, 1.65189465e-03, 2.65206266e-02],
       [9.69476399e-01, 9.90000000e-01, 9.99962807e-01, 9.86479735e-01],
       ...,
       [1.96954088e-02, 1.00000000e-02, 2.91284614e-05, 9.90817907e-03],
       [9.89492760e-01, 9.90000000e-01, 9.99889731e-01, 9.93127497e-01],
       [9.99114508e-01, 1.00000000e+00, 9.99945045e-01, 9.99686517e-01]])

In [76]:
fold1_preds[:, :-1].shape

(5000, 3)

In [72]:
fold1_preds[:, :-1]

array([[2.89664496e-02, 1.20000000e-01, 3.57584562e-04],
       [1.79099851e-02, 6.00000000e-02, 1.65189465e-03],
       [9.69476399e-01, 9.90000000e-01, 9.99962807e-01],
       ...,
       [1.96954088e-02, 1.00000000e-02, 2.91284614e-05],
       [9.89492760e-01, 9.90000000e-01, 9.99889731e-01],
       [9.99114508e-01, 1.00000000e+00, 9.99945045e-01]])

In [73]:
# dont forget to remove the average column
opt.fit(fold1_preds[:, :-1], yfold1)
opt_preds_fold2 = opt.predict(fold2_preds[:, :-1])
auc = metrics.roc_auc_score(yfold2, opt_preds_fold2)
print(f"Optimized AUC, Fold 2 = {auc}")
print(f"Coefficients = {opt.coef_}")

Optimization terminated successfully.
         Current function value: -0.978986
         Iterations: 55
         Function evaluations: 121
Optimized AUC, Fold 2 = 0.9809361600000001
Coefficients = [-0.08164794  0.72127158  0.68666043]


In [74]:
opt.fit(fold2_preds[:, :-1], yfold2)
opt_preds_fold1 = opt.predict(fold1_preds[:, :-1])
auc = metrics.roc_auc_score(yfold1, opt_preds_fold1)
print(f"Optimized AUC, Fold 1 = {auc}")
print(f"Coefficients = {opt.coef_}")

Optimization terminated successfully.
         Current function value: -0.981360
         Iterations: 41
         Function evaluations: 91
Optimized AUC, Fold 1 = 0.9789088
Coefficients = [-3.20702546e-04  9.86797083e-01  4.04664866e-01]


In [75]:
opt.coef_

array([-3.20702546e-04,  9.86797083e-01,  4.04664866e-01])

In [79]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris

# 加载数据集
data = load_iris()
X = data.data
y = (data.target == 2).astype(int)  # 将问题转化为二分类问题

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 训练基础模型
model1 = LogisticRegression()
model2 = DecisionTreeClassifier()
model3 = RandomForestClassifier()

model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
model3.fit(X_train, y_train)

# 生成基础模型的预测结果
preds1 = model1.predict_proba(X_train)[:, 1]
preds2 = model2.predict_proba(X_train)[:, 1]
preds3 = model3.predict_proba(X_train)[:, 1]

# 将基础模型的预测结果作为元模型的输入特征
stacked_features = np.column_stack((preds1, preds2, preds3))

# 训练元模型
meta_model = LogisticRegression()
meta_model.fit(stacked_features, y_train)

# 在测试集上进行预测
test_preds1 = model1.predict_proba(X_test)[:, 1]
test_preds2 = model2.predict_proba(X_test)[:, 1]
test_preds3 = model3.predict_proba(X_test)[:, 1]

# 将基础模型的预测结果作为元模型的输入特征
stacked_test_features = np.column_stack((test_preds1, test_preds2, test_preds3))

# 使用元模型进行最终预测
final_preds = meta_model.predict(stacked_test_features)

# 计算准确率
accuracy = accuracy_score(y_test, final_preds)
print("Stacking Model Accuracy:", accuracy)

Stacking Model Accuracy: 1.0


In [81]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris

# 加载数据集
data = load_iris()
X = data.data
y = (data.target == 2).astype(int)  # 将问题转化为二分类问题

# 定义基础模型
model1 = LogisticRegression()
model2 = DecisionTreeClassifier()
model3 = RandomForestClassifier()

# 定义元模型
meta_model = LogisticRegression()

# 使用K折交叉验证
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # 训练基础模型
    model1.fit(X_train, y_train)
    model2.fit(X_train, y_train)
    model3.fit(X_train, y_train)
    
    # 生成基础模型的预测结果
    preds1 = model1.predict_proba(X_train)[:, 1]
    preds2 = model2.predict_proba(X_train)[:, 1]
    preds3 = model3.predict_proba(X_train)[:, 1]
    
    # 将基础模型的预测结果作为元模型的输入特征
    stacked_features = np.column_stack((preds1, preds2, preds3))
    
    # 训练元模型
    meta_model.fit(stacked_features, y_train)
    
    # 在测试集上进行预测
    test_preds1 = model1.predict_proba(X_test)[:, 1]
    test_preds2 = model2.predict_proba(X_test)[:, 1]
    test_preds3 = model3.predict_proba(X_test)[:, 1]
    
    # 将基础模型的预测结果作为元模型的输入特征
    stacked_test_features = np.column_stack((test_preds1, test_preds2, test_preds3))
    
    # 使用元模型进行最终预测
    final_preds = meta_model.predict(stacked_test_features)
    
    # 计算准确率
    accuracy = accuracy_score(y_test, final_preds)
    accuracies.append(accuracy)

# 输出平均准确率
print("Stacking Model Average Accuracy:", np.mean(accuracies))

Stacking Model Average Accuracy: 0.9533333333333335


In [82]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# 加载数据集
data = load_iris()
X = data.data
y = data.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 定义基模型
base_models = [
    ('dt', DecisionTreeClassifier()),
    ('svc', SVC(probability=True))
]

# 定义元学习器
meta_model = LogisticRegression()

# 创建 Stacking 模型
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model)

# 训练 Stacking 模型
stacking_model.fit(X_train, y_train)

# 评估性能
accuracy = stacking_model.score(X_test, y_test)
print(f"Stacking Model Accuracy: {accuracy:.2f}")


Stacking Model Accuracy: 1.00


In [83]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris

# 加载数据集
data = load_iris()
X = data.data
y = (data.target == 2).astype(int)  # 将问题转化为二分类问题

# 定义基础模型
model1 = LogisticRegression()
model2 = DecisionTreeClassifier()
model3 = RandomForestClassifier()

# 定义元模型
meta_model = LogisticRegression()

# 使用K折交叉验证
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 存储每个模型的准确率
model1_accuracies = []
model2_accuracies = []
model3_accuracies = []
stacking_accuracies = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # 训练基础模型
    model1.fit(X_train, y_train)
    model2.fit(X_train, y_train)
    model3.fit(X_train, y_train)
    
    # 生成基础模型的预测结果
    preds1 = model1.predict_proba(X_train)[:, 1]
    preds2 = model2.predict_proba(X_train)[:, 1]
    preds3 = model3.predict_proba(X_train)[:, 1]
    
    # 将基础模型的预测结果作为元模型的输入特征
    stacked_features = np.column_stack((preds1, preds2, preds3))
    
    # 训练元模型
    meta_model.fit(stacked_features, y_train)
    
    # 在测试集上进行预测
    test_preds1 = model1.predict_proba(X_test)[:, 1]
    test_preds2 = model2.predict_proba(X_test)[:, 1]
    test_preds3 = model3.predict_proba(X_test)[:, 1]
    
    # 将基础模型的预测结果作为元模型的输入特征
    stacked_test_features = np.column_stack((test_preds1, test_preds2, test_preds3))
    
    # 使用元模型进行最终预测
    final_preds = meta_model.predict(stacked_test_features)
    
    # 计算每个基础模型的准确率
    model1_accuracy = accuracy_score(y_test, model1.predict(X_test))
    model2_accuracy = accuracy_score(y_test, model2.predict(X_test))
    model3_accuracy = accuracy_score(y_test, model3.predict(X_test))
    
    # 计算Stacking模型的准确率
    stacking_accuracy = accuracy_score(y_test, final_preds)
    
    # 存储准确率
    model1_accuracies.append(model1_accuracy)
    model2_accuracies.append(model2_accuracy)
    model3_accuracies.append(model3_accuracy)
    stacking_accuracies.append(stacking_accuracy)

# 输出每个模型的平均准确率
print("Logistic Regression Model Average Accuracy:", np.mean(model1_accuracies))
print("Decision Tree Model Average Accuracy:", np.mean(model2_accuracies))
print("Random Forest Model Average Accuracy:", np.mean(model3_accuracies))
print("Stacking Model Average Accuracy:", np.mean(stacking_accuracies))

Logistic Regression Model Average Accuracy: 0.9733333333333334
Decision Tree Model Average Accuracy: 0.9533333333333335
Random Forest Model Average Accuracy: 0.9600000000000002
Stacking Model Average Accuracy: 0.9533333333333335


In [91]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris

# 加载数据集
data = load_iris()
X = data.data
y = (data.target == 2).astype(int)  # 将问题转化为二分类问题

# 定义基础模型
model1 = LogisticRegression()
model2 = DecisionTreeClassifier()
model3 = RandomForestClassifier()
model4 = SVC(probability=True)  # 新增一个支持向量机模型

# 定义元模型
meta_model = GradientBoostingClassifier()  # 使用梯度提升决策树作为元模型

# 使用K折交叉验证
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# 存储每个模型的准确率
model1_accuracies = []
model2_accuracies = []
model3_accuracies = []
model4_accuracies = []
stacking_accuracies = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # 训练基础模型
    model1.fit(X_train, y_train)
    model2.fit(X_train, y_train)
    model3.fit(X_train, y_train)
    model4.fit(X_train, y_train)
    
    # 生成基础模型的预测结果
    preds1 = model1.predict_proba(X_train)[:, 1]
    preds2 = model2.predict_proba(X_train)[:, 1]
    preds3 = model3.predict_proba(X_train)[:, 1]
    preds4 = model4.predict_proba(X_train)[:, 1]
    
    # 将基础模型的预测结果作为元模型的输入特征
    stacked_features = np.column_stack((preds1, preds2, preds3, preds4))
    
    # 训练元模型
    meta_model.fit(stacked_features, y_train)
    
    # 在测试集上进行预测
    test_preds1 = model1.predict_proba(X_test)[:, 1]
    test_preds2 = model2.predict_proba(X_test)[:, 1]
    test_preds3 = model3.predict_proba(X_test)[:, 1]
    test_preds4 = model4.predict_proba(X_test)[:, 1]
    
    # 将基础模型的预测结果作为元模型的输入特征
    stacked_test_features = np.column_stack((test_preds1, test_preds2, test_preds3, test_preds4))
    
    # 使用元模型进行最终预测
        # 使用元模型进行最终预测
    final_preds = meta_model.predict(stacked_test_features)
    
    # 计算每个基础模型的准确率
    model1_accuracy = accuracy_score(y_test, model1.predict(X_test))
    model2_accuracy = accuracy_score(y_test, model2.predict(X_test))
    model3_accuracy = accuracy_score(y_test, model3.predict(X_test))
    model4_accuracy = accuracy_score(y_test, model4.predict(X_test))
    
    # 计算Stacking模型的准确率
    stacking_accuracy = accuracy_score(y_test, final_preds)
    
    # 存储准确率
    model1_accuracies.append(model1_accuracy)
    model2_accuracies.append(model2_accuracy)
    model3_accuracies.append(model3_accuracy)
    model4_accuracies.append(model4_accuracy)
    stacking_accuracies.append(stacking_accuracy)

# 输出每个模型的平均准确率
print("Logistic Regression Model Average Accuracy:", np.mean(model1_accuracies))
print("Decision Tree Model Average Accuracy:", np.mean(model2_accuracies))
print("Random Forest Model Average Accuracy:", np.mean(model3_accuracies))
print("SVM Model Average Accuracy:", np.mean(model4_accuracies))
print("Stacking Model Average Accuracy:", np.mean(stacking_accuracies))

Logistic Regression Model Average Accuracy: 0.9666666666666668
Decision Tree Model Average Accuracy: 0.9533333333333334
Random Forest Model Average Accuracy: 0.9600000000000002
SVM Model Average Accuracy: 0.9666666666666668
Stacking Model Average Accuracy: 0.9400000000000001
