# Finding good ensembles to submit in competition
We tried different types of ensembles (e.g., means and meta learners). Our best submissions used a stacked mean approach and weights found via an optimization. These submissions scored around 91.3% on the public and also private leaderboard.

In [1]:
import pandas as pd
import numpy as np
from functools import partial
import os
import random
import joblib
import json
from tqdm import tqdm
from PIL import Image
from sklearn.metrics import accuracy_score
from scipy.optimize import differential_evolution
import tensorflow as tf
from tensorflow import keras 
import gc
from functools import reduce
from itertools import combinations, chain
from tqdm import tqdm
from sklearn.model_selection import KFold
from itertools import chain
import warnings
warnings.filterwarnings("ignore")

# Loading Out-of-Fold Predictions for some of our tested models

In [2]:
oof_predictions_v3 = joblib.load("../input/cassava-leaf-disease-ensemble-tests/oof_v04 (1).pkl")

In [3]:
oof_predictions_v3.head(3)

Unnamed: 0,image_id,label,resnext,b5,mobilenet,vit2020,b4,vit2019,b3
0,1000015157.jpg,0,"[0.53539234, 0.08993052, 0.032574702, 0.010111...","[0.17068666, 0.28306848, 0.40483505, 0.0044726...","[0.7490078, 0.014262087, 0.005231139, 0.000635...","[0.42902824, 0.045752272, 0.31388378, 0.005485...","[0.29917482, 0.16379356, 0.2504335, 0.06966855...","[0.4443596, 0.21404053, 0.24348898, 0.01078206...","[0.12975824, 0.41058695, 0.1588993, 0.02674331..."
1,1000201771.jpg,3,"[3.361342e-07, 5.867391e-07, 5.65951e-05, 0.99...","[2.970924e-05, 0.0012548971, 1.7456967e-05, 0....","[0.00012562818, 0.00018958043, 0.0020241379, 0...","[1.8922072e-05, 7.141115e-05, 3.6473248e-05, 0...","[0.017034736, 0.033697814, 0.028847465, 0.8960...","[8.799362e-06, 4.8696966e-05, 1.5933587e-05, 0...","[0.020895468, 0.025896251, 0.037321843, 0.8942..."
2,100042118.jpg,1,"[0.005370396, 0.07950499, 0.017187783, 0.10000...","[0.0038973298, 0.12563738, 0.008966217, 0.0198...","[0.019499786, 0.06108744, 0.005322082, 0.21714...","[0.0003110762, 0.00180416, 0.02432872, 0.00174...","[0.054800056, 0.3077832, 0.08947, 0.08236225, ...","[0.00047566573, 0.0014882263, 0.007440664, 0.0...","[0.020274838, 0.11426823, 0.034628984, 0.07325..."


# Mean and Stacked Mean combinations
We used itertools.combinations to check all possible combinations for mean ensembles. In addition to the individual models, we also combined some models in advance. This was an easy way to try out the different combinations including diverse levels. In most cases, the calculated cross-validation scores were quite close to the public leaderboard results. Even after the final submission, the combinations proved to be very stable.

In [4]:
# Build some promising stacks for evaluation
oof_predictions_v3.loc[:,"vits"] = oof_predictions_v3.apply(lambda x: [np.mean(v) for v in zip(x["vit2019"],x["vit2020"])], axis=1)
oof_predictions_v3.loc[:,"vit_resnext"] = oof_predictions_v3.apply(lambda x: [np.mean(v) for v in zip(x["vit2020"],x["resnext"])], axis=1)

columns = oof_predictions_v3.columns[2:].tolist()

In [5]:
combined = []
for i in range(len(columns)):
    combined.append(list(combinations(columns, i+1)))

def evaluate_ensemble(df, columns):
    return df[[*columns]].apply(lambda x: np.argmax([np.sum(v) for v in zip(*[x[c] for c in columns])]), axis=1).values

results = dict()
with tqdm(total=len(list(chain(*combined)))) as process_bar:
    for c in list(chain(*combined)):
        process_bar.update(1)  
        results[c] = accuracy_score(oof_predictions_v3.label.values, evaluate_ensemble(oof_predictions_v3, c))

100%|██████████| 511/511 [19:03<00:00,  2.24s/it]


In [10]:
# Get top-50 combinations
{k: results[k] for k in sorted(results, key=results.get, reverse=True)[0:50]}

{('mobilenet', 'b4', 'vit_resnext'): 0.9103145300743095,
 ('mobilenet', 'b3', 'vit_resnext'): 0.9092863485535355,
 ('mobilenet', 'b4', 'b3', 'vit_resnext'): 0.9092863485535355,
 ('mobilenet', 'b4', 'b3'): 0.909239613029864,
 ('b5', 'mobilenet', 'b4'): 0.9091928775061925,
 ('mobilenet', 'vit_resnext'): 0.9090994064588493,
 ('b5', 'mobilenet', 'vit_resnext'): 0.9089591998878347,
 ('b5', 'mobilenet', 'b3'): 0.9087722577931485,
 ('b5', 'mobilenet', 'b4', 'b3'): 0.908632051222134,
 ('b5', 'mobilenet', 'b4', 'vit_resnext'): 0.9085385801747908,
 ('b5', 'mobilenet', 'b3', 'vit_resnext'): 0.9085385801747908,
 ('mobilenet', 'vits'): 0.9082581670327616,
 ('mobilenet', 'b4', 'vits'): 0.9081646959854185,
 ('resnext', 'b5', 'mobilenet', 'b4'): 0.9081646959854185,
 ('resnext', 'mobilenet', 'b4'): 0.9080712249380755,
 ('b5', 'mobilenet', 'b4', 'b3', 'vit_resnext'): 0.9080244894144038,
 ('resnext', 'mobilenet', 'b4', 'vits'): 0.9079777538907323,
 ('resnext', 'mobilenet', 'b4', 'b3'): 0.9079310183670608

# Differential Evolution
Another technique that we also used to obtain good and stable results in the rankings was a prior optimization of the weights of the softmax logits of each model. For optimization we used the [Scipy differential_evolution](https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.differential_evolution.html) method .

In [None]:
considered_models = oof_predictions_v3[["image_id","label","b4","vit2020","resnext","mobilenet"]]

In [None]:
kfold = KFold(n_splits=4)

yhats = considered_models.iloc[:,2:].values
y = considered_models.label.values
n_models = yhats.shape[1]

accuracy = []
for fold, (train_idx, test_idx) in enumerate(kfold.split(yhats, y)):
    
    print(f"Iteration {fold+1}")
    
    weights = np.array([1.0/n_models for _ in range(n_models)])
    bounds = [(0.0, 1.0) for _ in range(n_models)]
    minimizeargs = (np.take(yhats, train_idx, axis=0), np.take(y, train_idx, axis=0))
    
    def calculate_accuracy(y_true, y_pred):
        return np.average(y_true == y_pred)

    def loss_func(weights, Yhat, Y):
        w = np.mean(weights * Yhat, axis=1)
        return 1 - calculate_accuracy(Y, list(map(lambda x: np.argmax(x), w)))

    sol = differential_evolution(loss_func, bounds, minimizeargs, maxiter=20, tol=1e-5, disp=True, seed=8)
    
    
    # Calculate oof accuracy of optimized weights
    oof_accuracy = calculate_accuracy(np.take(y, test_idx, axis=0),
                                      list(map(lambda x: np.argmax(x), np.mean(
                                          np.take(yhats, test_idx, axis=0) * sol.x, axis=1))))
    
    print(f"{oof_accuracy}")
    
    accuracy.append((sol.x, oof_accuracy))

In [None]:
#weights for ensembles of four different models
accuracy