# MNIST: MLP ensemble

## Required imports

In [1]:
from collections import Counter
import keras
from keras.datasets import mnist
from keras.layers import Dense, Dropout
from keras.models import load_model, Sequential
from keras.optimizers import SGD
from keras.utils import np_utils
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


## Data preparation

For convenience, we define a function that does the data preparation for this learning task.

In [2]:
def prepare_data():
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train = x_train.reshape(60000, 784)
    x_test = x_test.reshape(10000, 784)
    x_train = x_train.astype(np.float32)/255.0
    x_test = x_test.astype(np.float32)/255.0
    y_train = np_utils.to_categorical(y_train)
    y_test = np_utils.to_categorical(y_test)
    return x_train, y_train, x_test, y_test

In [3]:
X_train, Y_train, x_test, y_test = prepare_data()

## Model definition

Since a model will have to be created for each ensemble member, we define a function that defines and compiles our model.

In [4]:
def create_model():
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(784,)))
    model.add(Dropout(0.2))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(10, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=SGD(),
                  metrics=['accuracy'])
    return model

## Training

For reproducibility, we seed the random number generator.

In [5]:
np.random.seed(45984)

We train three models, and store them in a list.  Note that each is trained on a different training set, since `train_test_split` splits input/output into random subsets.

In [6]:
nr_models = 3
models = list()
for i in range(nr_models):
    print(f'model {i + 1}')
    model = create_model()
    x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train)
    _ = model.fit(x_train, y_train, batch_size=64, epochs=10,
                  validation_data=(x_val, y_val), verbose=0)
    models.append(model)

model 1
model 2
model 3


## Evaluation

We compute and show the accuracies of the individual models on the test data.

In [7]:
accuracies = list()
for model in models:
    _, accuracy = model.evaluate(x_test, y_test, verbose=0)
    accuracies.append(accuracy)
accuracies

[0.9514, 0.9506, 0.9488]

Next, we compute the prediction of each individual model on the test input.

In [8]:
y_infer = np.empty((len(models), len(y_test)), dtype=np.int64)
for i, model in enumerate(models):
    y_infer[i, :] = model.predict_classes(x_test)

We evaluate the ensemble by using the majority vote for the individuals.

In [9]:
mistakes = 0
for j in range(y_infer.shape[1]):
    counter = Counter()
    for i in range(y_infer.shape[0]):
        counter[y_infer[i, j]] += 1
    inferred = counter.most_common(1)[0][0]
    if inferred != np.argmax(y_test[j]):
        mistakes += 1
print(1.0 - mistakes/len(y_test))

0.9519


The accuracy is indeed (marginally) better than that of the most accurate individual model.

## However...

For the same investment of compute time, you might expect that training a single model for three times as many epochs will yield a better result.

In [15]:
np.random.seed(45984)

In [16]:
model = create_model()
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train)
_ = model.fit(x_train, y_train, batch_size=64, epochs=30,
              validation_data=(x_val, y_val), verbose=0)

In [17]:
model.evaluate(x_test, y_test, verbose=0)

[0.09286915022358298, 0.9707]

That intuition would be entirely justified.