# Introduction
I recently read an interesting [article](https://mlwave.com/kaggle-ensembling-guide/) on different ensamble methods commonly used in Kaggle competitions, and thought it would be interesting to apply to a dataset that I have previosuly acieved a high accuracy on.

Of course, the best way to achieve high accuracy on MNIST is probably to use a very deep CNN, but here I will try to create an ensamble of different neural networks, and train a stacked generalizer on them using blending.

In [1]:
import pandas as pd
import numpy as np

# Loading the dataset

In [2]:
data_df = pd.read_csv('train.csv')
X = data_df.iloc[:, 1:].values
y = data_df['label'].values

In [3]:
X.shape

(42000, 784)

In [4]:
test_df = pd.read_csv('test.csv')
X_test = test_df.values

## Pre-process

In [5]:
img_dimensions = (1, 28, 28)

X = X.reshape(-1, *img_dimensions)
X_test = X_test.reshape(-1, *img_dimensions)

In [6]:
X = X / 255
X_test = X_test / 255

In [7]:
X.shape

(42000, 1, 28, 28)

## Shuffle and Fold

In [8]:
idx = np.random.permutation(y.size)
X = X[idx]
y = y[idx]

In [10]:
from sklearn.model_selection import StratifiedKFold

In [11]:
n_folds = 10
stratified_kfold = StratifiedKFold(n_folds, random_state=0)

In [12]:
folds = list(stratified_kfold.split(X, y))

## Convert to tensor

In [13]:
import torch

In [14]:
# Training
X = torch.from_numpy(X).float()
y = torch.from_numpy(y).long()

# Test
X_test = torch.from_numpy(X_test).float()

# Models

In [15]:
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim

## Multi Layer Perceptron

In [16]:
class MLP(nn.Module):
    
    def __init__(self, hidden_size):
        
        super(MLP, self).__init__()
        
        self.fc1 = nn.Linear(28*28, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.output = nn.Linear(hidden_size, 10)
    
    def forward(self, x):
        x = F.elu(self.fc1(x))
        x = F.elu(self.fc2(x))
        x = self.output(x)
        return x

## CNN1

In [17]:
class CNN1(nn.Module):
    
    def __init__(self):
        
        super(CNN1, self).__init__()
        
        self.conv1 = nn.Conv2d(1, 32, 4, stride=2)
        self.conv2 = nn.Conv2d(32, 64, 2)
        
        self.dropout1 = nn.Dropout2d(p=.25)
        self.dropout2 = nn.Dropout(p=.5)
        
        self.fc1 = nn.Linear(64 * 6 * 6, 128)
        self.output = nn.Linear(128, 10)
    
    def forward(self, x):
        
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = self.dropout1(x)
        x = x.view(-1, 64 * 6 * 6)
        x = F.relu(self.fc1(x))
        x = self.dropout2(x)
        x = self.output(x)
        return x
        
        

In [18]:
class CNN2(nn.Module):
    
    def __init__(self):
        
        super(CNN2, self).__init__()
        
        self.conv1 = nn.Conv2d(1, 32, 4, stride=2)
        self.conv2 = nn.Conv2d(32, 32, 2)
        self.conv3 = nn.Conv2d(32, 64, 2)
        
        self.fc1 = nn.Linear(64 * 6 * 6, 128)
        self.output = nn.Linear(128, 10)
    
    def forward(self, x):
        
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = F.avg_pool2d(F.relu(self.conv3(x)), 2)
        x = x.view(-1, 64 * 6 * 6)
        x = F.relu(self.fc1(x))
        x = self.output(x)
        return x
        
        

# Train

In [19]:
from tqdm import tqdm

# Function inspired by https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html
def train_model(model, train_loader, optimizer, criterion, validation_loader = None, epochs = 2):
    
    # Only enter the validation state if there is a validation_loader
    phases = ['train']
    data_set_loaders = {'train' : train_loader, 'val' : validation_loader} 
    if validation_loader:
        phases.append('val')
        
    for epoch in range(epochs):
        
        print('Epoch {}/{}'.format(epoch + 1, epochs))
        print('-' * 10)

        for phase in phases:
            
            data_set_loader = data_set_loaders[phase]
            
            # Only update model weights based on the training data
            if phase == 'train':
                model.train()
            else:
                model.eval()
                
            running_loss = 0.0
            running_corrects = 0
            
            for i, (inputs, labels) in enumerate(data_set_loader):
                #inputs, labels = batch
                
                #labels = torch.autograd.Variable(labels).type(torch.LongTensor)

                optimizer.zero_grad()
                
                # Only track history during training
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    predictions = torch.argmax(outputs, dim=1)
                    
                    # Only perform backpropagation during training
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                    
                # Save statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(predictions == labels.data)
                
            epoch_loss = running_loss / len(data_set_loader.dataset)
            epoch_acc = running_corrects.double() / len(data_set_loader.dataset)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))
                
            

In [47]:
def predict_proba(model, test_loader):
    
    model.eval()
    
    with torch.set_grad_enabled(False):
        probas = []
        for batch, *_ in test_loader:
            probas.append(model(batch))

        #return probas
        return torch.cat(probas)

In [32]:
models = [MLP(100), CNN1()]

In [33]:
dataset_blend_train = np.zeros((X.shape[0], len(models), 10))
dataset_blend_test = np.zeros((X_test.shape[0], len(models), 10))

In [34]:
import torch.utils.data
test_dataset_mlp = torch.utils.data.TensorDataset(X_test.reshape(-1, 28*28))
test_dataset_cnn = torch.utils.data.TensorDataset(X_test)


In [35]:
criterion = nn.CrossEntropyLoss()

In [36]:
def weight_reset(m):
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
        m.reset_parameters()

In [None]:
# Blending approach inspired by https://github.com/emanuele/kaggle_pbr/blob/master/blend.py

for j, model in enumerate(models):
    dataset_blend_test_j = np.zeros((X_test.shape[0], len(folds), 10))
    for i, (train, val) in enumerate(folds):
        model.apply(weight_reset)
        optimizer = optim.Adam(model.parameters())
        print("Fold", i)
        X_train = X[train]
        y_train = y[train]
        X_val = X[val]
        y_val = y[val]
        
        if j > 0:
            # CNN
            train_dataset = torch.utils.data.TensorDataset(X[train], y[train])
            val_dataset = torch.utils.data.TensorDataset(X[val], y[val])
            test_dataset = test_dataset_cnn
        else:
            # MLP
            train_dataset = torch.utils.data.TensorDataset(X[train].reshape(-1, 28*28), y[train])
            val_dataset = torch.utils.data.TensorDataset(X[val].reshape(-1, 28*28), y[val])
            test_dataset = test_dataset_mlp
            
            
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128)
        val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=128)
        test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128)
        
        train_model(model, train_loader, optimizer, criterion, epochs = 4)
        y_val = predict_proba(model, val_loader)
        dataset_blend_train[val, j] = y_val
        dataset_blend_test_j[:, i] = predict_proba(model, test_loader)
    dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)

# Generalizer

In [52]:
from sklearn.linear_model import LogisticRegression

In [59]:
generalizer = LogisticRegression()
generalizer.fit(dataset_blend_train.reshape(-1, 20), y)
y_train = generalizer.predict(dataset_blend_train.reshape(-1, 20))
y_test = generalizer.predict(dataset_blend_test.reshape(-1, 20))



# Evaluate models

In [62]:
from sklearn.metrics import accuracy_score

In [63]:
accuracy_score(y_train, y)

0.9833571428571428

In [64]:
dataset_blend_train.shape

(42000, 2, 10)

In [67]:
mlp_preds = dataset_blend_train[:,0,:].argmax(1)
cnn_preds = dataset_blend_train[:,1,:].argmax(1)

In [68]:
accuracy_score(mlp_preds, y), accuracy_score(cnn_preds, y)

(0.9572857142857143, 0.9843571428571428)

In [71]:
mlp_preds = dataset_blend_test[:,0,:].argmax(1)
cnn_preds = dataset_blend_test[:,1,:].argmax(1)

In [72]:
submission_df = pd.DataFrame(list(zip(np.arange(1, 28001), cnn_preds)), columns = ['ImageID', 'Label'])
submission_df.set_index('ImageID').to_csv('Submissions/submission_nb8_cnn.csv')

In [73]:
submission_df = pd.DataFrame(list(zip(np.arange(1, 28001), y_test)), columns = ['ImageID', 'Label'])
submission_df.set_index('ImageID').to_csv('Submissions/submission_nb8_generalizer.csv')

Test set accuracy is 0.98528 for the CNN, and 0.98342 for the generalizer. It's likely that the generalizer does not benefit from the predictions of the MLP, which only scored 95.7% accuracy on the training set. 

I should let both models train for more epochs, 4 are pretty few, and probably should also add more models. An ensamble of just two models seems a little bit small.

# Summary
I implemented a stacked ensamble with blending of two models, a CNN and an MLP. I did not allow them to train for very long, which resulted in the MLP performing pretty poorly. As a result, the generalizer did not perform better than the CNN did on its own. I should re-run the experiment with more epochs, and should probably also include more models. However, if I am interested in experimenting with ensamble learning, it would probably be better to tackle a problem which is solved well by simpler models, so that I can avoid the long training times of the neural networks.