## Evaluate pixel scaling methods

This tutorial evaluates the effectiveness following three pixel scaling methods across the entire training dataset:

* Normalization

* Centering means

* Standardization

A simple __CNN__ is built to train a model to recognise images from the __MNIST__ dataset and its accuracy evaluated against each of the data scaling methods above.

## 1. Load required libs

In [1]:
import numpy as np
from keras.datasets import mnist
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import MaxPooling2D
from keras.layers import Conv2D
from keras.layers import Flatten

from matplotlib import pyplot as plt
from matplotlib import image
%matplotlib inline

Using TensorFlow backend.


## 2. Load dataset

In [2]:
def load_dataset():
    (X_train, y_train), (X_test, y_test) = mnist.load_data()
    # reshape to have single channel
    X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], X_train.shape[2], 1))
    X_test = X_test.reshape((X_test.shape[0], X_train.shape[1], X_train.shape[2], 1))
    # one-hot encode labels
    y_train = to_categorical(y_train)
    y_test = to_categorical(y_test)
    return X_train, y_train, X_test, y_test

In [3]:
trainx, trainy, testx, testy = load_dataset()
print(trainx.shape)
print(trainy.shape)

(60000, 28, 28, 1)
(60000, 10)


## 3. Define model

In [4]:
def create_model():
    model = Sequential()
    model.add(Conv2D(32, (3,3), activation="relu", input_shape=(28, 28, 1)))
    model.add(MaxPooling2D((2,2)))
    model.add(Conv2D(64, (3,3), activation="relu"))
    model.add(MaxPooling2D((2,2)))
    model.add(Flatten())
    model.add(Dense(64, activation="relu"))
    model.add(Dense(10, activation="softmax"))
    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["acc"])
    return model

## 4. Data preparation

In [5]:
# normalize images
def normalize_images(train, test):
    train_norm = train.astype("float32")
    test_norm = test.astype("float32")
    train_norm = train_norm / 255.0
    test_norm = test_norm / 255.0
    return train_norm, test_norm

# center mean of images
def center_mean(train, test):
    train_cent = train.astype("float32")
    test_cent = test.astype("float32")
    mean = train_cent.mean()
    train_cent = train_cent - mean
    test_cent = test_cent - mean
    return train_cent, test_cent

# standardize images
def standardize_images(train, test):
    # convert from integers to floats
    train_stan = train.astype('float32')
    test_stan = test.astype('float32')
    # calculate statistics
    m = train_stan.mean()
    s = train_stan.std()
    # center datasets
    train_stan = (train_stan - m) / s
    test_stan = (test_stan - m) / s
    
    # return normalized images
    return train_stan, test_stan

In [6]:
print("ORIG TRAIN MEAN: {}".format(trainx.mean()))
print("ORIG TRAIN MAX: {} TRAIN MIN: {}".format(trainx.max(), trainx.min()))

# test normalization
x1, _ = normalize_images(trainx, testx)
print("NORMALIZATION")
print("TRAIN MEAN: {}".format(x1.mean()))
print("TRAIN MAX: {} TRAIN MIN: {}".format(x1.max(), x1.min()))

# test centering
x1, _ = center_mean(trainx, testx)
print("CENTERING")
print("TRAIN MEAN: {}".format(x1.mean()))
print("TRAIN MAX: {} TRAIN MIN: {}".format(x1.max(), x1.min()))

# test standardization
z, _ = standardize_images(trainx, testx)
print("STANDARDIZATION")
print("TRAIN MEAN: {:.3f}".format(z.mean()))
print("TRAIN STD: {:.3f}".format(z.std()))

ORIG TRAIN MEAN: 33.318421449829934
ORIG TRAIN MAX: 255 TRAIN MIN: 0
NORMALIZATION
TRAIN MEAN: 0.13066062331199646
TRAIN MAX: 1.0 TRAIN MIN: 0.0
CENTERING
TRAIN MEAN: -1.9512917788233608e-05
TRAIN MAX: 221.68154907226562 TRAIN MIN: -33.31844711303711
STANDARDIZATION
TRAIN MEAN: -0.000
TRAIN STD: 1.000


## 5. Evaluation loop

In [7]:
def evaluation_loop(data_prep_func, n_repeats=10):
    X_train, y_train, X_test, y_test = load_dataset()
    scores = list()
    
    model = create_model()
    for i in range(n_repeats):
        X_train_prep, X_test_prep = data_prep_func(X_train, X_test)
        model.fit(X_train_prep, y_train, epochs=5, batch_size=64, verbose=0)
        _, acc = model.evaluate(X_test_prep, y_test, verbose=0)
        scores.append(acc)
        print("> {:d} {:.3f}".format(i, acc))
    
    return scores

In [None]:
all_scores = list()

scores = evaluation_loop(normalize_images)
print("Normalization: {:.3f} ({:.3f})".format(np.mean(scores), np.std(scores)))
all_scores.append(scores)

scores = evaluation_loop(center_mean)
print("Center: {:.3f} ({:.3f})".format(np.mean(scores), np.std(scores)))
all_scores.append(scores)

scores = evaluation_loop(standardize_images)
print("Standardize: {:.3f} ({:.3f})".format(np.mean(scores), np.std(scores)))
all_scores.append(scores)

plt.boxplot(allscores, labels=["norm", "center", "std"])
plt.show()

> 0 0.990
