# CIFAR-10 Training

In [1]:
import mlflow
import tensorflow as tf
import numpy as np
from tensorflow.python.keras import backend as K
import pickle
from tensorflow.keras import datasets, layers, models
import tarfile
import os

In [2]:
print(f'Setting up MLflow experiment...')
experiment_name = 'cifar10-train'
mlflow_tracking_uri = os.getenv('MLFLOW_TRACKING_URI')
print(f'MLflow tracking uri: {mlflow_tracking_uri}')
mlflow.set_tracking_uri(mlflow_tracking_uri)
mlflow.set_experiment(experiment_name)

print(f'Downloading Cifar10 dataset...')

tar = tarfile.open('../data/cifar-10-python.tar.gz')
tar.extractall(path='../data')
tar.close()

  and should_run_async(code)


Setting up MLflow experiment...
MLflow tracking uri: http://mlflow:5000
Downloading Cifar10 dataset...


In [3]:
# reference: https://github.com/tensorflow/tensorflow/blob/9011878d87bdeff932e10e2b2d35570be5ef739e/tensorflow/python/keras/datasets/cifar.py#L26
def load_batch(fpath, label_key='labels'):
    """Internal utility for parsing CIFAR data.
    Arguments:
      fpath: path the file to parse.
      label_key: key for label data in the retrieve
          dictionary.
    Returns:
      A tuple `(data, labels)`.
    """
    with open(fpath, 'rb') as f:
        d = pickle.load(f, encoding='bytes')
        # decode utf8
        d_decoded = {}
        for k, v in d.items():
            d_decoded[k.decode('utf8')] = v
        d = d_decoded
    data = d['data']
    labels = d[label_key]

    data = data.reshape(data.shape[0], 3, 32, 32)
    return data, labels


# reference: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/datasets/cifar10.py#L32
def load_data():
    dir_name = '../data/cifar-10-batches-py'

    # load train data
    num_train_samples = 50000

    x_train = np.empty((num_train_samples, 3, 32, 32), dtype='uint8')
    y_train = np.empty((num_train_samples,), dtype='uint8')

    for i in range(1, 6):
        fpath = f'{dir_name}/data_batch_{i}'
        (x_train[(i - 1) * 10000:i * 10000, :, :, :],
         y_train[(i - 1) * 10000:i * 10000]) = load_batch(fpath)

    # load test data
    fpath = f'{dir_name}/test_batch'
    x_test, y_test = load_batch(fpath)

    y_train = np.reshape(y_train, (len(y_train), 1))
    y_test = np.reshape(y_test, (len(y_test), 1))

    if K.image_data_format() == 'channels_last':
        x_train = x_train.transpose(0, 2, 3, 1)
        x_test = x_test.transpose(0, 2, 3, 1)

    x_test = x_test.astype(x_train.dtype)
    y_test = y_test.astype(y_train.dtype)

    return (x_train, y_train), (x_test, y_test)

In [4]:
import datetime

print(f'Loading train/test images...')
(train_images, train_labels), (test_images, test_labels) = load_data()
# Normalize pixel values to be between 0 and 1
train_images, test_images = train_images / 255.0, test_images / 255.0

print(f'Neural networking training for 1 epoch...')
mlflow.tensorflow.autolog()

with mlflow.start_run():
    model = models.Sequential()
    model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 3)))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.Flatten())
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(10))

    model.compile(optimizer='adam',
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    log_dir = os.getenv('TENSORBOARD_LOGS_DIR') + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
    
    history = model.fit(train_images,
                        train_labels,
                        epochs=1,
                        validation_data=(test_images, test_labels),
                        callbacks=[tensorboard_callback])

    test_loss, test_acc = model.evaluate(test_images,  test_labels, verbose=2)
    print(f'test_acc: {test_acc}')
    print(f'test_loss: {test_loss}')

Loading train/test images...
Neural networking training for 1 epoch...
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
313/313 - 3s - loss: 1.2179 - accuracy: 0.5615
test_acc: 0.5615000128746033
test_loss: 1.217909336090088


In [4]:
# Load the TensorBoard notebook extension
%reload_ext tensorboard

In [5]:
%tensorboard --logdir /logs/tensorboard --port 6006