<a href="https://colab.research.google.com/github/ebaranas/colab/blob/master/MNIST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import tensorflow as tf

class Model(object):
    def __init__(self, model_name, params):
        self.NUM_CLASSES = params['NUM_CLASSES']
        SUPPORTED_MODELS= {"FullyConnected": (self.FullyConnected, [2]),
                    "LowResFrameClassifier": (self.LowResFrameClassifier, [4]),
                    "SimpleFullyConnected":(self.SimpleFullyConnected, [2])
        }
        self.model_function, self.required_dims = SUPPORTED_MODELS.get(model_name, 'KeyError')
        
    def get_model_function(self):
        return self.model_function
    
    def get_required_dims(self):
        # For now first element of list is default.
        return self.required_dims
    
    def train(self, next_batch):
        logit, label = self.model_function(next_batch[0]), next_batch[1]
        loss = self.get_loss(logit, label)
        optimizer = tf.train.AdamOptimizer().minimize(loss)
        accuracy = self.get_accuracy(logit, label)
        return [loss, optimizer, accuracy]
    
    def get_loss(self, logit, label):
        softmax = tf.nn.softmax_cross_entropy_with_logits_v2(labels=label, logits=logit)
        return tf.reduce_sum(softmax)

    def get_accuracy(self, logit, label):
        prediction = tf.argmax(logit, 1)
        equality = tf.equal(prediction, tf.argmax(label, 1))
        return tf.reduce_mean(tf.cast(equality, tf.float32))
        
    def FullyConnected(self, feature):
        #bn = tf.layers.batch_normalization(feature)
        fc1 = tf.layers.dense(feature, 50)
        fc2 = tf.layers.dense(fc1, 50)
        fc2 = tf.layers.dropout(fc2)
        flat = tf.layers.flatten(fc2) # added flatten layer 08/10/2018 to correct shape
        fc3 = tf.layers.dense(flat, self.NUM_CLASSES)
        return fc3
        
    def SimpleFullyConnected(self, feature):
        #bn = tf.layers.batch_normalization(feature)
        fc1 = tf.layers.dense(feature, 30)
        final = tf.layers.dense(fc1, self.NUM_CLASSES)
        return final
    
    def LowResFrameClassifier(self, feature):
        #bn = tf.layers.batch_normalization(feature)
        conv1 = tf.layers.conv2d(feature, 32, (3, 3), activation="relu")
        conv2 = tf.layers.conv2d(conv1, 32, (3, 3), activation="relu")
        maxpool2d_a = tf.layers.max_pooling2d(conv2, pool_size=(2, 2), strides=(1, 1))
        dropout_a = tf.layers.dropout(maxpool2d_a, rate=0.25)
        
        conv3 = tf.layers.conv2d(dropout_a, 64, (3, 3), activation="relu")
        conv4 = tf.layers.conv2d(conv3, 64, (3, 3), activation="relu")
        maxpool2d_b = tf.layers.max_pooling2d(conv3, pool_size=(2, 2), strides=(1, 1))
        dropout_b = tf.layers.dropout(maxpool2d_b, rate=0.25)
        
        flat = tf.layers.flatten(dropout_b)
        dense = tf.layers.dense(flat, 256, activation="relu")
        dropout_c = tf.layers.dropout(dense, rate=0.25)
        
        final = tf.layers.dense(dropout_c, self.NUM_CLASSES, activation="softmax")
        return final

In [0]:
import numpy as np
from multiprocessing import Pool
import tensorflow as tf
import main as m

# Pipeline strategy interface
class PipelineStrategy(object):
    ''' Simulated abstract base class for pipeline strategy '''

    def preprocess(self, features, labels, reshape):
        raise NotImplementedError

    def split_dataset(self, mode):
        raise NotImplementedError
        
    def get_next_batch(self, mode):
        raise NotImplementedError  
    
    def feed_dict(self, next_batch, mode):
        raise NotImplementedError
    
    def initializers(self, iterator):
        raise NotImplementedError

    def transform(dataset):
        raise NotImplementedError

    def augment(feature, label):
        raise NotImplementedError
        
    def shuffle(dataset):
        raise NotImplementedError

# Pipeline strategy 1
class FeedDictPipeline(PipelineStrategy):
    def __init__(self, features, labels, params):
        self.EPOCHS_COMPLETED = 0
        self.INDEX_IN_EPOCH = 0
        print(params['NUM_EXAMPLES'])
        self.features, self.labels = self.shuffle(features, labels, params['NUM_EXAMPLES'])
        self.params = params

    def get_next_batch(self, mode=None):
        '''Placeholders only'''
        replace_index = lambda tuple_: (None,) + tuple_[1:] # to turn NHWC to ?HWC
        return tuple((tf.placeholder(self.features.dtype, replace_index(self.features.shape)),
        tf.placeholder(self.labels.dtype, replace_index(self.labels.shape))
        ))
    
    def feed_dict(self, next_batch, mode):
        '''Returns a dictionary of next batch'''
        features, labels, buff = self.split_dataset(mode)
        start = self.INDEX_IN_EPOCH
        self.INDEX_IN_EPOCH += self.params['BATCH_SIZE']
        if self.INDEX_IN_EPOCH > self.params['TRAIN_SIZE']: # fix to include validation
            self.EPOCHS_COMPLETED += 1
            self.shuffle(features, labels, buff)
            start = 0
            self.INDEX_IN_EPOCH = self.params['BATCH_SIZE']
            assert self.params['BATCH_SIZE'] <= self.params['TRAIN_SIZE']
        end = self.INDEX_IN_EPOCH
        return {next_batch: (self.features[start:end], self.labels[start:end])}

    def split_dataset(self, mode):
        if mode == 'training':
            return (self.features[:self.params['TRAIN_SIZE']], self.labels[:self.params['TRAIN_SIZE']], 
            self.params['TRAIN_SIZE'])
        elif mode == 'validation':
            return (self.features[self.params['TRAIN_SIZE']:], self.labels[self.params['TRAIN_SIZE']:],
            self.params['NUM_EXAMPLES'] - self.params['TRAIN_SIZE'])
        else:
            raise ArgumentError
    
    def initializers(iterator):
        return tf.global_variables_initializer()
    
    def transform(self, dataset):
        p = Pool(self.params['NUM_CORES'])
        return p.map(self.augment, zip(*dataset))

    def augment(self, example):
        return example
        
    def shuffle(self, features, labels, buff):
        perm = np.arange(buff)
        np.random.shuffle(perm)
        return features[perm], labels[perm]

# pipeline strategy 2
class DataAPIPipeline(PipelineStrategy):
    def __init__(self, features, labels, params):
        self.params = params
        self.full_dataset = tf.data.Dataset.from_tensor_slices((features, labels)).shuffle(self.params['NUM_EXAMPLES'])
        self.iterator = None
    
    def get_next_batch(self, mode):
        dataset, buff = self.split_dataset(mode)
        dataset = dataset.cache().shuffle(buff).repeat().batch(self.params['BATCH_SIZE']).prefetch(1)
        # dataset = dataset.shuffle(buff).repeat().batch(self.params['BATCH_SIZE'])
        self.iterator = dataset.make_initializable_iterator()
        return self.iterator.get_next()
    
    def feed_dict(self, next_batch=None, mode=None):
        return None 

    def split_dataset(self, mode):
        if mode == 'training':
            return (self.full_dataset.take(self.params['TRAIN_SIZE']), self.params['TRAIN_SIZE'])
        elif mode == 'validation':
            return (self.full_dataset.skip(self.params['TRAIN_SIZE']), 
            self.params['NUM_EXAMPLES'] - self.params['TRAIN_SIZE'])
        else:
            raise ArgumentError

    def initializers(self):
        return [tf.global_variables_initializer(), self.iterator.initializer] 

    def transform(self, dataset):
        return dataset.map(lambda feature, label: self.augment(feature, label))

    def augment(self, feature, label):
        #feature = tf.image.random_hue(feature, 0.5)
        pass
   
    def shuffle(self, dataset):
        pass

In [0]:
from mnist import MNIST
import os
import numpy as np

def read_raw_mnist(data_path):
    mndata = MNIST(data_path)
    images, labels = mndata.load_training()
    return np.asarray(images), np.asarray(labels)

def read_raw_bfimage(data_path):
    images = []
    labels = []
    for label_index, label in sorted(enumerate(os.listdir(data_path))):
        filename = data_path + label
        label = label.replace(".npy", "")
        for image in np.load(filename): # will load one npy file which may contain many examples
            images.append(image)
            labels.append(label_index)
    return np.asarray(images), np.asarray(labels)

SUPPORTED_DATA = {"mnist": read_raw_mnist, "bfimage": read_raw_bfimage}

def read_raw(data_name, data_path):
    read = (SUPPORTED_DATA.get(data_name, 'KeyError'))
    return read(data_path)

In [19]:
!pip install python-mnist
from mnist import MNIST



In [0]:
import os
import time
import numpy as np
import tensorflow as tf
from models import Model
from pipelines import DataAPIPipeline
from pipelines import FeedDictPipeline
import toml
from tensorflow.python.client import timeline
from multiprocessing import Pool
import data as D

params = {'DATA_NAME' = 'mnist', 'DATA_PATH' = './', 'DTYPE' = 'np.float32', 'HEIGHT' = 28, 
          'WIDTH' = 28, 'CHANNELS' = 1, 'NUM_CLASSES' = 10, 'BATCH_SIZE' = 48, 'TRAIN_RATIO' = 0.8, 'NUM_CORES' = 4}

class Benchmark(object):
    def __init__(self, pipeline, model_name, config_path):
        params = load_params(config_path)
        print("Training '{model}' on '{data}' using '{pipeline}'".format(model=model_name,
            data=params['DATA_NAME'], pipeline=pipeline))
        raw_features, raw_labels = D.read_raw(params['DATA_NAME'], params['DATA_PATH'])
        
        params.update({'NUM_EXAMPLES': len(raw_features)})
        params.update({'TRAIN_SIZE': int(params['TRAIN_RATIO']*params['NUM_EXAMPLES'])})
        self.params = params
        self.model = Model(model_name, params)
        SUPPORTED_PIPELINES = {"feed_dict": FeedDictPipeline, "data_API": DataAPIPipeline}
        PipelineClass = SUPPORTED_PIPELINES.get(pipeline, 'KeyError')
        
        reshape = self.should_reshape(rank=len(raw_features.shape))
        dtype = eval(params['DTYPE'])
        features, labels = self.preprocess(raw_features, raw_labels, dtype, reshape)
        
        self.pipeline = PipelineClass(features, labels, params)
        print("PARAMS: reshape=", reshape, " from ", raw_features.shape, " dtype=", dtype, " batch size=", 
        params['BATCH_SIZE'], " train size=", params['TRAIN_SIZE'])

    
    def preprocess(self, features, labels, dtype, reshape):
        assert len(features) == len(labels)
        if not isinstance(features, dtype):
            features = dtype(features)
        if reshape is not False:
            features = features.reshape(reshape)
        labels = np.array(Pool(self.params['NUM_CORES']).map(self.encode_to_one_hot, labels))
        return np.asarray(features), np.asarray(labels)
    
    def encode_to_one_hot(self, label):
        one_hot_encoding = np.zeros(self.params['NUM_CLASSES'])
        one_hot_encoding[label] = 1
        return one_hot_encoding    
    
    def run(self, mode, iterations, profile):
        next_batch = self.pipeline.get_next_batch(mode)
        fetches = self.model.train(next_batch)
        if profile is not False:
            # graph_writer = tf.summary.FileWriter("pipeline", sess.graph)
            run_metadata = tf.RunMetadata()
            options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        else:
            run_metadata = None
            options = None
        
        with tf.Session() as sess:
            sess.run(self.pipeline.initializers(), options=options, run_metadata=run_metadata)
            avg_acc = 0
            print("START BENCHMARK")
            start = time.time()
            for i in range(iterations):
                loss, optimizer, accuracy = sess.run(fetches,
                feed_dict=self.pipeline.feed_dict(next_batch, mode),
                options=options, run_metadata=run_metadata)
                avg_acc += accuracy
                if i % int(iterations/10) == 0:
                    print("Epoch: {}, loss: {:.3f}, {} accuracy: {:.2f}%".format(i, loss, mode, accuracy * 100))
            
            print("Average {} accuracy over {} iterations is {:.2f}%".format(mode, iterations, (avg_acc / iterations) * 100))
            end = time.time()
            time_per_run = end - start
            avg_acc = avg_acc/iterations
            
            print("END BENCHMARK. TIME: ", time_per_run)
        
            self.generate_trace(run_metadata, profile)
            return time_per_run, avg_acc
    
    def generate_trace(self, run_metadata, profile):
        if profile is False:
            return
        else:
            # Create the Timeline object, and write it to a json file
            fetched_timeline = timeline.Timeline(run_metadata.step_stats)
            chrome_trace = fetched_timeline.generate_chrome_trace_format()
            with open(profile, 'w') as f:
                f.write(chrome_trace)
            return

    def should_reshape(self, rank):
        if rank not in self.model.get_required_dims():
            if rank == 4: # e.g. bfimage, FullyConnected
                return (-1, self.params['HEIGHT']*self.params['WIDTH']*self.params['CHANNELS'])
            elif rank == 2: # e.g. mnist, LowResFrameClassifier
                return (-1, self.params['HEIGHT'], self.params['WIDTH'], self.params['CHANNELS'])
            else:
                ValueError("Invalid data shape")
        else:
            return False

def load_params(config_path, param={}):
    '''
    Load parameters from file.
    '''
    params = {}
    if not os.path.isfile(config_path):
        raise KeyError
    else:
        with open(config_path, 'r', encoding='utf-8') as f:
            params.update(toml.load(f))
        return params

In [23]:
import tensorflow as tf
import numpy as np
import os
import sys
from mnist import MNIST
import math

def main():
    # GPU utilization
    NUM_RUNS = 1
    PATH_TO_CHROME_TRACES = "/project/datasets-API/benchmark-package/chrome-traces/test.json"
    
    benchmark = Benchmark('data_API', 'FullyConnected')
    tot_acc, tot_time = 0, 0
    for i in range(NUM_RUNS):
        tot_time += benchmark.run('training', 500, profile=False)[0]
        # tot_acc += benchmark.run('validation', 1000, profile=False)[1]
    print("RESULTS: Train time=", tot_time/NUM_RUNS, ", Validation accuracy=", tot_acc/NUM_RUNS*100)

Training 'FullyConnected' on 'mnist'
PARAMS: reshape= (-1, 28, 28, 1)  from  (60000, 784)  dtype= <class 'numpy.float32'>  batch size= 64  train size= 48000
START BENCHMARK
Epoch: 0, loss: 2210.548, training accuracy: 4.69%
Epoch: 60, loss: 2425.855, training accuracy: 9.38%
Epoch: 120, loss: 2452.834, training accuracy: 3.12%
Epoch: 180, loss: 2334.035, training accuracy: 9.38%
Epoch: 240, loss: 2543.234, training accuracy: 4.69%
Epoch: 300, loss: 2299.873, training accuracy: 6.25%
Epoch: 360, loss: 2139.520, training accuracy: 10.94%
Epoch: 420, loss: 2525.710, training accuracy: 4.69%
Epoch: 480, loss: 2215.819, training accuracy: 4.69%
Epoch: 540, loss: 2343.555, training accuracy: 7.81%
END BENCHMARK. TIME:  97.03578329086304
RESULTS: Train time= 97.03578329086304 , Validation accuracy= 0.0
