In [5]:
import torch
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib as plt
import random
import time
import csv
# import GPUtil
# import psutil
import datetime

import torch
from torch.nn import Module
from torch import nn

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Sequential, Model

In [6]:
from torch.utils.data import Dataset, DataLoader
from torch.nn import CrossEntropyLoss
from torch.optim import SGD
from torch.utils.data import DataLoader, TensorDataset
from torchvision.transforms import ToTensor
from torch.autograd import Variable
from torchvision import transforms


In [7]:
import requests, gzip, os, hashlib
import numpy as np

# https://github.com/geohot/ai-notebooks/blob/master/mnist_from_scratch.ipynb
def fetchMNISTFromURL(url):
    """This function loads and returns the training or testing sets of MNIST dataset (depending on the url passed).
    
    
    Arguments
    ---------
    url: string
        The proportion of the original training dataset that is used during the training process.

    """
    
    fp = os.path.join('./datasets/mnist', hashlib.md5(url.encode('utf-8')).hexdigest())
    if os.path.isfile(fp):
        with open(fp, "rb") as f:
            data = f.read()
            print(type(gzip.decompress(data)))
    else:
        with open(fp, "wb") as f:
            data = requests.get(url).content
            f.write(data)
    return np.frombuffer(gzip.decompress(data), dtype=np.uint8).copy()


def fetchMNIST():
    """This function loads and returns the training and testing sets of the MNIST dataset.
    """
    X_train_mnist = fetchMNISTFromURL("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz")[0x10:].reshape((-1, 28, 28))
    y_train_mnist = fetchMNISTFromURL("http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz")[8:]
    X_test_mnist = fetchMNISTFromURL("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz")[0x10:].reshape((-1, 28, 28))
    y_test_mnist = fetchMNISTFromURL("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz")[8:]
    
    return X_train_mnist, y_train_mnist, X_test_mnist, y_test_mnist

In [8]:
def load_and_preprocess_data(training_size):
    """This function loads and preprocesses (i.e., pads) the MNIST dataset. The function returns
    the training dataset (according to the passed training size), the testing dataset and the 
    larger testing dataset (i.e., the dataset that will be used during the inference phase).
    
    
    Arguments
    ---------
    training_size: float
        The proportion of the original training dataset that is used during the training process.
    """
    X_train, y_train, X_test, y_test = fetchMNIST()
    
    X_train = X_train[:int(training_size*X_train.shape[0])]
    y_train = y_train[:X_train.shape[0]]
    
    X_test_ext = X_test.copy()
    y_test_ext = y_test.copy()
    print(X_test_ext.shape)
    for j in range(100):
        X_test_ext = np.append(X_test_ext, X_test.copy(), axis=0)
        y_test_ext = np.append(y_test_ext, y_test.copy(), axis=0)
    print(X_test_ext.shape)

    X_train_padded = X_train.reshape(X_train.shape[0], 28, 28, 1)
    X_test_padded = X_test.reshape(X_test.shape[0], 28, 28, 1)
    X_test_padded_ext = X_test_ext.reshape(X_test_ext.shape[0], 28, 28, 1)

    X_train_padded = np.pad(X_train_padded, ((0,0),(2,2),(2,2), (0,0)), 'constant')
    X_test_padded = np.pad(X_test_padded, ((0,0),(2,2),(2,2), (0,0)), 'constant')
    X_test_padded_ext = np.pad(X_test_padded_ext, ((0,0),(2,2),(2,2), (0,0)), 'constant')
    
    X_train_padded = np.array(X_train_padded / 255.0)
    X_test_padded = np.array(X_test_padded / 255.0)
    X_test_padded_ext = np.array(X_test_padded_ext / 255.0)
    
    return X_train_padded, y_train, X_test_padded, y_test, X_test_padded_ext, y_test_ext

In [9]:
 # load_and_preprocess_data(0.1)

In [10]:
device_dict = {
    'cpu': {
        'PyTorch': 'cpu',
        'Keras': '/cpu:0',
        'TensorFlow': '/CPU:0'
    },
    'gpu': {
        'PyTorch': 'cuda',
        'Keras': '/gpu:0',
        'TensorFlow': '/GPU:0'
    }
}

weight_initialization_dict = { 
    'xavier': {
        'PyTorch': torch.nn.init.xavier_normal_,
        'Keras': tf.keras.initializers.GlorotNormal,
        'TensorFlow': tf.compat.v1.initializers.glorot_normal
    },
    'he': {
        'PyTorch': torch.nn.init.kaiming_normal_,
        'Keras': tf.keras.initializers.HeNormal,
        'TensorFlow': tf.compat.v1.keras.initializers.he_normal
    }
}





In [11]:
metrics = {
    'PyTorch': {},
    'Keras': {},
    'TensorFlow': {}
}

In [12]:
import arguments

In [13]:
i = arguments.i
training_size = arguments.training_size
batch_size = arguments.batch_size
n_epochs = arguments.n_epochs
learning_rate = arguments.learning_rate
data_type = arguments.data_type
device = arguments.device
weight_initialization = arguments.weight_initialization
framework = arguments.framework
dropout = arguments.dropout
phase = arguments.phase

In [14]:
# i = 0
# training_size = 1
# batch_size = 256
# n_epochs = 10
# learning_rate = 0.01
# data_type = 'float32'
# device = 'cpu'
# weight_initialization = 'xavier'
# framework = 'TensorFlow'
# dropout = 0.25
# phase = 'inference'

In [15]:
experiment = 'lenet5_{}{}_{}ts_{}batch_{}epochs_{}lr_{}dtype_{}_{}wi_{}dp'.format(framework, i,
                                                                              training_size, batch_size,
                                                                              n_epochs, learning_rate, data_type,
                                                                              device, weight_initialization, dropout)


In [16]:
#If the LSTM model dirrectory (i.e., the directory where the models are saved) does not exist, we create it.
if not os.path.isdir('./models'):
    os.mkdir('./models')
if not os.path.isdir('./models/lenet5'):
    os.mkdir('./models/lenet5')
if not os.path.isdir('./models/lenet5/{}'.format(experiment)):
    os.mkdir('./models/lenet5/{}'.format(experiment))

In [17]:
if not os.path.isdir('./Results/lenet5/'):
    os.mkdir('./Results/lenet5/')

In [18]:
X_train_padded, y_train, X_test_padded, y_test, X_test_padded_ext, y_test_ext = load_and_preprocess_data(training_size)

<class 'bytes'>
<class 'bytes'>
<class 'bytes'>
<class 'bytes'>
(10000, 28, 28)
(5010000, 28, 28)


MemoryError: Unable to allocate 38.2 GiB for an array with shape (5010000, 32, 32, 1) and data type float64

In [None]:
import torch
from torch.nn import Module
from torch import nn
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib as plt
import random
import time
import csv
# import GPUtil
# import psutil
import tensorflow as tf
import datetime

collect_time = 0

#Making sure the tensorflow doesn't take up all the VRAM available on GPU
if device == 'gpu':

    gpus = tf.config.experimental.list_physical_devices('GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)

# print('GPU mem: {}'.format((GPUtil.getGPUs()[0].memoryUsed / GPUtil.getGPUs()[0].memoryTotal) * 100))
# print('CPU mem: {}'.format(psutil.virtual_memory().used / psutil.virtual_memory().total))



#we try to minimize the randomness as much as possible
torch.manual_seed(0)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(0)
# tf.random.set_seed(0)
random.seed(0)
np.random.seed(0)


print('Training {}'.format(experiment))

training_time = 0
inference_time = 0
accuracy = 0

train_start_timestamp = 0
train_end_timestamp = 0

inference_start_timestamp = 0
inference_end_timestamp = 0


# X_train_mnist_temp = X_train_mnist[:int(training_size*len(X_train_mnist))]
# y_train_mnist = y_train_mnist[:len(X_train_mnist_temp)]

# training_mnist = X_train_mnist_temp.reshape(X_train_mnist_temp.shape[0], 28, 28, 1)
# testing_mnist = X_test_mnist.reshape(X_test_mnist.shape[0], 28, 28, 1)

# training_mnist = np.pad(training_mnist, ((0,0),(2,2),(2,2), (0,0)), 'constant')
# testing_mnist = np.pad(testing_mnist, ((0,0),(2,2),(2,2), (0,0)), 'constant')

# if data_type == 'mixed':
#     training_mnist_norm = np.array(training_mnist / 255.0, dtype=np.float16)
#     testing_mnist_norm = np.array(testing_mnist / 255.0, dtype=np.float16)
# else:
#     training_mnist_norm = np.array(training_mnist / 255.0)
#     testing_mnist_norm = np.array(testing_mnist / 255.0)

if framework == 'PyTorch':
    
    
    
    from torch.utils.data import Dataset, DataLoader
    from torch.nn import CrossEntropyLoss
    from torch.optim import SGD
    from torch.utils.data import DataLoader, TensorDataset
    from torchvision.transforms import ToTensor
    from torch.autograd import Variable
    from torchvision import transforms

    import pytorch_lenet5
    
    pytorch_train_loader, pytorch_test_loader, pytorch_test_loader_ext = pytorch_lenet5.generate_pytorch_dataloader(X_train_padded, X_test_padded,
                                                                                                                  X_test_padded_ext, y_train,
                                                                                                                  y_test, y_test_ext, batch_size,
                                                                                                                  device_dict[device][framework])

    model = pytorch_lenet5.PyTorchLenet5Mod(weight_initialization_dict[weight_initialization][framework], dropout)
    model = model.to(device_dict[device][framework])
    
    if phase == 'training':
        from torch.optim import SGD
        
        optimizer = SGD(model.parameters(), lr=learning_rate)
        training_time, inference_time, accuracy, train_start_timestamp, train_end_timestamp = pytorch_lenet5.pytorch_training_phase(model, optimizer,
                                                                                                                                  pytorch_train_loader, pytorch_test_loader,
                                                                                                                                  n_epochs, device_dict[device][framework],
                                                                                                                                  data_type, experiment)
    elif phase == 'inference':
        inference_start_timestamp, inference_end_timestamp = pytorch_lenet5.pytorch_inference_phase(model, experiment, pytorch_test_loader_ext,
                                                                                                  device_dict[device][framework], data_type)
    
    #We take the mean time the model takes to infer a single sample.
    inference_time /= X_test_padded.shape[0]
    


if framework == 'Keras':

    import os
    os.environ['TF2_BEHAVIOR'] = '1'
    import tensorflow as tf

    import keras_lenet5
    
    tf.random.set_seed(0)

    if data_type == 'mixed':
        policy = tf.keras.mixed_precision.experimental.Policy('mixed_float16')
        tf.keras.mixed_precision.experimental.set_policy(policy)
    
    if phase == 'training':
    
        model = keras_lenet5.initialize_keras_lenet5(weight_initialization_dict[weight_initialization][framework], dropout)
    
        optimizer = keras.optimizers.SGD(learning_rate=learning_rate)
        loss_fn = keras.losses.sparse_categorical_crossentropy
        
        training_time, inference_time, accuracy, train_start_timestamp, train_end_timestamp = keras_lenet5.keras_training_phase(model, optimizer,
                                                                                                                              loss_fn, X_train_padded,
                                                                                                                              y_train, X_test_padded,
                                                                                                                              y_test, batch_size, n_epochs,
                                                                                                                              device_dict[device][framework],
                                                                                                                              data_type, experiment)
    elif phase == 'inference':
        inference_start_timestamp, inference_end_timestamp = keras_lenet5.keras_inference_phase(X_test_padded_ext, y_test_ext,
                                                                                              batch_size, device_dict[device][framework],
                                                                                              data_type, experiment)



    # if device == 'gpu':
    #     training_time, inference_time, accuracy, cpu_utilization, cpu_mem, gpu_utilization, gpu_mem = KerasLenet5(experiment, training_mnist_norm, y_train_mnist, testing_mnist_norm, y_test_mnist, batch_size, n_epochs, learning_rate, data_type, device_dict[device][framework], weight_initialization_dict[weight_initialization][framework])
    # else:
    #     training_time, inference_time, accuracy, cpu_utilization, cpu_mem, _, _ = KerasLenet5(experiment, training_mnist_norm, y_train_mnist, testing_mnist_norm, y_test_mnist, batch_size, n_epochs, learning_rate, data_type, device_dict[device][framework], weight_initialization_dict[weight_initialization][framework])

    # keras_lenet5_model.save('./models/{}'.format(experiment))

    print('{}\tTraining time: {}\tInference time: {}\tAccuracy: {}'.format(experiment, training_time,
                                                                          inference_time, accuracy))

if framework == 'TensorFlow':
    #the first version of tensorflow needs to be used, as tensorflow 2.0 uses keras by default
    import tensorflow.compat.v1 as tf
    tf.disable_v2_behavior()

    tf.compat.v1.set_random_seed(0)
    
    import tensorflow_lenet5

    
    
    with tf.device(device_dict[device][framework]):
        with tf.compat.v1.variable_scope(name_or_scope='TensorFlowLenet5', reuse=tf.compat.v1.AUTO_REUSE,
                                         initializer=weight_initialization_dict[weight_initialization][framework]):

            model = tensorflow_lenet5.TensorFlowLenet5Mod(weight_initialization_dict[weight_initialization][framework], dropout)
            
            #Collecting the lenghts of the sequences (note that all the sequences are of the same length as they
            #have been padded).
            # lens_train = np.array([len(xi) for xi in X_train_padded], dtype='int32')
            # lens_test = np.array([len(xi) for xi in X_test_padded], dtype='int32')
            # lens_test_ext = np.array([len(xi) for xi in X_test_padded_ext], dtype='int32')
            if phase == 'training':
            # (model, learning_rate, X_train_padded,
            #           y_train, X_test_padded,
            #           y_test, batch_size, n_epochs, device, data_type,
            #           experiment):

                training_time, inference_time, accuracy, train_start_timestamp, train_end_timestamp = tensorflow_lenet5.tensorflow_training_phase(model, learning_rate, X_train_padded, y_train,
                                                                                                                                                  X_test_padded, y_test, batch_size, n_epochs,
                                                                                                                                                  device_dict[device][framework], data_type, experiment)
            elif phase == 'inference':
                
                inference_start_timestamp, inference_end_timestamp = tensorflow_lenet5.tensorflow_inference_phase(model, X_test_padded_ext,
                                                                                                                  y_test_ext, batch_size,
                                                                                                                  device_dict[device][framework],
                                                                                                                  data_type, experiment)

#Writing the results collected during the training or the inference phase
if phase == 'training':
    results = {
        'training_time': training_time,
        'inference_time': inference_time,
        'accuracy': accuracy,
        'train_start_timestamp': train_start_timestamp,
        'train_end_timestamp': train_end_timestamp
    }

    with open('./Results/lenet5/{}.txt'.format(experiment), 'w+', encoding='utf-8') as f:
        for fieldName in results.keys():
            f.write('{} = {}\n\n'.format(fieldName, results[fieldName]))
elif phase == 'inference':
    results = {
        'inference_start_timestamp': inference_start_timestamp,
        'inference_end_timestamp': inference_end_timestamp
    }
    
    with open('./Results/lenet5/{}.txt'.format(experiment), 'a', encoding='utf-8') as f:
        for fieldName in results.keys():
            f.write('{} = {}\n\n'.format(fieldName, results[fieldName]))

print(results)

print()

In [35]:
import time
import csv
import datetime

In [None]:
def read_results(is_complete=True):
    """This function reads the file that contains the results of the evaluation of the model.
    If this function is called after the inference phase is done (i.e., is_complete=True), then
    the function returns a dictionary containing the training time, the inference time, the accuracy,
    the training and the inference time stamps and the hardware utilization metrics. On the other hand,
    if the function is called before the inference phase (i.e., is_complete=False), then the function
    will return a dictionary containing all the above except for the hardware utilization metrics.
    
    Arguments:
    ----------
    is_complete: Boolean
        This boolean indicates whether the inference phase is done or not.
    """
    
    results = {}
    with open('./Results/lenet5/{}.txt'.format(experiment), 'r', encoding='utf-8') as f:
        
        s = f.read()

        results['training_time'] = float(s.split('training_time = ')[1].split('\n\n')[0])
        results['inference_time'] = float(s.split('inference_time = ')[1].split('\n\n')[0])
        results['accuracy'] = float(s.split('accuracy = ')[1].split('\n\n')[0])

        try:
            results['train_start_timestamp'] = datetime.datetime.strptime(s.split('train_start_timestamp = ')[1].split('\n\n')[0],
                                                                          '%Y-%m-%d %H:%M:%S.%f')
        except:
            results['train_start_timestamp'] = datetime.datetime.strptime(s.split('train_start_timestamp = ')[1].split('\n\n')[0],
                                                                          '%Y-%m-%d %H:%M:%S')

        try:
            results['train_end_timestamp'] = datetime.datetime.strptime(s.split('train_end_timestamp = ')[1].split('\n\n')[0],
                                                                        '%Y-%m-%d %H:%M:%S.%f')
        except:
            results['train_end_timestamp'] = datetime.datetime.strptime(s.split('train_end_timestamp = ')[1].split('\n\n')[0],
                                                                        '%Y-%m-%d %H:%M:%S')



        try:
            results['inference_start_timestamp'] = datetime.datetime.strptime(s.split('inference_start_timestamp = ')[1].split('\n\n')[0],
                                                                              '%Y-%m-%d %H:%M:%S.%f')
        except:
            results['inference_start_timestamp'] = datetime.datetime.strptime(s.split('inference_start_timestamp = ')[1].split('\n\n')[0],
                                                                              '%Y-%m-%d %H:%M:%S')

        try:
            results['inference_end_timestamp'] = datetime.datetime.strptime(s.split('inference_end_timestamp = ')[1].split('\n\n')[0],
                                                                            '%Y-%m-%d %H:%M:%S.%f')
        except:
            new_res['inference_end_timestamp'] = datetime.datetime.strptime(s.split('inference_end_timestamp = ')[1].split('\n\n')[0],
                                                                            '%Y-%m-%d %H:%M:%S')

            
        if is_complete:
            results['cpu_utilization_train'] = [float(samp) for samp in s.split('cpu_utilization_train = [')[1].split('\n\n')[0].replace(']', '').split(', ')]
            results['cpu_mem_train'] = [float(samp) for samp in s.split('cpu_mem_train = [')[1].split('\n\n')[0].replace(']', '').split(', ')]
            results['gpu_utilization_train'] = [float(samp) for samp in s.split('gpu_utilization_train = [')[1].split('\n\n')[0].replace(']', '').split(', ')]
            results['gpu_mem_train'] = [float(samp) for samp in s.split('gpu_mem_train = [')[1].split('\n\n')[0].replace(']', '').split(', ')]

            results['cpu_utilization_infer'] = [float(samp) for samp in s.split('cpu_utilization_infer = [')[1].split('\n\n')[0].replace(']', '').split(', ')]
            results['cpu_mem_infer'] = [float(samp) for samp in s.split('cpu_mem_infer = [')[1].split('\n\n')[0].replace(']', '').split(', ')]
            results['gpu_utilization_infer'] = [float(samp) for samp in s.split('gpu_utilization_infer = [')[1].split('\n\n')[0].replace(']', '').split(', ')]
            results['gpu_mem_infer'] = [float(samp) for samp in s.split('gpu_mem_infer = [')[1].split('\n\n')[0].replace(']', '').split(', ')]
        
    return results

In [None]:
def collect_sampled_metrics():
    """This function collects the hardware utilization metrics that corresond to the training and inference
    processes according to the time stamps of the training and inference (which are collected from the results
    file) and the time stamps at which the corresponding metric was sampled.
    """
    results = read_results(is_complete=False)

    results['cpu_utilization_train'] = []
    results['cpu_mem_train'] = []
    results['gpu_utilization_train'] = []
    results['gpu_mem_train'] = []
    
    results['cpu_utilization_infer'] = []
    results['cpu_mem_infer'] = []
    results['gpu_utilization_infer'] = []
    results['gpu_mem_infer'] = []


    with open('./Results/lenet5/metric_sampling.csv', 'r', encoding='utf-8') as csvFile:
        csvReader = csv.reader(csvFile, delimiter=',')

        for row in csvReader:
            if row[0] == 'CPU Utilization':
                continue

            sample_timestamp_str = row[4]

            try:
                sample_timestamp_datetime = datetime.datetime.strptime(sample_timestamp_str, '%Y-%m-%d %H:%M:%S.%f')
            except:
                sample_timestamp_datetime = datetime.datetime.strptime(sample_timestamp_str, '%Y-%m-%d %H:%M:%S')


            if results['inference_end_timestamp'] < sample_timestamp_datetime:
                break
            
            if results['inference_start_timestamp'] <= sample_timestamp_datetime and results['inference_end_timestamp'] >= sample_timestamp_datetime:
                results['cpu_utilization_infer'].append(float(row[0]))
                results['cpu_mem_infer'].append(float(row[1]))

                results['gpu_utilization_infer'].append(float(row[2]))
                results['gpu_mem_infer'].append(float(row[3]))
                continue

            if results['train_start_timestamp'] > sample_timestamp_datetime:
                continue

            if results['train_end_timestamp'] < sample_timestamp_datetime:
                continue

            
                
            results['cpu_utilization_train'].append(float(row[0]))
            results['cpu_mem_train'].append(float(row[1]))

            results['gpu_utilization_train'].append(float(row[2]))
            results['gpu_mem_train'].append(float(row[3]))


    with open('./Results/lenet5/{}.txt'.format(experiment), 'w+', encoding='utf-8') as f:
        for fieldName in results.keys():
            f.write('{} = {}\n\n'.format(fieldName, results[fieldName]))

In [None]:
#We only collect the hardware utilization metrics after the inference phase
if phase == 'inference':
    collect_sampled_metrics()