# Analyze how the batch size affects the performance across installed Caffe variants and models

**NB:** This is an early version of this notebook. Please see e.g. http://github.com/dividiti/ck-caffe-nvidia-tx1 for a more robust and up-to-date example.

## Includes

### Standard

In [None]:
import os
import sys
import json
import re

### Scientific

In [None]:
import IPython as ip
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mp

In [None]:
print ('IPython version: %s' % ip.__version__)
print ('NumPy version: %s' % np.__version__)
print ('SciPy version: %s' % sp.__version__)
print ('Pandas version: %s' % pd.__version__)
print ('Matplotlib version: %s' % mp.__version__)

In [None]:
import matplotlib.pyplot as plt
if mp.__version__[0]=='2': mp.style.use('classic')
from matplotlib import cm
%matplotlib inline

In [None]:
from IPython.display import display
def display_in_full(df):
    pd.options.display.max_columns = len(df.columns)
    pd.options.display.max_rows = len(df.index)
    display(df)

### Collective Knowledge

In [None]:
import ck.kernel as ck
print ('CK version: %s' % ck.__version__)

## Access Caffe experimental data

In [None]:
def get_experimental_results(tags='explore-batch-size-libs-models', repo_uoa='local'):
    # Get (lib_tag, model_tag) from a list of tags that should be available in r['dict']['tags'].
    # Tags include 2 of the 3 irrelevant tags (e.g. command name), a model tag and lib tag.
    # NB: Since it's easier to list a few model tags than many lib tags, the latter list is not expicitly specified.
    def get_lib_model_tags(tags):
        irrelevant_tags = [ 'explore-batch-size-libs-models', 'time_gpu', 'time_cpu', 'default' ]
        model_tags = [ 'bvlc-alexnet', 'bvlc-googlenet', 'deepscale-squeezenet-1.0', 'deepscale-squeezenet-1.1' ]
        lib_model_tags = [ tag for tag in tags if tag not in irrelevant_tags ]
        model_tags = [ tag for tag in lib_model_tags if tag in model_tags ]
        lib_tags = [ tag for tag in lib_model_tags if tag not in model_tags ]
        return (lib_tags[0], model_tags[0])
    
    module_uoa = 'experiment'
    r = ck.access({'action':'search', 'repo_uoa':repo_uoa, 'module_uoa':module_uoa, 'tags':tags})
    if r['return']>0:
        print ("Error: %s" % r['error'])
        exit(1)
    experiments = r['lst']
    
    first_experiment = True
    for experiment in experiments:
        data_uoa = experiment['data_uoa']
        r = ck.access({'action':'list_points', 'repo_uoa':repo_uoa, 'module_uoa':module_uoa, 'data_uoa':data_uoa})
        if r['return']>0:
            print ("Error: %s" % r['error'])
            exit(1)
       
        results = []
        for point in r['points']:
            with open(os.path.join(r['path'], 'ckp-%s.0001.json' % point)) as point_file:
                point_data_raw = json.load(point_file)
                run_info_list = [
                    characteristics['run']
                    for characteristics in point_data_raw['characteristics_list']
                    if characteristics['run'].get('run_success','')!=''
                ]
                # Select characteristics of interest. TODO: simplify.
                point_data_dict = {
                    'lib'         : get_lib_model_tags(r['dict']['tags'])[0],
                    'model'       : get_lib_model_tags(r['dict']['tags'])[1],
                    'batch size'  : int(point_data_raw['choices']['env']['CK_CAFFE_BATCH_SIZE']),
                    'time (ms)'   : [ float(run_info.get('time_fw_ms',0)) for run_info in run_info_list ],
                    'memory (MB)' : [ int(run_info.get('memory_mbytes',0)) for run_info in run_info_list ],
                    'success?'    : [ run_info.get('run_success','n/a') for run_info in run_info_list ]
                }
                results.append(point_data_dict)
        df_new = pd.DataFrame(data=results)
        df_new = df_new.set_index(['lib', 'model', 'batch size'])
        # Need to convert lists into separate columns. Ugly but works.
        # NB: More beautiful code can be found e.g. at http://github.com/dividiti/ck-caffe-nvidia-tx1.
        df_new_memory = df_new['memory (MB)'].apply(pd.Series)
        df_new_memory.columns = [ ['memory (MB)']*len(df_new_memory.columns), df_new_memory.columns ]
        df_new_time = df_new['time (ms)'].apply(pd.Series)
        df_new_time.columns = [ ['time (ms)']*len(df_new_time.columns), df_new_time.columns ]
        df_new_success = df_new['success?'].apply(pd.Series)
        df_new_success.columns = [ ['success?']*len(df_new_success.columns), df_new_success.columns ]               
        # Join together.
        df_new = df_new_memory.join(df_new_time).join(df_new_success)
        df_new.columns.names = ['characteristic', 'repetition']
        df_new = df_new.stack('repetition').unstack(['lib', 'model'])
        # display_in_full(df_new)
        if first_experiment:
            first_experiment = False
            df_all = df_new
        else:
            df_all = df_all.join(df_new)
    return df_all

In [None]:
df_all = get_experimental_results(repo_uoa='ck-caffe-odroid-xu3-thresh')

## All execution time data indexed by repetitions

In [None]:
df_time = df_all['time (ms)'].unstack('batch size').apply(pd.to_numeric).sortlevel(level=['lib', 'model'], axis=1)
display_in_full(df_time)

## Mean execution time per batch

In [None]:
df_mean_time_per_batch = df_time.describe().ix['mean'].unstack(level='batch size')
display_in_full(df_mean_time_per_batch)

In [None]:
batch_sizes = df_mean_time_per_batch.columns.tolist()
batch_sizes

## Mean execution time per image

In [None]:
df_mean_time_per_image = df_mean_time_per_batch / batch_sizes
display_in_full(df_mean_time_per_image)

## Best mean execution time per image

In [None]:
df_mean_time_per_image.min(axis=1)

In [None]:
# What is the batch size that gives the minimum time per image (or the maximum number of images per second)?
df_mean_time_per_image.idxmin(axis=1)

## Use the batch size with the best mean execution time per image

In [None]:
df_time_per_image = df_time / (batch_sizes*(len(df_time.columns)/len(batch_sizes)))
display_in_full(df_time_per_image)

In [None]:
df_min_time_per_image_index = pd.DataFrame(df_mean_time_per_image.idxmin(axis=1)).set_index(0, append=True).index.values
df_model_lib = df_time_per_image[df_min_time_per_image_index] \
     .stack(['model', 'lib']).reorder_levels(['model','lib','repetition'])
df_model_lib

In [None]:
df_model_lib_mean = df_model_lib.groupby(level=['model', 'lib']).mean()
df_model_lib_std  = df_model_lib.groupby(level=['model', 'lib']).std()

In [None]:
def plot(mean, std, ymax=0, title='Execution time per image (ms)'):
    ymax = mean.max().max() if ymax==0 else ymax
    mean.plot(yerr=std, title=title, kind='bar', ylim=[0,ymax*1.05],  rot=0, figsize=[16, 8], grid=True, legend=True, colormap=cm.autumn)

## Plot by Caffe models

### All

In [None]:
mean = df_model_lib_mean.unstack('lib')
std  = df_model_lib_std.unstack('lib')
plot(mean, std)

## Selection: AlexNet, SqueezeNet

In [None]:
df_model_lib_mean

In [None]:
mean = df_model_lib_mean.ix[['bvlc-alexnet', 'deepscale-squeezenet-1.1']].unstack('lib')
std  = df_model_lib_std.ix[['bvlc-alexnet', 'deepscale-squeezenet-1.1']].unstack('lib')
plot(mean, std)

In [None]:
df_model_lib_mean.ix[['bvlc-alexnet', 'deepscale-squeezenet-1.1']].unstack('lib').iloc[1] / \
df_model_lib_mean.ix[['bvlc-alexnet', 'deepscale-squeezenet-1.1']].unstack('lib').iloc[0]

### Selection: CUDA-level performance

## Plot by Caffe libs (variants)

### All

In [None]:
mean = df_model_lib_mean.unstack('model')
std  = df_model_lib_std.unstack('model')
plot(mean, std)

### Selection: AlexNet-level accuracy

In [None]:
alexnet_level_accuracy = [ 'bvlc-alexnet', 'deepscale-squeezenet-1.1' ]
mean = df_model_lib_mean.ix[alexnet_level_accuracy].unstack('model')
std  = df_model_lib_std.ix[alexnet_level_accuracy].unstack('model')
plot(mean, std)

### Selection: AlexNet-level accuracy, CUDA-level performance

In [None]:
# mean = mean.ix[cuda_level_performance]
# std = std.ix[cuda_level_performance]
# plot(mean, std)

## All memory size data

In [None]:
# Batch size of 2; repetition 0 (should be always available).
df_memory = df_all['memory (MB)'].loc[2].loc[0].unstack('lib')
plot(df_memory, pd.DataFrame(), title='Memory consumption (MB)')

In [None]:
df_memory.ix[['bvlc-alexnet', 'deepscale-squeezenet-1.1']].iloc[1] / \
df_memory.ix[['bvlc-alexnet', 'deepscale-squeezenet-1.1']].iloc[0]