# [PUBLIC] Analyse kernel profiling

<a id="data"></a>
## Get the experimental data from DropBox

**NB:** Please ignore this section if you are not interested in re-running or modifying this notebook. 

The experimental data was collected on the experimental platform and archived as follows:
```
$ cd `ck find ck-caffe:script:dvdt-prof`
$ python explore-dvdt-prof-libs-models-benchmarking.py
$ ck zip local:experiment:dvdt-prof-* --archive_name=ck-caffe-dvdt-prof-<...>.zip
```
The data can be downloaded and extracted as follows:

```
$ wget http://dl.dropboxusercontent.com/u/<...>/ck-caffe/public/dvdt-prof-<...>.zip
$ ck add repo:ck-caffe-dvdt-prof-<...> --zip=ck-caffe-dvdt-prof-<...>.zip --quiet
```

## Includes

### Standard

In [None]:
import os
import sys
import json
import time

### Scientific

In [None]:
import IPython as ip
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mp

In [None]:
print('IPython version: %s' % ip.__version__)
print('NumPy version: %s' % np.__version__)
print('SciPy version: %s' % sp.__version__)
print('Pandas version: %s' % pd.__version__)
print('Matplotlib version: %s' % mp.__version__)

In [None]:
import matplotlib.pyplot as plt
from matplotlib import cm
%matplotlib inline
# import scipy.stats as st

In [None]:
from IPython.display import display

### Collective Knowledge

In [None]:
import ck.kernel as ck
print('CK version: %s' % ck.__version__)

In [None]:
# NB: Install dvdt-prof first e.g. as "ck install ck-caffe/package/tool-dvdt-prof-cjson".
r=ck.access({'action':'show', 'module_uoa':'env', 'tags':'tool,opencl,dvdt,prof'})
if r['return']>0:
    print ("Error: %s" % r['error'])
    exit(1)
# Get the path to the first returned environment entry.
dvdt_prof_dir=r['lst'][0]['meta']['env']['CK_ENV_TOOL_DVDT_PROF']
dvdt_prof_src_python=os.path.join(dvdt_prof_dir,'src','python')
sys.path.append(dvdt_prof_src_python)
import prof_wrangler as pw
import prof_common as pc
pw.test()
pc.test()

## Access experimental results

In [None]:
def get_experimental_results(repo_uoa='local', tags='dvdt-prof'):
    module_uoa = 'experiment'
    r=ck.access({'action':'search', 'repo_uoa':repo_uoa, 'module_uoa':module_uoa, 'tags':tags})
    if r['return']>0:
        print ("Error: %s" % r['error'])
        exit(1)

    dfs = []
    experiments=r['lst']
    for experiment in experiments:
        data_uoa = experiment['data_uoa']
        r = ck.access({'action':'list_points', 'repo_oua':repo_uoa, 'module_uoa':module_uoa, 'data_uoa':data_uoa})
        if r['return']>0:
            print ("Error: %s" % r['error'])
            exit(1)
        
        # Get (lib_tag, model_tag) from a list of tags that should be available in r['dict']['tags'].
        # Tags include 2 of the 3 irrelevant tags, a model tag and a lib tag.
        # NB: Since it's easier to list all model tags than all lib tags, the latter list is not expicitly specified.
        tags = r['dict']['tags']
        irrelevant_tags = [ 'dvdt-prof', 'caffe' ]
        model_tags = [ 'bvlc-alexnet','bvlc-googlenet','deepscale-squeezenet-1.0','deepscale-squeezenet-1.1' ]
        lib_model_tags = [ tag for tag in tags if tag not in irrelevant_tags ]
        model_tags = [ tag for tag in lib_model_tags if tag in model_tags ]
        lib_tags = [ tag for tag in lib_model_tags if tag not in model_tags ]
        if len(lib_tags)==1 and len(model_tags)==1:
             (lib, model) = (lib_tags[0], model_tags[0])
        else:
            continue

        for point in r['points']:
            with open(os.path.join(r['path'], 'ckp-%s.0001.json' % point)) as point_file:
                point_data_raw = json.load(point_file)    
            characteristics_list = point_data_raw['characteristics_list']
            num_repetitions = len(characteristics_list)
            # DataFrame columns.
            data = [
                {
                    # features
                    'platform' : point_data_raw['features']['platform']['platform']['model'],
                    # choices
                    'lib' : lib,
                    'model' : model,
                    'batch_size' : point_data_raw['choices']['env'].get('CK_CAFFE_BATCH_SIZE',[]),
                    # statistical repetitions
                    'repetition_id' : repetition_id,
                    # runtime characteristics
                    'time (ms)'     : np.float32(characteristics['run'].get('time_fw_ms',0)),
                    'dvdt_prof_info': characteristics['run'].get('dvdt_prof',[]),
                    'per_layer_info': characteristics['run'].get('per_layer_info',[]),
                }
                for (repetition_id, characteristics) in zip(range(num_repetitions), characteristics_list)  
                if characteristics['run'].get('run_success','')!=''
            ]
            df = pd.DataFrame(data)
            df.columns.name = 'characteristics'
            df.index.name = 'index'
            df = df.set_index([ 'platform', 'lib', 'model', 'batch_size', 'repetition_id' ])
            dfs.append(df)
        results = pd.concat(dfs).sortlevel()
    return results

In [None]:
df = get_experimental_results(repo_uoa='local', tags='dvdt-prof')
pd.options.display.max_columns = len(df.columns)
pd.options.display.max_rows = len(df.index)
df

## Check execution time distribution

In [None]:
pd.options.display.max_columns = len(df.columns)
pd.options.display.max_rows = len(df.index)*8
df.groupby(level=df.index.names[:-1])[['time (ms)']].describe()

## Plot execution time

In [None]:
def plot(mean, std, rot=0):
    mean \
        .plot(yerr=std, title='Execution time (ms)', kind='bar', colormap=cm.autumn,
            figsize=[16, 8], rot=rot, grid=True, legend=True) \
        .legend(loc='upper left')

In [None]:
df_mean = df.groupby(level=df.index.names[:-1])['time (ms)'].mean().unstack('lib')
df_std = df.groupby(level=df.index.names[:-1])['time (ms)'].std().unstack('lib')
plot(df_mean, df_std, rot=45)

## Show profiling info

In [None]:
df_min = df \
    .ix[df.groupby(level=df.index.names[:-1])['time (ms)'].idxmin()] \
    .reset_index('repetition_id', drop=True)
df_min

In [None]:
batch_size = 1
df_model_lib = df_min[['dvdt_prof_info']] \
    .reset_index('platform', drop=True) \
    .reorder_levels([ 'batch_size', 'model', 'lib']) \
    .loc[batch_size] \
    .sortlevel()
df_model_lib

In [None]:
unit = 'ms'

## Analyse models

In [None]:
def analyse_model(df_model_lib, model, lib, min_pc=1.0):
    trace = pw.index_calls(df_model_lib.loc[model].loc[lib]['dvdt_prof_info'])
    # All kernel enqueues.
    df_kernel_enqueues = pw.df_kernel_enqueues(pw.filter_calls(trace, ['clEnqueueNDRangeKernel']), unit='ms')
    # Kernel enqueues that take at least 'min_pc' % of the execution time.
    df_kernel_enqueues_cum_time_num = pw.df_kernel_enqueues_cumulative_time_num(df_kernel_enqueues, unit)
    df_kernel_enqueues_cum_time_num.columns.name = '%s, %s' % (model, lib)
    return df_kernel_enqueues_cum_time_num[df_kernel_enqueues_cum_time_num['** Execution time (%) **'] > min_pc]

In [None]:
lib = 'opencl-clblast'
pd.options.display.max_columns = 3
pd.options.display.max_rows = 20
for model in [ 'bvlc-alexnet', 'bvlc-googlenet', 'deepscale-squeezenet-1.0', 'deepscale-squeezenet-1.1']:
    display(analyse_model(df_model_lib, model, lib, min_pc=2.0))

## Analyse kernels

In [None]:
def analyse_kernel(df_model_lib, model, lib, kernel):
    trace = pw.index_calls(df_model_lib.loc[model].loc[lib]['dvdt_prof_info'])
    # All kernel enqueues.
    nqs = pw.filter_calls(trace, ['clEnqueueNDRangeKernel'])
    df = pw.df_kernel_enqueues(nqs, unit='ms').swaplevel().ix[kernel]
    
    # All calls to set kernel args.
    set_args = pw.filter_calls(trace, ['clSetKernelArg'])
    
    mnk_triples = []
    for nq in nqs:
        if nq['name'] == kernel:
            mnk_triple = []
            for set_arg in set_args:
                if (set_arg['kernel']==nq['kernel']) and (0<=set_arg['arg_index'] and set_arg['arg_index']<=2):
                    mnk_triple.append(pc.hex_str_as_int(set_arg['arg_value']))
            mnk_triples.append(tuple(mnk_triple))

    df['kSizeM'] = 'N/A'; df['kSizeN'] = 'N/A'; df['kSizeK'] = 'N/A'    
    df[['kSizeM', 'kSizeN', 'kSizeK']] = mnk_triples
    df['GFLOPS'] = (2*df['kSizeM']*df['kSizeN']*df['kSizeK']*1e-9) / (df['p3 - p2 (ms)']*1e-3)
    
    return df

In [None]:
pd.options.display.max_columns = 14+4
pd.options.display.max_rows = 30
analyse_kernel(df_model_lib, 'bvlc-alexnet', 'opencl-clblast', 'XgemmDirectTN')

### TODO: Augment kernel enqueues with layer info

In [None]:
# def convert_time(t):
# #     get hh:mm:ss:decilmals
#     h,m,s = t.split(".")[0].split(":")  #[0] hours, minutes, seconds [1] milliseconds ... 
#     tsec = (int(h)*3600) + (int(m)*60) + int(s)
#     total = float(tsec) + float("0." + t.split(".")[1]) 
#     return float(total)

In [None]:
# print len(trace)
# print len(trace_layer)
# epoch_layer=[]
# for tlc in trace_layer[1:len(trace_layer)]:
#     epoch_layer.append(convert_time(tlc['timestamp'].split(" ")[1]))
     
# print "````````````````````````````````````"
# print tlc['timestamp'].split(" ")
# print "````````````````````````````````````"
# print trace[len(trace)-1]['timestamp']['end']
# print "````````````````````````````````````"  
# p = len(epoch_layer)
# last_trace = 0


# # t = trace[0]['timestamp']['end'].split("T")[1]
# # t = convert_time(t)
# # print ("first converted %s last epoch layer %s" %(t,epoch_layer[0]))



# # t = trace[0]['timestamp']['end'].split("T")[1]
# # t = convert_time(t)
# # print ("second converted %s last epoch layer %s" %(t,epoch_layer[1]))

# lc = 0
# for i in range (0, len(trace)):
#     t = trace[i]['timestamp']['end'].split("T")[1]
#     nt = convert_time(t)
    
#     if (nt < epoch_layer[lc]):
#         print ("%s (%s) belongs to %s" %(nt, trace[i]['timestamp']['end'].split("T")[1],epoch_layer[lc]))
#     else:
#         print "##############################################"
#         print "new layer %s ( how many trace %s)"% (lc,i)
#         lc=lc+1


# t = trace[len(trace)-1]['timestamp']['end'].split("T")[1]
# t = convert_time(t)
# print ("last converted %s last epoch layer %s" %(t,epoch_layer[p-1]))