# Analyse kernel profiling

Sample cmd 
```
ck benchmark program:caffe --env.CK_CAFFE_BATCH_SIZE=1 \
  --deps.lib-caffe=cb3e77cde4b54140 --deps.caffemodel=ae96844061a5678d \
  --cmd_key=time_gpu --dvdt_prof --skip_stat_analysis \
  --tags=prof,alexnet --record --record_uoa=prof-training-alexnet \
  --repetitions=3
```

## Includes

### Standard

In [None]:
import os
import sys
import json
import time

### Scientific

In [None]:
import IPython as ip
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mp

In [None]:
print('IPython version: %s' % ip.__version__)
print('NumPy version: %s' % np.__version__)
print('SciPy version: %s' % sp.__version__)
print('Pandas version: %s' % pd.__version__)
print('Matplotlib version: %s' % mp.__version__)

In [None]:
import matplotlib.pyplot as plt
from matplotlib import cm
%matplotlib inline
# import scipy.stats as st

### Collective Knowledge

In [None]:
import ck.kernel as ck
print('CK version: %s' % ck.__version__)

## Access experimental results

In [None]:
def get_experimental_results(tags):
    search = 'search'
    repo_uoa = 'local'
    module_uoa = 'experiment'
    r=ck.access({'action':search, 'module_uoa':module_uoa, 'tags':tags})
    if r['return']>0:
        print ("Error: %s" % r['error'])
        exit(1)
    experiments=r['lst']
    dfs = []
    for experiment in experiments:
        data_uoa = experiment['data_uoa']
        r = ck.access({'action':'list_points', 'repo_oua':repo_uoa, 'module_uoa':module_uoa, 'data_uoa':data_uoa})
        if r['return']>0:
            print ("Error: %s" % r['error'])
            exit(1)
        path = r['path']
        points = r['points']
        for point in points:
            with open(os.path.join(path, 'ckp-%s.0001.json' % point)) as point_file:
                point_data_raw = json.load(point_file)
            # DataFrame columns.
            characteristics = [
                {
                    'time (ms)'   : np.float32(characteristics['run'].get('time_fw_ms',0)),
                    'per_layer_info': characteristics['run'].get('per_layer_info',[]),
                    'dvdt_prof'   : characteristics['run'].get('dvdt_prof',[])
                }
                for characteristics in point_data_raw['characteristics_list'] 
                if characteristics['run'].get('run_success','')!=''
            ]
            df = pd.DataFrame(characteristics)
            df.columns.name = 'run characteristic'
            df.index.name = 'repetition'
            # DataFrame indices.
            df['program'] = point_data_raw['choices']['data_uoa']
            df['tags'] = 'unknown'
            df = df.set_index(['program','tags'], append=True)
            df = df.reorder_levels(('program', 'tags', 'repetition'))
            dfs.append(df)
    results = pd.concat(dfs)
#     for i in characteristics:
#         print i['per_layer_info']
#         print "###############################################################################"
#         print i['dvdt_prof']
    return results

In [None]:
results = get_experimental_results('alexnet,prof')

## Show execution time

In [None]:
results

In [None]:
results[['time (ms)']]

In [None]:
results[['dvdt_prof']]

## Plot execution time

In [None]:
def plot(mean, std):
    mean \
        .plot(yerr=std, title='Execution time (ms)', kind='bar', colormap=cm.autumn,
            figsize=[16, 8], rot=0, grid=True, legend=True) \
        .legend(loc='upper left')

## Show profiling info

In [None]:
# Pick the first repetition of the first experiment for now.
trace = results['dvdt_prof'].iloc[0]
trace_layer = results['per_layer_info'].iloc[0]
if not trace:
    raise Exception("No OpenCL profiling information!")
# What's that experiment, by the way?
results['dvdt_prof'].index[0]


In [None]:
r=ck.access({'action':'show', 'module_uoa':'env', 'tags':'tool,opencl,dvdt,prof'})
if r['return']>0:
    print ("Error: %s" % r['error'])
    exit(1)
# Get path the first returned environment entry.
dvdt_prof_dir=r['lst'][0]['meta']['env']['CK_ENV_TOOL_DVDT_PROF']
dvdt_prof_src_python=os.path.join(dvdt_prof_dir,'src','python')
sys.path.append(dvdt_prof_src_python)
import prof_wrangler as pw
pw.test()
import prof_common as pc
pc.test()

In [None]:
trace = pw.index_calls(trace)
unit = 'ms'
trace_layer = pw.index_calls(trace_layer)
unit2 = 'ms'


In [None]:
print len(trace)
print len(trace_layer)

### Kernel enqueues

In [None]:
# Partial trace only containing kernel enqueues.
kernel_enqueues = pw.filter_calls(trace, ['clEnqueueNDRangeKernel'])
# Kernel enqueues as a DataFrame.
df_kernel_enqueues = pw.df_kernel_enqueues(kernel_enqueues, unit)
df_kernel_enqueues

In [None]:
df_kernel_enqueues['count'] = 1

In [None]:
df_kernel_enqueues.groupby(level='name').sum()

In [None]:
# df_kernel_enqueues.info(memory_usage=True)

In [None]:
num_enqueues_total = len(kernel_enqueues)
num_enqueues_per_repetition = 4
df_kernel_enqueues['kernel_index'] = (pd.Series(range(num_enqueues_total)) % num_enqueues_per_repetition).values
df_kernel_enqueues = df_kernel_enqueues \
    .set_index('kernel_index', append=True) \
    .reorder_levels(['call_index','kernel_index','name'])

In [None]:
df_kernel_enqueues_stats = df_kernel_enqueues.groupby(level='kernel_index').describe()
df_kernel_enqueues_stats

# KERNEL XgemmDirectNN

In [None]:
# Partial trace only containing kernel enqueues.
kernel_enqueues = pw.filter_calls(trace, ['clEnqueueNDRangeKernel'])
# Kernel enqueues as a DataFrame.
df_kernel_enqueues = pw.df_kernel_enqueues(kernel_enqueues, unit)
df_xgemm_directnn_enqueues = df_kernel_enqueues.swaplevel().ix['XgemmDirectNN']
df_xgemm_directnn_enqueues
df_xgemm_directnn_enqueues['kSizeM'] = 'N/A'
df_xgemm_directnn_enqueues['kSizeN'] = 'N/A'
df_xgemm_directnn_enqueues['kSizeK'] = 'N/A'

setkernel_enqueues = pw.filter_calls(trace, ['clSetKernelArg'])
kernel_entries = pw.filter_calls(trace, ['clEnqueueNDRangeKernel'])
xgemm_entries = []
for kernel_count in kernel_entries:
    if kernel_count['name'] == 'XgemmDirectNN':
        xgemm_index = kernel_count['kernel']
        xgemm_matrix_sizes = []
        for k in setkernel_enqueues:
            if (k['kernel'] == xgemm_index) and (k['arg_index'] == 0 or k['arg_index'] == 1 or k['arg_index'] == 2):
                tmp = pc.hex_str_as_int(k['arg_value'])
                xgemm_matrix_sizes.append(tmp)
        entry_matrix_sizes = tuple(xgemm_matrix_sizes)
        xgemm_entries.append(entry_matrix_sizes)
print len(xgemm_entries)

df_xgemm_directnn_enqueues[['kSizeM', 'kSizeN', 'kSizeK']] = xgemm_entries
df_xgemm_directnn_enqueues


# KERNEL XgemmDirectTN


In [None]:
# Partial trace only containing kernel enqueues.
kernel_enqueues = pw.filter_calls(trace, ['clEnqueueNDRangeKernel'])
# Kernel enqueues as a DataFrame.
df_kernel_enqueues = pw.df_kernel_enqueues(kernel_enqueues, unit)
df_xgemm_directnn_enqueues = df_kernel_enqueues.swaplevel().ix['XgemmDirectTN']
df_xgemm_directnn_enqueues
df_xgemm_directnn_enqueues['kSizeM'] = 'N/A'
df_xgemm_directnn_enqueues['kSizeN'] = 'N/A'
df_xgemm_directnn_enqueues['kSizeK'] = 'N/A'

setkernel_enqueues = pw.filter_calls(trace, ['clSetKernelArg'])
kernel_entries = pw.filter_calls(trace, ['clEnqueueNDRangeKernel'])
xgemm_entries = []
for kernel_count in kernel_entries:
    if kernel_count['name'] == 'XgemmDirectTN':
        xgemm_index = kernel_count['kernel']
        xgemm_matrix_sizes = []
        for k in setkernel_enqueues:
            if (k['kernel'] == xgemm_index) and (k['arg_index'] == 0 or k['arg_index'] == 1 or k['arg_index'] == 2):
                tmp = pc.hex_str_as_int(k['arg_value'])
                xgemm_matrix_sizes.append(tmp)
        entry_matrix_sizes = tuple(xgemm_matrix_sizes)
        xgemm_entries.append(entry_matrix_sizes)
print len(xgemm_entries)

df_xgemm_directnn_enqueues[['kSizeM', 'kSizeN', 'kSizeK']] = xgemm_entries
df_xgemm_directnn_enqueues


# KERNEL XgemmDirectTT


In [None]:
# Partial trace only containing kernel enqueues.
kernel_enqueues = pw.filter_calls(trace, ['clEnqueueNDRangeKernel'])
# Kernel enqueues as a DataFrame.
df_kernel_enqueues = pw.df_kernel_enqueues(kernel_enqueues, unit)
df_xgemm_directnn_enqueues = df_kernel_enqueues.swaplevel().ix['XgemmDirectTT']
df_xgemm_directnn_enqueues
df_xgemm_directnn_enqueues['kSizeM'] = 'N/A'
df_xgemm_directnn_enqueues['kSizeN'] = 'N/A'
df_xgemm_directnn_enqueues['kSizeK'] = 'N/A'

setkernel_enqueues = pw.filter_calls(trace, ['clSetKernelArg'])
kernel_entries = pw.filter_calls(trace, ['clEnqueueNDRangeKernel'])
xgemm_entries = []
for kernel_count in kernel_entries:
    if kernel_count['name'] == 'XgemmDirectTT':
        xgemm_index = kernel_count['kernel']
        xgemm_matrix_sizes = []
        for k in setkernel_enqueues:
            if (k['kernel'] == xgemm_index) and (k['arg_index'] == 0 or k['arg_index'] == 1 or k['arg_index'] == 2):
                tmp = pc.hex_str_as_int(k['arg_value'])
                xgemm_matrix_sizes.append(tmp)
        entry_matrix_sizes = tuple(xgemm_matrix_sizes)
        xgemm_entries.append(entry_matrix_sizes)
print len(xgemm_entries)

df_xgemm_directnn_enqueues[['kSizeM', 'kSizeN', 'kSizeK']] = xgemm_entries
df_xgemm_directnn_enqueues

### Kernel per layer
 


In [None]:
def convert_time(t):
#     get hh:mm:ss:decilmals
    h,m,s = t.split(".")[0].split(":")  #[0] hours, minutes, seconds [1] milliseconds ... 
    tsec = (int(h)*3600) + (int(m)*60) + int(s)
    total = float(tsec) + float("0." + t.split(".")[1]) 
    return float(total)

In [None]:
print len(trace)
print len(trace_layer)
epoch_layer=[]
for tlc in trace_layer[1:len(trace_layer)]:
    epoch_layer.append(convert_time(tlc['timestamp'].split(" ")[1]))
     
print "````````````````````````````````````"
print tlc['timestamp'].split(" ")
print "````````````````````````````````````"
print trace[len(trace)-1]['timestamp']['end']
print "````````````````````````````````````"  
p = len(epoch_layer)
last_trace = 0


# t = trace[0]['timestamp']['end'].split("T")[1]
# t = convert_time(t)
# print ("first converted %s last epoch layer %s" %(t,epoch_layer[0]))



# t = trace[0]['timestamp']['end'].split("T")[1]
# t = convert_time(t)
# print ("second converted %s last epoch layer %s" %(t,epoch_layer[1]))

lc = 0
for i in range (0, len(trace)):
    t = trace[i]['timestamp']['end'].split("T")[1]
    nt = convert_time(t)
    
    if (nt < epoch_layer[lc]):
        print ("%s (%s) belongs to %s" %(nt, trace[i]['timestamp']['end'].split("T")[1],epoch_layer[lc]))
    else:
        print "##############################################"
        print "new layer %s ( how many trace %s)"% (lc,i)
        lc=lc+1


t = trace[len(trace)-1]['timestamp']['end'].split("T")[1]
t = convert_time(t)
print ("last converted %s last epoch layer %s" %(t,epoch_layer[p-1]))
    