# [PUBLIC] Analyse kernel profiling

## Platforms

### Odroid-XU3

- [CPU] quad-core ARM Cortex-A15 @ 2000 MHz ("big");
- [CPU] quad-core ARM Cortex-A7 @ 1400 MHz ("LITTLE");
- [GPU] quad-core ARM Mali-T628 @ 600 MHz;
- [GPU] dual-core ARM Mali-T628 @ 600 MHz (not used);
- [GPU] OpenCL driver 12.0 ("r12p0");
- [GPU] OpenCL standard 1.2;
- [RAM] 2 GB

In [None]:
platform_id = 'odroid-xu3'

### Firefly-RK3399

- [CPU] dual-core ARM Cortex-A72 @ 1800 MHz ("big");
- [CPU] quad-core ARM Cortex-A53 @ 1416 MHz ("LITTLE");
- [GPU] quad-core ARM Mali-T860 @ 800 MHz;
- [GPU] OpenCL driver 13.0 ("r13p0-00rel0-git(a4271c9)");
- [GPU] OpenCL standard 1.2;
- [RAM] 4 GB

In [None]:
# platform_id = 'firefly-rk3399'

<a id="data"></a>
## Get the experimental data from DropBox

**NB:** Please ignore this section if you are not interested in re-running or modifying this notebook. 

The experimental data was collected on the experimental platform and archived as follows:
```
$ cd `ck find ck-caffe:script:dvdt-prof`
$ python explore-dvdt-prof-libs-models-benchmarking.py
$ ck zip local:experiment:dvdt-prof-* --archive_name=ck-caffe-dvdt-prof-<platform_id>.zip
```
The data can be downloaded and extracted as follows:

```
$ wget http://dl.dropboxusercontent.com/u/<...>/ck-caffe/public/ck-caffe-dvdt-prof-<platform_id>.zip
$ ck add repo:ck-caffe-dvdt-prof-<platform_id> --zip=ck-caffe-dvdt-prof-<platform_id>.zip --quiet
```

## Includes

### Standard

In [None]:
import os
import sys
import json
import time
import math
import operator

### Scientific

In [None]:
import IPython as ip
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mp

In [None]:
print('IPython version: %s' % ip.__version__)
print('NumPy version: %s' % np.__version__)
print('SciPy version: %s' % sp.__version__)
print('Pandas version: %s' % pd.__version__)
print('Matplotlib version: %s' % mp.__version__)

In [None]:
import matplotlib.pyplot as plt
plt.style.use('classic') # https://matplotlib.org/users/dflt_style_changes.html
plt.rcParams.update({'font.size': 20})

from matplotlib import cm
%matplotlib inline

In [None]:
from IPython.display import display

### Collective Knowledge

In [None]:
import ck.kernel as ck
print('CK version: %s' % ck.__version__)

In [None]:
# NB: Install dvdt-prof first e.g. as "ck install ck-caffe/package/tool-dvdt-prof-cjson".
r=ck.access({'action':'show', 'module_uoa':'env', 'tags':'tool,opencl,dvdt,prof'})
if r['return']>0:
    print ("Error: %s" % r['error'])
    exit(1)
# Get the path to the first returned environment entry.
dvdt_prof_dir=r['lst'][0]['meta']['env']['CK_ENV_TOOL_DVDT_PROF']
dvdt_prof_src_python=os.path.join(dvdt_prof_dir,'src','python')
sys.path.append(dvdt_prof_src_python)
import prof_wrangler as pw
import prof_common as pc
pw.test()
pc.test()

## Access experimental results

In [None]:
def get_experimental_results(repo_uoa, tags='dvdt-prof'):
    module_uoa = 'experiment'
    r=ck.access({'action':'search', 'repo_uoa':repo_uoa, 'module_uoa':module_uoa, 'tags':tags})
    if r['return']>0:
        print ("Error: %s" % r['error'])
        exit(1)

    results = pd.DataFrame()
    dfs = []
    experiments=r['lst']
    
    for experiment in experiments:
        repo_uoa = experiment['repo_uoa']
        data_uoa = experiment['data_uoa']
        r = ck.access({'action':'list_points', 'repo_uoa':repo_uoa, 'module_uoa':module_uoa, 'data_uoa':data_uoa})
        if r['return']>0:
            print ("Error: %s" % r['error'])
            exit(1)
        
        # Get (lib_tag, model_tag) from a list of tags that should be available in r['dict']['tags'].
        # Tags include 2 of the 3 irrelevant tags, a model tag and a lib tag.
        # NB: Since it's easier to list all model tags than all lib tags, the latter list is not expicitly specified.
        tags = r['dict']['tags']
        irrelevant_tags = [ 'dvdt-prof', 'caffe-time-opencl' ]
        model_tags = [ 'bvlc-alexnet', 'bvlc-googlenet', 'deepscale-squeezenet-1.0', 'deepscale-squeezenet-1.1' ]
        lib_model_tags = [ tag for tag in tags if tag not in irrelevant_tags ]
        model_tags = [ tag for tag in lib_model_tags if tag in model_tags ]
        lib_tags = [ tag for tag in lib_model_tags if tag not in model_tags ]
        if len(lib_tags)==1 and len(model_tags)==1:
             (lib, model) = (lib_tags[0], model_tags[0])
        else:
            continue

        for point in r['points']:
            with open(os.path.join(r['path'], 'ckp-%s.0001.json' % point)) as point_file:
                point_data_raw = json.load(point_file)
            characteristics_list = point_data_raw['characteristics_list']
            num_repetitions = len(characteristics_list)
            # DataFrame columns.
            data = [
                {
                    # features
                    'platform' : point_data_raw['features']['platform']['platform']['model'],
                    # choices
                    'lib' : lib,
                    'model' : model,
                    'batch_size' : np.int64(point_data_raw['choices']['env'].get('CK_CAFFE_BATCH_SIZE',[])),
                    # statistical repetitions
                    'repetition_id' : repetition_id,
                    # runtime characteristics
                    'time (ms)'     : np.float32(characteristics['run'].get('time_fw_ms',0)),
                    'dvdt_prof_info': characteristics['run'].get('dvdt_prof',[]),
                    'per_layer_info': characteristics['run'].get('per_layer_info',[]),
                }
                for (repetition_id, characteristics) in zip(range(num_repetitions), characteristics_list)  
                #if characteristics['run'].get('run_success','')!=''
            ]
            df = pd.DataFrame(data)
            df.columns.name = 'characteristics'
            df.index.name = 'index'
            df = df.set_index([ 'platform', 'lib', 'model', 'batch_size', 'repetition_id' ])
            dfs.append(df)
        results = pd.concat(dfs).sortlevel()
    return results

In [None]:
df = get_experimental_results(repo_uoa='ck-caffe-dvdt-prof-'+platform_id, tags='dvdt-prof')
pd.options.display.max_columns = len(df.columns)
pd.options.display.max_rows = len(df.index)
df

## Check execution time distribution

**NB:** The total execution time includes the profiling overhead, so should not be interpreted as the indicative performance of a platform. The kernel execution time and the derived GFLOPS should be accurate.

In [None]:
pd.options.display.max_columns = len(df.columns)
pd.options.display.max_rows = len(df.index)*8
df.groupby(level=df.index.names[:-1])[['time (ms)']].describe()

## Plot execution time

In [None]:
def plot(mean, std, rot=0):
    mean \
        .plot(yerr=std, title='Execution time (ms)', kind='bar', colormap=cm.autumn,
            figsize=[16, 8], rot=rot, grid=True, legend=True) \
        .legend(loc='upper left')

In [None]:
df_mean = df.groupby(level=df.index.names[:-1])['time (ms)'].mean().unstack('lib')
df_std = df.groupby(level=df.index.names[:-1])['time (ms)'].std().unstack('lib')
plot(df_mean, df_std, rot=45)

## Show profiling info

In [None]:
df_min = df \
    .ix[df.groupby(level=df.index.names[:-1])['time (ms)'].idxmin()] \
    .reset_index('repetition_id', drop=True)
df_min

In [None]:
batch_size = 1
df_model_lib = df_min[['dvdt_prof_info']] \
    .reset_index('platform', drop=True) \
    .reorder_levels([ 'batch_size', 'model', 'lib']) \
    .loc[batch_size] \
    .sortlevel()
df_model_lib

In [None]:
unit='ms'

## Analyse models

In [None]:
models = df_model_lib.index.levels[0]
libs = df_model_lib.index.levels[1]

In [None]:
def concat(model, lib):
    return '%s:%s' % (model, lib)

In [None]:
def analyse_model_lib(df_model_lib, model, lib, min_pc=1.0):
    trace = pw.index_calls(df_model_lib.loc[model].loc[lib]['dvdt_prof_info'])
    # All kernel enqueues.
    df_kernel_enqueues = pw.df_kernel_enqueues(pw.filter_calls(trace, ['clEnqueueNDRangeKernel']), unit='ms')
    # Kernel enqueues that take at least 'min_pc' % of the execution time.
    df_kernel_enqueues_cum_time_num = pw.df_kernel_enqueues_cumulative_time_num(df_kernel_enqueues, unit)
    df_kernel_enqueues_cum_time_num.columns.name = concat(model, lib)
    return df_kernel_enqueues_cum_time_num[df_kernel_enqueues_cum_time_num['** Execution time (%) **'] > min_pc]

In [None]:
model_lib_analysis = {}
for model in models:
    for lib in libs:
        title = concat(model, lib)
        print('== %s ==' % title)
        try:
            analysis = analyse_model_lib(df_model_lib, model, lib, min_pc=0.0)
        except:
            print('... missing ...'); print(''); continue
        model_lib_analysis[title] = analysis
        pd.options.display.max_columns = analysis.columns.size
        pd.options.display.max_rows = analysis.index.size
        display(analysis)
        print('')

### Compare no-tune/tune

In [None]:
pd.DataFrame \
    .join(
        model_lib_analysis['deepscale-squeezenet-1.1:opencl-clblast'][['** Execution time (ms) **']],
        model_lib_analysis['deepscale-squeezenet-1.1:opencl-clblast-tune'][['** Execution time (ms) **']],
        lsuffix=' deepscale-squeezenet-1.1:opencl-clblast **',
        rsuffix=' deepscale-squeezenet-1.1:opencl-clblast-tune **',
        how='outer'
    )

In [None]:
model_lib_analysis['deepscale-squeezenet-1.1:opencl-clblast'][['** Execution time (ms) **']].sum()        

In [None]:
model_lib_analysis['deepscale-squeezenet-1.1:opencl-clblast-tune'][['** Execution time (ms) **']].sum()

In [None]:
plot(
    mean=pd.DataFrame.join(
        model_lib_analysis['deepscale-squeezenet-1.1:opencl-clblast'][['** Execution time (ms) **']],
        model_lib_analysis['deepscale-squeezenet-1.1:opencl-clblast-tune'][['** Execution time (ms) **']],
        lsuffix=' deepscale-squeezenet-1.1:opencl-clblast **',
        rsuffix=' deepscale-squeezenet-1.1:opencl-clblast-tune **',
        how='outer'
    ), std=pd.DataFrame(), rot=90)

## Analyse xGEMM kernels

In [None]:
def init_buckets(left=0, right=15):
    powers_of_two = [ pow(2, i) for i in range(left, right) ]
    buckets = [
        (mm, nn, kk)
        for mm in powers_of_two
        for nn in powers_of_two
        for kk in powers_of_two
    ]
    return buckets

def distance((x1, y1, z1), (x2, y2, z2)):
    dx = x2-x1
    dy = y2-y1
    dz = z2-z1
    s = np.float64(dx**2 + dy**2 + dz**2)
    return math.sqrt(s)

# Returns the bucket nearest to the triple according to the metric computed by distance().
# buckets is a list of power-of-two triples generated by init_buckets().
def get_nearest_bucket(buckets, triple):
    bucket = (-1, -1, -1)
    min_distance = np.float('inf')
    for cur_bucket in buckets:
        cur_distance = distance(triple, cur_bucket)
        if cur_distance < min_distance:
            min_distance = cur_distance
            bucket = cur_bucket
    return bucket

In [None]:
def analyse_xgemm_kernel(df_model_lib, model, lib, kernel):
    # Get trace for lib and model.
    trace = pw.index_calls(df_model_lib.loc[model].loc[lib]['dvdt_prof_info'])
    # All calls to set kernel args.
    set_args = pw.filter_calls(trace, ['clSetKernelArg']) 
    # All kernel enqueues.
    nqs = pw.filter_calls(trace, ['clEnqueueNDRangeKernel'])
    # Construct a DataFrame with info about kernel enqueues.
    df = pw.df_kernel_enqueues(nqs, unit='ms').swaplevel().ix[kernel]
    df = df[['p3 - p2 (ms)', 'gws2']]
    # As gws2 is always 1, we can use it to count the number of enqueues.
    df.columns = [ '** Execution time (ms) **', '** Number of enqueues **' ]
    df.columns.name = kernel
    # Augment the DataFrame with columns for the (M, N, K) triples.
    df['kSizeM'] = 'M'; df['bSizeM'] = 'MM'
    df['kSizeN'] = 'N'; df['bSizeN'] = 'NN'
    df['kSizeK'] = 'K'; df['bSizeK'] = 'KK'
    # Initialise buckets.
    buckets = init_buckets()
    # Augment the DataFrame with the actual (M, N, K) triples.
    mnk_triples = []; mmnnkk_triples = []
    for nq in nqs:
        if nq['name'] == kernel:
            prof = nq['profiling']
            (M, N, K) = ('M', 'N', 'K'); (MM, NN, KK) = ('MM', 'NN', 'KK')
            for set_arg in set_args:
                if (set_arg['call_index'] > nq['call_index']): break
                if (set_arg['kernel'] != nq['kernel']): continue
                arg_value = pc.hex_str_as_int(set_arg['arg_value'])
                if (set_arg['arg_index'] == 0): M = arg_value; MM = arg_value
                if (set_arg['arg_index'] == 1): N = arg_value; NN = arg_value
                if (set_arg['arg_index'] == 2): K = arg_value; KK = arg_value
            mnk_triples.append((M, N, K))
            mmnnkk_triples.append(get_nearest_bucket(buckets, (M, N, K)))
    df[['kSizeM', 'kSizeN', 'kSizeK']] = mnk_triples
    df[['bSizeM', 'bSizeN', 'bSizeK']] = mmnnkk_triples
    # Calculate Gflops and GFLOPS (Gflops/s).
    df['** Gflops **'] = 2*df['kSizeM']*df['kSizeN']*df['kSizeK']*1e-9
    df['** GFLOPS **'] = df['** Gflops **'] / (df['** Execution time (ms) **']*1e-3)
    return df

In [None]:
model_lib_kernel_analysis = {}
for model in models:
    for lib in libs:
        title = concat(model, lib)
        print('== %s ==' % title)
        try:
            analysis = model_lib_analysis[title]
        except:
            print(' ... missing ...'); print(''); continue
        for kernel in analysis.index:
            if kernel.lower().find('xgemm') == -1: continue
            analysis_xgemm = analyse_xgemm_kernel(df_model_lib, model, lib, kernel)
            pd.options.display.max_columns = analysis_xgemm.columns.size
            pd.options.display.max_rows = analysis_xgemm.index.size
            display(analysis_xgemm)
            analysis_xgemm_stats = analysis_xgemm.describe()
            pd.options.display.max_columns = analysis_xgemm_stats.columns.size
            pd.options.display.max_rows = analysis_xgemm_stats.index.size
            display(analysis_xgemm_stats)
            model_lib_kernel_analysis[concat(title, kernel)] = analysis_xgemm
            print('')
        print('')

## Analyse xGEMM buckets for tuning

In [None]:
min_pc = 0
model_lib_bucket_analysis = {}
for xgemm_analysis_key in model_lib_kernel_analysis:
    print('== %s ==' % xgemm_analysis_key)
    # Move the actual and bucket triples into the index to preserve them during the aggregation that follows.
    xgemm_analysis = model_lib_kernel_analysis[xgemm_analysis_key] \
        .set_index(['kSizeM', 'kSizeN', 'kSizeK', 'bSizeM', 'bSizeN', 'bSizeK'])
    # Aggregate the execution time and the number of enqueues.
    xgemm_analysis = xgemm_analysis \
        .groupby(level=xgemm_analysis.index.names).sum()
    # Move the actual triples back to the columns.
    xgemm_analysis = xgemm_analysis \
        .reset_index(level=['kSizeM', 'kSizeN', 'kSizeK'])
    xgemm_analysis.name = xgemm_analysis_key
    # Calculate the execution time in percent.
    xgemm_analysis['** Execution time (%) **'] = 100 * ( \
         xgemm_analysis['** Execution time (%s) **' % unit] / \
         xgemm_analysis['** Execution time (%s) **' % unit].sum())
    # Calculate GFLOPS taking into account that the execution time is accumulated over several enqueues.
    xgemm_analysis['** Gflops **'] = xgemm_analysis['** Number of enqueues **'] * \
        (2*xgemm_analysis['kSizeM']*xgemm_analysis['kSizeN']*xgemm_analysis['kSizeK']*1e-9)
    xgemm_analysis['** Gflops (%) **'] = 100 * (xgemm_analysis['** Gflops **'] / xgemm_analysis['** Gflops **'].sum())
    xgemm_analysis['** GFLOPS **'] = \
        xgemm_analysis['** Gflops **'] / (xgemm_analysis['** Execution time (ms) **']*1e-3)
    # Sort by the kernel operations in the descending order.
    pd.options.display.max_columns = len(xgemm_analysis.columns)
    pd.options.display.max_rows = len(xgemm_analysis.index)
    display(
        xgemm_analysis[xgemm_analysis['** Gflops (%) **'] > min_pc] \
        .sort_values(by=['** Gflops (%) **'], ascending=False)
    )
    model_lib_bucket_analysis[xgemm_analysis_key] = xgemm_analysis
    print('')

In [None]:
bucket_analysis_dir = os.path.join(os.path.curdir, 'bucket-analysis-%s-tmp' % platform_id)
if not os.path.exists(bucket_analysis_dir):
    os.makedirs(bucket_analysis_dir)

for bucket_analysis_key in model_lib_bucket_analysis:
    print('== %s ==' % bucket_analysis_key)
    if bucket_analysis_key.find('tune') == -1:
        print('... skipping no-tune ...'); print(''); continue
    bucket_analysis = model_lib_bucket_analysis[bucket_analysis_key]
    # Move the actual triples into the index to preserve them during the aggregation that follows.
    bucket_analysis = model_lib_bucket_analysis[bucket_analysis_key] \
        .set_index(['kSizeM', 'kSizeN', 'kSizeK', '** GFLOPS **'], append=True)
    # Aggregate by the bucket size.
    bucket_analysis = bucket_analysis \
        .groupby(level=['bSizeM', 'bSizeN', 'bSizeK']).sum()
    # Sort by the kernel operations in the descending order.
    bucket_analysis = \
        bucket_analysis[bucket_analysis['** Gflops (%) **'] > min_pc] \
        .sort_values(by=['** Gflops (%) **'], ascending=False)
    # Display.
    pd.options.display.max_columns = len(bucket_analysis.columns)
    pd.options.display.max_rows = len(bucket_analysis.index)
    display(bucket_analysis)
    # Dump buckets analysis to JSON.
    bucket_stats = []
    for index, row in bucket_analysis.iterrows():
        stats = {}
        stats['bSizeM'] = index[0]
        stats['bSizeN'] = index[1]
        stats['bSizeK'] = index[2]
        stats['Number of enqueues']  = row['** Number of enqueues **']
        stats['Execution time (ms)'] = row['** Execution time (ms) **']
        stats['Execution time (%)']  = row['** Execution time (%) **']
        stats['Gflops (%)'] = row['** Gflops (%) **']
        stats['Gflops'] = row['** Gflops **']
        bucket_stats.append(stats)
    bucket_analysis_file = os.path.join(bucket_analysis_dir, '%s.json' % bucket_analysis_key)
    with open(bucket_analysis_file, 'w') as f:
        json.dump(bucket_stats, f, indent=2)
    print('')