# `libcusmm`: Explore the Training Data

This notebook allows you to explore the training data collected from autotuning before proceeding to training.

## Import libraries

In [None]:
import re, sys, os, json, random
import numpy as np
import pandas as pd
import dask.dataframe as dd
from nb_helper import *

## Read training data from autotuning folders 

Read from files of form `tune_*x*x*/raw_training_data_*x*x*_algo.csv`. 
If you want to read from aggregated Parquet files (recommended), skip to lower

### Path to autotuning data

Provide the path to the autotuning data:
- You can use the bash cell below to navigate your filetree:

In [None]:
%%bash
ls -ad AUTOTUNING_DATA_PATH/tune_*x*x*/

- Then, copy what you've replaced `AUTOTUNING_DATA_PATH` with in the Python variable `autotuning_data_path` below:

In [None]:
autotuning_data_path = '' # may not recognize '~', please provide an absolute path:
check_autotuning_data_path(autotuning_data_path)

### Set options

Set the following options appropriately:

In [None]:
to_read = 100       # How many / which data folders to read. Options: 
                    # - 'all': reads from all available data folders. 
                    #   Beware, this might result in memory errors if large amounts of data are made available
                    # - a number: reads this number of data folders (e.g. 100)
                    # - a regex: reads the data folders with matching regex (e.g. tune_4x*x*)

In [None]:
algorithm = get_algorithm_to_explore('all')   # algorithms to explore. Options: all, tiny, small, medium

In [None]:
# Get the list of folders to read
folders_to_read = get_folders_to_read(to_read, autotuning_data_path)

### Read training data

In [None]:
raw_files_to_read, derived_files_to_read = get_files_to_read(folders_to_read, algorithm)

In [None]:
%%time
num_f = len(files_to_read)
data_raw = dd.read_csv(raw_files_to_read, dtype={}).set_index("Unnamed: 0")
data_derived = dd.read_csv(derived_files_to_read, dtype={}).set_index("Unnamed: 0")

In [None]:
# merge the two: 

## Read training data from Parquet files

Read from files of form `training_data_algorithm.parquet`.

### Path to autotuning data

Provide the path to the autotuning data:
- You can use the bash cell below to navigate your filetree:

In [None]:
%%bash
ls -ad AUTOTUNING_DATA_PATH/*.parquet

- Then, copy what you've replaced `AUTOTUNING_DATA_PATH` with in the Python variable `training_data_path` below:

In [None]:
training_data_path = '../tune_dataset_V100/' # may not recognize '~', please provide an absolute path:

In [None]:
algorithm = "small" # algorithm to explore. Options: tiny, small, medium, largeDB1, largeDB2

In [None]:
parquet_data_file = os.path.join(training_data_path, "training_data_" + algorithm + ".parquet")
data = dd.read_parquet(parquet_data_file)

## Data inspection

### Data head

In [None]:
for i in range(0, len(data.columns.values), page_width):
    display(data.iloc[:,i:i+page_width].head())

### Data description

In [None]:
print('Data size        :', sys.getsizeof(data)/10**6, 'MB')
print('Number of columns:', len(data.columns.values))
print('Number of rows   : {:,}'.format(len(data.index)))

In [None]:
#for i in range(0, len(data.columns.values), page_width):
#    display(data.iloc[:,i:i+page_width].describe())

### Columns

In [None]:
#print('Number of columns:', len(data.columns), '\nNumber of rows:', len(data.index), '\n')
for col in data.columns: 
    print('{:<40} {}'.format(col, data[col].dtype))

In [None]:
# Feature categories
mnk = ['m', 'n', 'k']
kernel_pars = ['algorithm', 'threads_per_blk', 'grouping', 'minblocks',
               'tile_m', 'tile_n', 'w', 'v', 'nbytes_smem', 'nbytes_cmem', 'regs_per_thread']
kernel_pars = list(set(kernel_pars) & set(data.columns.values))
perf =  ['perf (Gflop/s)', 'perf_scaled']
common = ['Gflops', 'mxnxk', 'size_a', 'size_b', 'size_c', 'nblks', 
          'warps_per_blk', 'nwarps', 'sm_desired', 'nthreads', 'ru_param_stack_unroll_factor']

### Features

Features in the left-most column correspond to "raw" parameters
* **green** kernel parameters 
* **grey** GPU card properties (taken from Nvidia/AMD documentation) 
* **pink** autotuning parameters (taken from DBCSR codebase) 

Other features correspond to derived parameters, computed from the "raw" parameters
* **yellow** matrix sizes
* **light grey** launch parameters
* **blue** and **purple** estimations of resource usages

![parameters dependency graph](libsmm_acc_predictive_modeling_features.png)

In [None]:
thresh = 300000     # do not perform very long operations on row counts above this threshold

In [None]:
data_to_profile = data
n_rows_data = len(data)
if n_rows_data > thresh:  # if it is a very large dataframe, perform op on subsampled rows
    data_to_profile = data.sample(frac = thresh / n_rows_data)

import pandas_profiling 
pandas_profiling.ProfileReport(data_to_profile.compute())

### Data visualization

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Get Series from Dask to Pandas
data_mxnxk = data['mxnxk'].compute()
data_perf = data['perf (Gflop/s)'].compute()
data_perf_scaled = data['perf_scaled'].compute()

In [None]:
plt.semilogx(data_mxnxk, data_perf, '.', markersize=1)
plt.xlabel('Training (m, n, k) triplets (in order of increasing m*n*k)')
plt.ylabel('Performance [Gflops]')

### Data visualization (scaled performance)

In [None]:
plt.plot(data_mxnxk, data_perf_scaled, '.', markersize=1)
plt.xlabel('Training (m, n, k) triplets (in order of increasing m*n*k)')
plt.ylabel('Performance scaled (overall)')

### Performance profile

In [None]:
# Choose (m, n, k) triplet
m_plot, n_plot, k_plot = (4, 4, 4)

In [None]:
data_mnk = data[data['m'] == m_plot][ 
                data['n'] == n_plot][ 
                data['k'] == k_plot].compute()
data_mnk.sort_values(by='perf (Gflop/s)', ascending=True, inplace=True)
plt.plot(data_mnk['perf (Gflop/s)'].values)
plt.xlabel('parameter set')
plt.ylabel('perf (Gflop/s)')
plt.title('Performance profile for kernel ' + str(m_plot) + 'x'+ str(n_plot) + 'x'+ str(k_plot))

In [None]:
# Histograms with Bokeh
from bokeh.plotting import figure 
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.io import output_notebook, show
output_notebook()

# Create histogram
num_bins = 100 
hist, edges = np.histogram(data_mnk['perf (Gflop/s)'], bins=num_bins)
df_hist = pd.DataFrame({'hist': hist, 'left': edges[:-1], 'right': edges[1:]})
source = ColumnDataSource(df_hist)

# Create tool 
hover = HoverTool(tooltips=[('# occurences', '@hist'), ('low', '@left'), ('high', '@right')])

# Create the figure
p = figure(plot_width=800, plot_height=800, title="Performance histogram",
           toolbar_location=None, tools="")
p.xgrid.grid_line_color = None
p.xaxis.axis_label = "Performance (GFlop/s)"
p.xaxis.major_label_orientation = 1.2
p.yaxis.axis_label = "# occurrences"
p.quad(source=source, bottom=0, top='hist', left='left', right='right', fill_color='blue')
p.add_tools(hover)
show(p)


In [None]:
# Histograms with Bokeh
from bokeh.plotting import figure 
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.io import output_notebook, show
output_notebook()

# Create histogram
num_bins = 100 
hist, edges = np.histogram(data_mnk['perf_scaled'], bins=num_bins)
df_hist = pd.DataFrame({'hist': hist, 'left': edges[:-1], 'right': edges[1:]})
source = ColumnDataSource(df_hist)

# Create tool 
hover = HoverTool(tooltips=[('# occurences', '@hist'), ('low', '@left'), ('high', '@right')])

# Create the figure
p = figure(plot_width=800, plot_height=800, title="Performance histogram",
           toolbar_location=None, tools="")
p.xgrid.grid_line_color = None
p.xaxis.axis_label = "Performance scaled"
p.xaxis.major_label_orientation = 1.2
p.yaxis.axis_label = "# occurrences"
p.quad(source=source, bottom=0, top='hist', left='left', right='right', fill_color='blue')
p.add_tools(hover)
show(p)

In [None]:
# Top slices of perf. distribution
pars_autotuning_top = {
    5: list(), 
    2: list(), 
    1: list(), 
    0.5: list()
}
max_perf = float(data_mnk['perf (Gflop/s)'].max())
max_perf_idx = data_mnk['perf (Gflop/s)'].idxmax()
max_perf_row = data_mnk.loc[max_perf_idx]
max_perf_cond = max_perf_row[mnk + kernel_pars + ['perf (Gflop/s)']]

print('Maximally performing parameter set:')
display(max_perf_cond)
for perc in pars_autotuning_top.keys():
    lim = max_perf - max_perf*perc/100
    blob = data_mnk.loc[data_mnk['perf (Gflop/s)'] >= lim]
    print('\ntop', perc, '%')
    display(blob[kernel_pars + ['perf (Gflop/s)']].describe())
    pars_autotuning_top[perc].append(blob)

### Pair plot 

In [None]:
data_pairplot = data
n_rows_data = len(data)
if n_rows_data > thresh:  # if it is a very large dataframe, perform op on subsampled rows
    data_pairplot = data.sample(frac = thresh / n_rows_data)

sns.pairplot(data_pairplot[mnk + kernel_pars + perf].compute().dropna())