# Data Exploration

In [None]:
# Add directory above current directory to path
import sys; sys.path.insert(0, '..')

from pathlib import Path
from dataset.dataset_builder import DatasetBuilder
from IPython.display import display
import pandas as pd

In [None]:
# Utility functions
def get_model_layer_statistics(dataset_builder, layer_type, file_paths):
    layers, model_names = [], []
    col1, col2 = 'Model Name',  f'Number of {layer_type} Layer'
    for file in file_paths:
        layers.append(len(dataset_builder.read_csv_and_convert_power(file)))
        model_names.append(file.parent.stem)
    df = pd.DataFrame({col1: model_names, col2: layers})
    sorted_df = df.sort_values(by=col2, ascending=False, ignore_index=True)
    display(sorted_df)
    return sorted_df

In [None]:
# Data Configuration

data_dir_path = Path('../training_data')
test_models = ["lenet", "resnet18", "vgg16"]

In [None]:
models = [folder.name for folder in data_dir_path.iterdir() if folder.is_dir()]

print(f"Models: {models}")
print(f"Total models: {len(models)}")

## Convolutional Layer

- There are 21 models with Convolutional layers.
- **googlenet** models looks fishy as it has only 1 CNN layer.


Insights into Data

- There are total 585 datapoints with 15 features (including power and runtime) for Convolutional Layers.
- Power:  `mean = 5.24` `max = 7.83` and `min = 1.18`
- Runtime: `mean = 0.08` `max = 0.64` and `min = 0.01`

In [None]:
from config.convolutional_features import CONV_FEATURES
conv_files = list(data_dir_path.rglob("**/convolutional.csv"))

In [None]:
conv_dataset = DatasetBuilder(features=CONV_FEATURES)
conv_df = get_model_layer_statistics(conv_dataset, layer_type='Convolutional', file_paths=conv_files)

In [None]:
combined_conv_data = conv_dataset.merge_feature_data(conv_files)

print("Convolutional Layers data summary")
combined_conv_df = pd.concat([combined_conv_data.input_features, combined_conv_data.power, combined_conv_data.runtime], axis=1)
display(combined_conv_df.describe())

## Pooling Layer

- Not all models have **Pooling Layers**.
- There are only 17 models with Pooling layers compared to 21 models with Convolutional layers.

Insights into Data

- There are total 61 datapoints with 13 features (including power and runtime) for Pooling Layers.
- Power:  `mean = 4.95` `max = 7.82` and `min = 1.18`
- Runtime: `mean = 0.03` `max = 0.14` and `min = 0.009`

In [None]:
from config.pooling_features import POOLING_FEATURES
pooling_files = list(data_dir_path.rglob("**/pooling.csv"))

In [None]:
pool_dataset = DatasetBuilder(features=POOLING_FEATURES)
pool_df = get_model_layer_statistics(pool_dataset, layer_type='Pooling', file_paths=pooling_files)

In [None]:
combined_pool_data = pool_dataset.merge_feature_data(pooling_files)

print("Pooling Layers data summary")
combined_pool_df = pd.concat([combined_pool_data.input_features, combined_pool_data.power, combined_pool_data.runtime], axis=1)
display(combined_pool_df.describe())

## Dense Layer

- Not all models have **Dense Layers**.
- There are only 17 models with Dense layers compared to 21 models with Convolutional layers.
- There can be models with 1 dense layers usually the last FC layer.


Insights into Data

- There are total 398 datapoints with 5 features (including power and runtime) for Dense Layers.
- Power:  `mean = 5.55` `max = 7.82` and `min = 1.18`
- Runtime: `mean = 0.09` `max = 3.37` and `min = 0.01`

In [None]:
from config.dense_features import DENSE_FEATURES
dense_files = list(data_dir_path.rglob("**/dense.csv"))

In [None]:
dense_dataset = DatasetBuilder(features=DENSE_FEATURES)
dense_df = get_model_layer_statistics(dense_dataset, layer_type='Dense', file_paths=dense_files)

In [None]:
combined_dense_data = dense_dataset.merge_feature_data(dense_files)

print("Dense Layers data summary")
combined_dense_df = pd.concat([combined_dense_data.input_features, combined_dense_data.power, combined_dense_data.runtime], axis=1)
display(combined_dense_df.describe())

## Overall Summary

### Power

Both dense and convolutional layers have almost the same characteritics for power values. Pooling layer also have almost the same characteristics with a bit lower mean compared to above 2.

Diggging a bit deeper, there is quite a bit variation in percentage quantiles for each layer.

### Runtime

Although the min and mean of dense and convolution layers is same, the dense layers have a quite larger maximum value for runtime compared to convolutional layers.

Pooling layers are the quickest amongst the 3.