# Data Exploration

In [82]:
# Add directory above current directory to path
import sys; sys.path.insert(0, '..')

from pathlib import Path
from dataset.dataset_builder import DatasetBuilder
from IPython.display import display
import pandas as pd

In [83]:
# Utility functions
def get_model_layer_statistics(dataset_builder, layer_type, file_paths):
    layers, model_names = [], []
    col1, col2 = 'Model Name',  f'Number of {layer_type} Layer'
    for file in file_paths:
        layers.append(len(dataset_builder.read_csv_and_convert_power(file)))
        model_names.append(file.parent.stem)
    df = pd.DataFrame({col1: model_names, col2: layers})
    sorted_df = df.sort_values(by=col2, ascending=False, ignore_index=True)
    display(sorted_df)
    return sorted_df

In [84]:
# Data Configuration

data_dir_path = Path('../training_data')
test_models = ["lenet", "resnet18", "vgg16"]

In [85]:
models = [folder.name for folder in data_dir_path.iterdir() if folder.is_dir()]

print(f"Models: {models}")
print(f"Total models: {len(models)}")

Models: ['resnet34', 'resnext50_32x4d', 'resnext101_32x8d', 'googlenet', 'lenet', 'resnet50', 'resnext101_64x4d', 'resnet152', 'convnext_small', 'mobilenet_v3_large', 'vgg13', 'convnext_base', 'mobilenet_v3_small', 'convnext_tiny', 'vgg16', 'mobilenet_v2', 'vgg11', 'alexnet', 'resnet101', 'vgg19', 'resnet18']
Total models: 21


## Convolutional Layer

- There are 21 models with Convolutional layers.
- **googlenet** models looks fishy as it has only 1 CNN layer.


Insights into Data

- There are total 585 datapoints with 15 features (including power and runtime) for Convolutional Layers.
- Power:  `mean = 5.24` `max = 7.83` and `min = 1.18`
- Runtime: `mean = 0.08` `max = 0.64` and `min = 0.01`

In [86]:
from config.convolutional_features import CONV_FEATURES
conv_files = list(data_dir_path.rglob("**/convolutional.csv"))

In [87]:
conv_dataset = DatasetBuilder(features=CONV_FEATURES)
conv_df = get_model_layer_statistics(conv_dataset, layer_type='Convolutional', file_paths=conv_files)

Unnamed: 0,Model Name,Number of Convolutional Layer
0,resnet152,67
1,resnext101_64x4d,60
2,resnet101,45
3,mobilenet_v2,43
4,convnext_small,40
5,convnext_base,40
6,resnext101_32x8d,38
7,mobilenet_v3_large,36
8,resnet34,36
9,mobilenet_v3_small,30


In [88]:
combined_conv_data = conv_dataset.merge_feature_data(conv_files)

print("Convolutional Layers data summary")
combined_conv_df = pd.concat([combined_conv_data.input_features, combined_conv_data.power, combined_conv_data.runtime], axis=1)
display(combined_conv_df.describe())

Convolutional Layers data summary


Unnamed: 0,batch_size,input_size_0,input_size_1,input_size_2,output_size_0,output_size_1,output_size_2,kernel_0,kernel_1,padding_0,padding_1,stride_0,stride_1,power,runtime
count,585.0,585.0,585.0,585.0,585.0,585.0,585.0,585.0,585.0,585.0,585.0,585.0,585.0,585.0,585.0
mean,1.0,393.991453,33.870085,33.870085,446.919658,28.28547,28.28547,3.235897,3.235897,2.188034,2.188034,1.158974,1.158974,5.245252,0.084916
std,0.0,363.106132,44.075125,44.075125,408.452922,32.089807,32.089807,1.970435,1.970435,1.96864,1.96864,0.418361,0.418361,1.804808,0.096239
min,1.0,1.0,1.0,1.0,16.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.183146,0.012982
25%,1.0,128.0,14.0,14.0,128.0,14.0,14.0,2.0,2.0,0.0,0.0,1.0,1.0,4.724114,0.049459
50%,1.0,256.0,14.0,14.0,256.0,14.0,14.0,3.0,3.0,2.0,2.0,1.0,1.0,5.881474,0.054854
75%,1.0,512.0,28.0,28.0,512.0,28.0,28.0,3.0,3.0,2.0,2.0,1.0,1.0,6.540556,0.08863
max,1.0,2048.0,224.0,224.0,2048.0,224.0,224.0,11.0,11.0,6.0,6.0,4.0,4.0,7.831482,0.642334


## Pooling Layer

- Not all models have **Pooling Layers**.
- There are only 17 models with Pooling layers compared to 21 models with Convolutional layers.

Insights into Data

- There are total 61 datapoints with 13 features (including power and runtime) for Pooling Layers.
- Power:  `mean = 4.95` `max = 7.82` and `min = 1.18`
- Runtime: `mean = 0.03` `max = 0.14` and `min = 0.009`

In [89]:
from config.pooling_features import POOLING_FEATURES
pooling_files = list(data_dir_path.rglob("**/pooling.csv"))

In [90]:
pool_dataset = DatasetBuilder(features=POOLING_FEATURES)
pool_df = get_model_layer_statistics(pool_dataset, layer_type='Pooling', file_paths=pooling_files)

Unnamed: 0,Model Name,Number of Pooling Layer
0,mobilenet_v3_small,10
1,mobilenet_v3_large,9
2,vgg19,5
3,vgg16,5
4,vgg13,5
5,vgg11,5
6,alexnet,3
7,resnet34,2
8,resnext50_32x4d,2
9,resnext101_64x4d,2


In [91]:
combined_pool_data = pool_dataset.merge_feature_data(pooling_files)

print("Pooling Layers data summary")
combined_pool_df = pd.concat([combined_pool_data.input_features, combined_pool_data.power, combined_pool_data.runtime], axis=1)
display(combined_pool_df.describe())

Pooling Layers data summary


Unnamed: 0,batch_size,input_size_0,input_size_1,input_size_2,output_size_0,output_size_1,output_size_2,kernel_0,kernel_1,stride_0,stride_1,power,runtime
count,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0
mean,1.0,484.196721,51.262295,51.262295,484.196721,23.131148,23.131148,51.262295,51.262295,1.540984,1.540984,4.950072,0.03206
std,0.0,588.710875,60.564539,60.564539,588.710875,31.756614,31.756614,60.564539,60.564539,0.502453,0.502453,2.628455,0.033769
min,1.0,16.0,7.0,7.0,16.0,1.0,1.0,7.0,7.0,1.0,1.0,1.183149,0.009791
25%,1.0,72.0,7.0,7.0,72.0,1.0,1.0,7.0,7.0,1.0,1.0,2.107938,0.013449
50%,1.0,256.0,27.0,27.0,256.0,7.0,7.0,27.0,27.0,2.0,2.0,5.44819,0.015197
75%,1.0,512.0,112.0,112.0,512.0,56.0,56.0,112.0,112.0,2.0,2.0,7.705176,0.039357
max,1.0,2048.0,224.0,224.0,2048.0,112.0,112.0,224.0,224.0,2.0,2.0,7.823878,0.140997


## Dense Layer

- Not all models have **Dense Layers**.
- There are only 17 models with Dense layers compared to 21 models with Convolutional layers.
- There can be models with 1 dense layers usually the last FC layer.


Insights into Data

- There are total 398 datapoints with 5 features (including power and runtime) for Dense Layers.
- Power:  `mean = 5.55` `max = 7.82` and `min = 1.18`
- Runtime: `mean = 0.09` `max = 3.37` and `min = 0.01`

In [92]:
from config.dense_features import DENSE_FEATURES
dense_files = list(data_dir_path.rglob("**/dense.csv"))

In [93]:
dense_dataset = DatasetBuilder(features=DENSE_FEATURES)
dense_df = get_model_layer_statistics(dense_dataset, layer_type='Dense', file_paths=dense_files)

Unnamed: 0,Model Name,Number of Dense Layer
0,resnet152,89
1,resnext101_32x8d,67
2,resnet101,60
3,resnext101_64x4d,45
4,mobilenet_v3_large,28
5,resnext50_32x4d,28
6,resnet50,27
7,mobilenet_v3_small,24
8,mobilenet_v2,10
9,vgg11,3


In [95]:
combined_dense_data = dense_dataset.merge_feature_data(dense_files)

print("Dense Layers data summary")
combined_dense_df = pd.concat([combined_dense_data.input_features, combined_dense_data.power, combined_dense_data.runtime], axis=1)
display(combined_dense_df.describe())

Dense Layers data summary


Unnamed: 0,batch_size,input_size,output_size,power,runtime
count,398.0,398.0,398.0,398.0,398.0
mean,1.0,174759.025126,178741.1,5.550736,0.098706
std,0.0,187214.403657,216511.3,1.772387,0.336419
min,1.0,8.0,8.0,1.183156,0.010512
25%,1.0,50176.0,50176.0,5.443578,0.034554
50%,1.0,200704.0,200704.0,5.897007,0.042994
75%,1.0,200704.0,200704.0,6.853875,0.081166
max,1.0,802816.0,1605632.0,7.828145,3.374961


## Overall Summary

### Power

Both dense and convolutional layers have almost the same characteritics for power values. Pooling layer also have almost the same characteristics with a bit lower mean compared to above 2.

Diggging a bit deeper, there is quite a bit variation in percentage quantiles for each layer.

### Runtime

Although the min and mean of dense and convolution layers is same, the dense layers have a quite larger maximum value for runtime compared to convolutional layers.

Pooling layers are the quickest amongst the 3.