# DataFrames Formatting - JMUBEN Dataset

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
PATH_EVALUATIONS = "./evaluations"
PATH_MLP_METRICS = PATH_EVALUATIONS + "/RNAs-MLP"
PATH_CNN_METRICS = PATH_EVALUATIONS + "/CNNs"
PATH_MLP_1HL = PATH_MLP_METRICS + "/metrics_mlp_1hl_experiments.csv"
PATH_MLP_2HL = PATH_MLP_METRICS + "/metrics_mlp_2hl_experiments.csv"

## 1. RNAs MLP dataframes

### 1.1. Opening dataframes

#### MLP One Hidden Layer

In [4]:
df_mlp_1hl = pd.read_csv(PATH_MLP_1HL)
df_mlp_1hl.head()

Unnamed: 0,Experiment,Architectures,Total Params,GFLOPS,Training Time (sec),Test Accuracy,Test F1 Weightet,Test Precision Weighted,Test Recall Weighted
0,1,4,81,8.002895e-08,1.799349,0.615485,0.525966,0.585156,0.615485
1,1,5,100,9.401882e-08,1.91451,0.667464,0.626959,0.619255,0.667464
2,1,6,119,2.995345e-07,0.721119,0.351039,0.314196,0.28517,0.351039
3,1,7,138,2.538573e-07,0.992684,0.595901,0.528823,0.506131,0.595901
4,1,8,157,7.266233e-08,3.963539,0.793567,0.777952,0.786391,0.793567


#### MLP Two Hidden Layer

In [5]:
df_mlp_2hl = pd.read_csv(PATH_MLP_2HL)
df_mlp_2hl.head()

Unnamed: 0,Experiment,Architectures,Total Params,GFLOPS,Training Time (sec),Test Accuracy,Test F1 Weightet,Test Precision Weighted,Test Recall Weighted
0,1,"(1, 14)",117,1.847504e-07,1.050065,0.320068,0.15521,0.102444,0.320068
1,1,"(2, 13)",137,4.619214e-08,5.065797,0.659266,0.567173,0.544528,0.659266
2,1,"(3, 12)",155,2.019025e-07,1.337279,0.573185,0.485153,0.455116,0.573185
3,1,"(4, 11)",171,1.465804e-07,2.060303,0.640307,0.5938,0.555253,0.640307
4,1,"(5, 10)",185,1.245983e-07,2.64851,0.614062,0.509854,0.632787,0.614062


### 1.2. Selecting unique architectures

#### Architectures from each dataframe

In [6]:
mlp_1hl_architectures = [str(architecture) for architecture in df_mlp_1hl["Architectures"].unique()]
mlp_2hl_architectures = [architecture for architecture in df_mlp_2hl["Architectures"].unique()]

#### All architectures ordered

In [7]:
architectures = mlp_1hl_architectures + mlp_2hl_architectures
architectures = architectures[:13] + architectures[-5:] + architectures[27:-5] + architectures[14:27]
architectures

['4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '(1, 5)',
 '(2, 4)',
 '(3, 3)',
 '(4, 2)',
 '(5, 1)',
 '(1, 6)',
 '(2, 5)',
 '(3, 4)',
 '(4, 3)',
 '(5, 2)',
 '(6, 1)',
 '(1, 12)',
 '(2, 11)',
 '(3, 10)',
 '(4, 9)',
 '(5, 8)',
 '(6, 7)',
 '(7, 6)',
 '(8, 5)',
 '(9, 4)',
 '(10, 3)',
 '(11, 2)',
 '(12, 1)',
 '(2, 13)',
 '(3, 12)',
 '(4, 11)',
 '(5, 10)',
 '(6, 9)',
 '(7, 8)',
 '(8, 7)',
 '(9, 6)',
 '(10, 5)',
 '(11, 4)',
 '(12, 3)',
 '(13, 2)',
 '(14, 1)']

### 1.3. Define columns and index

In [8]:
columns = df_mlp_2hl.columns[1:]
columns

Index(['Architectures', 'Total Params', 'GFLOPS', 'Training Time (sec)',
       'Test Accuracy', 'Test F1 Weightet', 'Test Precision Weighted',
       'Test Recall Weighted'],
      dtype='object')

In [9]:
index = [i for i in range(1, len(architectures)+1)]

### 1.4. Saving all metrics in a general dataframe

In [10]:
df_mlp = pd.DataFrame(columns=columns, index=index)
df_mlp["Architectures"] = architectures

In [11]:
different_format_columns = ["Total Params", "GFLOPS", "Training Time (sec)"]

In [12]:
for architecture in architectures:
    data = pd.DataFrame()
    
    if '(' in architecture:
        data = df_mlp_2hl.loc[df_mlp_2hl["Architectures"] == architecture]
    else:
        data = df_mlp_1hl.loc[df_mlp_1hl["Architectures"] == int(architecture)]
    
    for column in columns:
        av_metric, std_metric = float(), float()
        
        if column in different_format_columns:
            if column == "Total Params":
                df_mlp[column].loc[df_mlp["Architectures"] == architecture] = data["Total Params"].unique()[0]
            else:
                if column == "GFLOPS":
                    av_metric = data[column].mean()
                    std_metric = data[column].std()
                else:
                    av_metric = int(round(data["Training Time (sec)"].mean(), 0))
                    std_metric = int(round(data["Training Time (sec)"].std(), 0))
                
                df_mlp[column].loc[df_mlp["Architectures"] == architecture] = f"{av_metric} ± {std_metric}"
                
        elif column != "Architectures":
            av_metric = round(data[column].mean(), 6)
            std_metric = round(data[column].std(), 6)
            df_mlp[column].loc[df_mlp["Architectures"] == architecture] = f"{av_metric} ± {std_metric}"

In [13]:
df_mlp.to_csv(PATH_EVALUATIONS + "/metrics_MLPs.csv", header=True, index=False)

### 1.5. Show the five best architectures

In [14]:
df_mlp = df_mlp.sort_values("Test F1 Weightet", ascending=False, ignore_index=True)
df_mlp.head()

Unnamed: 0,Architectures,Total Params,GFLOPS,Training Time (sec),Test Accuracy,Test F1 Weightet,Test Precision Weighted,Test Recall Weighted
0,15,290,2.780355860883663e-07 ± 1.7413986951987645e-07,3 ± 2,0.728077 ± 0.025813,0.698414 ± 0.03677,0.692724 ± 0.04219,0.728077 ± 0.025813
1,14,271,2.910999226461182e-07 ± 6.834120478949927e-08,2 ± 0,0.732935 ± 0.02357,0.68231 ± 0.027017,0.712799 ± 0.075761,0.732935 ± 0.02357
2,"(7, 8)",207,1.318518990171966e-07 ± 1.0148747854794735e-07,4 ± 3,0.716823 ± 0.040743,0.67004 ± 0.043509,0.675844 ± 0.079326,0.716823 ± 0.040743
3,8,157,1.3081954646788009e-07 ± 5.189400670601304e-08,3 ± 1,0.722384 ± 0.061917,0.667223 ± 0.096691,0.668651 ± 0.105625,0.722384 ± 0.061917
4,16,309,2.7509807505306383e-07 ± 1.9293745513091e-08,2 ± 0,0.723617 ± 0.014291,0.658973 ± 0.019786,0.682832 ± 0.037071,0.723617 ± 0.014291


## 2. CNNs dataframes

### 2.1. Define architectures

In [15]:
architectures = ["MobileNetV2", "ShuffleNet", "VGG-16", "InceptionV3", "EfficientNetV2S"]

### 2.2. Saving all experiment metrics in one dataframe

In [16]:
df_cnn_experiments = pd.read_csv(f"{PATH_CNN_METRICS}/{architectures[0]}/metrics_{architectures[0]}.csv")
df_cnn_experiments.insert(0, "Architectures", architectures[0])

In [17]:
for i in range(1, len(architectures)):
    df_temp = pd.read_csv(f"{PATH_CNN_METRICS}/{architectures[i]}/metrics_{architectures[i]}.csv")
    df_new = df_cnn_experiments.append(df_temp, ignore_index=True)
    
    for j in range(len(df_temp)):
        df_new.loc[len(df_cnn_experiments)+j, "Architectures"] = architectures[i]
    
    df_cnn_experiments = df_new
    
df_cnn_experiments.to_csv(f"{PATH_CNN_METRICS}/metrics_experiments_CNNs.csv", index=False)

In [18]:
df_cnn_experiments.head()

Unnamed: 0,Architectures,Experiment,Total Params,GFLOPS,Epochs,Training Time (sec),Test Accuracy,Test F1 Weightet,Test Precision Weighted,Test Recall Weighted
0,MobileNetV2,1,2916421,0.002916,34,193.000553,0.142402,0.035501,0.020278,0.142402
1,MobileNetV2,2,2916421,0.120082,39,234.458109,0.142402,0.035501,0.020278,0.142402
2,MobileNetV2,3,2916421,0.237248,35,199.691904,0.324189,0.158736,0.105098,0.324189
3,ShuffleNet,1,1375525,0.004281,58,265.178504,0.999772,0.999772,0.999773,0.999772
4,ShuffleNet,2,1375525,0.121447,79,356.520217,0.999659,0.999659,0.999659,0.999659


### 2.3. Define data to save in a general dataframe

#### Define columns and index

In [19]:
columns = list(df_cnn_experiments.columns)
columns.remove("Experiment")
columns

['Architectures',
 'Total Params',
 'GFLOPS',
 'Epochs',
 'Training Time (sec)',
 'Test Accuracy',
 'Test F1 Weightet',
 'Test Precision Weighted',
 'Test Recall Weighted']

In [20]:
index = [i for i in range(1, 6)]

#### Saving all metrics

In [21]:
df_cnn = pd.DataFrame(columns=columns, index=index)
df_cnn["Architectures"] = architectures

In [22]:
different_format_columns = ["Total Params", "GFLOPS", "Epochs", "Training Time (sec)"]

In [23]:
for architecture in architectures:
    data = df_cnn_experiments.loc[df_cnn_experiments["Architectures"] == architecture] # pd.read_csv(f"{PATH_CNN_METRICS}/{architecture}/metrics_{architecture}.csv")
    
    for column in columns:
        av_metric, std_metric = float(), float()
        
        if column in different_format_columns:
            if column == "Total Params":
                df_cnn[column].loc[df_cnn["Architectures"] == architecture] = data["Total Params"].unique()[0]
            else:
                if column == "GFLOPS":
                    av_metric = data[column].mean()
                    std_metric = data[column].std()
                elif column == "Epochs":
                    av_metric = int(data[column].mean())
                    std_metric = int(data[column].std())
                else:
                    av_metric = int(round(data["Training Time (sec)"].mean(), 0))
                    std_metric = int(round(data["Training Time (sec)"].std(), 0))
                
                df_cnn[column].loc[df_cnn["Architectures"] == architecture] = f"{av_metric} ± {std_metric}"
                
        elif column != "Architectures":
            av_metric = round(data[column].mean(), 6)
            std_metric = round(data[column].std(), 6)
            df_cnn[column].loc[df_cnn["Architectures"] == architecture] = f"{av_metric} ± {std_metric}"

In [24]:
df_cnn.to_csv(PATH_EVALUATIONS + "/metrics_CNNs.csv", header=True, index=False)

#### Show the best architectures

In [25]:
df_cnn.sort_values("Test F1 Weightet", ascending=False, ignore_index=True)

Unnamed: 0,Architectures,Total Params,GFLOPS,Epochs,Training Time (sec),Test Accuracy,Test F1 Weightet,Test Precision Weighted,Test Recall Weighted
0,ShuffleNet,1375525,0.12144674699999998 ± 0.11716571999999999,69 ± 10,315 ± 46,0.999526 ± 0.000333,0.999526 ± 0.000333,0.999526 ± 0.000334,0.999526 ± 0.000333
1,InceptionV3,22854437,0.144318171 ± 0.11716572000000001,70 ± 14,512 ± 99,0.998976 ± 0.000626,0.998975 ± 0.000627,0.998975 ± 0.000627,0.998976 ± 0.000626
2,EfficientNetV2S,22854437,0.23433143999999997 ± 0.11716571999999999,74 ± 11,545 ± 88,0.998767 ± 0.001161,0.998766 ± 0.001163,0.998768 ± 0.001161,0.998767 ± 0.001161
3,VGG-16,67154757,0.211460016 ± 0.11716572,92 ± 8,1365 ± 120,0.998691 ± 0.001155,0.998688 ± 0.001159,0.998699 ± 0.001146,0.998691 ± 0.001155
4,MobileNetV2,2916421,0.12008178600000001 ± 0.11716572,36 ± 2,209 ± 22,0.202998 ± 0.104955,0.07658 ± 0.07115,0.048552 ± 0.048971,0.202998 ± 0.104955
