In [1]:
# Libraries
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

In [2]:
def get_experiment_name(ss_opt, fr_opt):
	if ss_opt == 'True' and fr_opt == 'True':
		return 'SS & FR'
	elif ss_opt == 'True':
		return 'SS'
	elif fr_opt == 'True':
		return 'FR'
	else:
		return 'Full'

In [3]:
# Datasets
datasets = ['aids', 'students', 'malware']

# Machine Learning models
models = ['LogisticRegression', 'SVC', 'RandomForestClassifier', 'GradientBoostingClassifier','DeepNeuralNetwork']

# Experiment options
subsampling_options = ['False', 'True']
feature_reduction_options = ['False', 'True']

# Inputs
DATASET = datasets[0]


Function to generate plots

In [4]:
def make_plots(metric):
    results = {}
    for model in models:
        for ss_opt in subsampling_options:
            for fr_opt in feature_reduction_options:
                filename = f'{DATASET}_{ss_opt}_{fr_opt}_{model}'
                path = f'../results/{filename}.csv'
                df = pd.read_csv(path)
                results[filename] = df[metric].transpose()

    n_experiments = len(results[filename])

    fig = go.Figure()

    for ss_opt in subsampling_options:
        for fr_opt in feature_reduction_options:
            x, y = [], []
            for model in models:
                filename = f'{DATASET}_{ss_opt}_{fr_opt}_{model}'
                x.extend([model for _ in range(n_experiments)])
                y.extend(list(results[filename]))
            fig.add_trace(go.Box(y=y, x=x, name=get_experiment_name(ss_opt, fr_opt)))
            

    fig.update_layout(
        title=metric,
        yaxis_title=metric,
        boxmode='group'
    )
    fig.show()


In [5]:
make_plots('accuracy')
make_plots('auc')
make_plots('log_loss')

In [6]:
make_plots('preprocess_time')
make_plots('training_time')
make_plots('prediction_time')

In [7]:
make_plots('model_size')

In [8]:
make_plots('emissions_prep')
make_plots('emissions_train')
make_plots('emissions_pred')

# Combined measures

In [9]:
results = {}
for model in models:
	for ss_opt in subsampling_options:
		for fr_opt in feature_reduction_options:
			filename = f'{DATASET}_{ss_opt}_{fr_opt}_{model}'
			path = f'../results/{filename}.csv'
			df = pd.read_csv(path)
			preprocess_time =  df['preprocess_time'].transpose()
			training_time =  df['training_time'].transpose()
			prediction_time =  df['prediction_time'].transpose()
			results[filename] = preprocess_time.to_numpy() + training_time.to_numpy() + prediction_time.to_numpy()

n_experiments = len(results[filename])

fig = go.Figure()

for ss_opt in subsampling_options:
	for fr_opt in feature_reduction_options:
		x, y = [], []
		for model in models:
			filename = f'{DATASET}_{ss_opt}_{fr_opt}_{model}'
			x.extend([model for _ in range(n_experiments)])
			y.extend(list(results[filename]))
		fig.add_trace(go.Box(y=y, x=x, name=get_experiment_name(ss_opt, fr_opt)))
		

fig.update_layout(
	title='total_time',
	yaxis_title='total_time',
	boxmode='group'
)
fig.show()


In [10]:
results = {}
for model in models:
	for ss_opt in subsampling_options:
		for fr_opt in feature_reduction_options:
			filename = f'{DATASET}_{ss_opt}_{fr_opt}_{model}'
			path = f'../results/{filename}.csv'
			df = pd.read_csv(path)
			preprocess_time =  df['preprocess_time'].transpose()
			training_time =  df['training_time'].transpose()
			prediction_time =  df['prediction_time'].transpose()
			model_size =  df['model_size'].transpose()
			total_time = preprocess_time.to_numpy() + training_time.to_numpy() + prediction_time.to_numpy()
			results[filename] = total_time * model_size.to_numpy()

n_experiments = len(results[filename])

fig = go.Figure()

for ss_opt in subsampling_options:
	for fr_opt in feature_reduction_options:
		x, y = [], []
		for model in models:
			filename = f'{DATASET}_{ss_opt}_{fr_opt}_{model}'
			x.extend([model for _ in range(n_experiments)])
			y.extend(list(results[filename]))
		fig.add_trace(go.Box(y=y, x=x, name=get_experiment_name(ss_opt, fr_opt)))
		

fig.update_layout(
	title='total_time * model_size',
	yaxis_title='total_time * model_size',
	boxmode='group'
)
fig.show()
