# Performance Under Dataset Variation

In [7]:
import csv
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, log_loss
import seaborn as sns
import os
import ast
import sys
import re
from statistics import mean, stdev
statistics_path = os.path.abspath("../")
sys.path.append(statistics_path)
import stats_utils
from matplotlib.ticker import MaxNLocator

In [2]:
path_to_exp_statistics = "/Users/admin/Desktop/thesis/dataset/metrics/"

In [3]:
path_to_exp_images = "/Users/admin/Desktop/thesis_writing/experiment_images/performance_exp/"

## 1. Compute Accuracy Plot for every dataset in a single diagram

In [9]:
# read dataset 1 logs
exp_name = "dataset_1_iid_bal_5cl_20r_1"
df1_try_1 = stats_utils.parse_experiments_statistics_to_df(path_to_exp_statistics, exp_name, csv_filename="logs.csv")

# read dataset 2 logs
exp_name = "dataset_2_iid_bal_5cl_20r_1"
df2_try_1 = stats_utils.parse_experiments_statistics_to_df(path_to_exp_statistics, exp_name, csv_filename="logs.csv")

# read dataset 3 logs
exp_name = "dataset_3_iid_bal_5cl_3"
df3_try_1 = stats_utils.parse_experiments_statistics_to_df(path_to_exp_statistics, exp_name, csv_filename="logs.csv")

# read dataset 4 logs
exp_name = "dataset_4_iid_bal_5cl_20r_1"
df4_try_1 = stats_utils.parse_experiments_statistics_to_df(path_to_exp_statistics, exp_name, csv_filename="logs.csv")

# read dataset 5 logs
exp_name = "dataset_5_iid_bal_5cl_20r_1"
df5_try_1 = stats_utils.parse_experiments_statistics_to_df(path_to_exp_statistics, exp_name, csv_filename="logs.csv")

# read dataset 6 logs
exp_name = "dataset_6_iid_bal_5cl_20r_1"
df6_try_1 = stats_utils.parse_experiments_statistics_to_df(path_to_exp_statistics, exp_name, csv_filename="logs.csv")

dfs = [df1_try_1, df2_try_1, df3_try_1, df4_try_1, df5_try_1, df6_try_1]

In [31]:
def plot_acc_loss_from_dfs(dfs, get_accuracy_loss_values, path_to_exp_images, should_save=False, filename="accuracies_for_every_dataset"):
    accs = {}
    losses = {}

    for i, df in enumerate(dfs):
        dataset_name = f"dataset_{i+1}"
        first_client_name = ast.literal_eval(df['devices_names'][0])[0]
        acc, loss = get_accuracy_loss_values(df, first_client_name)
        accs[dataset_name] = acc
        losses[dataset_name] = loss

    rounds = range(1, len(next(iter(accs.values()))) + 1)

    # Plot Accuracy
    plt.figure(figsize=(10, 6))
    for name, values in accs.items():
        plt.plot(rounds, values, label=name)
    plt.title("Accuracy per Round")
    plt.xlabel("Federated Round")
    plt.ylabel("Accuracy")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    if should_save == False:
        plt.show()
    else:
#         filename = "accuracies_for_every_dataset"
        path_to_file = os.path.join(path_to_exp_images, filename)
        plt.savefig(path_to_file, dpi=300)
        plt.close()

In [8]:
plot_acc_loss_from_dfs(dfs, stats_utils.get_accuracy_loss_values_for_dfs, path_to_exp_images)

In [10]:
def plot_acc_loss_from_dfs(
    dfs,
    get_accuracy_loss_values,
    path_to_exp_images,
    should_save=False,
    filename="accuracies_for_every_dataset",
    label_names=None,
    title=""
):
    accs = {}
    losses = {}

    # 1) Extract accuracy & loss per dataset
    for i, df in enumerate(dfs):
        label = label_names[i]
        first_client = ast.literal_eval(df["devices_names"][0])[0]
        acc, loss = get_accuracy_loss_values(df, first_client)
        accs[label] = acc
        losses[label] = loss

    # 2) X-axis is 1…n_rounds
    n_rounds = len(next(iter(accs.values())))
    rounds = list(range(1, n_rounds + 1))

    # 3) Plot Accuracy
    fig, ax = plt.subplots(figsize=(10, 6))
    for label, values in accs.items():
        ax.plot(rounds, values, label=label)

    ax.set_title(title)
    ax.set_xlabel("Federated Round")
    ax.set_ylabel("Accuracy")
    ax.grid(True)
    ax.legend()

    # 4) Force integer ticks and explicitly include 1 and n_rounds
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    ax.set_xticks(rounds)       # ensures the full range from 1 to n_rounds is shown

    plt.tight_layout()

    # 5) Show or save
    if not should_save:
        plt.show()
    else:
        path_to_file = os.path.join(path_to_exp_images, filename + ".png")
        fig.savefig(path_to_file, dpi=300)
        plt.close(fig)

In [12]:
filename = "accuracies_for_every_dataset"
title = "Accuracy Convergence Across HAR Datasets"
label_names = ["HARSense", "UCI Smartphone HAR", "Pamap2", "MHealth", "PhysioNet Acc Data", "MotionSense"]
plot_acc_loss_from_dfs(dfs, stats_utils.get_accuracy_loss_values_for_dfs, path_to_exp_images, \
                       should_save=True, filename=filename, label_names=label_names, title=title)

## 2. Compute confusion matrices for every dataset

In [13]:
labels_names_list = []
# read dataset 1 logs
exp_name = "dataset_1_iid_bal_5cl_20r_2"
df1_try_2 = stats_utils.parse_experiments_statistics_to_df(path_to_exp_statistics, exp_name, csv_filename="logs.csv")
labels_names_list.append(stats_utils.load_label_names(path_to_exp_statistics, exp_name))

# read dataset 2 logs
exp_name = "dataset_2_iid_bal_5cl_20r_2"
df2_try_2 = stats_utils.parse_experiments_statistics_to_df(path_to_exp_statistics, exp_name, csv_filename="logs.csv")
labels_names_list.append(stats_utils.load_label_names(path_to_exp_statistics, exp_name))

# read dataset 3 logs
exp_name = "dataset_3_iid_bal_5cl_3"
df3_try_2 = stats_utils.parse_experiments_statistics_to_df(path_to_exp_statistics, exp_name, csv_filename="logs.csv")
labels_names_list.append(stats_utils.load_label_names(path_to_exp_statistics, exp_name))

# read dataset 4 logs
exp_name = "dataset_4_iid_bal_5cl_20r_2"
df4_try_2 = stats_utils.parse_experiments_statistics_to_df(path_to_exp_statistics, exp_name, csv_filename="logs.csv")
labels_names_list.append(stats_utils.load_label_names(path_to_exp_statistics, exp_name))

# read dataset 5 logs
exp_name = "dataset_5_iid_bal_5cl_20r_2"
df5_try_2 = stats_utils.parse_experiments_statistics_to_df(path_to_exp_statistics, exp_name, csv_filename="logs.csv")
labels_names_list.append(stats_utils.load_label_names(path_to_exp_statistics, exp_name))

# read dataset 6 logs
exp_name = "dataset_6_iid_bal_5cl_20r_2"
df6_try_2 = stats_utils.parse_experiments_statistics_to_df(path_to_exp_statistics, exp_name, csv_filename="logs.csv")
labels_names_list.append(stats_utils.load_label_names(path_to_exp_statistics, exp_name))

dfs = [df1_try_2, df2_try_2, df3_try_2, df4_try_2, df5_try_2, df6_try_2]

In [14]:
def generate_confusion_matrix_only(y_true, y_pred, label_names, path_to_exp_logs, filename, figsize=(8,6)):
    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Create and configure plot
    plt.figure(figsize=figsize)
    sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=label_names, yticklabels=label_names)
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.tight_layout()
#     plt.show()
    
    # Save plot
    path_to_file = os.path.join(path_to_exp_logs, filename)
    plt.savefig(path_to_file, dpi=300)
    plt.close()

In [15]:
idx = 0
for df, labels in zip(dfs, labels_names_list):
    first_client_name = ast.literal_eval(df['devices_names'][0])[0]
    y_true = json.loads(df['y_true'].iloc[-1])[first_client_name]
    y_pred = json.loads(df['y_pred'].iloc[-1])[first_client_name]
    idx += 1
    filename = "confusion_matrix_" + str(idx)
    generate_confusion_matrix_only(y_true, y_pred, labels, path_to_exp_images,filename)

### 3. Summarizing the results over multiple Experiments (only for dataset 1 for now)

In [None]:
# | Metric                           | Try 1    | Try 2    | Try 3     | Mean         | Std Dev     |
# | -------------------------------- | -------- | -------- | --------- | ------------ | ----------- |
# | Training Time (no compilation)   | 69.44 s  | 69.18 s  | 69.00 s   | **69.21 s**  | **0.22 s**  |
# | Training Time (with compilation) | 334.40 s | 309.18 s | 280.00 s  | **307.86 s** | **27.36 s** |
# | F1 Score                         | 0.77     | 0.79     | 0.74      | **0.77**     | **0.025**   |
# | Accuracy                         | 0.79     | 0.81     | 0.80      | **0.80**     | **0.01**    |
# | Charge Drop (Galaxy S10)         | 27 mAh   | 23 mAh   | 19.79 mAh | –            | –           |

# Observations
# Training Time (Without Compilation)

# Very consistent across all runs (std dev: 0.22s), showing stable training overhead from the Android client.

# This confirms the reliability of training_time as a metric for computation performance, excluding server overhead.

# Training Time (With Compilation)

# High variability (std dev: 27.36s), likely due to external usage of the laptop (e.g., background load during model recompilation).

# Suggests compilation overhead should be interpreted with caution when the server environment is not idle.

# Model Performance

# F1 Score has moderate fluctuation (std dev: 0.025) but still reasonably consistent for early-stage FL experiments.

# Accuracy is more stable (std dev: 0.01), reinforcing the model’s robustness across runs.

# Energy Consumption

# The charge drop varies, but without measurements from multiple devices or across more repetitions, it’s difficult to draw meaningful conclusions.

# Suggest excluding it from performance evaluation for now unless dedicated energy experiments are run.

# although we did not interfere with the devices during the training time

In [28]:
# read dataset 1 logs
exp_name = "dataset_1_iid_bal_5cl_20r_1"
df1_try_1 = stats_utils.parse_experiments_statistics_to_df(path_to_exp_statistics, exp_name, csv_filename="logs.csv")

# read dataset 2 logs
exp_name = "dataset_1_iid_bal_5cl_20r_2"
df1_try_2 = stats_utils.parse_experiments_statistics_to_df(path_to_exp_statistics, exp_name, csv_filename="logs.csv")

# read dataset 3 logs
exp_name = "dataset_1_iid_bal_5cl_20r_3"
df1_try_3 = stats_utils.parse_experiments_statistics_to_df(path_to_exp_statistics, exp_name, csv_filename="logs.csv")

dfs = [df1_try_1, df1_try_2, df1_try_3]

In [33]:
filename="dataset_1_variation"
plot_acc_loss_from_dfs(dfs, stats_utils.get_accuracy_loss_values_for_dfs, path_to_exp_images, should_save=True, filename=filename)

In [None]:
# metric | dataset 1 | dataset 2 | dataset 3 | dataset 4 | dataset 5 | dataset 6

# Training time (no compilation) | 334.4 +- 27.36 | 268.06 | 410.23 | 260.39 | 922.90 | 333.29
# Training time (compilation) | 69.44 sec +- 0.22 | 71.16 | 143.42 | 76.65 | 673.62 | 116.55
# F1 score (macro Avg) | 0.72 +- 0.025 | 0.88 | 0.99 | 0.86 | 0.52 | 0.5
# accuracy | 0.79 +- 0.01 | 0.88 | 0.99 | 0.87 | 0.89 | 0.89

# here i will present my observations
# these are the comments made for dataset 1 which was run for 3 iters to get more statistically
# significant results
# Observations
# Training Time (Without Compilation)

# Very consistent across all runs (std dev: 0.22s), showing stable training overhead from the Android client.

# This confirms the reliability of training_time as a metric for computation performance, excluding server overhead.

# Training Time (With Compilation)

# High variability (std dev: 27.36s), likely due to external usage of the laptop (e.g., background load during model recompilation).

# Suggests compilation overhead should be interpreted with caution when the server environment is not idle.

# Model Performance

# F1 Score has moderate fluctuation (std dev: 0.025) but still reasonably consistent for early-stage FL experiments.

# Accuracy is more stable (std dev: 0.01), reinforcing the model’s robustness across runs.

# Energy Consumption


# although we did not interfere with the devices during the training time

# these are the comments for the performance of all the datasets as a whole
# we can see all the 6 datasets get reasonably good accuracies values
# and we can see the accuracy plot to have been stabilized by around the federated round 5
# however alhouth datasets 5 and 6 have hihg accuracies scores this is not true for the f1-scores
# (give the reason f1-score its better than the accuracy especially if we are talking about 
# imbalanced datasets for f1: When it's useful: When you care about false positives and false negatives, especially in imbalanced datasets or when the cost of misclassification is high.
# for accuracy: When it's useful: When the classes are balanced (roughly equal number of examples in each class).
# Limitation: In imbalanced datasets, accuracy can be misleading.)
# we can see this more clearly on their consfusion matrices where on both cases
# the model is heavily biased towards 'Walking' label
# and on dataset5 labels {ascending stairs, descending stairs, clapping} is misclassified to walking
# and on dataset6 labels {downstairs, upstairs, jogging} are also misclassified to walking