# HAR Dataset Creation for Performance Experiments
This notebook focuses on creating and preparing Human Activity Recognition (HAR) datasets for evaluating model performance under various conditions, such as data variations, IID/non-IID distributions, and scaling. Key tasks include:
- Loading HAR datasets (e.g., Dataset 1 and Dataset 6) using a custom  `dataloading` module.
- Performing train-test splits with options for scaling (e.g., standard scaler) and label mapping.
- Downsampling datasets to specific fractions for experimentation.
- Partitioning data into client-specific subsets for federated learning (e.g., 5 clients with shared test sets).
- Grouping subject data across clients and shuffling for random assignments.
- Saving client data partitions and labels for use in performance tests.
The goal is to generate controlled dataset variations to assess classifier performance in scenarios like class imbalance, data scaling, and federated setups.

In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import dataloading
import os
from sklearn.decomposition import PCA
from sklearn import model_selection, preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from collections import Counter
import random

## 1. Performance Under Dataset Variation

### 1.1 Dataset 1

In [18]:
df1 = dataloading.load_data1()
df1.head()

Unnamed: 0,AG-X,AG-Y,AG-Z,Acc-X,Acc-Y,Acc-Z,Gravity-X,Gravity-Y,Gravity-Z,RR-X,RR-Y,RR-Z,RV-X,RV-Y,RV-Z,cos,activity
0,-9.494311,5.423363,0.196757,-1.684995,0.401508,1.334339,-8.855526,4.199663,-0.335983,-0.762366,-0.750471,1.10906,0.232662,0.672025,0.101603,0.695649,Running
1,-8.890651,2.520043,0.96331,-0.895032,-1.936677,1.436671,-8.399648,5.06081,0.067106,-0.613229,-0.121966,1.906942,0.245168,0.66066,0.131849,0.697164,Running
2,-5.6328,1.581015,0.230293,2.350318,-3.642859,-1.169414,-7.911231,5.790931,0.21893,-0.142383,-0.352063,1.759936,0.264845,0.648175,0.163016,0.695091,Running
3,-6.322698,6.228244,-0.756644,0.812825,-0.722664,-0.334024,-7.523226,6.280006,0.36466,-0.148775,-0.382955,1.208129,0.29955,0.624335,0.19508,0.694564,Running
4,-7.640211,4.862821,1.71549,0.34967,-0.468074,-0.37878,-6.921238,6.909286,0.727059,-1.047857,0.379773,0.828895,0.310003,0.607825,0.220415,0.697039,Running


In [19]:
# try1
random_seed = 42
X_train,y_train,X_test,y_test,labels = dataloading.train_test_split(df1,test_size=0.3, scaler_type="standard", should_map_labels=True, random_seed=random_seed)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")

# data_frac = 0.4
# X_train,y_train,X_test,y_test = dataloading.downsample_train_test_split(X_train, y_train, X_test, y_test, frac=data_frac, random_seed=random_seed)
# print(f"X_train_ds: {X_train.shape}, y_train_ds: {y_train.shape}, X_test_ds: {X_test.shape}, y_test_ds: {y_test.shape}")

data_frac = None #list(np.ones(5) * 0.5)
client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=5, only_one_test_data=True, data_frac=data_frac, random_seed=random_seed)
dataloading.save_client_data(client_data,"exp_1_1_1", labels)

X_train: (65938, 16), y_train: (65938,), X_test: (28260, 16), y_test: (28260,)
Labels saved in /Users/admin/Desktop/thesis/dataset/working_data/exp_1_1_1/labels.txt


In [4]:
# try2
random_seed = 1337
X_train,y_train,X_test,y_test,labels = dataloading.train_test_split(df1,test_size=0.3, scaler_type="standard", should_map_labels=True, random_seed=random_seed)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")

data_frac = 0.4
X_train,y_train,X_test,y_test = dataloading.downsample_train_test_split(X_train, y_train, X_test, y_test, frac=data_frac, random_seed=random_seed)
print(f"X_train_ds: {X_train.shape}, y_train_ds: {y_train.shape}, X_test_ds: {X_test.shape}, y_test_ds: {y_test.shape}")

data_frac = None #list(np.ones(5) * 0.5)
client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=5, only_one_test_data=True, data_frac=data_frac, random_seed=random_seed)
dataloading.save_client_data(client_data,"dataset_1_iid_bal_5cl_2", labels)

X_train: (65938, 16), y_train: (65938,), X_test: (28260, 16), y_test: (28260,)
X_train_ds: (26375, 16), y_train_ds: (26375,), X_test_ds: (11304, 16), y_test_ds: (11304,)
Labels saved in /Users/admin/Desktop/thesis/dataset/working_data/dataset_1_iid_bal_5cl_2/labels.txt


In [7]:
#try3
random_seed = 2025
X_train,y_train,X_test,y_test,labels = dataloading.train_test_split(df1,test_size=0.3, scaler_type="standard", should_map_labels=True, random_seed=random_seed)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")

data_frac = 0.4
X_train,y_train,X_test,y_test = dataloading.downsample_train_test_split(X_train, y_train, X_test, y_test, frac=data_frac, random_seed=random_seed)
print(f"X_train_ds: {X_train.shape}, y_train_ds: {y_train.shape}, X_test_ds: {X_test.shape}, y_test_ds: {y_test.shape}")

data_frac = None #list(np.ones(5) * 0.5)
client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=5, only_one_test_data=True, data_frac=data_frac, random_seed=random_seed)
dataloading.save_client_data(client_data,"dataset_1_iid_bal_5cl_3", labels)

X_train: (65938, 16), y_train: (65938,), X_test: (28260, 16), y_test: (28260,)
X_train_ds: (26375, 16), y_train_ds: (26375,), X_test_ds: (11304, 16), y_test_ds: (11304,)
Labels saved in /Users/admin/Desktop/thesis/dataset/working_data/dataset_1_iid_bal_5cl_3/labels.txt


### 1.2 Dataset 2

In [2]:
df2 = dataloading.load_data2()
print(df2.shape)
df2.head()

(10299, 562)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,552,553,554,555,556,557,558,559,560,activity
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.298676,-0.710304,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627,standing
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,-0.595051,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317,standing
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,-0.390748,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118,standing
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,-0.11729,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663,standing
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,-0.351471,-0.699205,0.12332,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892,standing


In [3]:
# to many features. PCA them (keep 12 for the time being. ~80% of explained variance)
pca = PCA(n_components=12, svd_solver='full')
data2 = pca.fit_transform(df2.iloc[:,0:-1].to_numpy())

In [4]:
pca.explained_variance_ratio_.sum()

0.812423389522117

In [5]:
df2_pca = pd.concat([pd.DataFrame(data2).astype('float32'),pd.DataFrame(df2['activity'].reset_index(drop=True))],axis=1)
df2_pca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,activity
0,-5.532306,-0.71308,-1.425739,1.761952,-0.842171,-0.42872,0.279745,0.839325,-1.333017,-0.168956,-0.272959,0.399628,standing
1,-5.558568,-0.57611,-1.842062,0.816206,-0.554598,1.003947,-0.125675,-0.732936,-0.942399,0.586729,0.12554,-0.124013,standing
2,-5.493743,-0.290346,-2.15385,0.598862,0.04366,-0.030991,0.047537,0.089552,-0.213861,0.214159,0.154228,0.22756,standing
3,-5.695271,0.35893,-2.156196,0.351608,-0.769463,1.031169,0.486559,-0.442267,0.395172,-0.257603,0.462661,-0.645857,standing
4,-5.764235,0.564788,-2.369786,0.345952,-0.419203,0.369863,0.184831,-0.033604,0.429315,-0.268201,0.56168,-0.239655,standing


In [6]:
# try 1
random_seed = 42
X_train,y_train,X_test,y_test,labels = dataloading.train_test_split(df2_pca,test_size=0.3, scaler_type="standard", should_map_labels=True, random_seed=random_seed)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")

# data_frac = 0.4
# X_train,y_train,X_test,y_test = dataloading.downsample_train_test_split(X_train, y_train, X_test, y_test, frac=data_frac, random_seed=random_seed)
# print(f"X_train_ds: {X_train.shape}, y_train_ds: {y_train.shape}, X_test_ds: {X_test.shape}, y_test_ds: {y_test.shape}")

data_frac = None #list(np.ones(5) * 0.5)
client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=5, only_one_test_data=True, data_frac=data_frac, random_seed=random_seed)
dataloading.save_client_data(client_data,"exp_1_2_1", labels)

X_train: (7209, 12), y_train: (7209,), X_test: (3090, 12), y_test: (3090,)
Labels saved in /Users/admin/Desktop/thesis/dataset/working_data/exp_1_2_1/labels.txt


In [8]:
# try 2
random_seed = 1337
X_train,y_train,X_test,y_test,labels = dataloading.train_test_split(df2_pca,test_size=0.3, scaler_type="standard", should_map_labels=True, random_seed=random_seed)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")

data_frac = 0.4
X_train,y_train,X_test,y_test = dataloading.downsample_train_test_split(X_train, y_train, X_test, y_test, frac=data_frac, random_seed=random_seed)
print(f"X_train_ds: {X_train.shape}, y_train_ds: {y_train.shape}, X_test_ds: {X_test.shape}, y_test_ds: {y_test.shape}")

data_frac = None #list(np.ones(5) * 0.5)
client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=5, only_one_test_data=True, data_frac=data_frac, random_seed=random_seed)
dataloading.save_client_data(client_data,"dataset_2_iid_bal_5cl_2", labels)

X_train: (7209, 12), y_train: (7209,), X_test: (3090, 12), y_test: (3090,)
X_train_ds: (2883, 12), y_train_ds: (2883,), X_test_ds: (1236, 12), y_test_ds: (1236,)
Labels saved in /Users/admin/Desktop/thesis/dataset/working_data/dataset_2_iid_bal_5cl_2/labels.txt


### 1.3 Dataset 3

In [2]:
df3 = dataloading.load_data3(n_subjects=4)
df3.head()

Unnamed: 0,activity,0,1,2,3,4,5,6,7,8,...,41,42,43,44,45,46,47,48,49,50
0,lying,30.375,2.2153,8.27915,5.58753,2.24689,8.55387,5.77143,-0.00475,0.037579,...,0.002908,-0.027714,0.001752,-61.1081,-36.8636,-58.3696,1.0,0.0,0.0,0.0
1,lying,30.375,2.29196,7.67288,5.74467,2.27373,8.14592,5.78739,-0.17171,0.025479,...,0.020882,0.000945,0.006007,-60.8916,-36.3197,-58.3656,1.0,0.0,0.0,0.0
2,lying,30.375,2.2909,7.1424,5.82342,2.26966,7.66268,5.78846,-0.238241,0.011214,...,-0.035392,-0.052422,-0.004882,-60.3407,-35.7842,-58.6119,1.0,0.0,0.0,0.0
3,lying,30.375,2.218,7.14365,5.8993,2.22177,7.25535,5.88,-0.192912,0.019053,...,-0.032514,-0.018844,0.02695,-60.7646,-37.1028,-57.8799,1.0,0.0,0.0,0.0
4,lying,30.375,2.30106,7.25857,6.09259,2.2072,7.24042,5.95555,-0.069961,-0.018328,...,0.001351,-0.048878,-0.006328,-60.204,-37.1225,-57.8847,1.0,0.0,0.0,0.0


In [3]:
# remove zero information features: 13,14,15,16,30,31,32,33,47,48,49,50
df3.drop([13,14,15,16,30,31,32,33,47,48,49,50],axis=1,inplace=True)

In [4]:
# X_train,y_train,X_test,y_test,labels = dataloading.train_test_split(df3,test_size=0.2)
# print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")

random_seed = 42
X_train,y_train,X_test,y_test,labels = dataloading.train_test_split(df3,test_size=0.3, scaler_type="standard", should_map_labels=True, random_seed=random_seed)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")

# data_frac = 0.4
# X_train,y_train,X_test,y_test = dataloading.downsample_train_test_split(X_train, y_train, X_test, y_test, frac=data_frac, random_seed=random_seed)
# print(f"X_train_ds: {X_train.shape}, y_train_ds: {y_train.shape}, X_test_ds: {X_test.shape}, y_test_ds: {y_test.shape}")


# data_frac = None #list(np.ones(5) * 0.5)
# client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=5, only_one_test_data=True, data_frac=data_frac, random_seed=random_seed)
# dataloading.save_client_data(client_data,"dataset_3_iid_bal_5cl_1", labels)

X_train: (635643, 39), y_train: (635643,), X_test: (272419, 39), y_test: (272419,)


In [5]:
data_frac = None #list(np.ones(5) * 0.5)
client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=5, only_one_test_data=True, data_frac=data_frac, random_seed=random_seed)
dataloading.save_client_data(client_data,"exp1_3_1", labels)

Labels saved in /Users/admin/Desktop/thesis/dataset/working_data/exp1_3_1/labels.txt


In [4]:
random_seed = 1337
X_train,y_train,X_test,y_test,labels = dataloading.train_test_split(df3,test_size=0.3, scaler_type="standard", should_map_labels=True, random_seed=random_seed)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")

data_frac = 0.4
X_train,y_train,X_test,y_test = dataloading.downsample_train_test_split(X_train, y_train, X_test, y_test, frac=data_frac, random_seed=random_seed)
print(f"X_train_ds: {X_train.shape}, y_train_ds: {y_train.shape}, X_test_ds: {X_test.shape}, y_test_ds: {y_test.shape}")

data_frac = None #list(np.ones(5) * 0.5)
client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=5, only_one_test_data=True, data_frac=data_frac, random_seed=random_seed)
dataloading.save_client_data(client_data,"dataset_3_iid_bal_5cl_2", labels)

X_train: (173045, 39), y_train: (173045,), X_test: (74163, 39), y_test: (74163,)
X_train_ds: (69218, 39), y_train_ds: (69218,), X_test_ds: (29665, 39), y_test_ds: (29665,)
Labels saved in /Users/admin/Desktop/thesis/dataset/working_data/dataset_3_iid_bal_5cl_2/labels.txt


### 1.4 Dataset 4

In [2]:
df4 = dataloading.load_data4()
df4.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,activity
6656,-9.7788,0.5569,1.1975,0.008373,-0.03349,2.6493,-9.4517,0.37683,-0.20965,-0.88931,...,-2.8439,-9.0618,1.8177,-0.058824,-0.93429,-0.34483,0.35537,-0.37003,-0.3502,standing_still
6657,-9.7733,0.2788,0.73036,-0.025118,-0.025118,2.4157,-9.5306,0.40179,-0.20965,-0.88931,...,-2.9935,-9.2048,1.5189,-0.058824,-0.93429,-0.34483,0.71991,0.17803,0.37363,standing_still
6658,-9.8609,0.11561,0.79988,0.025118,0.016745,2.3865,-9.5991,0.48141,-0.20037,-0.86867,...,-2.8846,-9.1945,1.5507,-0.058824,-0.93429,-0.34483,0.35537,-0.37003,-0.3502,standing_still
6659,-9.7409,0.17652,0.88957,0.18001,0.12977,2.3758,-9.5997,0.42919,-0.20037,-0.86867,...,-2.9245,-9.1746,1.5413,-0.078431,-0.93429,-0.34052,0.35718,-0.18858,-0.35198,standing_still
6660,-9.7821,0.21637,0.90368,0.092098,0.046049,2.3239,-9.5406,0.40038,-0.20037,-0.86867,...,-2.8963,-9.2039,1.6127,-0.078431,-0.93429,-0.34052,-0.001887,-0.18867,-0.72017,standing_still


In [3]:
np.unique(df4['activity'])

array(['climbing_stairs', 'cycling', 'frontal_elevation_of_arms',
       'jogging', 'jummp_front_and_back', 'knees_bending', 'lying_down',
       'running', 'sitting_and_relaxing', 'standing_still',
       'waist_bend_forward', 'walking'], dtype=object)

In [6]:
random_seed = 42
X_train,y_train,X_test,y_test,labels = dataloading.train_test_split(df4,test_size=0.3, scaler_type="standard", should_map_labels=True, random_seed=random_seed)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")

data_frac = 0.4
X_train,y_train,X_test,y_test = dataloading.downsample_train_test_split(X_train, y_train, X_test, y_test, frac=data_frac, random_seed=random_seed)
print(f"X_train_ds: {X_train.shape}, y_train_ds: {y_train.shape}, X_test_ds: {X_test.shape}, y_test_ds: {y_test.shape}")

data_frac = None #list(np.ones(5) * 0.5)
client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=5, only_one_test_data=True, data_frac=data_frac, random_seed=random_seed)
dataloading.save_client_data(client_data,"dataset_4_iid_bal_5cl_1", labels)

X_train: (274556, 23), y_train: (274556,), X_test: (68639, 23), y_test: (68639,)
X_train: (240236, 23), y_train: (240236,), X_test: (102959, 23), y_test: (102959,)
X_train_ds: (96094, 23), y_train_ds: (96094,), X_test_ds: (41183, 23), y_test_ds: (41183,)
Labels saved in /Users/admin/Desktop/thesis/dataset/working_data/dataset_4_iid_bal_5cl_1/labels.txt


In [4]:
random_seed = 1337
X_train,y_train,X_test,y_test,labels = dataloading.train_test_split(df4,test_size=0.3, scaler_type="standard", should_map_labels=True, random_seed=random_seed)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")

data_frac = 0.4
X_train,y_train,X_test,y_test = dataloading.downsample_train_test_split(X_train, y_train, X_test, y_test, frac=data_frac, random_seed=random_seed)
print(f"X_train_ds: {X_train.shape}, y_train_ds: {y_train.shape}, X_test_ds: {X_test.shape}, y_test_ds: {y_test.shape}")

data_frac = None #list(np.ones(5) * 0.5)
client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=5, only_one_test_data=True, data_frac=data_frac, random_seed=random_seed)
dataloading.save_client_data(client_data,"dataset_4_iid_bal_5cl_2", labels)

X_train: (240236, 23), y_train: (240236,), X_test: (102959, 23), y_test: (102959,)
X_train_ds: (96094, 23), y_train_ds: (96094,), X_test_ds: (41183, 23), y_test_ds: (41183,)
Labels saved in /Users/admin/Desktop/thesis/dataset/working_data/dataset_4_iid_bal_5cl_2/labels.txt


### 1.5 Dataset 5

In [2]:
df5 = dataloading.load_data5()
df5.head()

Unnamed: 0,activity,subject,lw_x,lw_y,lw_z,lh_x,lh_y,lh_z,la_x,la_y,la_z,ra_x,ra_y,ra_z
35216,clapping,0,0.094,0.703,-0.703,0.293,-0.926,-0.25,-0.152,-0.953,0.117,-0.156,0.992,0.082
35217,clapping,0,0.172,0.699,-0.73,0.293,-0.922,-0.25,-0.16,-0.953,0.113,-0.156,0.992,0.086
35218,clapping,0,0.258,0.695,-0.762,0.293,-0.922,-0.254,-0.16,-0.953,0.113,-0.156,1.0,0.078
35219,clapping,0,0.348,0.711,-0.793,0.289,-0.918,-0.254,-0.156,-0.957,0.113,-0.152,1.004,0.074
35220,clapping,0,0.438,0.734,-0.797,0.289,-0.922,-0.254,-0.156,-0.957,0.117,-0.156,1.008,0.078


In [3]:
# drop subject column
df5.drop(['subject'],axis=1,inplace=True)

In [5]:
random_seed = 42
X_train,y_train,X_test,y_test,labels = dataloading.train_test_split(df5,test_size=0.3, scaler_type="standard", should_map_labels=True, random_seed=random_seed)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")

data_frac = 0.25
X_train,y_train,X_test,y_test = dataloading.downsample_train_test_split(X_train, y_train, X_test, y_test, frac=data_frac, random_seed=random_seed)
print(f"X_train_ds: {X_train.shape}, y_train_ds: {y_train.shape}, X_test_ds: {X_test.shape}, y_test_ds: {y_test.shape}")

data_frac = None #list(np.ones(5) * 0.5)
client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=5, only_one_test_data=True, data_frac=data_frac, random_seed=random_seed)
dataloading.save_client_data(client_data,"dataset_5_iid_bal_5cl_3", labels)

X_train: (4191814, 12), y_train: (4191814,), X_test: (1796492, 12), y_test: (1796492,)
X_train_ds: (1047953, 12), y_train_ds: (1047953,), X_test_ds: (449123, 12), y_test_ds: (449123,)
Labels saved in /Users/admin/Desktop/thesis/dataset/working_data/dataset_5_iid_bal_5cl_3/labels.txt


In [4]:
random_seed = 1337
X_train,y_train,X_test,y_test,labels = dataloading.train_test_split(df5,test_size=0.3, scaler_type="standard", should_map_labels=True, random_seed=random_seed)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")

data_frac = 0.15
X_train,y_train,X_test,y_test = dataloading.downsample_train_test_split(X_train, y_train, X_test, y_test, frac=data_frac, random_seed=random_seed)
print(f"X_train_ds: {X_train.shape}, y_train_ds: {y_train.shape}, X_test_ds: {X_test.shape}, y_test_ds: {y_test.shape}")

data_frac = None #list(np.ones(5) * 0.5)
client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=5, only_one_test_data=True, data_frac=data_frac, random_seed=random_seed)
dataloading.save_client_data(client_data,"dataset_5_iid_bal_5cl_2", labels)

X_train: (4191814, 12), y_train: (4191814,), X_test: (1796492, 12), y_test: (1796492,)
X_train_ds: (628772, 12), y_train_ds: (628772,), X_test_ds: (269473, 12), y_test_ds: (269473,)
Labels saved in /Users/admin/Desktop/thesis/dataset/working_data/dataset_5_iid_bal_5cl_2/labels.txt


### 1.6 Dataset 6

In [2]:
df6 = dataloading.load_data6()

[['attitude.roll', 'attitude.pitch', 'attitude.yaw'], ['gravity.x', 'gravity.y', 'gravity.z'], ['rotationRate.x', 'rotationRate.y', 'rotationRate.z'], ['userAcceleration.x', 'userAcceleration.y', 'userAcceleration.z']]
[INFO] -- Data subjects' information is imported.
[INFO] -- Creating Time-Series


In [3]:
print(df6.shape)
df6.head()

(1412865, 14)


Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,activity,subject
0,1.528132,-0.733896,0.696372,0.741895,0.669768,-0.031672,0.316738,0.77818,1.082764,0.294894,-0.184493,0.377542,downstairs,0.0
1,1.527992,-0.716987,0.677762,0.753099,0.657116,-0.032255,0.842032,0.424446,0.643574,0.219405,0.035846,0.114866,downstairs,0.0
2,1.527765,-0.706999,0.670951,0.759611,0.649555,-0.032707,-0.138143,-0.040741,0.343563,0.010714,0.134701,-0.167808,downstairs,0.0
3,1.516768,-0.704678,0.675735,0.760709,0.647788,-0.04114,-0.025005,-1.048717,0.03586,-0.008389,0.136788,0.094958,downstairs,0.0
4,1.493941,-0.703918,0.672994,0.760062,0.64721,-0.05853,0.114253,-0.91289,0.047341,0.199441,0.353996,-0.044299,downstairs,0.0


In [4]:
df6.drop('subject', axis=1, inplace=True)

In [5]:
random_seed = 42
X_train,y_train,X_test,y_test,labels = dataloading.train_test_split(df6,test_size=0.3, scaler_type="standard", should_map_labels=True, random_seed=random_seed)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")

data_frac = 0.15
X_train,y_train,X_test,y_test = dataloading.downsample_train_test_split(X_train, y_train, X_test, y_test, frac=data_frac, random_seed=random_seed)
print(f"X_train_ds: {X_train.shape}, y_train_ds: {y_train.shape}, X_test_ds: {X_test.shape}, y_test_ds: {y_test.shape}")

data_frac = None #list(np.ones(5) * 0.5)
client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=5, only_one_test_data=True, data_frac=data_frac, random_seed=random_seed)
dataloading.save_client_data(client_data,"dataset_6_iid_bal_5cl_1", labels)

X_train: (989005, 12), y_train: (989005,), X_test: (423860, 12), y_test: (423860,)
X_train_ds: (148350, 12), y_train_ds: (148350,), X_test_ds: (63579, 12), y_test_ds: (63579,)
Labels saved in /Users/admin/Desktop/thesis/dataset/working_data/dataset_6_iid_bal_5cl_1/labels.txt


In [6]:
random_seed = 1337
X_train,y_train,X_test,y_test,labels = dataloading.train_test_split(df6,test_size=0.3, scaler_type="standard", should_map_labels=True, random_seed=random_seed)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")

data_frac = 0.15
X_train,y_train,X_test,y_test = dataloading.downsample_train_test_split(X_train, y_train, X_test, y_test, frac=data_frac, random_seed=random_seed)
print(f"X_train_ds: {X_train.shape}, y_train_ds: {y_train.shape}, X_test_ds: {X_test.shape}, y_test_ds: {y_test.shape}")

data_frac = None #list(np.ones(5) * 0.5)
client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=5, only_one_test_data=True, data_frac=data_frac, random_seed=random_seed)
dataloading.save_client_data(client_data,"dataset_6_iid_bal_5cl_2", labels)

X_train: (989005, 12), y_train: (989005,), X_test: (423860, 12), y_test: (423860,)
X_train_ds: (148350, 12), y_train_ds: (148350,), X_test_ds: (63579, 12), y_test_ds: (63579,)
Labels saved in /Users/admin/Desktop/thesis/dataset/working_data/dataset_6_iid_bal_5cl_2/labels.txt


## 2. Synthetic non-IID: Label Skew

### Try it with dataset 6

In [2]:
df6 = dataloading.load_data6()

[['attitude.roll', 'attitude.pitch', 'attitude.yaw'], ['gravity.x', 'gravity.y', 'gravity.z'], ['rotationRate.x', 'rotationRate.y', 'rotationRate.z'], ['userAcceleration.x', 'userAcceleration.y', 'userAcceleration.z']]
[INFO] -- Data subjects' information is imported.
[INFO] -- Creating Time-Series


In [3]:
df6.drop('subject', axis=1, inplace=True)

In [12]:
random_seed = 42
X_train,y_train,X_test,y_test,labels = dataloading.train_test_split(df6,test_size=0.3, scaler_type="standard", should_map_labels=True, random_seed=random_seed)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")

data_frac = 0.15
X_train,y_train,X_test,y_test = dataloading.downsample_train_test_split(X_train, y_train, X_test, y_test, frac=data_frac, random_seed=random_seed)
print(f"X_train_ds: {X_train.shape}, y_train_ds: {y_train.shape}, X_test_ds: {X_test.shape}, y_test_ds: {y_test.shape}")

X_train: (989005, 12), y_train: (989005,), X_test: (423860, 12), y_test: (423860,)
X_train_ds: (148350, 12), y_train_ds: (148350,), X_test_ds: (63579, 12), y_test_ds: (63579,)


In [13]:
# we will balance the dataset and then imbalanced it as we want
client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=5)

class_ratio_list = [
    [0.0, 1.0, 1.0, 1.0, 1.0, 1.0],
    [1.0, 0.0, 1.0, 1.0, 1.0, 1.0],
    [1.0, 1.0, 0.0, 1.0, 1.0, 1.0],
    [1.0, 1.0, 1.0, 0.0, 1.0, 1.0],
    [1.0, 1.0, 1.0, 1.0, 0.0, 1.0]
]

# class_ratio_list = [
#     [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
#     [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
#     [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
#     [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
#     [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
# ]


client_data_im = []
for idx,(X_train, y_train, X_test, y_test) in enumerate(client_data):
    class_ratio = class_ratio_list[idx]
    X_train_im, y_train_im, X_test_im, y_test_im = dataloading.class_imbalance((X_train,y_train,X_test,y_test),class_ratio,balance=True)
    dataloading.print_balance(y_train_im,y_test_im)
    client_data_im.append((X_train_im, y_train_im, X_test_im, y_test_im))

y_train counts: [7235 7235    0 7235 7235 7235] ratio: [0.2 0.2 0.  0.2 0.2 0.2]
y_test counts: [15550 15550 15550 15550 15550 15550] ratio: [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
y_train counts: [7162    0 7162 7162 7162 7162] ratio: [0.2 0.  0.2 0.2 0.2 0.2]
y_test counts: [15550 15550 15550 15550 15550 15550] ratio: [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
y_train counts: [7153 7153    0 7153 7153 7153] ratio: [0.2 0.2 0.  0.2 0.2 0.2]
y_test counts: [15550 15550 15550 15550 15550 15550] ratio: [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
y_train counts: [7175 7175 7175    0 7175 7175] ratio: [0.2 0.2 0.2 0.  0.2 0.2]
y_test counts: [15550 15550 15550 15550 15550 15550] ratio: [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
y_train counts: [7150 7150 7150 7150    0 7150] ratio: [0.2 0.2 0.2 0.2 0.  0.2]
y_test counts: [15550 15550 15550 15550 15550 15550] ratio: [0.16666667 0.16666667 

In [14]:
dataloading.save_client_data(client_data_im,"test_2", labels)

Labels saved in /Users/admin/Desktop/thesis/dataset/working_data/test_2/labels.txt


### Try it with dataset 1: it may be a better candidate

In [2]:
df1 = dataloading.load_data1()
df1.head()

Unnamed: 0,AG-X,AG-Y,AG-Z,Acc-X,Acc-Y,Acc-Z,Gravity-X,Gravity-Y,Gravity-Z,RR-X,RR-Y,RR-Z,RV-X,RV-Y,RV-Z,cos,activity
0,-9.494311,5.423363,0.196757,-1.684995,0.401508,1.334339,-8.855526,4.199663,-0.335983,-0.762366,-0.750471,1.10906,0.232662,0.672025,0.101603,0.695649,Running
1,-8.890651,2.520043,0.96331,-0.895032,-1.936677,1.436671,-8.399648,5.06081,0.067106,-0.613229,-0.121966,1.906942,0.245168,0.66066,0.131849,0.697164,Running
2,-5.6328,1.581015,0.230293,2.350318,-3.642859,-1.169414,-7.911231,5.790931,0.21893,-0.142383,-0.352063,1.759936,0.264845,0.648175,0.163016,0.695091,Running
3,-6.322698,6.228244,-0.756644,0.812825,-0.722664,-0.334024,-7.523226,6.280006,0.36466,-0.148775,-0.382955,1.208129,0.29955,0.624335,0.19508,0.694564,Running
4,-7.640211,4.862821,1.71549,0.34967,-0.468074,-0.37878,-6.921238,6.909286,0.727059,-1.047857,0.379773,0.828895,0.310003,0.607825,0.220415,0.697039,Running


In [5]:
random_seed = 42
X_train,y_train,X_test,y_test,labels = dataloading.train_test_split(df1,test_size=0.3, scaler_type="standard", should_map_labels=True, random_seed=random_seed)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")
client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=5)

X_train: (65938, 16), y_train: (65938,), X_test: (28260, 16), y_test: (28260,)


In [6]:
# we will balance the dataset and then imbalanced it as we want
# 1/6 missing
# class_ratio_list = [
#     [0.0, 1.0, 1.0, 1.0, 1.0, 1.0],
#     [1.0, 0.0, 1.0, 1.0, 1.0, 1.0],
#     [1.0, 1.0, 0.0, 1.0, 1.0, 1.0],
#     [1.0, 1.0, 1.0, 0.0, 1.0, 1.0],
#     [1.0, 1.0, 1.0, 1.0, 0.0, 1.0]
# ]

# 3/6 missing
# class_ratio_list = [
#     [0.0, 1.0, 0.0, 1.0, 0.0, 1.0],
#     [1.0, 0.0, 1.0, 0.0, 1.0, 0.0],
#     [0.0, 1.0, 0.0, 1.0, 0.0, 1.0],
#     [1.0, 0.0, 1.0, 0.0, 1.0, 0.0],
#     [0.0, 1.0, 0.0, 1.0, 0.0, 1.0]
# ]

# 5/6 missing
class_ratio_list = [
    [1.0, .0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 1.0, 1.0]
]


client_data_im = []
for idx,(X_train, y_train, X_test, y_test) in enumerate(client_data):
    class_ratio = class_ratio_list[idx]
    X_train_im, y_train_im, X_test_im, y_test_im = dataloading.class_imbalance((X_train,y_train,X_test,y_test),class_ratio,balance=True)
    dataloading.print_balance_2(y_train_im,y_test_im)
    client_data_im.append((X_train_im, y_train_im, X_test_im, y_test_im))

Train label distribution: 0.0: 3593 
Test label distribution: 0.0: 7490  1.0: 7490  2.0: 7490  3.0: 7490  4.0: 7490  5.0: 7490 
Train label distribution: 1.0: 3571 
Test label distribution: 0.0: 7490  1.0: 7490  2.0: 7490  3.0: 7490  4.0: 7490  5.0: 7490 
Train label distribution: 2.0: 3485 
Test label distribution: 0.0: 7490  1.0: 7490  2.0: 7490  3.0: 7490  4.0: 7490  5.0: 7490 
Train label distribution: 3.0: 3516 
Test label distribution: 0.0: 7490  1.0: 7490  2.0: 7490  3.0: 7490  4.0: 7490  5.0: 7490 
Train label distribution: 4.0: 3611  5.0: 3611 
Test label distribution: 0.0: 7490  1.0: 7490  2.0: 7490  3.0: 7490  4.0: 7490  5.0: 7490 


In [30]:
dataloading.save_client_data(client_data_im,"exp_2_2_1", labels)

Labels saved in /Users/admin/Desktop/thesis/dataset/working_data/exp_2_2_1/labels.txt


### 3. Quantity Skew

In [10]:
df1 = dataloading.load_data1()
df1.head()

Unnamed: 0,AG-X,AG-Y,AG-Z,Acc-X,Acc-Y,Acc-Z,Gravity-X,Gravity-Y,Gravity-Z,RR-X,RR-Y,RR-Z,RV-X,RV-Y,RV-Z,cos,activity
0,-9.494311,5.423363,0.196757,-1.684995,0.401508,1.334339,-8.855526,4.199663,-0.335983,-0.762366,-0.750471,1.10906,0.232662,0.672025,0.101603,0.695649,Running
1,-8.890651,2.520043,0.96331,-0.895032,-1.936677,1.436671,-8.399648,5.06081,0.067106,-0.613229,-0.121966,1.906942,0.245168,0.66066,0.131849,0.697164,Running
2,-5.6328,1.581015,0.230293,2.350318,-3.642859,-1.169414,-7.911231,5.790931,0.21893,-0.142383,-0.352063,1.759936,0.264845,0.648175,0.163016,0.695091,Running
3,-6.322698,6.228244,-0.756644,0.812825,-0.722664,-0.334024,-7.523226,6.280006,0.36466,-0.148775,-0.382955,1.208129,0.29955,0.624335,0.19508,0.694564,Running
4,-7.640211,4.862821,1.71549,0.34967,-0.468074,-0.37878,-6.921238,6.909286,0.727059,-1.047857,0.379773,0.828895,0.310003,0.607825,0.220415,0.697039,Running


In [11]:
random_seed = 42
X_train,y_train,X_test,y_test,labels = dataloading.train_test_split(df1,test_size=0.3, scaler_type="standard", should_map_labels=True, random_seed=random_seed)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")

X_train: (65938, 16), y_train: (65938,), X_test: (28260, 16), y_test: (28260,)


In [12]:
# we will balance the dataset and then imbalanced it as we want
client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=5)

# 60% of data
# class_ratio_list = [
#     [0.6, 0.6, 0.6, 0.6, 0.6, 0.6,],
#     [0.6, 0.6, 0.6, 0.6, 0.6, 0.6,],
#     [0.6, 0.6, 0.6, 0.6, 0.6, 0.6,],
#     [0.6, 0.6, 0.6, 0.6, 0.6, 0.6,],
#     [0.6, 0.6, 0.6, 0.6, 0.6, 0.6,]
# ]


# # 30% of data
# class_ratio_list = [
#     [0.3, 0.3, 0.3, 0.3, 0.3, 0.3,],
#     [0.3, 0.3, 0.3, 0.3, 0.3, 0.3,],
#     [0.3, 0.3, 0.3, 0.3, 0.3, 0.3,],
#     [0.3, 0.3, 0.3, 0.3, 0.3, 0.3,],
#     [0.3, 0.3, 0.3, 0.3, 0.3, 0.3,]
# ]

# 10% of data
class_ratio_list = [
    [0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
    [0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
    [0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
    [0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
    [0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
]

client_data_im = []
for idx,(X_train, y_train, X_test, y_test) in enumerate(client_data):
    class_ratio = class_ratio_list[idx]
    X_train_im, y_train_im, X_test_im, y_test_im = dataloading.class_imbalance((X_train,y_train,X_test,y_test),class_ratio,balance=True)
    dataloading.print_balance_2(y_train_im,y_test_im)
    client_data_im.append((X_train_im, y_train_im, X_test_im, y_test_im))

Train label distribution: 0.0: 359  1.0: 359  2.0: 359  3.0: 359  4.0: 359  5.0: 359 
Test label distribution: 0.0: 7490  1.0: 7490  2.0: 7490  3.0: 7490  4.0: 7490  5.0: 7490 
Train label distribution: 0.0: 357  1.0: 357  2.0: 357  3.0: 357  4.0: 357  5.0: 357 
Test label distribution: 0.0: 7490  1.0: 7490  2.0: 7490  3.0: 7490  4.0: 7490  5.0: 7490 
Train label distribution: 0.0: 348  1.0: 348  2.0: 348  3.0: 348  4.0: 348  5.0: 348 
Test label distribution: 0.0: 7490  1.0: 7490  2.0: 7490  3.0: 7490  4.0: 7490  5.0: 7490 
Train label distribution: 0.0: 352  1.0: 352  2.0: 352  3.0: 352  4.0: 352  5.0: 352 
Test label distribution: 0.0: 7490  1.0: 7490  2.0: 7490  3.0: 7490  4.0: 7490  5.0: 7490 
Train label distribution: 0.0: 361  1.0: 361  2.0: 361  3.0: 361  4.0: 361  5.0: 361 
Test label distribution: 0.0: 7490  1.0: 7490  2.0: 7490  3.0: 7490  4.0: 7490  5.0: 7490 


In [40]:
dataloading.save_client_data(client_data_im,"exp_3_3_1", labels)

Labels saved in /Users/admin/Desktop/thesis/dataset/working_data/exp_3_3_1/labels.txt


### 4. Mix Skew (Label And Quantity)

In [2]:
df1 = dataloading.load_data1()
df1.head()

Unnamed: 0,AG-X,AG-Y,AG-Z,Acc-X,Acc-Y,Acc-Z,Gravity-X,Gravity-Y,Gravity-Z,RR-X,RR-Y,RR-Z,RV-X,RV-Y,RV-Z,cos,activity
0,-9.494311,5.423363,0.196757,-1.684995,0.401508,1.334339,-8.855526,4.199663,-0.335983,-0.762366,-0.750471,1.10906,0.232662,0.672025,0.101603,0.695649,Running
1,-8.890651,2.520043,0.96331,-0.895032,-1.936677,1.436671,-8.399648,5.06081,0.067106,-0.613229,-0.121966,1.906942,0.245168,0.66066,0.131849,0.697164,Running
2,-5.6328,1.581015,0.230293,2.350318,-3.642859,-1.169414,-7.911231,5.790931,0.21893,-0.142383,-0.352063,1.759936,0.264845,0.648175,0.163016,0.695091,Running
3,-6.322698,6.228244,-0.756644,0.812825,-0.722664,-0.334024,-7.523226,6.280006,0.36466,-0.148775,-0.382955,1.208129,0.29955,0.624335,0.19508,0.694564,Running
4,-7.640211,4.862821,1.71549,0.34967,-0.468074,-0.37878,-6.921238,6.909286,0.727059,-1.047857,0.379773,0.828895,0.310003,0.607825,0.220415,0.697039,Running


In [5]:
random_seed = 42
X_train,y_train,X_test,y_test,labels = dataloading.train_test_split(df1,test_size=0.3, scaler_type="standard", should_map_labels=True, random_seed=random_seed)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")

X_train: (65938, 16), y_train: (65938,), X_test: (28260, 16), y_test: (28260,)


In [6]:
# we will balance the dataset and then imbalanced it as we want
client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=5)

# 60% of data
# class_ratio_list = [
#     [0.6, 0.6, 0.6, 0.6, 0.6, 0.6,],
#     [0.6, 0.6, 0.6, 0.6, 0.6, 0.6,],
#     [0.6, 0.6, 0.6, 0.6, 0.6, 0.6,],
#     [0.6, 0.6, 0.6, 0.6, 0.6, 0.6,],
#     [0.6, 0.6, 0.6, 0.6, 0.6, 0.6,]
# ]


# # 3/6 label - 30% data
# class_ratio_list = [
#     [0.3, 0.0, 0.3, 0.0, 0.3, 0.3],
#     [0.0, 0.3, 0.0, 0.3, 0.0, 0.3],
#     [0.3, 0.0, 0.3, 0.0, 0.3, 0.0],
#     [0.0, 0.3, 0.0, 0.3, 0.0, 0.3],
#     [0.3, 0.0, 0.3, 0.0, 0.3, 0.0]
# ]

# 5/6 missing / 10% data
class_ratio_list = [
    [0.1, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.1, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.1, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.1, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.1, 0.1]
]

client_data_im = []
for idx,(X_train, y_train, X_test, y_test) in enumerate(client_data):
    class_ratio = class_ratio_list[idx]
    X_train_im, y_train_im, X_test_im, y_test_im = dataloading.class_imbalance((X_train,y_train,X_test,y_test),class_ratio,balance=True)
    dataloading.print_balance_2(y_train_im,y_test_im)
    client_data_im.append((X_train_im, y_train_im, X_test_im, y_test_im))

Train label distribution: 0.0: 359 
Test label distribution: 0.0: 7490  1.0: 7490  2.0: 7490  3.0: 7490  4.0: 7490  5.0: 7490 
Train label distribution: 1.0: 357 
Test label distribution: 0.0: 7490  1.0: 7490  2.0: 7490  3.0: 7490  4.0: 7490  5.0: 7490 
Train label distribution: 2.0: 348 
Test label distribution: 0.0: 7490  1.0: 7490  2.0: 7490  3.0: 7490  4.0: 7490  5.0: 7490 
Train label distribution: 3.0: 352 
Test label distribution: 0.0: 7490  1.0: 7490  2.0: 7490  3.0: 7490  4.0: 7490  5.0: 7490 
Train label distribution: 4.0: 361  5.0: 361 
Test label distribution: 0.0: 7490  1.0: 7490  2.0: 7490  3.0: 7490  4.0: 7490  5.0: 7490 


In [11]:
dataloading.save_client_data(client_data_im,"exp_4_2_1", labels)

Labels saved in /Users/admin/Desktop/thesis/dataset/working_data/exp_4_2_1/labels.txt


### 5. Client Scaling with Real non-IID data

### to_client splitting respecting their subject id

In [2]:
def load_harsense_subjects(base_dir, n_subjects=12):
    """
    Load HARSense data for each subject into a dictionary of DataFrames.

    Parameters:
        base_dir (str): Path to the HARSense folder containing sub1.csv ... sub12.csv
        n_subjects (int): Number of subjects to load (default 12)

    Returns:
        dict: Keys are subject IDs (1 to n_subjects), values are pandas DataFrames
    """
    subjects = {}
    for i in range(1, n_subjects + 1):
        filename = f"sub{i}.csv"
        file_path = os.path.join(base_dir, filename)
        if not os.path.isfile(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")
        subjects[i] = pd.read_csv(file_path)
    return subjects

In [3]:
def normalize_subject_labels(subject_dfs, canonical_labels, label_col="activity"):
    # Build a lowercase→canonical mapping
    mapping = {lbl.lower(): lbl for lbl in canonical_labels}

    cleaned = {}
    for sid, df in subject_dfs.items():
        df = df.copy()
        # Lowercase everything first
        df[label_col] = df[label_col].str.lower()

        # Map to canonical or get NaN for unknowns
        df[label_col] = df[label_col].map(mapping)

        # Report and drop any rows that failed to map
        if df[label_col].isna().any():
            bad = df[df[label_col].isna()]
            bad_vals = bad[label_col].unique()
            print(f"bad values: {bad_vals}")
            print(f"Subject {sid}: dropping {len(bad)} rows with unknown labels {bad_vals}")
            df = df[df[label_col].notna()]

        cleaned[sid] = df

    return cleaned

In [4]:
base_dir = "/Users/admin/Desktop/thesis/dataset/Data_1_HARSense_Statistical_Human_Activity_Recognition/HARSense"
raw_subjects = load_harsense_subjects(base_dir)
canonical = ['Running','Sitting','Standing','Walking','downstaires','upstaires']
subject_dfs = normalize_subject_labels(raw_subjects, canonical)

bad values: [nan]
Subject 11: dropping 977 rows with unknown labels [nan]
bad values: [nan]
Subject 12: dropping 1400 rows with unknown labels [nan]


In [5]:
def prepare_federated_data(subject_dfs, test_size=0.2, scaler_type="standard", random_seed=42):
    # 1) Concatenate all DataFrames, adding subject_id
    all_dfs = []
    for sid, df in subject_dfs.items():
        df2 = df.copy()
        df2["subject_id"] = sid
        all_dfs.append(df2)
    big_df = pd.concat(all_dfs, ignore_index=True)

    # 2) Fit LabelEncoder on all activities
    le = preprocessing.LabelEncoder()
    big_df["activity_enc"] = le.fit_transform(big_df["activity"].values)

    # 3) Train/Test split on the big DataFrame
    train_df, test_df = model_selection.train_test_split(
        big_df,
        test_size=test_size,
        random_state=random_seed,
        shuffle=True,
        stratify=big_df["activity_enc"]
    )

    # 4) Fit scaler on global train features
    feature_cols = [c for c in big_df.columns if c not in ("activity", "activity_enc", "subject_id")]
    if scaler_type == "minmax":
        scaler = MinMaxScaler()
    else:
        scaler = StandardScaler()

    scaler.fit(train_df[feature_cols].values)

    # 5) Build global test set
    X_test = scaler.transform(test_df[feature_cols].values).astype(np.float32)
    y_test = test_df["activity_enc"].values.astype(np.int64)

    # 6) Build per-subject client_data list
    client_data = []
    for sid in sorted(subject_dfs.keys()):
        subj_train = train_df[train_df["subject_id"] == sid]
        X_tr = scaler.transform(subj_train[feature_cols].values).astype(np.float32)
        y_tr = subj_train["activity_enc"].values.astype(np.int64)
        client_data.append((X_tr, y_tr))

    return client_data, X_test, y_test, le, scaler

In [6]:
train_data, X_test, y_test, le, scaler = prepare_federated_data(
    subject_dfs=subject_dfs,
    test_size=0.3,
    scaler_type="standard",
    random_seed=42
)

In [7]:
client_data = []
for X_tr, y_tr in train_data:
    client_data.append((X_tr, y_tr, X_test, y_test))

In [8]:
labels = le.classes_
dataloading.save_client_data(client_data,"exp_5_1_1", labels)
print(labels)

Labels saved in /Users/admin/Desktop/thesis/dataset/working_data/exp_5_1_1/labels.txt


### Dataset 1 to client splitting without respecting their client id

In [5]:
df1 = dataloading.load_data1()
df1.head()

Unnamed: 0,AG-X,AG-Y,AG-Z,Acc-X,Acc-Y,Acc-Z,Gravity-X,Gravity-Y,Gravity-Z,RR-X,RR-Y,RR-Z,RV-X,RV-Y,RV-Z,cos,activity
0,-9.494311,5.423363,0.196757,-1.684995,0.401508,1.334339,-8.855526,4.199663,-0.335983,-0.762366,-0.750471,1.10906,0.232662,0.672025,0.101603,0.695649,Running
1,-8.890651,2.520043,0.96331,-0.895032,-1.936677,1.436671,-8.399648,5.06081,0.067106,-0.613229,-0.121966,1.906942,0.245168,0.66066,0.131849,0.697164,Running
2,-5.6328,1.581015,0.230293,2.350318,-3.642859,-1.169414,-7.911231,5.790931,0.21893,-0.142383,-0.352063,1.759936,0.264845,0.648175,0.163016,0.695091,Running
3,-6.322698,6.228244,-0.756644,0.812825,-0.722664,-0.334024,-7.523226,6.280006,0.36466,-0.148775,-0.382955,1.208129,0.29955,0.624335,0.19508,0.694564,Running
4,-7.640211,4.862821,1.71549,0.34967,-0.468074,-0.37878,-6.921238,6.909286,0.727059,-1.047857,0.379773,0.828895,0.310003,0.607825,0.220415,0.697039,Running


In [6]:
# try1
random_seed = 42
X_train,y_train,X_test,y_test,labels = dataloading.train_test_split(df1,test_size=0.3, scaler_type="standard", should_map_labels=True, random_seed=random_seed)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")

data_frac = None
client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=12, only_one_test_data=True, data_frac=data_frac, random_seed=random_seed)
dataloading.save_client_data(client_data,"exp_5_1_1", labels)

X_train: (65938, 16), y_train: (65938,), X_test: (28260, 16), y_test: (28260,)
Labels saved in /Users/admin/Desktop/thesis/dataset/working_data/exp_5_1_1/labels.txt


### Dataset 6

In [6]:
df6 = dataloading.load_data6()

[['attitude.roll', 'attitude.pitch', 'attitude.yaw'], ['gravity.x', 'gravity.y', 'gravity.z'], ['rotationRate.x', 'rotationRate.y', 'rotationRate.z'], ['userAcceleration.x', 'userAcceleration.y', 'userAcceleration.z']]
[INFO] -- Data subjects' information is imported.
[INFO] -- Creating Time-Series


In [3]:
df6.head()

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,activity,subject
0,1.528132,-0.733896,0.696372,0.741895,0.669768,-0.031672,0.316738,0.77818,1.082764,0.294894,-0.184493,0.377542,downstairs,0.0
1,1.527992,-0.716987,0.677762,0.753099,0.657116,-0.032255,0.842032,0.424446,0.643574,0.219405,0.035846,0.114866,downstairs,0.0
2,1.527765,-0.706999,0.670951,0.759611,0.649555,-0.032707,-0.138143,-0.040741,0.343563,0.010714,0.134701,-0.167808,downstairs,0.0
3,1.516768,-0.704678,0.675735,0.760709,0.647788,-0.04114,-0.025005,-1.048717,0.03586,-0.008389,0.136788,0.094958,downstairs,0.0
4,1.493941,-0.703918,0.672994,0.760062,0.64721,-0.05853,0.114253,-0.91289,0.047341,0.199441,0.353996,-0.044299,downstairs,0.0


In [8]:
def prepare_subject_client_data(
    df,
    subject_col="subject",
    label_col="activity",
    test_size=0.2,
    scaler_type="standard",
    random_seed=42
):
    # 1) Encode labels
    le = preprocessing.LabelEncoder()
    df["label_enc"] = le.fit_transform(df[label_col].values)

    # 2) Global train/test split
    train_df, test_df = model_selection.train_test_split(
        df,
        test_size=test_size,
        random_state=random_seed,
        shuffle=True,
        stratify=df["label_enc"]
    )

    # 3) Fit scaler on global train features
    feature_cols = [c for c in df.columns if c not in (subject_col, label_col, "label_enc")]
    if scaler_type == "minmax":
        scaler = MinMaxScaler()
    else:
        scaler = StandardScaler()
    scaler.fit(train_df[feature_cols].values)

    # 4) Build global test set
    X_test = scaler.transform(test_df[feature_cols].values).astype(np.float32)
    y_test = test_df["label_enc"].values.astype(np.float32)

    # 5) Partition train by subject
    client_data = []
    for sid in sorted(train_df[subject_col].unique()):
        df_sub = train_df[train_df[subject_col] == sid]
        X_tr = scaler.transform(df_sub[feature_cols].values).astype(np.float32)
        y_tr = df_sub["label_enc"].values.astype(np.float32)
        client_data.append((X_tr, y_tr))

    return client_data, X_test, y_test, le, scaler

In [9]:
train_data, X_test, y_test, le, scaler = prepare_subject_client_data(
    df6,
    subject_col="subject",
    label_col="activity",
    test_size=0.05,
    scaler_type="standard",
    random_seed=42
)

In [10]:
# format data on save_client format
client_data = []
for X_tr, y_tr in train_data:
    client_data.append((X_tr, y_tr, X_test, y_test))
    
labels = le.classes_
print(labels)

['downstairs' 'jogging' 'sitting' 'standing' 'upstairs' 'walking']


In [11]:
dataloading.save_client_data(client_data,"exp_5", labels)

Labels saved in /Users/admin/Desktop/thesis/dataset/working_data/exp_5/labels.txt


In [16]:
for X_train, y_train, X_test, y_test in client_data:
    print("-------------------------------------")
#     class_counts = Counter(y_train)
#     print(class_counts)
    print(y_train.shape)
    unique, counts = np.unique(y_train, return_counts=True)
    class_counts = dict(zip(unique, counts))
    print(class_counts)

-------------------------------------
(59146,)
{0.0: 4857, 1.0: 6109, 2.0: 19562, 3.0: 12109, 4.0: 5981, 5.0: 10528}
-------------------------------------
(59144,)
{0.0: 5579, 1.0: 6097, 2.0: 12263, 3.0: 9765, 4.0: 6862, 5.0: 18578}
-------------------------------------
(59483,)
{0.0: 6019, 1.0: 5668, 2.0: 13534, 3.0: 13608, 4.0: 6223, 5.0: 14431}
-------------------------------------
(53235,)
{0.0: 4828, 1.0: 5887, 2.0: 11548, 3.0: 10024, 4.0: 6143, 5.0: 14805}
-------------------------------------
(49667,)
{0.0: 4747, 1.0: 4128, 2.0: 13227, 3.0: 11793, 4.0: 5303, 5.0: 10469}
-------------------------------------
(54470,)
{0.0: 5457, 1.0: 5014, 2.0: 11267, 3.0: 13588, 4.0: 5142, 5.0: 14002}
-------------------------------------
(58176,)
{0.0: 4767, 1.0: 5612, 2.0: 15492, 3.0: 11262, 4.0: 7072, 5.0: 13971}
-------------------------------------
(57758,)
{0.0: 5529, 1.0: 5884, 2.0: 13657, 3.0: 13011, 4.0: 6764, 5.0: 12913}
-------------------------------------
(54324,)
{0.0: 5225, 1.0: 4

In [17]:
print(y_test.shape)
unique, counts = np.unique(y_test, return_counts=True)
class_counts = dict(zip(unique, counts))
print(class_counts)

(70644,)
{0.0: 6593, 1.0: 6712, 2.0: 16939, 3.0: 15321, 4.0: 7864, 5.0: 17215}


### Now we will concentrate the data of the 24 subjects on the 5 clients

In [8]:
# Assume client_data is a list of 24 tuples: (X_train, y_train, X_test, y_test)
# and that X_test, y_test are identical for all entries.

# Extract the common test set from the first tuple
_, _, X_test, y_test = client_data[0]

# 1) Shuffle subject indices for random assignment
indices = list(range(len(client_data)))
random.seed(42)
random.shuffle(indices)

# 2) Define new group sizes: four groups of 5, one group of 4
group_sizes = [5, 5, 5, 5, 4]

# 3) Concatenate only the train splits into 5 new clients
new_client_train = []
start = 0
for size in group_sizes:
    group_idxs = indices[start:start + size]
    start += size

    # Concatenate their X_train and y_train
    X_tr_list = [client_data[i][0] for i in group_idxs]
    y_tr_list = [client_data[i][1] for i in group_idxs]
    X_train = np.concatenate(X_tr_list, axis=0)
    y_train = np.concatenate(y_tr_list, axis=0)

    # Shuffle the training data
    perm = np.random.RandomState(42).permutation(len(X_train))
    X_train = X_train[perm]
    y_train = y_train[perm]

    new_client_train.append((X_train, y_train))

# new_client_train now contains 5 (X_train, y_train) tuples
# X_test, y_test remain common for evaluation

# Inspect shapes
for idx, (X_tr, y_tr) in enumerate(new_client_train, 1):
    print(f"Client {idx}: X_train={X_tr.shape}, y_train={y_tr.shape}")

print(f"Global test set: X_test={X_test.shape}, y_test={y_test.shape}")

Client 1: X_train=(242191, 12), y_train=(242191,)
Client 2: X_train=(232488, 12), y_train=(232488,)
Client 3: X_train=(232506, 12), y_train=(232506,)
Client 4: X_train=(228150, 12), y_train=(228150,)
Client 5: X_train=(194957, 12), y_train=(194957,)
Global test set: X_test=(282573, 12), y_test=(282573,)


In [9]:
client_data_2 = []
for X_tr, y_tr in new_client_train:
    client_data_2.append((X_tr, y_tr, X_test, y_test))

In [17]:
dataloading.save_client_data(client_data,"exp_5_3_1", labels)

Labels saved in /Users/admin/Desktop/thesis/dataset/working_data/exp_5_3_1/labels.txt


### 5. Scaling Data IID case

In [8]:
df6 = dataloading.load_data6()
df6.drop('subject', axis=1, inplace=True)

[['attitude.roll', 'attitude.pitch', 'attitude.yaw'], ['gravity.x', 'gravity.y', 'gravity.z'], ['rotationRate.x', 'rotationRate.y', 'rotationRate.z'], ['userAcceleration.x', 'userAcceleration.y', 'userAcceleration.z']]
[INFO] -- Data subjects' information is imported.
[INFO] -- Creating Time-Series


In [11]:
# try1
random_seed = 42
X_train,y_train,X_test,y_test,labels = dataloading.train_test_split(df6,test_size=0.2, scaler_type="standard", should_map_labels=True, random_seed=random_seed)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")

X_train: (1130292, 12), y_train: (1130292,), X_test: (282573, 12), y_test: (282573,)


In [10]:
data_frac = None
client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=5, only_one_test_data=True, data_frac=data_frac, random_seed=random_seed)
dataloading.save_client_data(client_data,"exp_5_iid_case", labels)

Labels saved in /Users/admin/Desktop/thesis/dataset/working_data/exp_5_iid_case/labels.txt
