# HAR Non-IID Class Imbalance Experiments
This notebook explores the impact of non-IID data distributions and class imbalance on training classifiers for Human Activity Recognition (HAR). The dataset is split into multiple clients (2, 6, or 35) with varying class ratios to create challenging learning scenarios.

The goal is to analyze how non-IID distributions and aggressive class imbalances affect model performance, making learning more difficult for a classifier.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import dataloading
import os
from sklearn.decomposition import PCA
import torch

In [2]:
df1 = dataloading.load_data1()
df1.head()

Unnamed: 0,AG-X,AG-Y,AG-Z,Acc-X,Acc-Y,Acc-Z,Gravity-X,Gravity-Y,Gravity-Z,RR-X,RR-Y,RR-Z,RV-X,RV-Y,RV-Z,cos,activity
0,-9.494311,5.423363,0.196757,-1.684995,0.401508,1.334339,-8.855526,4.199663,-0.335983,-0.762366,-0.750471,1.10906,0.232662,0.672025,0.101603,0.695649,Running
1,-8.890651,2.520043,0.96331,-0.895032,-1.936677,1.436671,-8.399648,5.06081,0.067106,-0.613229,-0.121966,1.906942,0.245168,0.66066,0.131849,0.697164,Running
2,-5.6328,1.581015,0.230293,2.350318,-3.642859,-1.169414,-7.911231,5.790931,0.21893,-0.142383,-0.352063,1.759936,0.264845,0.648175,0.163016,0.695091,Running
3,-6.322698,6.228244,-0.756644,0.812825,-0.722664,-0.334024,-7.523226,6.280006,0.36466,-0.148775,-0.382955,1.208129,0.29955,0.624335,0.19508,0.694564,Running
4,-7.640211,4.862821,1.71549,0.34967,-0.468074,-0.37878,-6.921238,6.909286,0.727059,-1.047857,0.379773,0.828895,0.310003,0.607825,0.220415,0.697039,Running


In [3]:
#client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=1)
#dataloading.save_client_data(client_data,"dataset_1_experiment_3")
# let see class imbalance between client data
# for X_train, y_train, X_test, y_test in client_data:
#     print("=================")
#     dataloading.print_balance(y_train,y_test)

### experiment 2: insert class imbalance

In [3]:
X_train,y_train,X_test,y_test,labels = dataloading.train_test_split(df1,test_size=0.2)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")

X_train: (75358, 16), y_train: (75358,), X_test: (18840, 16), y_test: (18840,)


In [4]:
client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=2)
# dataloading.save_client_data(client_data,"dataset_1_experiment_1")

In [85]:
# we we balance the dataset and then imbalance it as we want
class_ratio_list = [
    [0.1, 0.1, 0.1, 0.8, 0.1, 0.8], 
    [0.8, 0.1, 0.2, 0.3, 0.8, 0.8]
]

client_data_im = []
for idx,(X_train, y_train, X_test, y_test) in enumerate(client_data):
    class_ratio = class_ratio_list[idx]
    X_train_im, y_train_im, X_test_im, y_test_im = dataloading.class_imbalance((X_train,y_train,X_test,y_test),class_ratio,balance=True)
    dataloading.print_balance(y_train_im,y_test_im)
    client_data_im.append((X_train_im, y_train_im, X_test_im, y_test_im))

y_train counts: [ 84 168 253 674 674 674] ratio: [0.033241   0.06648199 0.10011872 0.26671943 0.26671943 0.26671943]
y_test counts: [3369 3369 3369 3369 3369 3369] ratio: [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
y_train counts: [718  90 180 269 718 718] ratio: [0.26661716 0.03341998 0.06683996 0.0998886  0.26661716 0.26661716]
y_test counts: [3350 3350 3350 3350 3350 3350] ratio: [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
y_train counts: [659 659  82 165 247 659] ratio: [0.26669365 0.26669365 0.03318495 0.06677459 0.09995953 0.26669365]
y_test counts: [3401 3401 3401 3401 3401 3401] ratio: [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
y_train counts: [718 718 718  90 180 269] ratio: [0.26661716 0.26661716 0.26661716 0.03341998 0.06683996 0.0998886 ]
y_test counts: [3435 3435 3435 3435 3435 3435] ratio: [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
y_train counts: [262 700 700 700  88 175] ra

In [86]:
dataloading.save_client_data(client_data_im,"dataset_1_experiment_4")

### experiment 3: more aggresive class imbalance

In [3]:
X_train,y_train,X_test,y_test,labels = dataloading.train_test_split(df1,test_size=0.8)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")
# split data to N = 6 clients
client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=6)
# we we balance the dataset and then imbalance it as we want
class_ratio_list = [
    [0.01, 0.01, 0.01, 0.08, 0.01, 0.01], 
    [0.01, 0.01, 0.01, 0.01, 0.08, 0.01], 
    [0.01, 0.01, 0.01, 0.01, 0.01, 0.08], 
    [0.08, 0.01, 0.01, 0.01, 0.01, 0.01], 
    [0.01, 0.08, 0.01, 0.01, 0.01, 0.01], 
    [0.01, 0.01, 0.08, 0.01, 0.01, 0.01]
    #[0.1, 1, 1, 1, 1, 1]
]

client_data_im = []
for idx,(X_train, y_train, X_test, y_test) in enumerate(client_data):
    class_ratio = class_ratio_list[idx]
    X_train_im, y_train_im, X_test_im, y_test_im = dataloading.class_imbalance((X_train,y_train,X_test,y_test),class_ratio,balance=True)
    dataloadingr.print_balance(y_train_im,y_test_im)
    client_data_im.append((X_train_im, y_train_im, X_test_im, y_test_im))   
    
dataloading.save_client_data(client_data_im,"dataset_1_experiment_5")

X_train: (18839, 16), y_train: (18839,), X_test: (75359, 16), y_test: (75359,)
y_train counts: [ 8  8  8 67  8  8] ratio: [0.07476636 0.07476636 0.07476636 0.62616822 0.07476636 0.07476636]
y_test counts: [3369 3369 3369 3369 3369 3369] ratio: [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
y_train counts: [ 9  9  9  9 72  9] ratio: [0.07692308 0.07692308 0.07692308 0.07692308 0.61538462 0.07692308]
y_test counts: [3350 3350 3350 3350 3350 3350] ratio: [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
y_train counts: [ 8  8  8  8  8 66] ratio: [0.0754717  0.0754717  0.0754717  0.0754717  0.0754717  0.62264151]
y_test counts: [3401 3401 3401 3401 3401 3401] ratio: [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
y_train counts: [72  9  9  9  9  9] ratio: [0.61538462 0.07692308 0.07692308 0.07692308 0.07692308 0.07692308]
y_test counts: [3435 3435 3435 3435 3435 3435] ratio: [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0

### experiment 4: there is not motive to use more than 2 clients. problem is too easy. Make it more class imbalance

In [102]:
X_train,y_train,X_test,y_test,labels = dataloading.train_test_split(df1,test_size=0.8)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")
# split data to N = 6 clients
client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=6)
# we we balance the dataset and then imbalance it as we want
class_ratio_list = [
    [0.8, 0.01, 0.01, 0.01, 0.01, 0.01], 
    [0.01, 0.8, 0.01, 0.01, 0.01, 0.01], 
    #[0.01, 0.01, 0.8, 0.01, 0.01, 0.01],
    [0.8, 0.8, 0.8, 0.8, 0.8, 0.8], 
    [0.01, 0.01, 0.01, 0.8, 0.01, 0.01], 
    [0.01, 0.01, 0.01, 0.01, 0.8, 0.01], 
    [0.01, 0.01, 0.01, 0.01, 0.01, 0.8]
]
client_data_im = []
for idx,(X_train, y_train, X_test, y_test) in enumerate(client_data):
    class_ratio = class_ratio_list[idx]
    X_train_im, y_train_im, X_test_im, y_test_im = dataloading.class_imbalance((X_train,y_train,X_test,y_test),class_ratio,balance=True)
    dataloading.print_balance(y_train_im,y_test_im)
    client_data_im.append((X_train_im, y_train_im, X_test_im, y_test_im))   
    
dataloading.save_client_data(client_data_im,"dataset_1_experiment_6")

X_train: (18839, 16), y_train: (18839,), X_test: (75359, 16), y_test: (75359,)
y_train counts: [674   8   8   8   8   8] ratio: [0.94397759 0.01120448 0.01120448 0.01120448 0.01120448 0.01120448]
y_test counts: [3369 3369 3369 3369 3369 3369] ratio: [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
y_train counts: [  9 718   9   9   9   9] ratio: [0.01179554 0.94102228 0.01179554 0.01179554 0.01179554 0.01179554]
y_test counts: [3350 3350 3350 3350 3350 3350] ratio: [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
y_train counts: [659 659 659 659 659 659] ratio: [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
y_test counts: [3401 3401 3401 3401 3401 3401] ratio: [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
y_train counts: [  9   9   9 718   9   9] ratio: [0.01179554 0.01179554 0.01179554 0.94102228 0.01179554 0.01179554]
y_test counts: [3435 3435 3435 3435 3435 3435] ratio: [0.16666667 0.16666667 0.16666667

In [None]:
# experiment 7. Split the dataset to 35 clients and train only 1 client

In [3]:
# experiment 8: 2 clients. half - half samples. Let it run
df1 = dataloading.load_data1()
df1.head()

Unnamed: 0,AG-X,AG-Y,AG-Z,Acc-X,Acc-Y,Acc-Z,Gravity-X,Gravity-Y,Gravity-Z,RR-X,RR-Y,RR-Z,RV-X,RV-Y,RV-Z,cos,activity
0,-9.494311,5.423363,0.196757,-1.684995,0.401508,1.334339,-8.855526,4.199663,-0.335983,-0.762366,-0.750471,1.10906,0.232662,0.672025,0.101603,0.695649,Running
1,-8.890651,2.520043,0.96331,-0.895032,-1.936677,1.436671,-8.399648,5.06081,0.067106,-0.613229,-0.121966,1.906942,0.245168,0.66066,0.131849,0.697164,Running
2,-5.6328,1.581015,0.230293,2.350318,-3.642859,-1.169414,-7.911231,5.790931,0.21893,-0.142383,-0.352063,1.759936,0.264845,0.648175,0.163016,0.695091,Running
3,-6.322698,6.228244,-0.756644,0.812825,-0.722664,-0.334024,-7.523226,6.280006,0.36466,-0.148775,-0.382955,1.208129,0.29955,0.624335,0.19508,0.694564,Running
4,-7.640211,4.862821,1.71549,0.34967,-0.468074,-0.37878,-6.921238,6.909286,0.727059,-1.047857,0.379773,0.828895,0.310003,0.607825,0.220415,0.697039,Running


In [4]:
X_train,y_train,X_test,y_test,labels = dataloading.train_test_split(df1,test_size=0.2)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")

X_train: (75358, 16), y_train: (75358,), X_test: (18840, 16), y_test: (18840,)


In [5]:
client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=2)
# dataloading.save_client_data(client_data,"dataset_1_experiment_1")

In [8]:
# we we balance the dataset and then imbalance it as we want
class_ratio_list = [
    [0.0, 0.0, 0.0, 1.0, 1.0, 1.0], 
    [1.0, 1.0, 1.0, 0.0, 0.0, 0.0]
]

client_data_im = []
for idx,(X_train, y_train, X_test, y_test) in enumerate(client_data):
    class_ratio = class_ratio_list[idx]
    X_train_im, y_train_im, X_test_im, y_test_im = dataloading.class_imbalance((X_train,y_train,X_test,y_test),class_ratio,balance=True)
    dataloading.print_balance(y_train_im,y_test_im)
    client_data_im.append((X_train_im, y_train_im, X_test_im, y_test_im))

dataloading.save_client_data(client_data_im,"dataset_1_experiment_2")

y_train counts: [10333     0     0 10333     0 10333] ratio: [0.33333333 0.         0.         0.33333333 0.         0.33333333]
y_test counts: [2489 2489 2489 2489 2489 2489] ratio: [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
y_train counts: [10014     0     0 10014     0 10014] ratio: [0.33333333 0.         0.         0.33333333 0.         0.33333333]
y_test counts: [2471 2471 2471 2471 2471 2471] ratio: [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
