In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import dataloading
import os
from sklearn.decomposition import PCA
from sklearn import model_selection, preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from collections import Counter
import random

In [2]:
df1 = dataloading.load_data1()
df1.head()

Unnamed: 0,AG-X,AG-Y,AG-Z,Acc-X,Acc-Y,Acc-Z,Gravity-X,Gravity-Y,Gravity-Z,RR-X,RR-Y,RR-Z,RV-X,RV-Y,RV-Z,cos,activity
0,-9.494311,5.423363,0.196757,-1.684995,0.401508,1.334339,-8.855526,4.199663,-0.335983,-0.762366,-0.750471,1.10906,0.232662,0.672025,0.101603,0.695649,Running
1,-8.890651,2.520043,0.96331,-0.895032,-1.936677,1.436671,-8.399648,5.06081,0.067106,-0.613229,-0.121966,1.906942,0.245168,0.66066,0.131849,0.697164,Running
2,-5.6328,1.581015,0.230293,2.350318,-3.642859,-1.169414,-7.911231,5.790931,0.21893,-0.142383,-0.352063,1.759936,0.264845,0.648175,0.163016,0.695091,Running
3,-6.322698,6.228244,-0.756644,0.812825,-0.722664,-0.334024,-7.523226,6.280006,0.36466,-0.148775,-0.382955,1.208129,0.29955,0.624335,0.19508,0.694564,Running
4,-7.640211,4.862821,1.71549,0.34967,-0.468074,-0.37878,-6.921238,6.909286,0.727059,-1.047857,0.379773,0.828895,0.310003,0.607825,0.220415,0.697039,Running


In [3]:
random_seed = 42
X_train,y_train,X_test,y_test,labels = dataloading.train_test_split(df1,test_size=0.3, scaler_type="standard", should_map_labels=True, random_seed=random_seed)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")

X_train: (65938, 16), y_train: (65938,), X_test: (28260, 16), y_test: (28260,)


In [4]:
# we will balance the dataset and then imbalanced it as we want
client_data = dataloading.to_client(data=(X_train,y_train,X_test,y_test),max_clients=5)

# 1/6 missing
class_ratio_list = [
    [0.0, 1.0, 1.0, 1.0, 1.0, 1.0],
    [1.0, 0.0, 1.0, 1.0, 1.0, 1.0],
    [1.0, 1.0, 0.0, 1.0, 1.0, 1.0],
    [1.0, 1.0, 1.0, 0.0, 1.0, 1.0],
    [1.0, 1.0, 1.0, 1.0, 0.0, 1.0]
]

# 3/6 missing
# class_ratio_list = [
#     [0.0, 1.0, 0.0, 1.0, 0.0, 1.0],
#     [1.0, 0.0, 1.0, 0.0, 1.0, 0.0],
#     [0.0, 1.0, 0.0, 1.0, 0.0, 1.0],
#     [1.0, 0.0, 1.0, 0.0, 1.0, 0.0],
#     [0.0, 1.0, 0.0, 1.0, 0.0, 1.0]
# ]

# 5/6 missing
# class_ratio_list = [
#     [1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
#     [0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
#     [0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
#     [0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
#     [0.0, 0.0, 0.0, 0.0, 1.0, 1.0]
# ]

client_data_im = []
for idx,(X_train, y_train, X_test, y_test) in enumerate(client_data):
    class_ratio = class_ratio_list[idx]
    X_train_im, y_train_im, X_test_im, y_test_im = dataloading.class_imbalance((X_train,y_train,X_test,y_test),class_ratio,balance=True)
    dataloading.print_balance(y_train_im,y_test_im)
    client_data_im.append((X_train_im, y_train_im, X_test_im, y_test_im))

y_train counts: [3593 3593    0 3593 3593 3593] ratio: [0.2 0.2 0.  0.2 0.2 0.2]
y_test counts: [7490 7490 7490 7490 7490 7490] ratio: [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
y_train counts: [3571    0 3571 3571 3571 3571] ratio: [0.2 0.  0.2 0.2 0.2 0.2]
y_test counts: [7490 7490 7490 7490 7490 7490] ratio: [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
y_train counts: [3485 3485    0 3485 3485 3485] ratio: [0.2 0.2 0.  0.2 0.2 0.2]
y_test counts: [7490 7490 7490 7490 7490 7490] ratio: [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
y_train counts: [3516 3516 3516    0 3516 3516] ratio: [0.2 0.2 0.2 0.  0.2 0.2]
y_test counts: [7490 7490 7490 7490 7490 7490] ratio: [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
y_train counts: [3611 3611 3611 3611    0 3611] ratio: [0.2 0.2 0.2 0.2 0.  0.2]
y_test counts: [7490 7490 7490 7490 7490 7490] ratio: [0.16666667 0.16666667 0.16666667 0.16666667 0.166666

In [5]:
dataloading.save_client_data(client_data_im,"energy_exp", labels)

Labels saved in /Users/admin/Desktop/thesis/dataset/working_data/energy_exp/labels.txt
