Data preparation

In [3]:
import numpy as np
import pandas as pd
import h5py
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
file_path = r"C:\Users\giuli\Documents\Open_Campus\Sleep_Apnea\00_pre files\X_train.h5"

test_file_path = r"C:\Users\giuli\Documents\Open_Campus\Sleep_Apnea\00_pre files\X_test.h5"

with h5py.File(test_file_path, "r") as f:
    raw = f["data"][:]              # shape: (4400, 72002)

# 1) metadata
ids      = raw[:, 0].astype(int)    # window ID
subjects = raw[:, 1].astype(int)    # subject ID

print("First 10 IDs:     ", ids[:10])
print("First 10 subjects:", subjects[:10])

# 2) signal only (drop the first 2 columns)
X_flat = raw[:, 2:]                 # shape: (4400, 72000)

# 3) reshape to (N, 9000, 8)
N = X_flat.shape[0]
X = X_flat.reshape(N, 9000, 8)

print("X shape:", X.shape)          # should be (4400, 9000, 8)

First 10 IDs:      [4400 4401 4402 4403 4404 4405 4406 4407 4408 4409]
First 10 subjects: [22 22 22 22 22 22 22 22 22 22]
X shape: (4400, 9000, 8)


In [12]:
def normalize(X):
    mean = X.mean(axis=1, keepdims=True)
    std  = X.std(axis=1, keepdims=True) + 1e-8
    return (X - mean) / std

X = normalize(X)



In [13]:
channel_names = [
    "Abdominal belt", "Airflow", "PPG",
    "Thoracic belt", "Snoring", "SpO2",
    "EEG C4-A1", "EEG O2-A1"
]

stats = []

for i, name in enumerate(channel_names):
    ch = X[:, i, :].ravel()
    stats.append([name, np.mean(ch), np.std(ch), np.min(ch), np.max(ch)])

stats_df = pd.DataFrame(stats, columns=["Channel", "Mean", "Std", "Min", "Max"])
stats_df



Unnamed: 0,Channel,Mean,Std,Min,Max
0,Abdominal belt,-0.360888,0.186424,-2.187833,2.512332
1,Airflow,-0.360472,0.186525,-2.187833,2.512338
2,PPG,-0.360151,0.187018,-2.187833,2.512386
3,Thoracic belt,-0.360124,0.188626,-2.316642,2.540537
4,Snoring,-0.3604,0.193569,-2.303222,2.559511
5,SpO2,-0.359708,0.196316,-2.294819,2.670353
6,EEG C4-A1,-0.360099,0.189251,-2.294819,2.539213
7,EEG O2-A1,-0.359723,0.190467,-2.300352,2.539213


In [10]:
# Ensure y is aligned with X using IDs
y_df = y_df.set_index("ID")   # use ID as index

mask_cols = [c for c in y_df.columns if c.startswith("y_")]
y = y_df.loc[ids, mask_cols].to_numpy()   # y aligned with X


print("y shape:", y.shape)   # (4400, 90)

y shape: (4400, 90)


In [14]:
#export normalized data to H5 file
output_path = r"C:\Users\giuli\Documents\Open_Campus\Sleep_Apnea\00_pre files\X_test_normalized.h5"
with h5py.File(output_path, "w") as f:
    f.create_dataset("data", data=X)