In [9]:
import numpy as np
import pandas as pd
import xarray as xr
import seaborn as sns  
import matplotlib.pyplot as plt

import tqdm
from scipy.stats import pearsonr

In [10]:
ds_hb = xr.open_dataset("../data/merged-20241227.nc")
ds_hb

In [11]:
np.unique(ds_hb["device"].to_numpy())

array([ 9, 11, 12, 13, 15], dtype=int32)

In [12]:
ds_hb.sel(id=ds_hb.device==9)

In [13]:
X = ds_hb["signal"].to_numpy().squeeze()
y = ds_hb["hb"].to_numpy()

X.shape, y.shape

((605, 8800), (605,))

In [14]:
r, p = pearsonr(X, y.reshape(-1, 1))

sr_corr = pd.Series(r, index=range(X.shape[1])).sort_values()
sr_corr

7291   -0.106012
7295   -0.105111
7299   -0.104919
7271   -0.104621
7283   -0.104153
          ...   
5799    0.052563
5767    0.052755
5783    0.053340
5731    0.053729
5755    0.054891
Length: 8800, dtype: float64

In [15]:
subsets = list(np.unique(ds_hb["device"].data)) + ["all"]

for subset in tqdm.tqdm(subsets):
    if subset == "all":
        X = ds_hb["signal"].to_numpy()
        y = ds_hb["hb"].to_numpy()
    else:
        X = ds_hb.sel(id=ds_hb["device"] == subset)["signal"].to_numpy()
        y = ds_hb.sel(id=ds_hb["device"] == subset)["hb"].to_numpy()

    r, p = pearsonr(X, y.reshape(-1, 1))

    df_corr = pd.DataFrame({
        "feat_num": range(X.shape[1]),
        "r": r,
        "r_abs": np.abs(r),
    })
    
    df_corr = df_corr.sort_values("r_abs", ascending=False)
    df_corr.to_csv(f"corr-{subset}.csv", index=None)

100%|██████████| 6/6 [00:00<00:00, 10.65it/s]
