In [None]:
import config as cfg

In [None]:
import pandas as pd
import os 
from library.path_utils import get_file_path, to_absolute_path
from pathlib import Path

import numpy as np
import matplotlib.pyplot as plt 
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [None]:
RAW_SYM_DIR = "./raw_data/SYM"
# 엑셀 파일 경로 (실제 경로로 수정)
output_folder_name = "./_tmp/SYM"
SYM1_file_name = "backup_SYM1.xlsx"
SYM2_file_name = "backup_SYM2.xlsx"

SYM_raw_paths = [
    get_file_path(RAW_SYM_DIR, f"{SYM1_file_name}"),
    get_file_path(RAW_SYM_DIR, f"{SYM2_file_name}"),
]
SYM_raw_paths = [Path(p) for p in SYM_raw_paths]
SYM_raw_paths = [str(p) for p in SYM_raw_paths]
output_folder = to_absolute_path(output_folder_name)

In [None]:
#load data
input_path = os.path.join(output_folder, "foot.csv")
step= pd.read_csv(input_path)
step.rename(columns={'foot': 'step'}, inplace=True)

#data preprocessing
step['step'] = pd.to_numeric(step['step'])
step_nonzero = step[step.step != 0].copy()

#statistical analysis
step_mean = step_nonzero.groupby(['ID','date'])['step'].mean().reset_index().rename(columns={'step':'step_mean'})
step_var = step_nonzero.groupby(['ID','date'])['step'].var().reset_index().rename(columns={'step':'step_var'})
step_max = step_nonzero.groupby(['ID','date'])['step'].max().reset_index().rename(columns={'step':'step_max'})

#calculation of step_hvar_mean
step_nonzero['hour'] = pd.to_datetime(step_nonzero['time']).dt.hour 
step_hvar = step_nonzero.groupby(['ID','date','hour'])['step'].var().reset_index()

step_hvar_mean = step_hvar.groupby(['ID','date'])['step'].mean().reset_index().rename(columns={'step':'step_hvar_mean'})

# create total daily steps
daily_steps = step.groupby(['ID','date'])['step'].sum().reset_index().rename(columns={'step':'steps'})

#data merge
step_statistics_merged= pd.merge(left=step, right=step_var, how="outer", on =['date','ID'])
step_statistics_merged= pd.merge(left=step_statistics_merged, right=step_max, how="outer", on =['date','ID'])
step_statistics_merged= pd.merge(left=step_statistics_merged, right=step_mean, how="outer", on =['date','ID'])
step_statistics_merged= pd.merge(left=step_statistics_merged, right=step_hvar_mean, how="outer", on =['date','ID'])

#data preprocessing
step_statistics_merged['datetime'] = step_statistics_merged['date'] + ' ' + step_statistics_merged['time']

output_path = os.path.join(output_folder, "step_stactistics.csv")
step_statistics_merged.to_csv(output_path, index=False)

#data per date
step_date= pd.merge(left=daily_steps, right=step_var, how="left", on =['date','ID'])
step_date= pd.merge(left=step_date, right=step_max, how="left", on =['date','ID'])
step_date= pd.merge(left=step_date, right=step_mean, how="left", on =['date','ID'])
step_date= pd.merge(left=step_date, right=step_hvar_mean, how="left", on =['date','ID'])


output_path = os.path.join(output_folder, "step_date.csv")
step_date.to_csv(output_path, index=False)

In [None]:
input_path = os.path.join(output_folder, "step_date.csv")
step = pd.read_csv(input_path)
step['date'] = pd.to_datetime(step['date'])
id_list = step['ID'].unique()

step_delta = pd.DataFrame(columns=['ID', 'date', 'steps', 'step_max', 'step_mean', 'step_hvar_mean', 'step_delta', 'step_max_delta',
                                   'step_mean_delta', 'step_hvar_mean_delta', 'step_delta2', 'step_max_delta2',
                                   'step_mean_delta2', 'step_hvar_mean_delta2'])
for id in id_list:
    step_id = step.loc[(step.ID == id)]
    time_per_day = pd.date_range(step_id.date.min(), step_id.date.max(), freq='D')
    temp = pd.DataFrame()
    temp['date'] = time_per_day
    step_id = pd.merge(step_id, temp, how='right', on='date')
    step_id.ID = id
    step_id['step_delta'] = step_id['steps'].diff()
    step_id['step_delta2'] = step_id['steps'].diff(periods=2)
    step_id['step_max_delta'] = step_id['step_max'].diff()
    step_id['step_max_delta2'] = step_id['step_max'].diff(periods=2)
    step_id['step_mean_delta'] = step_id['step_mean'].diff()
    step_id['step_mean_delta2'] = step_id['step_mean'].diff(periods=2)
    step_id['step_hvar_mean_delta'] = step_id['step_hvar_mean'].diff()
    step_id['step_hvar_mean_delta2'] = step_id['step_hvar_mean'].diff(periods=2)
    step_delta = pd.concat([step_delta, step_id], axis=0)



step_delta['date'] = step_delta['date'].dt.strftime('%Y-%m-%d')
step_delta.reset_index(drop=True, inplace=True)
# Drop rows where steps, step_delta, and step_delta2 are all zero
step_delta = step_delta[~((step_delta['steps'] == 0) & (step_delta['step_delta'] == 0) & (step_delta['step_delta2'] == 0))]

output_path = os.path.join(output_folder, "step_delta.csv")
step_delta.to_csv(output_path, index=False)

In [None]:
#load data
input_path = os.path.join(output_folder, "HR.csv")
HR= pd.read_csv(input_path)

#data preprocessing
HR['HR'] = pd.to_numeric(HR['HR'])
HR_nonzero = HR[HR.HR != 0]

#statistical analysis
HR_mean = HR_nonzero.groupby(['ID','date'])['HR'].mean().reset_index()
HR_var = HR_nonzero.groupby(['ID','date'])['HR'].var().reset_index()
# 최소 2개 이상의 데이터가 있는 날짜만 분산 계산
HR_count = HR_nonzero.groupby(['ID','date']).size().reset_index(name='count')
HR_var = HR_var.merge(HR_count, on=['ID','date'])
HR_var = HR_var[HR_var['count'] >= 2].drop('count', axis=1)
HR_min = HR_nonzero.groupby(['ID','date'])['HR'].min().reset_index()
HR_max = HR_nonzero.groupby(['ID','date'])['HR'].max().reset_index()

#calculation of HR_hvar_mean
HR['hour'] = pd.to_datetime(HR['time']).dt.hour 
HR_hvar = HR.groupby(['ID','date','hour'])['HR'].var().reset_index()
HR_hvar_mean = HR_hvar.groupby(['ID','date'])['HR'].mean().reset_index()
# hvar_mean도 동일한 조건 적용: 최소 2개 이상의 nonzero HR이 있는 날짜만
HR_hvar_mean = HR_hvar_mean.merge(HR_count, on=['ID','date'], how='left')
HR_hvar_mean = HR_hvar_mean[HR_hvar_mean['count'] >= 2].drop('count', axis=1)

#data merge
HR_statistics_merged = pd.merge(left=HR, right=HR_var, how="outer", on =['date','ID'], suffixes=['', '_var'])
HR_statistics_merged = pd.merge(left=HR_statistics_merged, right=HR_min, how="outer", on =['date','ID'], suffixes=['', '_min'])
HR_statistics_merged = pd.merge(left=HR_statistics_merged, right=HR_max, how="outer", on =['date','ID'], suffixes=['', '_max'])
HR_statistics_merged = pd.merge(left=HR_statistics_merged, right=HR_mean, how="outer", on =['date','ID'], suffixes=['', '_mean'])
HR_statistics_merged = pd.merge(left=HR_statistics_merged, right=HR_hvar_mean, how="outer", on =['date','ID'], suffixes=['', '_hvar_mean'])

#data preprocessing
HR_statistics_merged['datetime'] = HR_statistics_merged['date'] + ' ' + HR_statistics_merged['time']
HR_statistics_merged.drop('hour', axis=1, inplace=True)


output_path = os.path.join(output_folder, "hr_stactistics_fixed.csv")
HR_statistics_merged.to_csv(output_path, index=False)

#data per date
HR_date = pd.merge(left=HR_var, right=HR_max, how="left", on =['date','ID'], suffixes=['', '_max'])
HR_date = pd.merge(left=HR_date, right=HR_mean, how="left", on =['date','ID'], suffixes=['', '_mean'])
HR_date = pd.merge(left=HR_date, right=HR_hvar_mean, how="left", on =['date','ID'], suffixes=['', '_hvar_mean'])
HR_date.rename(columns = {'HR':'HR_var'}, inplace=True)


output_path = os.path.join(output_folder, "HR_date_fixed.csv")
HR_date.to_csv(output_path, index=False)

In [None]:
input_path = os.path.join(output_folder, "HR.csv")
HR = pd.read_csv(input_path)
HR['HR'] = pd.to_numeric(HR['HR'])
id_list = HR['ID'].unique()
    
HR_interpolated = pd.DataFrame(columns=['index', 'ID', 'date', 'time', 'HR'])
for id in tqdm(id_list):
    temp_id = HR.loc[(HR['ID'] == id)].copy()
    temp_id.reset_index(inplace=True)
    temp_id.drop('index', axis=1, inplace=True)
    date_list =temp_id['date'].unique()
    for date in date_list:
        temp_date = temp_id.loc[(temp_id['date'] == date)].copy()
        temp_date.reset_index(inplace=True)
        temp_date.drop('index', axis=1, inplace=True)
        temp_date.reset_index(inplace=True)  
        temp_date = temp_date.replace(0, np.nan)
        if temp_date.HR.count() > 720:
            temp_date = temp_date.interpolate(method='values', limit_direction = 'both')
            HR_interpolated = pd.concat([HR_interpolated, temp_date], axis=0)
            file_name =  id + ' ' + date
            # plot_df(temp_date['index'], temp_date['HR'], file_name)
        else:
            pass
        
HR_interpolated.reset_index(drop=True, inplace=True)
output_path = os.path.join(output_folder, "HR_interpolated_720.csv")
HR_interpolated.to_csv(output_path, index=False)


In [None]:
from utils_for_analysis import mesor, amplitude, acrophase
input_path = os.path.join(output_folder, "HR.csv")
HR_interpolated = pd.read_csv(input_path)
HR_interpolated['HR'] = pd.to_numeric(HR_interpolated['HR'])

id_list = HR_interpolated['ID'].unique()
circadian_data = pd.DataFrame(columns=['ID','date','acr','amp','mesor'])
                  
for id in id_list:
    temp_id = HR_interpolated.loc[(HR_interpolated['ID'] == id)]
    temp_id.reset_index(inplace=True)
    temp_id = temp_id.drop('index', axis=1)
    date_list =temp_id['date'].unique()
    for date in date_list:
        temp_date = temp_id.loc[(temp_id['date'] == date)]
        temp_date.reset_index(inplace=True)
        temp_date = temp_date.drop('index', axis=1)
        temp_date.reset_index(inplace=True)  
        if temp_date.HR.count() > 720:
            acr = acrophase(temp_date['index'], temp_date['HR'])
            amp = amplitude(temp_date['index'], temp_date['HR'])
            mes = mesor(temp_date['index'], temp_date['HR'])
            new_row = pd.DataFrame([[id, date, acr, amp, mes]], columns=['ID','date','acr','amp','mesor'])
            circadian_data = pd.concat([circadian_data, new_row], ignore_index=True)
            print(id, date, acr, amp, mes)
        else:
            pass


output_path = os.path.join(output_folder, "circadian_parameter_720.csv")
circadian_data.to_csv(output_path, index=False)

In [None]:
input_path = os.path.join(output_folder, "circadian_parameter_720.csv")
circadian = pd.read_csv(input_path)
circadian['date'] = pd.to_datetime(circadian['date'])
id_list = circadian['ID'].unique()

circadian_delta = pd.DataFrame(columns=['ID', 'date', 'acr', 'amp', 'mesor','acr_delta', 'acr_delta2', 'amp_delta', 'amp_delta2', 'mesor_delta', 'mesor_delta2'])
for id in id_list:
    circadian_id = circadian.loc[(circadian.ID == id)]
    time_per_day = pd.date_range(circadian_id.date.min(), circadian_id.date.max(), freq='D')
    temp = pd.DataFrame()
    temp['date'] = time_per_day
    circadian_id = pd.merge(circadian_id, temp, how='right', on='date')
    circadian_id.ID = id
    circadian_id['acr_delta'] = circadian_id['acr'].diff()
    circadian_id['acr_delta2'] = circadian_id['acr'].diff(periods=2)
    circadian_id['amp_delta'] = circadian_id['amp'].diff()
    circadian_id['amp_delta2'] = circadian_id['amp'].diff(periods=2)
    circadian_id['mesor_delta'] = circadian_id['mesor'].diff()
    circadian_id['mesor_delta2'] = circadian_id['mesor'].diff(periods=2)
    circadian_delta = pd.concat([circadian_delta, circadian_id], axis=0)

circadian_delta['date'] = circadian_delta['date'].dt.strftime('%Y-%m-%d')
circadian_delta.reset_index(drop=True, inplace=True)


output_path = os.path.join(output_folder, "circadian_delta_720.csv")
circadian_delta.to_csv(output_path, index=False)

In [None]:
from utils_for_analysis import (
    check_bandpower_value_a,
    check_bandpower_value_b,
    check_bandpower_value_c,
    check_bandpower_value_d,
)
from joblib import Parallel, delayed
from tqdm.auto import tqdm
from tqdm_joblib import tqdm_joblib
input_path = os.path.join(output_folder, "HR_interpolated_720.csv")
# 1) 파일 로드 & 타입 변환
HR = pd.read_csv(
    input_path,
    parse_dates=["date"]
)
HR["HR"] = pd.to_numeric(HR["HR"], errors="coerce")

# ——— build per-minute DataFrame ———
df_per_min = pd.DataFrame(columns=['ID','HR','date'])
for id in HR['ID'].unique():
    df_id = HR[HR['ID'] == id]
    time_per_min = pd.date_range(df_id['date'].min(), df_id['date'].max(), freq='min')
    temp = pd.DataFrame({'date': time_per_min})
    df_id = pd.merge(df_id, temp, how='right', on='date')
    df_id['ID'] = id
    df_per_min = pd.concat([df_per_min, df_id], axis=0)

df_per_min["day"] = df_per_min["date"].dt.date

# 4) 하루 그룹 하나당 밴드파워 계산 함수
def compute_bandpower_for_group(group):
    (id_, day), sub = group
    valid_count = sub['HR'].notna().sum()
    if valid_count <= 720:
        return None
    idx = np.arange(len(sub))
    hr  = sub["HR"].to_numpy()
    return {
        "ID":           id_,
        "date":         pd.Timestamp(day),
        "bandpower_a":  check_bandpower_value_a(idx, hr),
        "bandpower_b":  check_bandpower_value_b(idx, hr),
        "bandpower_c":  check_bandpower_value_c(idx, hr),
        "bandpower_d":  check_bandpower_value_d(idx, hr),
    }

groups = df_per_min.groupby(["ID","day"], sort=False)

# 5) tqdm_joblib 로 진행률 표시하며 병렬 처리
with tqdm_joblib(tqdm(total=df_per_min["ID"].nunique(), desc="IDs")):
    results = Parallel(n_jobs=-1)(
        delayed(lambda g: compute_bandpower_for_group(g))(grp)
        for grp in groups
    )

# 6) None 삭제 & DataFrame 생성
records = [r for r in results if r is not None]
bandpower_df = pd.DataFrame(records)

output_path = os.path.join(output_folder, "bandpower_fixed_720.csv")
bandpower_df.to_csv(output_path, index=False)