In [None]:
import config as cfg

In [None]:
RAW_PXPN_DIR = "./raw_data/PXPN"
# 엑셀 파일 경로 (실제 경로로 수정)
enroll_file_name = "1. 픽셀패닉 enroll 정보_250516"
zip_file_name = "pixelpanic_raw_data.zip"
output_folder_name = "./tmp/PXPN"

In [None]:
import pandas as pd
from path_utils import get_file_path

enroll_path = get_file_path(RAW_PXPN_DIR, f"{enroll_file_name}.csv")
zip_path = get_file_path(RAW_PXPN_DIR, f"{zip_file_name}")
output_folder = get_file_path(output_folder_name)

In [4]:
from utils_for_preprocessing import read_all_data
import pandas as pd
import datetime as dt
import os
BASE_PASSIVE_DIR = zip_path

step = read_all_data('Step', BASE_PASSIVE_DIR, exclude_keywords=['resting', 'variability'])

# Convert obtained_at to datetime then split into date and time
step['started_at'] = pd.to_datetime(step['started_at'])
step['date'] = step['started_at'].dt.date.astype(str)
step['time'] = step['started_at'].dt.time.astype(str)


# Drop unneeded columns and reset index
step = step.drop(columns=['started_at', 'ended_at', 'obtained_at']).reset_index(drop=True)
output_path = os.path.join(output_folder, "step.csv")
step.to_csv(output_path, index=False)

In [5]:

#data preprocessing
step['steps'] = pd.to_numeric(step['steps'])
step_nonzero = step[step.steps != 0]

#statistical analysis
step_mean = step_nonzero.groupby(['ID','date'])['steps'].mean().reset_index().rename(columns={'steps':'step_mean'})
step_var = step_nonzero.groupby(['ID','date'])['steps'].var().reset_index().rename(columns={'steps':'step_var'})
step_max = step_nonzero.groupby(['ID','date'])['steps'].max().reset_index().rename(columns={'steps':'step_max'})

#calculation of step_hvar_mean
step_nonzero['hour'] = pd.to_datetime(step_nonzero['time']).dt.hour 
step_hvar = step_nonzero.groupby(['ID','date','hour'])['steps'].var().reset_index()

step_hvar_mean = step_hvar.groupby(['ID','date'])['steps'].mean().reset_index().rename(columns={'steps':'step_hvar_mean'})

# create total daily steps
daily_steps = step.groupby(['ID','date'])['steps'].sum().reset_index()

#data merge
step_statistics_merged= pd.merge(left=step, right=step_var, how="outer", on =['date','ID'])
step_statistics_merged= pd.merge(left=step_statistics_merged, right=step_max, how="outer", on =['date','ID'])
step_statistics_merged= pd.merge(left=step_statistics_merged, right=step_mean, how="outer", on =['date','ID'])
step_statistics_merged= pd.merge(left=step_statistics_merged, right=step_hvar_mean, how="outer", on =['date','ID'])

#data preprocessing
step_statistics_merged['datetime'] = step_statistics_merged['date'] + ' ' + step_statistics_merged['time']

output_path = os.path.join(output_folder, "step_stactistics.csv")
step_statistics_merged.to_csv(output_path, index=False)

#data per date
step_date= pd.merge(left=daily_steps, right=step_var, how="left", on =['date','ID'])
step_date= pd.merge(left=step_date, right=step_max, how="left", on =['date','ID'])
step_date= pd.merge(left=step_date, right=step_mean, how="left", on =['date','ID'])
step_date= pd.merge(left=step_date, right=step_hvar_mean, how="left", on =['date','ID'])

output_path = os.path.join(output_folder, "step_date.csv")
step_date.to_csv(output_path, index=False)

In [6]:

step = step_date
step['date'] = pd.to_datetime(step['date'])
id_list = step['ID'].unique()

step_delta = pd.DataFrame(columns=['ID', 'date', 'steps', 'step_max', 'step_mean', 'step_hvar_mean', 'step_delta', 'step_max_delta',
                                   'step_mean_delta', 'step_hvar_mean_delta', 'step_delta2', 'step_max_delta2',
                                   'step_mean_delta2', 'step_hvar_mean_delta2'])
for id in id_list:
    step_id = step.loc[(step.ID == id)]
    time_per_day = pd.date_range(step_id.date.min(), step_id.date.max(), freq='D')
    temp = pd.DataFrame()
    temp['date'] = time_per_day
    step_id = pd.merge(step_id, temp, how='right', on='date')
    step_id.ID = id
    step_id['step_delta'] = step_id['steps'].diff()
    step_id['step_delta2'] = step_id['steps'].diff(periods=2)
    step_id['step_max_delta'] = step_id['step_max'].diff()
    step_id['step_max_delta2'] = step_id['step_max'].diff(periods=2)
    step_id['step_mean_delta'] = step_id['step_mean'].diff()
    step_id['step_mean_delta2'] = step_id['step_mean'].diff(periods=2)
    step_id['step_hvar_mean_delta'] = step_id['step_hvar_mean'].diff()
    step_id['step_hvar_mean_delta2'] = step_id['step_hvar_mean'].diff(periods=2)
    step_delta = pd.concat([step_delta, step_id], axis=0)



step_delta['date'] = step_delta['date'].dt.strftime('%Y-%m-%d')
step_delta.reset_index(drop=True, inplace=True)
# Drop rows where steps, step_delta, and step_delta2 are all zero
step_delta = step_delta[~((step_delta['steps'] == 0) & (step_delta['step_delta'] == 0) & (step_delta['step_delta2'] == 0))]

output_path = os.path.join(output_folder, "step_delta.csv")
step_delta.to_csv(output_path, index=False)

In [7]:
from utils_for_preprocessing import read_all_data
import pandas as pd

BASE_PASSIVE_DIR = zip_path

sleep = read_all_data('Sleep', BASE_PASSIVE_DIR, exclude_keywords=['resting', 'variability'])

# Map various sleep type labels into standardized SLT codes
sleep['type'] = sleep['type'].replace({
    'SLT1': 'SLT3',
    'SLT0': 'SLT2',
    'asleepCore': 'SLT4',
    'asleepDeep': 'SLT5',
    'asleepREM': 'SLT6',
    'asleepUnspecified': 'SLT2',
    'awake': 'SLT1'
})

# Ensure datetime types for start and end
sleep['started_at'] = pd.to_datetime(sleep['started_at'])
sleep['ended_at'] = pd.to_datetime(sleep['ended_at'])

# Calculate session duration
sleep['duration'] = sleep['ended_at'] - sleep['started_at']

# Extract date for grouping
sleep['date'] = sleep['started_at'].dt.date

# Pivot to sum durations per SLT type
sleep_summary = sleep.pivot_table(
    index=['ID', 'date'],
    columns='type',
    values='duration',
    aggfunc='sum',
    fill_value=pd.Timedelta(0)
)

# Ensure all SLT1–SLT6 columns exist
for slt in ['SLT1', 'SLT2', 'SLT3', 'SLT4', 'SLT5', 'SLT6']:
    if slt not in sleep_summary.columns:
        sleep_summary[slt] = pd.Timedelta(0)

# Compute total sleep as sum of all SLT durations
sleep_summary['total_sleep'] = sleep_summary[['SLT1', 'SLT2', 'SLT3', 'SLT4', 'SLT5', 'SLT6']].sum(axis=1)

# Convert SLT and total_sleep durations from Timedelta to hours (float)
for col in ['SLT1', 'SLT2', 'SLT3', 'SLT4', 'SLT5', 'SLT6', 'total_sleep']:
    sleep_summary[col] = sleep_summary[col] / pd.Timedelta(hours=1)

# Convert index back to columns
sleep_summary = sleep_summary.reset_index()

# Overwrite sleep with the summary table
sleep = sleep_summary

output_path = os.path.join(output_folder, "sleep_type.csv")
sleep.to_csv(output_path, index=False)


In [8]:
from utils_for_preprocessing import read_all_data
import pandas as pd

BASE_PASSIVE_DIR = zip_path

sleep = read_all_data('Sleep', BASE_PASSIVE_DIR, exclude_keywords=['resting', 'variability'])

sleep = sleep.drop(columns='type')
output_path = os.path.join(output_folder, "sleep_log.csv")
sleep.to_csv(output_path, index=False)



In [9]:
from utils_for_preprocessing import read_all_data
import pandas as pd

BASE_PASSIVE_DIR = zip_path

heartrate = read_all_data('HeartRate', BASE_PASSIVE_DIR, exclude_keywords=['resting', 'variability'])

# Convert obtained_at to datetime then split into date and time
heartrate['obtained_at'] = pd.to_datetime(heartrate['obtained_at'])
heartrate['date'] = heartrate['obtained_at'].dt.date.astype(str)
heartrate['time'] = heartrate['obtained_at'].dt.time.astype(str)

# Drop unneeded columns and reset index
heartrate = heartrate.drop(columns=['started_at', 'ended_at', 'obtained_at']).reset_index(drop=True)
output_path = os.path.join(output_folder, "HR.csv")
heartrate.to_csv(output_path, index=False)

In [10]:
import pandas as pd

#load data
HR= heartrate
HR = HR.rename(columns={'heart_rate' : 'HR'})
#data preprocessing
HR['HR'] = pd.to_numeric(HR['HR'])
HR_nonzero = HR[HR.HR != 0]

#statistical analysis
HR_mean = HR_nonzero.groupby(['ID','date'])['HR'].mean().reset_index()
HR_var = HR_nonzero.groupby(['ID','date'])['HR'].var().reset_index()
HR_min = HR_nonzero.groupby(['ID','date'])['HR'].min().reset_index()
HR_max = HR_nonzero.groupby(['ID','date'])['HR'].max().reset_index()

#calculation of HR_hvar_mean
HR['hour'] = pd.to_datetime(HR['time']).dt.hour 
HR_hvar = HR.groupby(['ID','date','hour'])['HR'].var().reset_index()
HR_hvar_mean = HR_hvar.groupby(['ID','date'])['HR'].mean().reset_index()

#data merge
HR_statistics_merged = pd.merge(left=HR, right=HR_var, how="outer", on =['date','ID'], suffixes=['', '_var'])
HR_statistics_merged = pd.merge(left=HR_statistics_merged, right=HR_min, how="outer", on =['date','ID'], suffixes=['', '_min'])
HR_statistics_merged = pd.merge(left=HR_statistics_merged, right=HR_max, how="outer", on =['date','ID'], suffixes=['', '_max'])
HR_statistics_merged = pd.merge(left=HR_statistics_merged, right=HR_mean, how="outer", on =['date','ID'], suffixes=['', '_mean'])
HR_statistics_merged = pd.merge(left=HR_statistics_merged, right=HR_hvar_mean, how="outer", on =['date','ID'], suffixes=['', '_hvar_mean'])

#data preprocessing
HR_statistics_merged['datetime'] = HR_statistics_merged['date'] + ' ' + HR_statistics_merged['time']
HR_statistics_merged.drop('hour', axis=1, inplace=True)

output_path = os.path.join(output_folder, "hr_stactistics_fixed.csv")
HR_statistics_merged.to_csv(output_path, index=False)

#data per date
HR_date = pd.merge(left=HR_var, right=HR_max, how="left", on =['date','ID'], suffixes=['', '_max'])
HR_date = pd.merge(left=HR_date, right=HR_mean, how="left", on =['date','ID'], suffixes=['', '_mean'])
HR_date = pd.merge(left=HR_date, right=HR_hvar_mean, how="left", on =['date','ID'], suffixes=['', '_hvar_mean'])
HR_date.rename(columns = {'HR':'HR_var'}, inplace=True)

output_path = os.path.join(output_folder, "HR_date_fixed.csv")
HR_date.to_csv(output_path, index=False)

In [12]:
import pandas as pd
import numpy as np
from tqdm import tqdm

def main():
    output_path = os.path.join(output_folder, "HR.csv")
    HR = pd.read_csv(output_path)
    if 'heart_rate' in HR.columns:
        HR.rename(columns={'heart_rate': 'HR'}, inplace=True)
    HR['HR'] = pd.to_numeric(HR['HR'], errors='coerce')
    HR['datetime'] = pd.to_datetime(HR['date'] + ' ' + HR['time'], errors='coerce')
    
    out = []
    for pid, grp in tqdm(HR.groupby('ID'), desc='Processing IDs'):
        grp = grp.dropna(subset=['datetime']).set_index('datetime').sort_index()
        for day, day_grp in grp.groupby(grp.index.date):
            orig_count = day_grp['HR'].dropna().shape[0]
            base = pd.to_datetime(f"{day}") + pd.to_timedelta(np.arange(1440), unit='m')
            df_full = pd.DataFrame(index=base)
            if orig_count > 720:
                tmp = day_grp[['HR']].resample('1min').mean()
                tmp = tmp.reindex(df_full.index)
                tmp['HR'] = tmp['HR'].interpolate(method='time', limit=30, limit_direction='both')
                df_full['HR'] = tmp['HR']
            else:
                df_full['HR'] = np.nan
            df_full['ID'] = pid
            df_full['date'] = pd.to_datetime(day).date()
            df_full['time'] = df_full.index.time.astype(str)
            out.append(df_full[['HR', 'ID', 'date', 'time']])
    
    HR_interp = pd.concat(out, ignore_index=True)
    output_path = os.path.join(output_folder, "HR_interpolated_720.csv")
    HR_interp.to_csv(output_path, index=False)

if __name__ == '__main__':
    main()


Processing IDs: 100%|██████████| 18/18 [00:00<00:00, 19.42it/s]


In [14]:
import pandas as pd
from utils_for_preprocessing import mesor, amplitude, acrophase
output_path = os.path.join(output_folder, "HR_interpolated_720.csv")
HR_interpolated = pd.read_csv(output_path)

HR_interpolated['HR'] = pd.to_numeric(HR_interpolated['HR'])

id_list = HR_interpolated['ID'].unique()
circadian_data = pd.DataFrame(columns=['ID','date','acr','amp','mesor'])
                  
for id in id_list:
    temp_id = HR_interpolated.loc[(HR_interpolated['ID'] == id)]
    temp_id.reset_index(inplace=True)
    temp_id = temp_id.drop('index', axis=1)
    date_list =temp_id['date'].unique()
    for date in date_list:
        temp_date = temp_id.loc[(temp_id['date'] == date)]
        temp_date.reset_index(inplace=True)
        temp_date = temp_date.drop('index', axis=1)
        temp_date.reset_index(inplace=True)  
        if temp_date.HR.count() > 720:
            acr = acrophase(temp_date['index'], temp_date['HR'])
            amp = amplitude(temp_date['index'], temp_date['HR'])
            mes = mesor(temp_date['index'], temp_date['HR'])
            new_row = pd.DataFrame([[id, date, acr, amp, mes]], columns=['ID','date','acr','amp','mesor'])
            circadian_data = pd.concat([circadian_data, new_row], ignore_index=True)
            print(id, date, acr, amp, mes)
        else:
            pass

output_path = os.path.join(output_folder, "circadian_parameter_720.csv")
circadian_data.to_csv(output_path, index=False)


PXPN_10006 2024-11-25 19.265145588436745 15.424105846332768 69.03950135486359
PXPN_10006 2024-11-26 17.994066748389073 11.548442258213504 70.02448313558153
PXPN_10007 2024-11-30 21.576007247313072 1.0231328086876441 81.67332569912041
PXPN_10008 2024-11-05 19.770565746121743 18.491975889228527 77.24021177435837
PXPN_10008 2024-11-12 19.614707422070083 16.678075562293504 79.32278832555278
PXPN_10008 2024-11-13 18.541739662094216 7.441858093046665 72.66930650655237
PXPN_10008 2024-11-21 21.014255048695478 17.61298673939339 77.96165404811236
PXPN_10008 2024-11-28 15.441927204364685 7.686289341123911 78.49288894876533
PXPN_10011 2024-11-21 1.0102779284418977 16.54302164609696 104.18441510054075
PXPN_10011 2024-11-25 1.6321218131114865 28.999639909969943 110.13771794332347
PXPN_10011 2024-11-26 12.12516858682338 19.52795850434224 85.28402832094925
PXPN_10011 2024-11-30 16.273908480290647 23.153342481335066 77.15986539844646
PXPN_10018 2024-11-22 12.838550456043496 12.629442879917468 95.11612

In [15]:
import pandas as pd
output_path = os.path.join(output_folder, "circadian_parameter_720.csv")
circadian = pd.read_csv(output_path)
circadian['date'] = pd.to_datetime(circadian['date'])
id_list = circadian['ID'].unique()

circadian_delta = pd.DataFrame(columns=['ID', 'date', 'acr', 'amp', 'mesor','acr_delta', 'acr_delta2', 'amp_delta', 'amp_delta2', 'mesor_delta', 'mesor_delta2'])
for id in id_list:
    circadian_id = circadian.loc[(circadian.ID == id)]
    time_per_day = pd.date_range(circadian_id.date.min(), circadian_id.date.max(), freq='D')
    temp = pd.DataFrame()
    temp['date'] = time_per_day
    circadian_id = pd.merge(circadian_id, temp, how='right', on='date')
    circadian_id.ID = id
    circadian_id['acr_delta'] = circadian_id['acr'].diff()
    circadian_id['acr_delta2'] = circadian_id['acr'].diff(periods=2)
    circadian_id['amp_delta'] = circadian_id['amp'].diff()
    circadian_id['amp_delta2'] = circadian_id['amp'].diff(periods=2)
    circadian_id['mesor_delta'] = circadian_id['mesor'].diff()
    circadian_id['mesor_delta2'] = circadian_id['mesor'].diff(periods=2)
    circadian_delta = pd.concat([circadian_delta, circadian_id], axis=0)

circadian_delta['date'] = circadian_delta['date'].dt.strftime('%Y-%m-%d')
circadian_delta.reset_index(drop=True, inplace=True)

output_path = os.path.join(output_folder, "circadian_delta_720.csv")
circadian_delta.to_csv(output_path, index=False)

In [16]:
import pandas as pd
import numpy as np
from utils_for_preprocessing import (
    check_bandpower_value_a,
    check_bandpower_value_b,
    check_bandpower_value_c,
    check_bandpower_value_d,
)
from joblib import Parallel, delayed
from tqdm.auto import tqdm
from tqdm_joblib import tqdm_joblib
output_path = os.path.join(output_folder, "HR_interpolated_720.csv")
# 1) 파일 로드 & 타입 변환
HR = pd.read_csv(
    output_path,
    parse_dates=["date"]
)
HR["HR"] = pd.to_numeric(HR["HR"], errors="coerce")

# ——— build per-minute DataFrame ———
df_per_min = pd.DataFrame(columns=['ID','HR','date'])
for id in HR['ID'].unique():
    df_id = HR[HR['ID'] == id]
    time_per_min = pd.date_range(df_id['date'].min(), df_id['date'].max(), freq='min')
    temp = pd.DataFrame({'date': time_per_min})
    df_id = pd.merge(df_id, temp, how='right', on='date')
    df_id['ID'] = id
    df_per_min = pd.concat([df_per_min, df_id], axis=0)

df_per_min["day"] = df_per_min["date"].dt.date

# 4) 하루 그룹 하나당 밴드파워 계산 함수
def compute_bandpower_for_group(group):
    (id_, day), sub = group
    valid_count = sub['HR'].notna().sum()
    if valid_count <= 720:
        return None
    idx = np.arange(len(sub))
    hr  = sub["HR"].to_numpy()
    return {
        "ID":           id_,
        "date":         pd.Timestamp(day),
        "bandpower_a":  check_bandpower_value_a(idx, hr),
        "bandpower_b":  check_bandpower_value_b(idx, hr),
        "bandpower_c":  check_bandpower_value_c(idx, hr),
        "bandpower_d":  check_bandpower_value_d(idx, hr),
    }

groups = df_per_min.groupby(["ID","day"], sort=False)

# 5) tqdm_joblib 로 진행률 표시하며 병렬 처리
with tqdm_joblib(tqdm(total=df_per_min["ID"].nunique(), desc="IDs")):
    results = Parallel(n_jobs=-1)(
        delayed(lambda g: compute_bandpower_for_group(g))(grp)
        for grp in groups
    )

# 6) None 삭제 & DataFrame 생성
records = [r for r in results if r is not None]
bandpower_df = pd.DataFrame(records)

output_path = os.path.join(output_folder, "bandpower_720.csv")
bandpower_df.to_csv(output_path, index=False)

IDs:   0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]