# Panic Project (DHLAB) - Data Imputation

author:  `@cyshin971`  

date:    `2025-06-23`  

version: `2.1`

> version `2.1`: Questionnaire forward filling added (previously implemented in scraping stage)  
> version `2.0`: imputation separated from `data_analysis.ipynb` (version `1.3`)  

In [1]:
version = '2-1'

# 📚 | Import Libraries 

In [2]:
import config as cfg
import logging

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
logging.getLogger('matplotlib').setLevel(logging.WARNING)

from library.pandas_utils import move_column, remove_columns, aggregate_by_column, create_empty_df, read_csv
from library.text_utils import save_as_csv
from library.json_utils import save_dict_to_file, load_dict_from_file
from library.path_utils import get_file_path

# ⚙️ | Settings

In [3]:
manual_scraped_data_filename = 'final_result_20250620_720' # Keep as None if you don't want to manually specify a file

lifelog_null_avg = False
# True: will replace null values with average per patient
# False: will replace null values with 0 in lifelog data 

# 📁 | Path Variables 

In [4]:
DATA_PATH = "./_data"
TMP_PATH = "./cys/_tmp"
OUT_PATH = "./cys/_output"

try:
	features_dict = load_dict_from_file(OUT_PATH, 'panic_features_dict')
except FileNotFoundError:
    raise FileNotFoundError(f"File not found: {get_file_path(OUT_PATH, 'panic_features_dict')}. Please run data_analysis.ipynb first.")
print(f"Loaded features dict with {len(features_dict)} keys:")
scraped_data_filename = None
for k, v in features_dict.items():
    if k == 'scraped_data_filename':
        print(f"  {k}: {v}.csv")
        scraped_data_filename = v
    elif k == 'preproc_version':
        preproc_version = v
    else:
        print(f"  {k}: {v}")

if scraped_data_filename is None:
	raise ValueError("scraped_data_filename not found in features_dict")
if manual_scraped_data_filename is not None:
    logging.warning(f"Using manually specified scraped_data_filename: {manual_scraped_data_filename}. If this is not intended, please set it to None.")
    scraped_data_filename = manual_scraped_data_filename

features_dict['imputation_version'] = version
save_dict_to_file(features_dict, OUT_PATH, 'panic_features_dict')

PREPROC_PATH = f"{OUT_PATH}/{scraped_data_filename}/preprocessed"
OUTPUT_PATH = f"{OUT_PATH}/{scraped_data_filename}/imputed"

DEBUG - (json_utils.py) load_dict_from_file: Dictionary loaded successfully from C:\Users\cyshi\OneDrive\Documents\GitHub\Panic-Project-CYS\cys\_output\panic_features_dict.json
DEBUG - (json_utils.py) save_dict_to_file: Dictionary saved successfully to C:\Users\cyshi\OneDrive\Documents\GitHub\Panic-Project-CYS\cys\_output\panic_features_dict.json


Loaded features dict with 12 keys:
  scraped_data_filename: final_result_20250620_360.csv
  demography: ['gender', 'age', 'marriage', 'job', 'smkHx', 'drinkHx', 'suicideHx', 'suicide_need']
  dailylog: ['severity', 'exercise', 'alcohol', 'coffee', 'menstruation', 'smoking', 'positive_feeling', 'negative_feeling', 'positive_E', 'negative_E', 'anxiety', 'annoying']
  lifelog: ['HR_var', 'HR_max', 'HR_mean', 'HR_hvar_mean', 'HR_acrophase', 'HR_amplitude', 'HR_mesor', 'HR_acrophase_difference', 'HR_acrophase_difference_2d', 'HR_amplitude_difference', 'HR_amplitude_difference_2d', 'HR_mesor_difference', 'HR_mesor_difference_2d', 'bandpower(0.001-0.0005Hz)', 'bandpower(0.0005-0.0001Hz)', 'bandpower(0.0001-0.00005Hz)', 'bandpower(0.00005-0.00001Hz)', 'steps', 'SLT1', 'SLT2', 'SLT3', 'SLT4', 'SLT5', 'SLT6', 'total_sleep', 'steps_maximum', 'steps_mean', 'step_hvar_mean', 'step_delta', 'step_max_delta', 'step_mean_delta', 'step_hvar_mean_delta', 'step_delta2', 'step_max_delta2', 'step_mean_delta

# ⚒️ | Preprocessed Data

## Load Preprocessed Data

In [5]:
pre_data = read_csv(get_file_path(PREPROC_PATH, f'panic_pre_data_{preproc_version}({scraped_data_filename}).csv'))
display(pre_data.head(5))
metadata = read_csv(get_file_path(PREPROC_PATH, f'panic_metadata_{preproc_version}({scraped_data_filename}).csv'))
display(metadata.head(5))
demography_data = read_csv(get_file_path(PREPROC_PATH, f'panic_demography_data_{preproc_version}({scraped_data_filename}).csv'))
display(demography_data.head(5))

Unnamed: 0,entry_id,dataset,ID,date,PHQ_9,STAI_X2,CSM,CTQ_1,CTQ_2,CTQ_3,...,SLT2,SLT3,SLT4,SLT5,SLT6,total_sleep,dbp,panic,severity,panic_label
0,PXPN_10006_2024-11-04,PXPN,PXPN_10006,2024-11-04,0.0,32.0,31.0,11.0,13.0,17.0,...,,,,,,,,0.0,,0
1,PXPN_10006_2024-11-05,PXPN,PXPN_10006,2024-11-05,0.0,32.0,31.0,11.0,13.0,17.0,...,4.47,3.62,4.67,0.65,1.85,15.26,,0.0,,0
2,PXPN_10006_2024-11-06,PXPN,PXPN_10006,2024-11-06,0.0,32.0,31.0,11.0,13.0,17.0,...,0.0,0.2,4.07,1.43,1.68,7.38,1.0,1.0,,0
3,PXPN_10006_2024-11-07,PXPN,PXPN_10006,2024-11-07,0.0,32.0,31.0,11.0,13.0,17.0,...,0.0,0.14,5.08,0.0,0.97,6.19,0.0,2.0,1.0,1
4,PXPN_10006_2024-11-08,PXPN,PXPN_10006,2024-11-08,0.0,32.0,31.0,11.0,13.0,17.0,...,,,,,,,,0.0,,0


Unnamed: 0,entry_id,ID,date,dataset,dailylog_data,lifelog_data,questionnaire_data,dtype_n,dbp,panic,n_prior_data,valid_entry_3,valid_entry_2,valid_entry_1,ref_event_id,panic_label,severity
0,SYM2-1-96_2021-08-04,SYM2-1-96,2021-08-04,SYM2,0,1,0,1,,0.0,2,0,1,1,,0,
1,SYM2-1-96_2021-08-03,SYM2-1-96,2021-08-03,SYM2,0,1,0,1,,0.0,1,0,0,1,,0,
2,SYM2-1-96_2021-08-02,SYM2-1-96,2021-08-02,SYM2,0,1,0,1,,0.0,0,0,0,0,,0,
3,SYM2-1-96_2021-07-30,SYM2-1-96,2021-07-30,SYM2,0,1,0,1,,0.0,2,0,1,1,,0,
4,SYM2-1-96_2021-07-29,SYM2-1-96,2021-07-29,SYM2,0,1,0,1,,0.0,1,0,0,1,,0,


Unnamed: 0,ID,gender,age,marriage,job,smkHx,drinkHx,suicideHx,suicide_need
0,PXPN_10006,0,32.0,0.0,1.0,1.0,1.0,0.0,0.0
1,PXPN_10007,1,38.0,1.0,1.0,0.0,0.0,0.0,0.0
2,PXPN_10008,0,38.0,1.0,0.0,0.0,1.0,0.0,0.0
3,PXPN_10009,1,28.0,0.0,0.0,1.0,0.0,1.0,0.0
4,PXPN_10010,1,21.0,0.0,0.0,1.0,1.0,0.0,0.0


# Data Imputation

## Questionnaire

- The null values were forward filled from the first value in the scraped data
- the entries with null values prior to the first value was backward filled
- other null values were set to 0

In [6]:
pre_data_filled_questionnaire = create_empty_df()
pre_data_filled_questionnaire = pre_data.copy()

pre_data_filled_questionnaire.sort_values(by=['ID', 'date'], inplace=True)

for qcol in features_dict['questionnaire']:
    pre_data_filled_questionnaire[qcol] = (
        pre_data_filled_questionnaire.groupby('ID')[qcol]
            .apply(lambda s: s.ffill().bfill())
            .fillna(0)
            .values
    )

pre_data_filled_questionnaire.sort_values(by=['ID', 'date'], inplace=True)
disp_df = pre_data_filled_questionnaire[pre_data_filled_questionnaire['ID'] == 'SYM1-1-380'].copy()
display(disp_df[features_dict['id']+features_dict['questionnaire']].head(5))

Unnamed: 0,entry_id,ID,date,dataset,PHQ_9,STAI_X2,CSM,CTQ_1,CTQ_2,CTQ_3,...,CTQ_5,KRQ,MDQ,ACQ,APPQ_1,APPQ_2,APPQ_3,BSQ,GAD_7,BRIAN
9523,SYM1-1-380_2021-07-24,SYM1-1-380,2021-07-24,SYM1,7.0,0.0,24.0,0.0,0.0,0.0,...,0.0,0.0,0.0,18.0,20.0,14.0,14.0,41.0,1.0,34.0
9524,SYM1-1-380_2021-07-25,SYM1-1-380,2021-07-25,SYM1,7.0,0.0,24.0,0.0,0.0,0.0,...,0.0,0.0,0.0,18.0,20.0,14.0,14.0,41.0,1.0,34.0
9525,SYM1-1-380_2021-07-26,SYM1-1-380,2021-07-26,SYM1,7.0,0.0,24.0,0.0,0.0,0.0,...,0.0,0.0,0.0,18.0,20.0,14.0,14.0,41.0,1.0,34.0
9526,SYM1-1-380_2021-07-27,SYM1-1-380,2021-07-27,SYM1,7.0,0.0,24.0,0.0,0.0,0.0,...,0.0,0.0,0.0,18.0,20.0,14.0,14.0,41.0,1.0,34.0
9527,SYM1-1-380_2021-07-28,SYM1-1-380,2021-07-28,SYM1,7.0,0.0,24.0,0.0,0.0,0.0,...,0.0,0.0,0.0,18.0,20.0,14.0,14.0,41.0,1.0,34.0


## Daily Log

- Null values set to `0`

In [7]:
pre_data_filled_dailylog = create_empty_df()
pre_data_filled_dailylog = pre_data_filled_questionnaire.copy()

# Fill Null Values
pre_data_filled_dailylog.loc[:, features_dict['dailylog']] = pre_data_filled_dailylog.loc[:, features_dict['dailylog']].fillna(0)

## Life Log

- Null values set to the average of existing entry values per patient
- For cases when there are no existing entry values, the total average of all entries was used to set null values

In [8]:
pre_data_filled_lifelog = create_empty_df()
pre_data_filled_lifelog = pre_data_filled_dailylog.copy()

if not lifelog_null_avg:
    logging.info("Filling null values in lifelog data with 0")
    # Fill Null Values with 0
    pre_data_filled_lifelog.loc[:, features_dict['lifelog']] = pre_data_filled_lifelog.loc[:, features_dict['lifelog']].fillna(0)
else:
    logging.info("Filling null values in lifelog data with average per patient")
    # Find the mean of each column for each ID
    agg_matrix = []
    for col in features_dict['lifelog']:
        add_tuple = (col, col, 'mean')
        agg_matrix.append(add_tuple)
    lifelog_agg = aggregate_by_column(pre_data_filled_lifelog, 'ID', agg_matrix)
    
    # fill the remaining null values with the mean of each column
    for col in features_dict['lifelog']:
        lifelog_agg[col] = lifelog_agg[col].fillna(lifelog_agg[col].mean())
    
    # Use the values in lifelog_agg to fill the null values in pre_data_filled_lifelog
    unique_ids = pre_data_filled_lifelog['ID'].unique()
    for uid in unique_ids:
        for col in features_dict['lifelog']:
            pre_data_filled_lifelog.loc[pre_data_filled_lifelog['ID'] == uid, col] = pre_data_filled_lifelog.loc[pre_data_filled_lifelog['ID'] == uid, col].fillna(
				lifelog_agg.loc[lifelog_agg['ID'] == uid, col].values[0])

display(pre_data_filled_lifelog[features_dict['lifelog']].head(5))

INFO - (1287973818.py) <module>: Filling null values in lifelog data with 0


Unnamed: 0,HR_var,HR_max,HR_mean,HR_hvar_mean,HR_acrophase,HR_amplitude,HR_mesor,HR_acrophase_difference,HR_acrophase_difference_2d,HR_amplitude_difference,...,step_hvar_mean,step_delta,step_max_delta,step_mean_delta,step_hvar_mean_delta,step_delta2,step_max_delta2,step_mean_delta2,step_hvar_mean_delta2,steps_variance
0,364.72,135.0,74.33,123.22,0.0,0.0,0.0,0.0,0.0,0.0,...,24734.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31382.71
1,92.21,84.0,54.81,29.4,0.0,0.0,0.0,0.0,0.0,0.0,...,776.67,772.0,-409.0,-88.71,-23957.39,0.0,0.0,0.0,0.0,1740.3
2,171.01,115.0,62.71,50.58,0.0,0.0,0.0,0.0,0.0,0.0,...,4892.6,577.0,260.0,22.7,4115.93,1349.0,-149.0,-66.01,-19841.46,8995.2
3,573.86,148.0,79.18,72.7,0.0,0.0,0.0,0.0,0.0,0.0,...,20119.74,7336.0,276.0,68.51,15227.13,7913.0,536.0,91.21,19343.06,30976.38
4,319.68,124.0,87.58,228.52,0.0,0.0,0.0,0.0,0.0,0.0,...,32949.61,-4014.0,208.0,36.32,12829.87,3322.0,484.0,104.83,28057.0,47028.76


## 💾 | Save Filled Data

In [9]:
pre_data_filled = create_empty_df()
pre_data_filled = pre_data_filled_lifelog.copy()

save_as_csv(pre_data_filled, OUTPUT_PATH, f"panic_pre_data_filled_{version}({scraped_data_filename})")

DEBUG - (text_utils.py) save_as_csv: Saved panic_pre_data_filled_2-1(final_result_20250620_720).csv to C:\Users\cyshi\OneDrive\Documents\GitHub\Panic-Project-CYS\cys\_output\final_result_20250620_720\imputed


WindowsPath('C:/Users/cyshi/OneDrive/Documents/GitHub/Panic-Project-CYS/cys/_output/final_result_20250620_720/imputed/panic_pre_data_filled_2-1(final_result_20250620_720).csv')

## Filled Metadata

In [10]:
metadata_filled = create_empty_df()
metadata_filled = metadata.copy()
if len(pre_data_filled) != len(metadata):
    raise ValueError(f"Length of pre_data_filled ({len(pre_data_filled)}) does not match length of metadata ({len(metadata)}). Please check the data consistency.")
none_columns = ['dailylog_data', 'lifelog_data', 'questionnaire_data', 'dtype_n']
for col in none_columns:
    metadata_filled[col] = None

metadata_filled['dailylog_data'] = pre_data_filled[features_dict['dailylog']].notnull().any(axis=1).astype(int)
metadata_filled['lifelog_data'] = pre_data_filled[features_dict['lifelog']].notnull().any(axis=1).astype(int)
metadata_filled['questionnaire_data'] = pre_data_filled[features_dict['questionnaire']].notnull().any(axis=1).astype(int)

# TODO: Diary data is not used in the current analysis, but can be useful for future reference
metadata_filled['dtype_n'] = metadata_filled['dailylog_data'] + metadata_filled['lifelog_data'] + metadata_filled['questionnaire_data']

save_as_csv(metadata_filled, OUTPUT_PATH, f"panic_metadata_filled_{version}({scraped_data_filename})")
display(metadata_filled.head(5))

DEBUG - (text_utils.py) save_as_csv: Saved panic_metadata_filled_2-1(final_result_20250620_720).csv to C:\Users\cyshi\OneDrive\Documents\GitHub\Panic-Project-CYS\cys\_output\final_result_20250620_720\imputed


Unnamed: 0,entry_id,ID,date,dataset,dailylog_data,lifelog_data,questionnaire_data,dtype_n,dbp,panic,n_prior_data,valid_entry_3,valid_entry_2,valid_entry_1,ref_event_id,panic_label,severity
0,SYM2-1-96_2021-08-04,SYM2-1-96,2021-08-04,SYM2,1,1,1,3,,0.0,2,0,1,1,,0,
1,SYM2-1-96_2021-08-03,SYM2-1-96,2021-08-03,SYM2,1,1,1,3,,0.0,1,0,0,1,,0,
2,SYM2-1-96_2021-08-02,SYM2-1-96,2021-08-02,SYM2,1,1,1,3,,0.0,0,0,0,0,,0,
3,SYM2-1-96_2021-07-30,SYM2-1-96,2021-07-30,SYM2,1,1,1,3,,0.0,2,0,1,1,,0,
4,SYM2-1-96_2021-07-29,SYM2-1-96,2021-07-29,SYM2,1,1,1,3,,0.0,1,0,0,1,,0,


# 📒 | Reports

In [11]:
unique_filled_ids =  pre_data_filled['ID'].unique()
print(f"Total number of unique IDs in pre_data_filled: {len(unique_filled_ids)}")
print(f"Total number of entries in pre_data_filled: {len(pre_data_filled)}")
panic_entries = pre_data_filled[pre_data_filled['panic_label'] == 1]
print(f"Total number of panic entries in pre_data_filled: {len(panic_entries)}")

Total number of unique IDs in pre_data_filled: 273
Total number of entries in pre_data_filled: 23828
Total number of panic entries in pre_data_filled: 811


In [12]:
filled_entry_ids = pre_data_filled['entry_id'].unique()
print(f"Total number of daily log entries: {metadata_filled[metadata_filled['dailylog_data'] == 1].shape[0]} / {len(filled_entry_ids)} ({metadata_filled[metadata_filled['dailylog_data'] == 1].shape[0] / len(filled_entry_ids) * 100:.2f}%)")
print(f"Total number of life log entries: {metadata_filled[metadata_filled['lifelog_data'] == 1].shape[0]} / {len(filled_entry_ids)} ({metadata_filled[metadata_filled['lifelog_data'] == 1].shape[0] / len(filled_entry_ids) * 100:.2f}%)")
print(f"Total number of questionnaire entries: {metadata_filled[metadata_filled['questionnaire_data'] == 1].shape[0]} / {len(filled_entry_ids)} ({metadata_filled[metadata_filled['questionnaire_data'] == 1].shape[0] / len(filled_entry_ids) * 100:.2f}%)")
# print(f"Total number of panic diary entries: {metadata_filled[metadata_filled['diary_data'] == 1].shape[0]} / {len(filled_entry_ids)} ({metadata_filled[metadata_filled['diary_data'] == 1].shape[0] / len(filled_entry_ids) * 100:.2f}%)")

Total number of daily log entries: 23828 / 23828 (100.00%)
Total number of life log entries: 23828 / 23828 (100.00%)
Total number of questionnaire entries: 23828 / 23828 (100.00%)


In [13]:
panic_patients = metadata_filled[metadata_filled['panic_label'] == 1]['ID'].unique()
print(f"Total number of patients with panic events: {len(panic_patients)}")

Total number of patients with panic events: 105
