# Panic Project (DHLAB) - Data Imputation

author:  `@cyshin971`  

date:    `2025-07-xx`  

Instructions:
- Run `data_processing.ipynb`
- Output can be found in `./output/{scraped_data_filname}/imputed`

version: `3.0`
 
> version `2.0`: imputation separated from `data_analysis.ipynb` (version `1.3`)  
> version `3.0`: Release Version

In [1]:
version = '3-0'

# 📚 | Import Libraries 

Required Packages:
- `python` (`3.10`)
- `pandas`  
- `numpy`
- `json`

In [2]:
import logging

import numpy as np

from library.pandas_utils import create_empty_df, read_csv
from library.text_utils import save_as_csv
from library.json_utils import save_dict_to_file, load_dict_from_file
from library.path_utils import get_file_path

print(f"Numpy version: {np.__version__}")

Numpy version: 1.26.4


# ⚙️ | Settings

In [3]:
manual_scraped_data_filename = None # Keep as None if you don't want to manually specify a file

use_growing_avg = True # Default: True
# True: will use a growing average for the patient, i.e. the average of all previous entries
# False: will use a fixed average for the patient, i.e. the average of all entries for that patient

null_default_zero = True # Default: True
# True: will replace null values with no existing entries for that patient with 0
# False: will replace null values with no existing entries for that patient with the patient average or global average

# 📁 | Path Variables 

In [4]:
TMP_PATH = "./_tmp"
OUT_PATH = "./_output"

try:
	features_dict = load_dict_from_file(OUT_PATH, 'panic_features_dict')
except FileNotFoundError:
    raise FileNotFoundError(f"File not found: {get_file_path(OUT_PATH, 'panic_features_dict')}. Please run data_preprocessing.ipynb first.")
print(f"Loaded features dict with {len(features_dict)} keys:")
scraped_data_filename = None
for k, v in features_dict.items():
    if k == 'scraped_data_filename':
        print(f"  {k}: {v}.csv")
        scraped_data_filename = v
    elif k == 'preproc_version':
        preproc_version = v
        print(f"  {k}: {v}")
    else:
        print(f"  {k}: {v}")

if scraped_data_filename is None:
	raise ValueError("scraped_data_filename not found in features_dict. Please ensure that data_preprocessing.ipynb has been run successfully before running this notebook.")
if manual_scraped_data_filename is not None:
    logging.warning(f"Using manually specified scraped_data_filename: {manual_scraped_data_filename}. If this is not intended, please set it to None.")
    scraped_data_filename = manual_scraped_data_filename

features_dict['imputation_version'] = version
features_dict['use_growing_avg'] = use_growing_avg
features_dict['null_default_zero'] = null_default_zero
save_dict_to_file(features_dict, OUT_PATH, 'panic_features_dict')

PREPROC_PATH = f"{OUT_PATH}/{scraped_data_filename}/preprocessed"
OUTPUT_PATH = f"{OUT_PATH}/{scraped_data_filename}/imputed"

DEBUG - (json_utils.py) load_dict_from_file: Dictionary loaded successfully from C:\Users\cyshi\OneDrive\Documents\GitHub\Panic-Project-CYS\_output\panic_features_dict.json
DEBUG - (json_utils.py) save_dict_to_file: Dictionary saved successfully to C:\Users\cyshi\OneDrive\Documents\GitHub\Panic-Project-CYS\_output\panic_features_dict.json


Loaded features dict with 15 keys:
  scraped_data_filename: final_result_20250626_360_no_ffill.csv
  preproc_version: 3-0
  demography: ['gender', 'age', 'marriage', 'job', 'smkHx', 'drinkHx', 'suicideHx', 'suicide_need']
  dailylog: ['severity', 'exercise', 'alcohol', 'coffee', 'menstruation', 'smoking', 'positive_feeling', 'negative_feeling', 'positive_E', 'negative_E', 'anxiety', 'annoying']
  mood: ['positive_feeling', 'negative_feeling', 'positive_E', 'negative_E', 'anxiety', 'annoying']
  dailylog_life: ['exercise', 'alcohol', 'coffee', 'menstruation', 'smoking']
  lifelog: ['HR_var', 'HR_max', 'HR_mean', 'HR_hvar_mean', 'HR_acrophase', 'HR_amplitude', 'HR_mesor', 'HR_acrophase_difference', 'HR_acrophase_difference_2d', 'HR_amplitude_difference', 'HR_amplitude_difference_2d', 'HR_mesor_difference', 'HR_mesor_difference_2d', 'bandpower(0.001-0.0005Hz)', 'bandpower(0.0005-0.0001Hz)', 'bandpower(0.0001-0.00005Hz)', 'bandpower(0.00005-0.00001Hz)', 'steps', 'SLT1', 'SLT2', 'SLT3', 'SL

# ⚒️ | Preprocessed Data

## Load Preprocessed Data

In [5]:
pre_data = read_csv(get_file_path(PREPROC_PATH, f'panic_pre_data_{preproc_version}({scraped_data_filename}).csv'))
display(pre_data.head(2))
metadata = read_csv(get_file_path(PREPROC_PATH, f'panic_metadata_{preproc_version}({scraped_data_filename}).csv'))
display(metadata.head(2))
demography_data = read_csv(get_file_path(PREPROC_PATH, f'panic_demography_data_{preproc_version}({scraped_data_filename}).csv'))
display(demography_data.head(2))

Unnamed: 0,entry_id,dataset,ID,date,PHQ_9,STAI_X2,CSM,CTQ_1,CTQ_2,CTQ_3,...,SLT1,SLT2,SLT3,SLT4,SLT5,SLT6,total_sleep,panic,severity,panic_label
0,PXPN_10006_2024-11-04,PXPN,PXPN_10006,2024-11-04,0.0,32.0,31.0,11.0,13.0,17.0,...,,,,,,,,0.0,,0
1,PXPN_10006_2024-11-05,PXPN,PXPN_10006,2024-11-05,,,,,,,...,0.0,4.47,3.62,4.67,0.65,1.85,15.26,0.0,,0


Unnamed: 0,entry_id,ID,date,dataset,dailylog_data,lifelog_data,questionnaire_data,dtype_n,panic,n_prior_data,dbp,valid_entry_3,valid_entry_2,valid_entry_1,ref_event_id,panic_label,severity
0,SYM2-1-96_2021-08-04,SYM2-1-96,2021-08-04,SYM2,0,1,0,1,0.0,2,,0,1,1,,0,
1,SYM2-1-96_2021-08-03,SYM2-1-96,2021-08-03,SYM2,0,1,0,1,0.0,1,,0,0,1,,0,


Unnamed: 0,ID,gender,age,marriage,job,smkHx,drinkHx,suicideHx,suicide_need
0,PXPN_10006,0,32.0,0.0,1.0,1.0,1.0,0.0,0.0
1,PXPN_10007,1,38.0,1.0,1.0,0.0,0.0,0.0,0.0


# Data Imputation

## Questionnaire

- The null values were forward filled from the first value in the scraped data
- the entries with null values prior to the first value was backward filled
- Subsequent entries were forward filled
- other null values were set to 0

In [6]:
pre_data_filled_questionnaire = create_empty_df()
pre_data_filled_questionnaire = pre_data.copy()

pre_data_filled_questionnaire.sort_values(by=['ID', 'date'], inplace=True)

for qcol in features_dict['questionnaire']:
    pre_data_filled_questionnaire[qcol] = (
        pre_data_filled_questionnaire.groupby('ID')[qcol]
            .apply(lambda s: s.ffill().bfill())
            .fillna(0)
            .values
    )

pre_data_filled_questionnaire.sort_values(by=['ID', 'date'], inplace=True)
disp_df = pre_data_filled_questionnaire[pre_data_filled_questionnaire['ID'] == 'SYM1-1-380'].copy()
display(disp_df[features_dict['id']+features_dict['questionnaire']].head(2))

Unnamed: 0,entry_id,ID,date,dataset,PHQ_9,STAI_X2,CSM,CTQ_1,CTQ_2,CTQ_3,...,CTQ_5,KRQ,MDQ,ACQ,APPQ_1,APPQ_2,APPQ_3,BSQ,GAD_7,BRIAN
7790,SYM1-1-380_2021-07-24,SYM1-1-380,2021-07-24,SYM1,7.0,0.0,24.0,0.0,0.0,0.0,...,0.0,0.0,0.0,18.0,20.0,14.0,14.0,41.0,1.0,34.0
7791,SYM1-1-380_2021-07-25,SYM1-1-380,2021-07-25,SYM1,7.0,0.0,24.0,0.0,0.0,0.0,...,0.0,0.0,0.0,18.0,20.0,14.0,14.0,41.0,1.0,34.0


## Growing Average Imputation

- Initial fill: All NaNs get 0.
- By ID: For each ID, process their entries in chronological order (date).
- Running average: Each time a valid value is encountered, add it to the running sum/count.
- For every NaN (now 0, but you detect it with the original mask), fill with running average so far.
- No values for that ID: Remain at 0 (since running_count stays at 0).
- After each value: The average is updated so the imputation logic matches your requirements.
- No backward fill: Only forward imputation based on past data.

In [7]:
from utils.imputation_utils import growing_average_impute

pre_data_filled_avg = create_empty_df()
pre_data_filled_avg = pre_data_filled_questionnaire.copy()

avg_features = features_dict['lifelog'] + features_dict['mood']

if use_growing_avg:
	logging.info("Using growing average imputation.")
	pre_data_filled_avg = growing_average_impute(
		pre_data_filled_avg, 'ID', avg_features, default_fill_zero=null_default_zero
	)
else: 
	logging.info("Using fixed average imputation.")
	for col in avg_features:
		pre_data_filled_avg[col] = pre_data_filled_avg.groupby('ID')[col].transform(
			lambda x: x.fillna(x.mean())
		)
		if null_default_zero:
			pre_data_filled_avg[col].fillna(0, inplace=True)
		else:
			pre_data_filled_avg[col].fillna(pre_data_filled_avg[col].mean(), inplace=True)

INFO - (4216903050.py) <module>: Using growing average imputation.


## Null Value 0

- Null values set to `0`

In [8]:
zero_null_features = features_dict['dailylog_life']

pre_data_filled_zero_null = create_empty_df()
pre_data_filled_zero_null = pre_data_filled_avg.copy()

# Fill Null Values
pre_data_filled_zero_null.loc[:, zero_null_features] = pre_data_filled_zero_null.loc[:, zero_null_features].fillna(0)

## 💾 | Save Filled Data

In [9]:
pre_data_filled = create_empty_df()
pre_data_filled = pre_data_filled_zero_null.copy()

save_as_csv(pre_data_filled, OUTPUT_PATH, f"panic_pre_data_filled_{version}({scraped_data_filename})_{('grw' if use_growing_avg else 'avg')}_{('zero' if null_default_zero else 'global')}")

DEBUG - (path_utils.py) make_dir: Created directory: C:\Users\cyshi\OneDrive\Documents\GitHub\Panic-Project-CYS\_output\final_result_20250626_360_no_ffill\imputed
DEBUG - (text_utils.py) save_as_csv: Saved panic_pre_data_filled_3-0(final_result_20250626_360_no_ffill)_grw_zero.csv to C:\Users\cyshi\OneDrive\Documents\GitHub\Panic-Project-CYS\_output\final_result_20250626_360_no_ffill\imputed


WindowsPath('C:/Users/cyshi/OneDrive/Documents/GitHub/Panic-Project-CYS/_output/final_result_20250626_360_no_ffill/imputed/panic_pre_data_filled_3-0(final_result_20250626_360_no_ffill)_grw_zero.csv')

## Filled Metadata

In [10]:
metadata_filled = create_empty_df()
metadata_filled = metadata.copy()
if len(pre_data_filled) != len(metadata):
    raise ValueError(f"Length of pre_data_filled ({len(pre_data_filled)}) does not match length of metadata ({len(metadata)}). Please check the data consistency.")
none_columns = ['dailylog_data', 'lifelog_data', 'questionnaire_data', 'dtype_n']
for col in none_columns:
    metadata_filled[col] = None

metadata_filled['dailylog_data'] = pre_data_filled[features_dict['dailylog']].notnull().any(axis=1).astype(int)
metadata_filled['lifelog_data'] = pre_data_filled[features_dict['lifelog']].notnull().any(axis=1).astype(int)
metadata_filled['questionnaire_data'] = pre_data_filled[features_dict['questionnaire']].notnull().any(axis=1).astype(int)

# TODO: Diary data is not used in the current analysis, but can be useful for future reference
metadata_filled['dtype_n'] = metadata_filled['dailylog_data'] + metadata_filled['lifelog_data'] + metadata_filled['questionnaire_data']

save_as_csv(metadata_filled, OUTPUT_PATH, f"panic_metadata_filled_{version}({scraped_data_filename})_{('grw' if use_growing_avg else 'avg')}_{('zero' if null_default_zero else 'global')}")
display(metadata_filled.head(2))

DEBUG - (text_utils.py) save_as_csv: Saved panic_metadata_filled_3-0(final_result_20250626_360_no_ffill)_grw_zero.csv to C:\Users\cyshi\OneDrive\Documents\GitHub\Panic-Project-CYS\_output\final_result_20250626_360_no_ffill\imputed


Unnamed: 0,entry_id,ID,date,dataset,dailylog_data,lifelog_data,questionnaire_data,dtype_n,panic,n_prior_data,dbp,valid_entry_3,valid_entry_2,valid_entry_1,ref_event_id,panic_label,severity
0,SYM2-1-96_2021-08-04,SYM2-1-96,2021-08-04,SYM2,1,1,1,3,0.0,2,,0,1,1,,0,
1,SYM2-1-96_2021-08-03,SYM2-1-96,2021-08-03,SYM2,1,1,1,3,0.0,1,,0,0,1,,0,


# 📒 | Reports

In [11]:
unique_filled_ids =  pre_data_filled['ID'].unique()
print(f"Total number of unique IDs in pre_data_filled: {len(unique_filled_ids)}")
print(f"Total number of entries in pre_data_filled: {len(pre_data_filled)}")
panic_entries = pre_data_filled[pre_data_filled['panic_label'] == 1]
print(f"Total number of panic entries in pre_data_filled: {len(panic_entries)}")

Total number of unique IDs in pre_data_filled: 273
Total number of entries in pre_data_filled: 19379
Total number of panic entries in pre_data_filled: 811


In [12]:
filled_entry_ids = pre_data_filled['entry_id'].unique()
print(f"Total number of daily log entries: {metadata_filled[metadata_filled['dailylog_data'] == 1].shape[0]} / {len(filled_entry_ids)} ({metadata_filled[metadata_filled['dailylog_data'] == 1].shape[0] / len(filled_entry_ids) * 100:.2f}%)")
print(f"Total number of life log entries: {metadata_filled[metadata_filled['lifelog_data'] == 1].shape[0]} / {len(filled_entry_ids)} ({metadata_filled[metadata_filled['lifelog_data'] == 1].shape[0] / len(filled_entry_ids) * 100:.2f}%)")
print(f"Total number of questionnaire entries: {metadata_filled[metadata_filled['questionnaire_data'] == 1].shape[0]} / {len(filled_entry_ids)} ({metadata_filled[metadata_filled['questionnaire_data'] == 1].shape[0] / len(filled_entry_ids) * 100:.2f}%)")

Total number of daily log entries: 19379 / 19379 (100.00%)
Total number of life log entries: 19379 / 19379 (100.00%)
Total number of questionnaire entries: 19379 / 19379 (100.00%)


In [13]:
panic_patients = metadata_filled[metadata_filled['panic_label'] == 1]['ID'].unique()
print(f"Total number of patients with panic events: {len(panic_patients)}")

Total number of patients with panic events: 105
