# Panic Project (DHLAB) - Data Imputation

author:  `@cyshin971`  

date:    `2025-07-02`  

version: `2.3`
 
> version `2.0`: imputation separated from `data_analysis.ipynb` (version `1.3`)  
> version `2.1`: Questionnaire forward filling added (previously implemented in scraping stage) 

> version `2.2`: Replaced previous imputation methods with methods agreed on `20250625`  
>  - Questionnaire 'first come method' imputation entirely moved from scraping to this notebook (`data_imputation.ipynb`)
>  - Added "Growing Average Imputation" for `lifelog`, `mood` features
>  - Implemented zero fill imputation for `dailylog_life` features   

> version `2.3`: Added `use_growing_avg` and `null_default_zero` to switch b/w growing average to group average imputation and switch b/w zero fill and average fill

In [None]:
version = '2-3'

# 📚 | Import Libraries 

In [None]:
import config as cfg
import logging

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
logging.getLogger('matplotlib').setLevel(logging.WARNING)

from library.pandas_utils import move_column, remove_columns, aggregate_by_column, create_empty_df, read_csv
from library.text_utils import save_as_csv
from library.json_utils import save_dict_to_file, load_dict_from_file
from library.path_utils import get_file_path

# ⚙️ | Settings

In [None]:
manual_scraped_data_filename = None # Keep as None if you don't want to manually specify a file

use_growing_avg = True
# True: will use a growing average for the patient, i.e. the average of all previous entries
# False: will use a fixed average for the patient, i.e. the average of all entries for that patient

null_default_zero = True
# True: will replace null values with no existing entries for that patient with 0
# False: will replace null values with no existing entries for that patient with the patient average or global average

# NOTE: 20250625 consensus
# use_growing_avg = True
# null_default_zero = True

# 📁 | Path Variables 

In [None]:
DATA_PATH = "./_data"
TMP_PATH = "./cys/_tmp"
OUT_PATH = "./cys/_output"

try:
	features_dict = load_dict_from_file(OUT_PATH, 'panic_features_dict')
except FileNotFoundError:
    raise FileNotFoundError(f"File not found: {get_file_path(OUT_PATH, 'panic_features_dict')}. Please run data_analysis.ipynb first.")
print(f"Loaded features dict with {len(features_dict)} keys:")
scraped_data_filename = None
for k, v in features_dict.items():
    if k == 'scraped_data_filename':
        print(f"  {k}: {v}.csv")
        scraped_data_filename = v
    elif k == 'preproc_version':
        preproc_version = v
    else:
        print(f"  {k}: {v}")

if scraped_data_filename is None:
	raise ValueError("scraped_data_filename not found in features_dict")
if manual_scraped_data_filename is not None:
    logging.warning(f"Using manually specified scraped_data_filename: {manual_scraped_data_filename}. If this is not intended, please set it to None.")
    scraped_data_filename = manual_scraped_data_filename

features_dict['imputation_version'] = version
save_dict_to_file(features_dict, OUT_PATH, 'panic_features_dict')

PREPROC_PATH = f"{OUT_PATH}/{scraped_data_filename}/preprocessed"
OUTPUT_PATH = f"{OUT_PATH}/{scraped_data_filename}/imputed"

# ⚒️ | Preprocessed Data

## Load Preprocessed Data

In [None]:
pre_data = read_csv(get_file_path(PREPROC_PATH, f'panic_pre_data_{preproc_version}({scraped_data_filename}).csv'))
display(pre_data.head(5))
metadata = read_csv(get_file_path(PREPROC_PATH, f'panic_metadata_{preproc_version}({scraped_data_filename}).csv'))
display(metadata.head(5))
demography_data = read_csv(get_file_path(PREPROC_PATH, f'panic_demography_data_{preproc_version}({scraped_data_filename}).csv'))
display(demography_data.head(5))

# Data Imputation

## Questionnaire

- The null values were forward filled from the first value in the scraped data
- the entries with null values prior to the first value was backward filled
- Subsequent entries were forward filled
- other null values were set to 0

In [None]:
pre_data_filled_questionnaire = create_empty_df()
pre_data_filled_questionnaire = pre_data.copy()

pre_data_filled_questionnaire.sort_values(by=['ID', 'date'], inplace=True)

for qcol in features_dict['questionnaire']:
    pre_data_filled_questionnaire[qcol] = (
        pre_data_filled_questionnaire.groupby('ID')[qcol]
            .apply(lambda s: s.ffill().bfill())
            .fillna(0)
            .values
    )

pre_data_filled_questionnaire.sort_values(by=['ID', 'date'], inplace=True)
disp_df = pre_data_filled_questionnaire[pre_data_filled_questionnaire['ID'] == 'SYM1-1-380'].copy()
display(disp_df[features_dict['id']+features_dict['questionnaire']].head(5))

## Growing Average Imputation

- Initial fill: All NaNs get 0.
- By ID: For each ID, process their entries in chronological order (date).
- Running average: Each time a valid value is encountered, add it to the running sum/count.
- For every NaN (now 0, but you detect it with the original mask), fill with running average so far.
- No values for that ID: Remain at 0 (since running_count stays at 0).
- After each value: The average is updated so the imputation logic matches your requirements.
- No backward fill: Only forward imputation based on past data.

In [None]:
def growing_average_impute(
    df,
    group_col,
    target_cols,
    default_fill_zero=True,
    prefill_value=None
):
    """
    Impute NaN values in target columns by growing average within each group.
    Parameters:
        df (pd.DataFrame): DataFrame to impute.
        group_col (str): Grouping column (e.g., patient ID).
        target_cols (list): Columns to impute.
        default_fill_zero (bool): If True, use 0.0 as prefill before first value; else use group mean or prefill_value.
        prefill_value (float, optional): Custom value to use as prefill (takes precedence over group mean if provided).
    Returns:
        pd.DataFrame: Imputed DataFrame.
    """
    df = df.copy()
    df.sort_values(by=[group_col, 'date'], inplace=True)

    # Precompute group means and global means if needed
    if not default_fill_zero:
        group_means = df.groupby(group_col)[target_cols].mean()
    global_means = df[target_cols].mean()

    for col in target_cols:
        for pid, sub in df.groupby(group_col):
            vals = sub[col].values
            mask = ~np.isnan(vals)

            running_sum = 0.0
            running_count = 0

            # Determine fill value for "pre-first" NaNs
            if default_fill_zero:
                fill_val = 0.0
            elif prefill_value is not None:
                fill_val = prefill_value
            else:
                group_mean = group_means.loc[pid, col] if pid in group_means.index else np.nan
                # If group mean is nan or missing, fallback to global mean, else 0.0
                if not np.isnan(group_mean):
                    fill_val = group_mean
                elif not np.isnan(global_means[col]):
                    fill_val = global_means[col]
                else:
                    fill_val = 0.0

            # Growing average imputation
            for i in range(len(vals)):
                if mask[i]:
                    running_sum += vals[i]
                    running_count += 1
                else:
                    if running_count == 0:
                        vals[i] = fill_val
                    else:
                        vals[i] = running_sum / running_count

            # Write back to DataFrame
            df.loc[sub.index, col] = vals

        # After group loop, fill any remaining NaNs with global mean (if not default_fill_zero)
        if not default_fill_zero:
            df[col].fillna(global_means[col], inplace=True)

    return df

## Average Value Imputation

In [None]:
pre_data_filled_avg = create_empty_df()
pre_data_filled_avg = pre_data_filled_questionnaire.copy()

avg_features = features_dict['lifelog'] + features_dict['mood']

if use_growing_avg:
	logging.info("Using growing average imputation.")
	pre_data_filled_avg = growing_average_impute(
		pre_data_filled_avg, 'ID', avg_features, default_fill_zero=null_default_zero
	)
else: 
	logging.info("Using fixed average imputation.")
	for col in avg_features:
		pre_data_filled_avg[col] = pre_data_filled_avg.groupby('ID')[col].transform(
			lambda x: x.fillna(x.mean())
		)
		if null_default_zero:
			pre_data_filled_avg[col].fillna(0, inplace=True)
		else:
			pre_data_filled_avg[col].fillna(pre_data_filled_avg[col].mean(), inplace=True)

## Null Value 0

- Null values set to `0`

In [None]:
zero_null_features = features_dict['dailylog_life']

pre_data_filled_zero_null = create_empty_df()
pre_data_filled_zero_null = pre_data_filled_avg.copy()

# Fill Null Values
pre_data_filled_zero_null.loc[:, zero_null_features] = pre_data_filled_zero_null.loc[:, zero_null_features].fillna(0)

## 💾 | Save Filled Data

In [None]:
pre_data_filled = create_empty_df()
pre_data_filled = pre_data_filled_zero_null.copy()

save_as_csv(pre_data_filled, OUTPUT_PATH, f"panic_pre_data_filled_{version}({scraped_data_filename})")

## Filled Metadata

In [None]:
metadata_filled = create_empty_df()
metadata_filled = metadata.copy()
if len(pre_data_filled) != len(metadata):
    raise ValueError(f"Length of pre_data_filled ({len(pre_data_filled)}) does not match length of metadata ({len(metadata)}). Please check the data consistency.")
none_columns = ['dailylog_data', 'lifelog_data', 'questionnaire_data', 'dtype_n']
for col in none_columns:
    metadata_filled[col] = None

metadata_filled['dailylog_data'] = pre_data_filled[features_dict['dailylog']].notnull().any(axis=1).astype(int)
metadata_filled['lifelog_data'] = pre_data_filled[features_dict['lifelog']].notnull().any(axis=1).astype(int)
metadata_filled['questionnaire_data'] = pre_data_filled[features_dict['questionnaire']].notnull().any(axis=1).astype(int)

# TODO: Diary data is not used in the current analysis, but can be useful for future reference
metadata_filled['dtype_n'] = metadata_filled['dailylog_data'] + metadata_filled['lifelog_data'] + metadata_filled['questionnaire_data']

save_as_csv(metadata_filled, OUTPUT_PATH, f"panic_metadata_filled_{version}({scraped_data_filename})")
display(metadata_filled.head(5))

# 📒 | Reports

In [None]:
unique_filled_ids =  pre_data_filled['ID'].unique()
print(f"Total number of unique IDs in pre_data_filled: {len(unique_filled_ids)}")
print(f"Total number of entries in pre_data_filled: {len(pre_data_filled)}")
panic_entries = pre_data_filled[pre_data_filled['panic_label'] == 1]
print(f"Total number of panic entries in pre_data_filled: {len(panic_entries)}")

In [None]:
filled_entry_ids = pre_data_filled['entry_id'].unique()
print(f"Total number of daily log entries: {metadata_filled[metadata_filled['dailylog_data'] == 1].shape[0]} / {len(filled_entry_ids)} ({metadata_filled[metadata_filled['dailylog_data'] == 1].shape[0] / len(filled_entry_ids) * 100:.2f}%)")
print(f"Total number of life log entries: {metadata_filled[metadata_filled['lifelog_data'] == 1].shape[0]} / {len(filled_entry_ids)} ({metadata_filled[metadata_filled['lifelog_data'] == 1].shape[0] / len(filled_entry_ids) * 100:.2f}%)")
print(f"Total number of questionnaire entries: {metadata_filled[metadata_filled['questionnaire_data'] == 1].shape[0]} / {len(filled_entry_ids)} ({metadata_filled[metadata_filled['questionnaire_data'] == 1].shape[0] / len(filled_entry_ids) * 100:.2f}%)")
# print(f"Total number of panic diary entries: {metadata_filled[metadata_filled['diary_data'] == 1].shape[0]} / {len(filled_entry_ids)} ({metadata_filled[metadata_filled['diary_data'] == 1].shape[0] / len(filled_entry_ids) * 100:.2f}%)")

In [None]:
panic_patients = metadata_filled[metadata_filled['panic_label'] == 1]['ID'].unique()
print(f"Total number of patients with panic events: {len(panic_patients)}")