# Panic Project (DHLAB) - Data Preprocessing

author:  `@cyshin971`  

date:    `2025-06-20`  

version: `2.0`

> version `1.0`: Derived from `data_analysis.ipynb` version `1.0`  
> version `2.0`: Updated to consensus on progress meeting `20250619`

In [1]:
version = "2-0"

# 📚 | Import Libraries 

In [2]:
import config as cfg
import logging

import pandas as pd
import numpy as np
logging.getLogger('matplotlib').setLevel(logging.WARNING)

from library.pandas_utils import move_column, remove_columns, aggregate_by_column, create_empty_df, read_csv
from library.text_utils import save_as_csv
from library.json_utils import save_dict_to_file
from library.path_utils import get_file_path

# ⚙️ | Settings

In [3]:
scraped_data_filename = "final_result_20250620_360" # Name of the scraped data file without extension (.csv)

save_unnaccounted_data = False  # Set to True (Default: False) if you want to save the unaccounted data to TMP_PATH

# 📁 | Path Variables 

In [4]:
DATA_PATH = "./_data"
TMP_PATH = "./cys/_tmp"
OUT_PATH = f"./cys/_output"
OUTPUT_PATH = f"{OUT_PATH}/{scraped_data_filename}/preprocessed"

# ⛏️ | Scraped Data

load preprocessed data (by `junyeol_lee`)
- Each entry are the datapoints for a patient (`ID`) on a specific date (`date`)
- If there were multiple datapoints for a specific date (`date`) for a specific patient (`ID`), the values were statistically processed (`sum`, `avg`, etc.) to a representation for the day
- Questionnaire data was treated as a 'semi-trait' variable
  - The first entry to a questionnaire was forward filled until a second entry to the questionnaire.
  - All subsequent entries to the questionnaire was forward filled
- Diary contents were added (20250613)
	- `mood`, `contents`
- Certain columns were added back (20250613)
  - demography: `suicide_need` (`boolean`)
  - dailylog:
    - `steps_maximum`
	- `steps_mean`
	- `step_hvar_mean`
	- `step_delta`
	- `step_max_delta`
	- `step_mean_delta`
	- `step_hvar_mean_delta`
	- `step_delta2`
	- `step_max_delta2`
	- `step_mean_delta2`
	- `step_hvar_mean_delta2`
	- `steps_variance`

## Scraped Data Features

In [5]:
features_dict = {
    "scraped_data_filename": scraped_data_filename,
    "preproc_version": version,
	"demography": [
		'gender', 'age', 'marriage', 'job', 'smkHx', 'drinkHx', 'suicideHx', 'suicide_need'
	],
	"dailylog": [
		'panic', 'severity', # NOTE: Caution when constructing dataset as these values are typically labels
  		'exercise', 'alcohol', 'coffee', 'menstruation',
		'smoking', 'positive_feeling', 'negative_feeling', 'positive_E', 'negative_E',
		'anxiety', 'annoying'
	],
	"lifelog": [
        'HR_var', 'HR_max', 'HR_mean', 'HR_hvar_mean', 'HR_acrophase', 'HR_amplitude', 'HR_mesor',
        'HR_acrophase_difference', 'HR_acrophase_difference_2d', 'HR_amplitude_difference',
        'HR_amplitude_difference_2d', 'HR_mesor_difference', 'HR_mesor_difference_2d',
        'bandpower(0.001-0.0005Hz)', 'bandpower(0.0005-0.0001Hz)', 'bandpower(0.0001-0.00005Hz)', 'bandpower(0.00005-0.00001Hz)',
        'steps', 'SLT1', 'SLT2', 'SLT3', 'SLT4', 'SLT5', 'SLT6', 'total_sleep',
        'steps_maximum', 'steps_mean', 'step_hvar_mean', 'step_delta',
        'step_max_delta', 'step_mean_delta', 'step_hvar_mean_delta',
        'step_delta2', 'step_max_delta2', 'step_mean_delta2', 'step_hvar_mean_delta2', 'steps_variance'
	],
	"questionnaire": [
		'PHQ_9', 'STAI_X2', 'CSM', 'CTQ_1', 'CTQ_2', 'CTQ_3', 'CTQ_4', 'CTQ_5', 'KRQ', 'MDQ',
		'ACQ', 'APPQ_1', 'APPQ_2', 'APPQ_3', 'BSQ', 'GAD_7', 'BRIAN'
	],
	# "diary":[
    #     'mood', 'contents'
	# ],
	"excluded": [ # Dropped as variables were only in SYM dataset
		'SPAQ_1', 'SPAQ_2', 'BFNE', 'CES_D', 'KOSSSF', 'SADS', 'STAI_X1', 'medication_in_month',
        'Unnamed: 0' # Placeholder column
	],
    "id": [
        'ID', 'date'
    ],
    "label": [
        'panic', 'severity'
    ],
    "metadata": []
}

demo_vars = features_dict["demography"]
dailylog_vars = features_dict["dailylog"]
lifelog_vars = features_dict["lifelog"]
questionnaire_vars = features_dict["questionnaire"]

state_vars = demo_vars
trait_vars = dailylog_vars + lifelog_vars + questionnaire_vars
all_vars = state_vars + dailylog_vars + lifelog_vars + questionnaire_vars
all_cols = features_dict["id"] + all_vars # + features_dict["diary"]

print(f'Number of variables: {len(all_vars)}')
print(f'   Demographic variables: {len(state_vars)}')
print(f'   Daily log variables: {len(dailylog_vars)}')
print(f'   Life log variables: {len(lifelog_vars)}')
print(f'   Questionnaire variables: {len(questionnaire_vars)}')

Number of variables: 75
   Demographic variables: 8
   Daily log variables: 13
   Life log variables: 37
   Questionnaire variables: 17


## Load Scraped Data

In [6]:
scraped_data = read_csv(get_file_path(DATA_PATH, scraped_data_filename+'.csv'))

# check if all columns are present
missing_cols = [col for col in all_vars if col not in scraped_data.columns]
if missing_cols:
    logging.warning(f"Missing columns in scraped_data: {missing_cols}")
else:
	logging.info("All expected columns are present in scraped_data.")
extra_cols = [col for col in scraped_data.columns if col not in all_cols + features_dict["excluded"]]
if extra_cols:
	logging.warning(f"Extra columns in scraped_data: {extra_cols}")

# convert date column to datetime format
scraped_data['date'] = pd.to_datetime(scraped_data['date'], format='%Y-%m-%d')
remove_columns(scraped_data, ['Unnamed: 0'])

# remove any of the columns in features_dict["excluded"] if they exist
for col in features_dict["excluded"]:
	if col in scraped_data.columns:
		logging.info(f"Removing excluded column: {col}")
		scraped_data.drop(columns=[col], inplace=True)

print(f"Number of rows: {scraped_data.shape[0]}")
print(f"Number of columns: {scraped_data.shape[1]}")
display(scraped_data.head(5))

INFO - (2246184437.py) <module>: All expected columns are present in scraped_data.


Number of rows: 29002
Number of columns: 77


Unnamed: 0,ID,date,panic,gender,PHQ_9,STAI_X2,CSM,CTQ_1,CTQ_2,CTQ_3,...,HR_mean,HR_hvar_mean,SLT1,SLT2,SLT3,SLT4,SLT5,SLT6,total_sleep,severity
0,PXPN_10006,2024-11-04,0.0,0,0.0,32.0,31.0,11.0,13.0,17.0,...,74.33,123.22,,,,,,,,
1,PXPN_10006,2024-11-05,0.0,0,0.0,32.0,31.0,11.0,13.0,17.0,...,54.81,29.4,0.0,4.47,3.62,4.67,0.65,1.85,15.26,
2,PXPN_10006,2024-11-06,1.0,0,0.0,32.0,31.0,11.0,13.0,17.0,...,62.71,50.58,0.0,0.0,0.2,4.07,1.43,1.68,7.38,
3,PXPN_10006,2024-11-07,2.0,0,0.0,32.0,31.0,11.0,13.0,17.0,...,79.18,72.7,0.0,0.0,0.14,5.08,0.0,0.97,6.19,1.0
4,PXPN_10006,2024-11-08,0.0,0,0.0,32.0,31.0,11.0,13.0,17.0,...,87.58,228.52,,,,,,,,


# ⚒️ | Data Preprocessing

Changes from scraped data:
- add `entry_id` to identify each entry: `'ID'_'date'`
- add `dataset` to identify source: `SYM1`, `SYM2`, `PXPN`
- convert `panic` (`0`, `1`, `2` = panic) to days before panic (`dbp`) (panic = `0`, `1`, `2`)
- keep `panic` column instead of removing it (`20250617`)
- add `panic_label` : whether a panic occurred in the entry (`boolean`)
- demographic features were removed from preprocessed data (`data_pre`) and extracted
- diary features were removed from preprocessed data (`data_pre`) and extracted
- the data was filtered to remove entries with only demgraphic data (no `dailylog`, `lifelog`, `questionnaire`, or `diary` entries)

## Initialize Preprocessed Data

- add `entry_id` to identify each entry: `'ID'_'date'`
- add `dataset` to identify source: `SYM1`, `SYM2`, `PXPN`
- convert `panic` (`0`, `1`, `2` = panic) to days befor panic (`dbp`) (panic = `0`, `1`, `2`)
- add `panic_label` (boolean)
- keep `panic` column instead of removing it (`20250617`)
> If using `panic` column as a label this must be removed as a feature from final dataset

In [7]:
data_pre_init = create_empty_df()
data_pre_init = scraped_data.copy()

# Add 'entry_id' column: unique identifier for each row
data_pre_init['entry_id'] = data_pre_init['ID'] + '_' + data_pre_init['date'].astype(str)
instance_id_unique = data_pre_init['entry_id'].unique()
move_column(data_pre_init, 'entry_id', 0)
print("Number of unique entry IDs:", len(instance_id_unique))
# Check if 'entry_id' is unique
if data_pre_init['entry_id'].duplicated().any():
	# return the rows with duplicate 'entry_id'
	duplicates = data_pre_init[data_pre_init['entry_id'].duplicated(keep=False)]
	display(duplicates.head(5))
	save_as_csv(duplicates, TMP_PATH, f"duplicates_{scraped_data_filename}")
	raise ValueError("Duplicate 'entry_id' found in the data. Please resolve this issue before proceeding.")

# Add 'dataset' column: source of data
data_pre_init['dataset'] = data_pre_init['ID'].str.split('_').str[0]
data_pre_init['dataset'] = data_pre_init['dataset'].str.split('-').str[0]
move_column(data_pre_init, 'dataset', 1)

# Convert 'panic' column to Days Before Panic (dbp)
data_pre_init['dbp'] = data_pre_init.apply(
	lambda row: np.nan if row['panic'] == 0
 				else 0 if row['panic'] == 2 else row['panic'],
	axis=1
)

# Add panic_label column
data_pre_init['panic_label'] = data_pre_init['panic'].apply(lambda x: 1 if x == 2 else 0)

# Update the features_dict
if 'entry_id' not in features_dict['id']:
	features_dict['id'].insert(0, 'entry_id')
if 'dataset' not in features_dict['id']:
	features_dict['id'].append('dataset')
if 'dbp' not in features_dict['dailylog']:
	features_dict['label'].insert(0, 'dbp')
if 'panic_label' not in features_dict['label']:
	features_dict['label'].append('panic_label')
# Remove 'panic' from dailylog features (as it is a label) #NOTE: Need to remove as panic null values were filled with 0 in scraped_data
if 'panic' in features_dict['dailylog']:
	features_dict['dailylog'].remove('panic')

# print scraped_data shape
print(f"Scraped data shape: {scraped_data.shape}")
print(f"Initialized preprocessed data shape: {data_pre_init.shape}")

Number of unique entry IDs: 29002
Scraped data shape: (29002, 77)
Initialized preprocessed data shape: (29002, 81)


In [8]:
display(data_pre_init.head(5))
print("Unique sources in metadata_ljy: ", data_pre_init['dataset'].unique())
print("Number of entries in metadata_ljy:", data_pre_init.shape[0])
sym1_n = data_pre_init[data_pre_init['dataset'] == 'SYM1'].shape[0]
sym2_n = data_pre_init[data_pre_init['dataset'] == 'SYM2'].shape[0]
print("    SYM entries:", sym1_n+sym2_n)
print("    PXPN entries:", data_pre_init[data_pre_init['dataset'] == 'PXPN'].shape[0])
print("Number of unique IDs in metadata_ljy:", len(data_pre_init['ID'].unique()))
# find the unique IDs for SYM1 and SYM2
sym1_ids = data_pre_init[data_pre_init['dataset'] == 'SYM1']['ID'].unique()
sym2_ids = data_pre_init[data_pre_init['dataset'] == 'SYM2']['ID'].unique()
print("    SYM IDs: ", len(sym1_ids)+len(sym2_ids))
print("    PXPN IDs: ", len(data_pre_init[data_pre_init['dataset'] == 'PXPN']['ID'].unique()))
print("Number of panic events (dbp=0):", data_pre_init[data_pre_init['dbp'] == 0].shape[0])

Unnamed: 0,entry_id,dataset,ID,date,panic,gender,PHQ_9,STAI_X2,CSM,CTQ_1,...,SLT1,SLT2,SLT3,SLT4,SLT5,SLT6,total_sleep,severity,dbp,panic_label
0,PXPN_10006_2024-11-04,PXPN,PXPN_10006,2024-11-04,0.0,0,0.0,32.0,31.0,11.0,...,,,,,,,,,,0
1,PXPN_10006_2024-11-05,PXPN,PXPN_10006,2024-11-05,0.0,0,0.0,32.0,31.0,11.0,...,0.0,4.47,3.62,4.67,0.65,1.85,15.26,,,0
2,PXPN_10006_2024-11-06,PXPN,PXPN_10006,2024-11-06,1.0,0,0.0,32.0,31.0,11.0,...,0.0,0.0,0.2,4.07,1.43,1.68,7.38,,1.0,0
3,PXPN_10006_2024-11-07,PXPN,PXPN_10006,2024-11-07,2.0,0,0.0,32.0,31.0,11.0,...,0.0,0.0,0.14,5.08,0.0,0.97,6.19,1.0,0.0,1
4,PXPN_10006_2024-11-08,PXPN,PXPN_10006,2024-11-08,0.0,0,0.0,32.0,31.0,11.0,...,,,,,,,,,,0


Unique sources in metadata_ljy:  ['PXPN' 'SYM1' 'SYM2']
Number of entries in metadata_ljy: 29002
    SYM entries: 28163
    PXPN entries: 839
Number of unique IDs in metadata_ljy: 429
    SYM IDs:  400
    PXPN IDs:  29
Number of panic events (dbp=0): 811


## Initialize Metadata

initialize `metadata` by adding
- `demography_data` : whether demography data exists in the entry (`boolean`)
- `dailylog_data`, `lifelog_data`, `questionnaire_data` : whether each data group exists in the entry (`boolean`)
- `dtype_n` : how many of the 3 `state` groups exists in the entry (`int`)
- `diary_data`: whether panic diary data group exists in the entry (`boolean`)

In [9]:
metadata_init = create_empty_df()
metadata_init = data_pre_init.copy()

metadata_init['demography_data'] = metadata_init[features_dict['demography']].notnull().any(axis=1).astype(int)
metadata_init['dailylog_data'] = metadata_init[features_dict['dailylog']].notnull().any(axis=1).astype(int)
metadata_init['lifelog_data'] = metadata_init[features_dict['lifelog']].notnull().any(axis=1).astype(int)
metadata_init['questionnaire_data'] = metadata_init[features_dict['questionnaire']].notnull().any(axis=1).astype(int)
# metadata_init['diary_data'] = metadata_init[features_dict['diary']].notnull().any(axis=1).astype(int)

# TODO: Diary data is not used in the current analysis, but can be useful for future reference
metadata_init['dtype_n'] = metadata_init['dailylog_data'] + metadata_init['lifelog_data'] + metadata_init['questionnaire_data'] #TODO: + metadata_init['diary_data']
move_column(metadata_init, 'dtype_n', 8)

add_list = ['dailylog_data', 'lifelog_data', 'questionnaire_data', 'dtype_n'] # , 'diary_data']
for item in add_list:
	if item not in features_dict['metadata']:
		features_dict['metadata'].append(item)
del add_list

check_metadata = False
if check_metadata:
    check_type = 'questionnaire' # demography, dailylog, lifelog, questionnaire
    check_for = 0
    test = metadata_init[metadata_init[check_type+'_data'] == check_for].copy()
    test = test[features_dict['id']+features_dict['metadata']+features_dict[check_type]]
    print(f"--------- TEST {test.shape[0]} ENTRIES WITH {check_type} = {check_for} ---------")
    display(test.head(10))
    save_as_csv(test, TMP_PATH, f"metadata_{check_type}_{check_for}")
    print("------------------------------------------------------------------------")
    del test, check_type, check_for

display(metadata_init.head(5))

Unnamed: 0,entry_id,dataset,ID,date,panic,gender,PHQ_9,STAI_X2,dtype_n,CSM,...,SLT5,SLT6,total_sleep,severity,dbp,panic_label,demography_data,dailylog_data,lifelog_data,questionnaire_data
0,PXPN_10006_2024-11-04,PXPN,PXPN_10006,2024-11-04,0.0,0,0.0,32.0,3,31.0,...,,,,,,0,1,1,1,1
1,PXPN_10006_2024-11-05,PXPN,PXPN_10006,2024-11-05,0.0,0,0.0,32.0,3,31.0,...,0.65,1.85,15.26,,,0,1,1,1,1
2,PXPN_10006_2024-11-06,PXPN,PXPN_10006,2024-11-06,1.0,0,0.0,32.0,3,31.0,...,1.43,1.68,7.38,,1.0,0,1,1,1,1
3,PXPN_10006_2024-11-07,PXPN,PXPN_10006,2024-11-07,2.0,0,0.0,32.0,3,31.0,...,0.0,0.97,6.19,1.0,0.0,1,1,1,1,1
4,PXPN_10006_2024-11-08,PXPN,PXPN_10006,2024-11-08,0.0,0,0.0,32.0,2,31.0,...,,,,,,0,1,0,1,1


## Extract Demography Data

- All patients within the scraped data were confirmed to have demographic data (`demography_data` = `True`)
- as such demography_data will not be included in the `metadata`
- demographic features were removed from preprocessed data (`data_pre`)
- Demography data was extracted and saved as `demography.csv` to the `output` directory

In [10]:
agg_matrix = [
	('gender_n', 'gender', 'nunique'),
	('age_n', 'age', 'nunique'),
	('marriage_n', 'marriage', 'nunique'),
	('job_n', 'job', 'nunique'),
	('smkHx_n', 'smkHx', 'nunique'),
	('drinkHx_n', 'drinkHx', 'nunique'),
	('suicideHx_n', 'suicideHx', 'nunique'),
	('suicide_need_n', 'suicide_need', 'nunique'),
    ('gender', 'gender', 'first'),
	('age', 'age', 'first'),
	('marriage', 'marriage', 'first'),
	('job', 'job', 'first'),
	('smkHx', 'smkHx', 'first'),
	('drinkHx', 'drinkHx', 'first'),
	('suicideHx', 'suicideHx', 'first'),
	('suicide_need', 'suicide_need', 'first'),
]
demo_data = create_empty_df()
demo_data = aggregate_by_column(metadata_init, 'ID', agg_matrix)

# check if the length of each unique value is 1
non_unique_cols = []
for col in features_dict['demography']:
	if demo_data[col+'_n'].apply(lambda x: x > 1).any():
		non_unique_cols.append(col)
if non_unique_cols:
	raise ValueError(f"Demographic columns {non_unique_cols} are not unique for each ID in demo_data.")
else:
	print("All demographic columns are unique for each ID in demo_data.")

for col in features_dict['demography']:
	remove_columns(demo_data, [col+'_n'])
print(f"Number of rows in demo_data: {demo_data.shape[0]}")
display(demo_data.head(5))

save_as_csv(demo_data, OUTPUT_PATH, f"panic_demography_data_{version}({scraped_data_filename})")
# Remove demographic features from data_proc
remove_columns(data_pre_init, features_dict['demography'])

All demographic columns are unique for each ID in demo_data.
Number of rows in demo_data: 429


Unnamed: 0,ID,gender,age,marriage,job,smkHx,drinkHx,suicideHx,suicide_need
0,PXPN_10006,0,32.0,0.0,1.0,1.0,1.0,0.0,0.0
1,PXPN_10007,1,38.0,1.0,1.0,0.0,0.0,0.0,0.0
2,PXPN_10008,0,38.0,1.0,0.0,0.0,1.0,0.0,0.0
3,PXPN_10009,1,28.0,0.0,0.0,1.0,0.0,1.0,0.0
4,PXPN_10010,1,21.0,0.0,0.0,1.0,1.0,0.0,0.0


DEBUG - (path_utils.py) make_dir: Created directory: C:\Users\cyshi\OneDrive\Documents\GitHub\Panic-Project-CYS\cys\_output\final_result_20250620_360\preprocessed
DEBUG - (text_utils.py) save_as_csv: Saved panic_demography_data_2-0(final_result_20250620_360).csv to C:\Users\cyshi\OneDrive\Documents\GitHub\Panic-Project-CYS\cys\_output\final_result_20250620_360\preprocessed


## Extract Panic Diary Data

In [11]:
# if all(col in data_pre_init.columns for col in features_dict['diary']):
# 	panic_diary_data = create_empty_df()
# 	panic_diary_data = data_pre_init[features_dict['id'] + features_dict['diary']].copy()

# 	panic_diary_entries = metadata_init[metadata_init['diary_data'] == 1]['entry_id'].unique()
# 	# Filter panic_diary_data to only include entries with diary data
# 	panic_diary_data = panic_diary_data[panic_diary_data['entry_id'].isin(panic_diary_entries)]

# 	print(f"Number of rows in panic_diary_data: {panic_diary_data.shape[0]}")
# 	print(f"Number of unique patients in panic_diary_data: {panic_diary_data['ID'].nunique()}")
# 	print(f"Unique datasets in panic_diary_data: {panic_diary_data['dataset'].unique()}")
# 	display(panic_diary_data.head(5))

# 	save_as_csv(panic_diary_data, OUTPUT_PATH, f"panic_diary_data_{version}({scraped_data_filename})")
# 	remove_columns(data_pre_init, features_dict['diary'])  # Remove diary columns from data_pre_init
# else:
# 	print("No diary data found in the scraped data. Skipping panic_diary_data creation.")

## Construct Intermediate Metadata
- the current `metadata` (`metadata_init`) was filtered to include only columns for identification, added columns for metadata, and labels
- the `metadata` was also filtered to get rid of all entries that only have demography data (`dtype_n` = 0)

In [12]:
metadata_int = create_empty_df()
metadata_int = metadata_init.copy()

metadata_int = metadata_int[features_dict['id'] + features_dict['metadata'] + features_dict['label']]
move_column(metadata_int, 'severity', -1)
move_column(metadata_int, 'panic_label', -1)
metadata_int = metadata_int[metadata_int['dtype_n'] > 0]
metadata_int = metadata_int[metadata_int['date'].notnull()]
display(metadata_int.head(5))

Unnamed: 0,entry_id,ID,date,dataset,dailylog_data,lifelog_data,questionnaire_data,dtype_n,dbp,panic,severity,panic_label
0,PXPN_10006_2024-11-04,PXPN_10006,2024-11-04,PXPN,1,1,1,3,,0.0,,0
1,PXPN_10006_2024-11-05,PXPN_10006,2024-11-05,PXPN,1,1,1,3,,0.0,,0
2,PXPN_10006_2024-11-06,PXPN_10006,2024-11-06,PXPN,1,1,1,3,1.0,1.0,,0
3,PXPN_10006_2024-11-07,PXPN_10006,2024-11-07,PXPN,1,1,1,3,0.0,2.0,1.0,1
4,PXPN_10006_2024-11-08,PXPN_10006,2024-11-08,PXPN,0,1,1,2,,0.0,,0


## Filter Preprocessed Data

- the data was filtered to remove entries with only demgraphic data
- the removed IDs were checked to see if no relevant entries were discarded

In [13]:
data_pre = create_empty_df()
data_pre = data_pre_init.copy()

# Filter data_proc to keep only rows with entry IDs present in metadata_int
metadata_int_unique_ids = metadata_int['entry_id'].unique()
data_pre = data_pre[data_pre['entry_id'].isin(metadata_int_unique_ids)]

# remove rows with null dates
data_pre = data_pre[data_pre['date'].notnull()]

# Move label columns to the end
move_column(data_pre, 'dbp', -1)
move_column(data_pre, 'panic', -1)
move_column(data_pre, 'severity', -1)
move_column(data_pre, 'panic_label', -1)

display(data_pre.head(5))

# Find IDs present in unfiltered_data but missing in filtered_data (i.e., lost after filtering)
check_missing_ids = False
if check_missing_ids:
	missing_ids = np.setdiff1d(data_pre_init['ID'].unique(), data_pre['ID'].unique())
	missing_data = data_pre_init[data_pre_init['ID'].isin(missing_ids)]
	print(f"Number of IDs lost after filtering: {len(missing_ids)}")
	_ = save_as_csv(missing_data, TMP_PATH, f"missing_{scraped_data_filename}")

Unnamed: 0,entry_id,dataset,ID,date,PHQ_9,STAI_X2,CSM,CTQ_1,CTQ_2,CTQ_3,...,SLT2,SLT3,SLT4,SLT5,SLT6,total_sleep,dbp,panic,severity,panic_label
0,PXPN_10006_2024-11-04,PXPN,PXPN_10006,2024-11-04,0.0,32.0,31.0,11.0,13.0,17.0,...,,,,,,,,0.0,,0
1,PXPN_10006_2024-11-05,PXPN,PXPN_10006,2024-11-05,0.0,32.0,31.0,11.0,13.0,17.0,...,4.47,3.62,4.67,0.65,1.85,15.26,,0.0,,0
2,PXPN_10006_2024-11-06,PXPN,PXPN_10006,2024-11-06,0.0,32.0,31.0,11.0,13.0,17.0,...,0.0,0.2,4.07,1.43,1.68,7.38,1.0,1.0,,0
3,PXPN_10006_2024-11-07,PXPN,PXPN_10006,2024-11-07,0.0,32.0,31.0,11.0,13.0,17.0,...,0.0,0.14,5.08,0.0,0.97,6.19,0.0,2.0,1.0,1
4,PXPN_10006_2024-11-08,PXPN,PXPN_10006,2024-11-08,0.0,32.0,31.0,11.0,13.0,17.0,...,,,,,,,,0.0,,0


## 💾 | Save Preprocessed Data

In [14]:
# save data_pre to CSV
save_as_csv(data_pre, OUTPUT_PATH, f"panic_pre_data_{version}({scraped_data_filename})")

display(data_pre.head(3))
print("--------------------------------------------------------")
print("Total entries in original: ", data_pre_init.shape[0])
sym1_n = data_pre_init[data_pre_init['dataset'] == 'SYM1'].shape[0]
sym2_n = data_pre_init[data_pre_init['dataset'] == 'SYM2'].shape[0]
print("    SYM entries:", sym1_n+sym2_n)
print("    PXPN entries:", data_pre_init[data_pre_init['dataset'] == 'PXPN'].shape[0])
print("Number of unique IDs in original:", len(data_pre_init['ID'].unique()))
# find the unique IDs for SYM1 and SYM2
sym1_ids = data_pre_init[data_pre_init['dataset'] == 'SYM1']['ID'].unique()
sym2_ids = data_pre_init[data_pre_init['dataset'] == 'SYM2']['ID'].unique()
print("    SYM IDs: ", len(sym1_ids)+len(sym2_ids))
print("    PXPN IDs: ", len(data_pre_init[data_pre_init['dataset'] == 'PXPN']['ID'].unique()))
print("Number of panic events (dbp=0):", data_pre_init[data_pre_init['dbp'] == 0].shape[0])
print("--------------------------------------------------------")
print("Total entries in filtered: ", data_pre.shape[0])
sym1_n = data_pre[data_pre['dataset'] == 'SYM1'].shape[0]
sym2_n = data_pre[data_pre['dataset'] == 'SYM2'].shape[0]
print("    SYM entries:", sym1_n+sym2_n)
print("    PXPN entries:", data_pre[data_pre['dataset'] == 'PXPN'].shape[0])
print("Number of unique IDs in filtered:", len(data_pre['ID'].unique()))
# find the unique IDs for SYM1 and SYM2
sym1_ids = data_pre[data_pre['dataset'] == 'SYM1']['ID'].unique()
sym2_ids = data_pre[data_pre['dataset'] == 'SYM2']['ID'].unique()
print("    SYM IDs: ", len(sym1_ids)+len(sym2_ids))
print("    PXPN IDs: ", len(data_pre[data_pre['dataset'] == 'PXPN']['ID'].unique()))
print("Number of panic events (dbp=0):", data_pre[data_pre['dbp'] == 0].shape[0])

DEBUG - (text_utils.py) save_as_csv: Saved panic_pre_data_2-0(final_result_20250620_360).csv to C:\Users\cyshi\OneDrive\Documents\GitHub\Panic-Project-CYS\cys\_output\final_result_20250620_360\preprocessed


Unnamed: 0,entry_id,dataset,ID,date,PHQ_9,STAI_X2,CSM,CTQ_1,CTQ_2,CTQ_3,...,SLT2,SLT3,SLT4,SLT5,SLT6,total_sleep,dbp,panic,severity,panic_label
0,PXPN_10006_2024-11-04,PXPN,PXPN_10006,2024-11-04,0.0,32.0,31.0,11.0,13.0,17.0,...,,,,,,,,0.0,,0
1,PXPN_10006_2024-11-05,PXPN,PXPN_10006,2024-11-05,0.0,32.0,31.0,11.0,13.0,17.0,...,4.47,3.62,4.67,0.65,1.85,15.26,,0.0,,0
2,PXPN_10006_2024-11-06,PXPN,PXPN_10006,2024-11-06,0.0,32.0,31.0,11.0,13.0,17.0,...,0.0,0.2,4.07,1.43,1.68,7.38,1.0,1.0,,0


--------------------------------------------------------
Total entries in original:  29002
    SYM entries: 28163
    PXPN entries: 839
Number of unique IDs in original: 429
    SYM IDs:  400
    PXPN IDs:  29
Number of panic events (dbp=0): 811
--------------------------------------------------------
Total entries in filtered:  23828
    SYM entries: 23014
    PXPN entries: 814
Number of unique IDs in filtered: 273
    SYM IDs:  244
    PXPN IDs:  29
Number of panic events (dbp=0): 811


# 📖 | Metadata

**Description**
- `entry_id`: ID for each entry `'ID'_'date'`
- `ID`: ID for each patient
- `date`: logging date of each entry
- `dataset`: source of entry (`SYM1`, `SYM2`, `PXPN`)
- `dailylog_data`: whether daily log data exists in the entry (`boolean`)
- `lifelog_data`: whether life log data exists in the entry (`boolean`)
- `questionnaire_data`: whether questionnaire data exists in the entry (`boolean`)
- `dtype_n`: how many of the 3 `state` groups exists in the entry (`int`)
- `diary_data`: whether panic diary data exists in the entry (`boolean`)
- `dbp`: number of consecutive days prior to panic. i.e. panic day = 0; 1 day prior = 1; etc. (up to 3)
- `n_prior_data`: number of existing consecutive prior (days) entries
- `ref_event_id`: the `entry_id` to which days before panic (`dbp`) is referencing
- `valid_entry_3`: whether the entry has 3 consecutive days of prior data (`n_prior_data`)
- `valid_entry_2`: whether the entry has 2 consecutive days of prior data (`n_prior_data`)
- `valid_entry_1`: whether the entry has 1 consecutive days of prior data (`n_prior_data`)
- `panic_label`: whether a panic occured in the entry (`boolean`)
- `severity`: severity of the panic (1 ~ 5)

## Calculate Days Before Panic (``dbp``) and Prior Consecutive Days (``n_prior_data``)

- calculate the consecutive 'days before panic' (`dbp`):
  - day when panic occured -> `dbp` = 0
  - 1 day before panic -> `dbp` = 1
  - 2 day before panic -> `dbp` = 2
  - 3 day before panic -> `dbp` = 3 (etc)
  - stop calculating at a set limit (`delta_days`) or if a panic occurred within the limit
- calculate the number of existing prior consecutive (days) entries (`n_prior_data`) (Default: 3)
  - stop calculating at a certain limit (`lookback_limit`) (Default: 7)

> May take ~ 1 to 2 min

In [15]:
from cys.utils import process_calculate_days_before_panic

metadata_calc = create_empty_df()
metadata_calc = metadata_int.copy()

metadata_calc['n_prior_data']    = None
metadata_calc['ref_event_id']    = None
move_column(metadata_calc, 'panic_label', -1)
move_column(metadata_calc, 'severity', -1)
metadata_calc.sort_values(by=['ID', 'date'], ascending=False, inplace=True)

d_days = 3
l_back_lim = 7

metadata_int = process_calculate_days_before_panic(metadata_calc, delta_days=d_days, lookback_limit=l_back_lim)

# update features_dict with metadata columns
if 'ref_event_id' not in features_dict['metadata']:
	features_dict['metadata'].append('ref_event_id')
if 'n_prior_data' not in features_dict['metadata']:
	features_dict['metadata'].append('n_prior_data')

Processing: 100.00% complete

In [16]:
p_id = 'SYM2-1-422'
disp_df = metadata_int[metadata_int['ID'] == p_id]
display(disp_df.head(5))
del disp_df, p_id

Unnamed: 0,entry_id,ID,date,dataset,dailylog_data,lifelog_data,questionnaire_data,dtype_n,dbp,panic,n_prior_data,ref_event_id,panic_label,severity
27164,SYM2-1-422_2022-05-24,SYM2-1-422,2022-05-24,SYM2,0,1,0,1,,0.0,4,,0,
27163,SYM2-1-422_2022-05-23,SYM2-1-422,2022-05-23,SYM2,1,1,0,2,,0.0,3,,0,
27162,SYM2-1-422_2022-05-22,SYM2-1-422,2022-05-22,SYM2,1,1,0,2,,0.0,2,,0,
27161,SYM2-1-422_2022-05-21,SYM2-1-422,2022-05-21,SYM2,1,1,0,2,,0.0,1,,0,
27160,SYM2-1-422_2022-05-20,SYM2-1-422,2022-05-20,SYM2,1,1,0,2,,0.0,0,,0,


## Find Valid Entries
- add `valid_entry_3`: whether the entry has 3 consecutive days of prior data (`n_prior_data`)
- add `valid_entry_2`: whether the entry has 2 consecutive days of prior data (`n_prior_data`)
- add `valid_entry_1`: whether the entry has 1 consecutive days of prior data (`n_prior_data`)

In [17]:
metadata_int['valid_entry_3'] = metadata_int.apply(
	lambda row: 1 if row['n_prior_data'] >= 3 else 0,
	axis=1
)
metadata_int['valid_entry_2'] = metadata_int.apply(
	lambda row: 1 if row['n_prior_data'] >= 2 else 0,
	axis=1
)
metadata_int['valid_entry_1'] = metadata_int.apply(
	lambda row: 1 if row['n_prior_data'] >= 1 else 0,
	axis=1
)
move_column(metadata_int, 'ref_event_id', -1)
move_column(metadata_int, 'panic_label', -1)
move_column(metadata_int, 'severity', -1)
display(metadata_int.head(5))

Unnamed: 0,entry_id,ID,date,dataset,dailylog_data,lifelog_data,questionnaire_data,dtype_n,dbp,panic,n_prior_data,valid_entry_3,valid_entry_2,valid_entry_1,ref_event_id,panic_label,severity
29001,SYM2-1-96_2021-08-04,SYM2-1-96,2021-08-04,SYM2,0,1,0,1,,0.0,2,0,1,1,,0,
29000,SYM2-1-96_2021-08-03,SYM2-1-96,2021-08-03,SYM2,0,1,0,1,,0.0,1,0,0,1,,0,
28999,SYM2-1-96_2021-08-02,SYM2-1-96,2021-08-02,SYM2,0,1,0,1,,0.0,0,0,0,0,,0,
28996,SYM2-1-96_2021-07-30,SYM2-1-96,2021-07-30,SYM2,0,1,0,1,,0.0,2,0,1,1,,0,
28995,SYM2-1-96_2021-07-29,SYM2-1-96,2021-07-29,SYM2,0,1,0,1,,0.0,1,0,0,1,,0,


In [18]:
# check for panic labeling consistency
# Panic events should have dbp = 0, panic = 2
test_panic_dbpnot0 = metadata_int[(metadata_int['panic'] == 2) & (metadata_int['dbp'] != 0)]['entry_id'].unique()
test_panic_dbp1 = metadata_int[(metadata_int['panic'] == 1) & (metadata_int['dbp'] != 1)]['entry_id'].unique()
if len(test_panic_dbpnot0) != 0:
	raise ValueError("Entries found with dbp != 0 for panic events. Please check the data.")
if len(test_panic_dbp1) != 0:
	raise ValueError("Entries found with dbp != 1 for panic = 1. Please check the data.")
del test_panic_dbpnot0, test_panic_dbp1

## 💾 | Save Metadata

In [19]:
metadata = create_empty_df()
metadata = metadata_int.copy()
save_as_csv(metadata, OUTPUT_PATH, f"panic_metadata_{version}({scraped_data_filename})")
save_dict_to_file(features_dict, OUT_PATH, "panic_features_dict")

display(metadata.head(10))

DEBUG - (text_utils.py) save_as_csv: Saved panic_metadata_2-0(final_result_20250620_360).csv to C:\Users\cyshi\OneDrive\Documents\GitHub\Panic-Project-CYS\cys\_output\final_result_20250620_360\preprocessed
DEBUG - (json_utils.py) save_dict_to_file: Dictionary saved successfully to C:\Users\cyshi\OneDrive\Documents\GitHub\Panic-Project-CYS\cys\_output\panic_features_dict.json


Unnamed: 0,entry_id,ID,date,dataset,dailylog_data,lifelog_data,questionnaire_data,dtype_n,dbp,panic,n_prior_data,valid_entry_3,valid_entry_2,valid_entry_1,ref_event_id,panic_label,severity
29001,SYM2-1-96_2021-08-04,SYM2-1-96,2021-08-04,SYM2,0,1,0,1,,0.0,2,0,1,1,,0,
29000,SYM2-1-96_2021-08-03,SYM2-1-96,2021-08-03,SYM2,0,1,0,1,,0.0,1,0,0,1,,0,
28999,SYM2-1-96_2021-08-02,SYM2-1-96,2021-08-02,SYM2,0,1,0,1,,0.0,0,0,0,0,,0,
28996,SYM2-1-96_2021-07-30,SYM2-1-96,2021-07-30,SYM2,0,1,0,1,,0.0,2,0,1,1,,0,
28995,SYM2-1-96_2021-07-29,SYM2-1-96,2021-07-29,SYM2,0,1,0,1,,0.0,1,0,0,1,,0,
28994,SYM2-1-96_2021-07-28,SYM2-1-96,2021-07-28,SYM2,0,1,0,1,,0.0,0,0,0,0,,0,
28988,SYM2-1-96_2021-07-22,SYM2-1-96,2021-07-22,SYM2,0,1,0,1,,0.0,1,0,0,1,,0,
28987,SYM2-1-96_2021-07-21,SYM2-1-96,2021-07-21,SYM2,0,1,0,1,,0.0,0,0,0,0,,0,
28975,SYM2-1-96_2021-07-09,SYM2-1-96,2021-07-09,SYM2,0,1,0,1,,0.0,5,1,1,1,,0,
28974,SYM2-1-96_2021-07-08,SYM2-1-96,2021-07-08,SYM2,0,1,0,1,,0.0,4,1,1,1,,0,


# 🔍 | Data Analysis

## Overview

In [20]:
scraped_unique_ids = scraped_data['ID'].unique()
data_pre_unique_ids = data_pre['ID'].unique()
print(f"Scraped Unique IDs: {len(scraped_unique_ids)} -> {len(data_pre_unique_ids)} after preprocessing. discarded {len(scraped_unique_ids) - len(data_pre_unique_ids)} IDs.")
scraped_data_n = len(scraped_data)
data_pre_entry_ids = data_pre['entry_id'].unique()
print(f"Scraped Entries: {scraped_data_n} -> {len(data_pre_entry_ids)} after preprocessing. discarded {scraped_data_n - len(data_pre_entry_ids)} entries.")
print(f"    Unnaccounted entry removal: {scraped_data_n - len(data_pre_entry_ids) - (len(scraped_unique_ids) - len(data_pre_unique_ids))} entries removed")
scraped_panic_events = scraped_data[scraped_data['panic'] == 2].shape[0]
data_pre_panic_events = data_pre[data_pre['panic'] == 2].shape[0]
data_pre_dbp_panic_events = data_pre[data_pre['dbp'] == 0].shape[0]
data_pre_label_panic_events = data_pre[data_pre['panic_label'] == 1].shape[0]
if data_pre_dbp_panic_events != data_pre_panic_events:
	raise ValueError("Mismatch in panic events count: dbp panic events and panic events do not match.")
if data_pre_label_panic_events != data_pre_panic_events:
	raise ValueError("Mismatch in panic events count: label panic events and panic events do not match.")
print(f"Scraped Panic Events: {scraped_panic_events} -> {data_pre_panic_events} after preprocessing. discarded {scraped_panic_events - data_pre_panic_events} panic events.")

if save_unnaccounted_data:
    logging.info("Saving unaccounted data...")
    # find the entry_ids in scraped_data that are not in pre_data
    missing_entry_ids = set(data_pre_init['entry_id']) - set(data_pre['entry_id'])
    if len(missing_entry_ids) != 0:
        print(f"Missing entry IDs: {len(missing_entry_ids)}")
        unnaccounted_data = data_pre_init[data_pre_init['entry_id'].isin(missing_entry_ids)]
        save_as_csv(unnaccounted_data, TMP_PATH, f"unnaccounted_data_{scraped_data_filename}")

Scraped Unique IDs: 429 -> 273 after preprocessing. discarded 156 IDs.
Scraped Entries: 29002 -> 23828 after preprocessing. discarded 5174 entries.
    Unnaccounted entry removal: 5018 entries removed
Scraped Panic Events: 811 -> 811 after preprocessing. discarded 0 panic events.


In [21]:
data_pre_entry_ids = data_pre['entry_id'].unique()
print(f"Total number of daily log entries: {metadata[metadata['dailylog_data'] == 1].shape[0]} / {len(data_pre_entry_ids)} ({metadata[metadata['dailylog_data'] == 1].shape[0] / len(data_pre_entry_ids) * 100:.2f}%)")
print(f"Total number of life log entries: {metadata[metadata['lifelog_data'] == 1].shape[0]} / {len(data_pre_entry_ids)} ({metadata[metadata['lifelog_data'] == 1].shape[0] / len(data_pre_entry_ids) * 100:.2f}%)")
print(f"Total number of questionnaire entries: {metadata[metadata['questionnaire_data'] == 1].shape[0]} / {len(data_pre_entry_ids)} ({metadata[metadata['questionnaire_data'] == 1].shape[0] / len(data_pre_entry_ids) * 100:.2f}%)")
# print(f"Total number of panic diary entries: {metadata[metadata['diary_data'] == 1].shape[0]} / {len(data_pre_entry_ids)} ({metadata[metadata['diary_data'] == 1].shape[0] / len(data_pre_entry_ids) * 100:.2f}%)")

Total number of daily log entries: 14854 / 23828 (62.34%)
Total number of life log entries: 12408 / 23828 (52.07%)
Total number of questionnaire entries: 17978 / 23828 (75.45%)


In [22]:
panic_patients = metadata[metadata['panic_label'] == 1]['ID'].unique()
print(f"Total number of patients with panic events: {len(panic_patients)}")

Total number of patients with panic events: 105


In [23]:
panic_1_entries = metadata[metadata['panic'] == 1].shape[0]
dbp_1_entries = metadata[metadata['dbp'] == 1].shape[0]
if panic_1_entries != dbp_1_entries:
	raise ValueError("Mismatch in panic entries count: panic entries and dbp entries do not match.")
if len(metadata) != len(data_pre):
    raise ValueError("Error")
print(f"Total number of entries with dbp = 1 (panic = 1): {panic_1_entries} / {len(data_pre_entry_ids)} ({panic_1_entries / len(data_pre_entry_ids) * 100:.2f}%)")
dbp_2_entries = metadata[metadata['dbp'] == 2].shape[0]
print(f"Total number of entries with dbp = 2: {dbp_2_entries} / {len(data_pre_entry_ids)} ({dbp_2_entries / len(data_pre_entry_ids) * 100:.2f}%)")
dbp_3_entries = metadata[metadata['dbp'] == 3].shape[0]
print(f"Total number of entries with dbp = 3: {dbp_3_entries} / {len(data_pre_entry_ids)} ({dbp_3_entries / len(data_pre_entry_ids) * 100:.2f}%)")
panic_0_entries = metadata[metadata['panic'] == 0].shape[0]
print(f"Total number of entries with panic = 0: {panic_0_entries} / {len(data_pre_entry_ids)} ({panic_0_entries / len(data_pre_entry_ids) * 100:.2f}%)")

Total number of entries with dbp = 1 (panic = 1): 502 / 23828 (2.11%)
Total number of entries with dbp = 2: 384 / 23828 (1.61%)
Total number of entries with dbp = 3: 326 / 23828 (1.37%)
Total number of entries with panic = 0: 22515 / 23828 (94.49%)


## Data Displayed

In [24]:
print("Scraped data shape:", scraped_data.shape)
display(scraped_data.head(2))
print("Data preprocessed shape:", data_pre.shape)
display(data_pre.head(2))
print("Metadata shape:", metadata.shape)
display(metadata.head(2))

Scraped data shape: (29002, 77)


Unnamed: 0,ID,date,panic,gender,PHQ_9,STAI_X2,CSM,CTQ_1,CTQ_2,CTQ_3,...,HR_mean,HR_hvar_mean,SLT1,SLT2,SLT3,SLT4,SLT5,SLT6,total_sleep,severity
0,PXPN_10006,2024-11-04,0.0,0,0.0,32.0,31.0,11.0,13.0,17.0,...,74.33,123.22,,,,,,,,
1,PXPN_10006,2024-11-05,0.0,0,0.0,32.0,31.0,11.0,13.0,17.0,...,54.81,29.4,0.0,4.47,3.62,4.67,0.65,1.85,15.26,


Data preprocessed shape: (23828, 73)


Unnamed: 0,entry_id,dataset,ID,date,PHQ_9,STAI_X2,CSM,CTQ_1,CTQ_2,CTQ_3,...,SLT2,SLT3,SLT4,SLT5,SLT6,total_sleep,dbp,panic,severity,panic_label
0,PXPN_10006_2024-11-04,PXPN,PXPN_10006,2024-11-04,0.0,32.0,31.0,11.0,13.0,17.0,...,,,,,,,,0.0,,0
1,PXPN_10006_2024-11-05,PXPN,PXPN_10006,2024-11-05,0.0,32.0,31.0,11.0,13.0,17.0,...,4.47,3.62,4.67,0.65,1.85,15.26,,0.0,,0


Metadata shape: (23828, 17)


Unnamed: 0,entry_id,ID,date,dataset,dailylog_data,lifelog_data,questionnaire_data,dtype_n,dbp,panic,n_prior_data,valid_entry_3,valid_entry_2,valid_entry_1,ref_event_id,panic_label,severity
29001,SYM2-1-96_2021-08-04,SYM2-1-96,2021-08-04,SYM2,0,1,0,1,,0.0,2,0,1,1,,0,
29000,SYM2-1-96_2021-08-03,SYM2-1-96,2021-08-03,SYM2,0,1,0,1,,0.0,1,0,0,1,,0,
