# Panic Project (DHLAB) - Data Analysis

author:  `@cyshin971`  

date:    `2025-06-12`  

version: `1.0`

# 📚 | Import Libraries 

In [1]:
import config as cfg
import logging

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
logging.getLogger('matplotlib').setLevel(logging.WARNING)
from tqdm.notebook import tqdm
from joblib import Parallel, delayed

from library.pandas_utils import move_column, remove_columns, aggregate_by_column, create_empty_df, find_unique_row
from library.text_utils import save_as_csv
# from library.path_utils import get_file_path
from library.matplotlib_utils import plot_histogram_of_counts
from library.json_utils import save_dict_to_file

# 📁 | Path Variables 

In [2]:
DATA_PATH = "../_data"
TMP_PATH = "./cys/_tmp"
OUTPUT_PATH = "./cys/_output"

# ⛏️ | Scraped Data

load preprocessed data (by `junyeol_lee`)
- Each entry are the datapoints for a patient (`ID`) on a specific date (`date`)
- If there were multiple datapoints for a specific date (`date`) for a specific patient (`ID`), the values were processed (`sum`, `avg`, etc.) to a representation for the day

**Demography**
| Feature     | Source |  Data Type |  Range         |  Description           |  Data Type |  Description      | Change         |
|:------------|:-------|:--------------|:------------------|:-------------------------|:------------------|:------------------------|:---------------|
|             |        | **raw**       |          |           | **scraped**       |         |     |
| `gender`    |        | `boolean`       |        | Male (?), Female (?)     | `boolean`           | Male (?), Female (?)    |   |
| `age`       |        | `int`           | 0–120?            | Age in years             | `int`               | Age in years            |      |
| `marriage`  |        |               |                   |                         |                   |                        |                |
| `smkHx`     |        |               |                   |                         |                   |                        |                |
| `drinkHx`   |        |               |                   |                         |                   |                        |                |
| `suicideHx` |        |               |                   |                         |                   |                        |                |

Daily Log:
- `panic`
- `severity`
- `exercise`
- `alcohol`
- `coffee`
- `menstruation`
- `smoking`: (`boolean`) **-> change to (``int``)**
- `positive_feeling`
- `negative_feeling`
- `positive_E`
- `negative_E`
- `anxiety`
- `annoying`

## Scraped Data Features

In [3]:
metadata_filename = "final_result_20250612"

features_dict = {
	"demography": [
		'gender', 'age', 'marriage', 'job', 'smkHx', 'drinkHx', 'suicideHx'
	],
	"dailylog": [
		'panic', 'severity', 'exercise', 'alcohol', 'coffee', 'menstruation',
		'smoking', 'positive_feeling', 'negative_feeling', 'positive_E', 'negative_E',
		'anxiety', 'annoying'
	],
	"lifelog": [
		'HR_var', 'HR_max', 'HR_mean', 'HR_hvar_mean', 'HR_acrophase', 'HR_amplitude',
		'HR_mesor','HR_acrophase_difference', 'HR_acrophase_difference_2d', 'HR_amplitude_difference',
		'HR_amplitude_difference_2d', 'HR_mesor_difference', 'HR_mesor_difference_2d',
		'bandpower(0.001-0.0005Hz)', 'bandpower(0.0005-0.0001Hz)', 'bandpower(0.0001-0.00005Hz)', 'bandpower(0.00005-0.00001Hz)',
		'steps', 'SLT1', 'SLT2', 'SLT3', 'SLT4', 'SLT5', 'SLT6', 'total_sleep'
	],
	"questionnaire": [
		'PHQ_9', 'STAI_X2', 'CSM', 'CTQ_1', 'CTQ_2', 'CTQ_3', 'CTQ_4', 'CTQ_5', 'KRQ', 'MDQ',
		'ACQ', 'APPQ_1', 'APPQ_2', 'APPQ_3', 'BSQ', 'GAD_7', 'BRIAN'
	],
    "id": [
        'ID', 'date'
    ],
    "label": [
        'severity'
    ],
    "metadata": []
}

demo_vars = features_dict["demography"]
dailylog_vars = features_dict["dailylog"]
lifelog_vars = features_dict["lifelog"]
questionnaire_vars = features_dict["questionnaire"]

state_vars = demo_vars
trait_vars = dailylog_vars + lifelog_vars + questionnaire_vars
all_vars = state_vars + dailylog_vars + lifelog_vars + questionnaire_vars

print(f'Number of variables: {len(all_vars)}')
print(f'   Demographic variables: {len(state_vars)}')
print(f'   Daily log variables: {len(dailylog_vars)}')
print(f'   Life log variables: {len(lifelog_vars)}')
print(f'   Questionnaire variables: {len(questionnaire_vars)}')

save_dict_to_file(features_dict, TMP_PATH, "scraped_features")

DEBUG - (json_utils.py) save_dict_to_file: Dictionary saved successfully to C:\Users\cyshi\OneDrive\Documents\GitHub\Panic-Project-CYS\cys\_tmp\scraped_features.json


Number of variables: 62
   Demographic variables: 7
   Daily log variables: 13
   Life log variables: 25
   Questionnaire variables: 17


WindowsPath('C:/Users/cyshi/OneDrive/Documents/GitHub/Panic-Project-CYS/cys/_tmp/scraped_features.json')

## Load Scraped Data

In [4]:
scraped_data = pd.read_csv(os.path.join(DATA_PATH, f"{metadata_filename}.csv"))

# check if all columns are present
missing_cols = [col for col in all_vars if col not in scraped_data.columns]
if missing_cols:
    logging.warning(f"Missing columns in scraped_data: {missing_cols}")
else:
	logging.info("All expected columns are present in scraped_data.")
# convert date column to datetime format
scraped_data['date'] = pd.to_datetime(scraped_data['date'], format='%Y-%m-%d')
remove_columns(scraped_data, ['Unnamed: 0'])

print(f"Number of rows: {scraped_data.shape[0]}")
print(f"Number of columns: {scraped_data.shape[1]}")
display(scraped_data.head(5))

INFO - (1965066665.py) <module>: All expected columns are present in scraped_data.


Number of rows: 24370
Number of columns: 85


Unnamed: 0,ID,date,panic,gender,marriage,job,smkHx,drinkHx,suicideHx,suicide_need,...,STAI_X1,SLT1,SLT2,SLT3,SLT4,SLT5,SLT6,total_sleep,smoking,menstruation
0,SYM1-1-100,2021-03-02,2.0,1,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,SYM1-1-100,2021-03-03,2.0,1,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,SYM1-1-100,2021-03-19,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,...,,0.0,8.0,0.0,0.0,0.0,0.0,8.0,,
3,SYM1-1-100,2021-08-12,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,...,,0.9,0.0,0.0,3.68,1.23,1.52,7.33,,
4,SYM1-1-102,NaT,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


# ⚒️ | Data Preprocessing

## Initialize Preprocessed Data

- add `entry_id` to identify each entry: `'ID'_'date'`
- add dataset to identify source: `SYM1`, `SYM2`, `PXPN`
- convert `panic` (`0`, `1`, `2` = panic) to days befor panic (`dbp`) (panic = `0`, `1`, `2`)
- convert `0` values in `exercise`, `alcohol`, `coffee`, `menstruation`, `smoking` to null values `np.nan` (`NaN`)

In [5]:
data_pre_init = create_empty_df()
data_pre_init = scraped_data.copy()

# Add 'entry_id' column: unique identifier for each row
data_pre_init['entry_id'] = data_pre_init['ID'] + '_' + data_pre_init['date'].astype(str)
instance_id_unique = data_pre_init['entry_id'].unique()
move_column(data_pre_init, 'entry_id', 0)
print("Number of unique entry IDs:", len(instance_id_unique))
# Check if 'entry_id' is unique
if data_pre_init['entry_id'].duplicated().any():
	# return the rows with duplicate 'entry_id'
	duplicates = data_pre_init[data_pre_init['entry_id'].duplicated(keep=False)]
	print(f"Duplicate entry_id found [{len(duplicates)}]:")
	display(duplicates.head(5))
	save_as_csv(duplicates, TMP_PATH, f"duplicates_{metadata_filename}")

# Add 'dataset' column: source of data
data_pre_init['dataset'] = data_pre_init['ID'].str.split('_').str[0]
data_pre_init['dataset'] = data_pre_init['dataset'].str.split('-').str[0]
move_column(data_pre_init, 'dataset', 1)

# Convert 'panic' column to Days Before Panic (dbp)
data_pre_init['dbp'] = data_pre_init.apply(
	lambda row: np.nan if row['panic'] == 0
 				else 0 if row['panic'] == 2 else row['panic'],
	axis=1
)
remove_columns(data_pre_init, ['panic'])

# Convert 'daily_log' variables = 0 to NaN
data_pre_init['exercise'] = data_pre_init['exercise'].replace(0, np.nan)
data_pre_init['alcohol'] = data_pre_init['alcohol'].replace(0, np.nan)
data_pre_init['coffee'] = data_pre_init['coffee'].replace(0, np.nan)
data_pre_init['menstruation'] = data_pre_init['menstruation'].replace(0, np.nan)
data_pre_init['smoking'] = data_pre_init['smoking'].replace(0, np.nan)

# Update the features_dict
features_dict['id'] = ['entry_id'] + features_dict['id'] + ['dataset']
features_dict['label'] = ['dbp'] + features_dict['label']
features_dict['dailylog'].remove('panic')

Number of unique entry IDs: 24370


In [6]:
display(data_pre_init.head(5))
print("Unique sources in metadata_ljy: ", data_pre_init['dataset'].unique())
print("Number of entries in metadata_ljy:", data_pre_init.shape[0])
sym1_n = data_pre_init[data_pre_init['dataset'] == 'SYM1'].shape[0]
sym2_n = data_pre_init[data_pre_init['dataset'] == 'SYM2'].shape[0]
print("    SYM entries:", sym1_n+sym2_n)
print("    PXPN entries:", data_pre_init[data_pre_init['dataset'] == 'PXPN'].shape[0])
print("Number of unique IDs in metadata_ljy:", len(data_pre_init['ID'].unique()))
# find the unique IDs for SYM1 and SYM2
sym1_ids = data_pre_init[data_pre_init['dataset'] == 'SYM1']['ID'].unique()
sym2_ids = data_pre_init[data_pre_init['dataset'] == 'SYM2']['ID'].unique()
print("    SYM IDs: ", len(sym1_ids)+len(sym2_ids))
print("    PXPN IDs: ", len(data_pre_init[data_pre_init['dataset'] == 'PXPN']['ID'].unique()))
print("Number of panic events (dbp=0):", data_pre_init[data_pre_init['dbp'] == 0].shape[0])

Unnamed: 0,entry_id,dataset,ID,date,gender,marriage,job,smkHx,drinkHx,suicideHx,...,SLT1,SLT2,SLT3,SLT4,SLT5,SLT6,total_sleep,smoking,menstruation,dbp
0,SYM1-1-100_2021-03-02,SYM1,SYM1-1-100,2021-03-02,1,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,0.0
1,SYM1-1-100_2021-03-03,SYM1,SYM1-1-100,2021-03-03,1,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,0.0
2,SYM1-1-100_2021-03-19,SYM1,SYM1-1-100,2021-03-19,1,0.0,0.0,0.0,0.0,0.0,...,0.0,8.0,0.0,0.0,0.0,0.0,8.0,,,
3,SYM1-1-100_2021-08-12,SYM1,SYM1-1-100,2021-08-12,1,0.0,0.0,0.0,0.0,0.0,...,0.9,0.0,0.0,3.68,1.23,1.52,7.33,,,
4,SYM1-1-102_NaT,SYM1,SYM1-1-102,NaT,0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


Unique sources in metadata_ljy:  ['SYM1' 'SYM2' 'PXPN']
Number of entries in metadata_ljy: 24370
    SYM entries: 23531
    PXPN entries: 839
Number of unique IDs in metadata_ljy: 429
    SYM IDs:  400
    PXPN IDs:  29
Number of panic events (dbp=0): 802


## Initialize Metadata

initialize `metadata` by adding
- `demography_data` : whether demography data exists in the entry (`boolean`)
- `dailylog_data`, `lifelog_data`, `questionnaire_data` : whether each data group exists in the entry (`boolean`)
- `dtype_n` : how many of the 3 `state` groups exists in the entry (`int`)
- `panic_label` : whether a panic occured in the entry (`boolean`)

In [7]:
metadata_init = create_empty_df()
metadata_init = data_pre_init.copy()

metadata_init['demography_data'] = metadata_init[features_dict['demography']].notnull().any(axis=1).astype(int)
metadata_init['dailylog_data'] = metadata_init[features_dict['dailylog']].notnull().any(axis=1).astype(int)
metadata_init['lifelog_data'] = metadata_init[features_dict['lifelog']].notnull().any(axis=1).astype(int)
metadata_init['questionnaire_data'] = metadata_init[features_dict['questionnaire']].notnull().any(axis=1).astype(int)
metadata_init['dtype_n'] = metadata_init['dailylog_data'] + metadata_init['lifelog_data'] + metadata_init['questionnaire_data']
move_column(metadata_init, 'dtype_n', 8)
metadata_init['panic_label'] = metadata_init['dbp'].apply(lambda x: 1 if x == 0 else 0)

add_list = ['dailylog_data', 'lifelog_data', 'questionnaire_data', 'dtype_n']
for item in add_list:
	if item not in features_dict['metadata']:
		features_dict['metadata'].append(item)
del add_list
if 'panic_label' not in features_dict['label']:
	features_dict['label'].append('panic_label')

check_metadata = False
if check_metadata:
    check_type = 'dailylog' # demography, dailylog, lifelog, questionnaire
    check_for = 1
    test = metadata_init[metadata_init[check_type+'_data'] == check_for].copy()
    test = test[features_dict['id']+features_dict['metadata']+features_dict[check_type]]
    print(f"--------- TEST {test.shape[0]} ENTRIES WITH {check_type} = {check_for} ---------")
    display(test.head(10))
    save_as_csv(test, TMP_PATH, f"metadata_{check_type}_{check_for}")
    print("------------------------------------------------------------------------")
    del test, check_type, check_for

#metadata_init = metadata_init[features_dict['id'] + features_dict['metadata'] + features_dict['demography'] + features_dict['label']]
display(metadata_init.head(5))

Unnamed: 0,entry_id,dataset,ID,date,gender,marriage,job,smkHx,dtype_n,drinkHx,...,SLT6,total_sleep,smoking,menstruation,dbp,demography_data,dailylog_data,lifelog_data,questionnaire_data,panic_label
0,SYM1-1-100_2021-03-02,SYM1,SYM1-1-100,2021-03-02,1,0.0,0.0,0.0,2,0.0,...,,,,,0.0,1,1,0,1,1
1,SYM1-1-100_2021-03-03,SYM1,SYM1-1-100,2021-03-03,1,0.0,0.0,0.0,2,0.0,...,,,,,0.0,1,1,0,1,1
2,SYM1-1-100_2021-03-19,SYM1,SYM1-1-100,2021-03-19,1,0.0,0.0,0.0,2,0.0,...,0.0,8.0,,,,1,0,1,1,0
3,SYM1-1-100_2021-08-12,SYM1,SYM1-1-100,2021-08-12,1,0.0,0.0,0.0,2,0.0,...,1.52,7.33,,,,1,0,1,1,0
4,SYM1-1-102_NaT,SYM1,SYM1-1-102,NaT,0,0.0,0.0,0.0,0,0.0,...,,,,,,1,0,0,0,0


## Extract Demography Data

- All patients within the scraped data were confirmed to have demographic data (`demography_data` = `True`)
- as such demography_data will not be included in the `metadata`
- Demography data was extracted and saved as `demography.csv` to the `output` directory

In [8]:
agg_matrix = [
	('gender_n', 'gender', 'nunique'),
	('age_n', 'age', 'nunique'),
	('marriage_n', 'marriage', 'nunique'),
	('job_n', 'job', 'nunique'),
	('smkHx_n', 'smkHx', 'nunique'),
	('drinkHx_n', 'drinkHx', 'nunique'),
	('suicideHx_n', 'suicideHx', 'nunique'),
    ('gender', 'gender', 'first'),
	('age', 'age', 'first'),
	('marriage', 'marriage', 'first'),
	('job', 'job', 'first'),
	('smkHx', 'smkHx', 'first'),
	('drinkHx', 'drinkHx', 'first'),
	('suicideHx', 'suicideHx', 'first')
]
demo_data = create_empty_df()
demo_data = aggregate_by_column(metadata_init, 'ID', agg_matrix)
# check if the length of each unique value is 1
non_unique_cols = []
for col in features_dict['demography']:
	if demo_data[col+'_n'].apply(lambda x: x > 1).any():
		non_unique_cols.append(col)
if non_unique_cols:
	raise ValueError(f"Demographic columns {non_unique_cols} are not unique for each ID in demo_data.")
else:
	print("All demographic columns are unique for each ID in demo_data.")

for col in features_dict['demography']:
	remove_columns(demo_data, [col+'_n'])
print(f"Number of rows in demo_data: {demo_data.shape[0]}")
display(demo_data.head(5))

save_as_csv(demo_data, OUTPUT_PATH, f"demography_({metadata_filename})")

All demographic columns are unique for each ID in demo_data.
Number of rows in demo_data: 429


Unnamed: 0,ID,gender,age,marriage,job,smkHx,drinkHx,suicideHx
0,PXPN_10006,0,32.0,0.0,1.0,1.0,1.0,0.0
1,PXPN_10007,1,38.0,1.0,1.0,0.0,0.0,0.0
2,PXPN_10008,0,38.0,1.0,0.0,0.0,1.0,0.0
3,PXPN_10009,1,28.0,0.0,0.0,1.0,0.0,1.0
4,PXPN_10010,1,21.0,0.0,0.0,1.0,1.0,0.0


DEBUG - (text_utils.py) save_as_csv: Saved demography_(final_result_20250612).csv to C:\Users\cyshi\OneDrive\Documents\GitHub\Panic-Project-CYS\cys\_output


WindowsPath('C:/Users/cyshi/OneDrive/Documents/GitHub/Panic-Project-CYS/cys/_output/demography_(final_result_20250612).csv')

## Construct Intermediate Metadata
- the current `metadata` (`metadata_init`) was filtered to include only columns for identification, added columns for metadata, and labels
- the `metadata` was also filtered to get rid of all entries that only have demography data (`dtype_n` = 0)

In [9]:
metadata_int = create_empty_df()
metadata_int = metadata_init.copy()

metadata_int = metadata_int[features_dict['id'] + features_dict['metadata'] + features_dict['label']]
move_column(metadata_int, 'severity', -1)
metadata_int = metadata_int[metadata_int['dtype_n'] > 0]
metadata_int.sort_values(by=['ID', 'date'], inplace=True)
display(metadata_int.head(5))

Unnamed: 0,entry_id,ID,date,dataset,dailylog_data,lifelog_data,questionnaire_data,dtype_n,dbp,panic_label,severity
23531,PXPN_10006_2024-11-04,PXPN_10006,2024-11-04,PXPN,1,1,1,3,,0,
23532,PXPN_10006_2024-11-05,PXPN_10006,2024-11-05,PXPN,1,1,1,3,,0,
23533,PXPN_10006_2024-11-06,PXPN_10006,2024-11-06,PXPN,1,1,1,3,1.0,0,
23534,PXPN_10006_2024-11-07,PXPN_10006,2024-11-07,PXPN,1,1,1,3,0.0,1,1.0
23535,PXPN_10006_2024-11-08,PXPN_10006,2024-11-08,PXPN,0,1,1,2,,0,


## Filter Preprocessed Data

- demographic features were removed from preprocessed data (`data_pre`)
- the data was filtered to remove entries with only demgraphic data
- the removed IDs were checked to see if no relevant entries were discarded

In [10]:
data_pre = create_empty_df()
data_pre = data_pre_init.copy()
# Remove demographic features from data_proc
remove_columns(data_pre, features_dict['demography'])
# Filter data_proc to keep only rows with entry IDs present in metadata_int
metadata_int_unique_ids = metadata_int['entry_id'].unique()
data_pre = data_pre[data_pre['entry_id'].isin(metadata_int_unique_ids)]

# remove rows with null dates
data_pre = data_pre[data_pre['date'].notnull()]

# Find IDs present in unfiltered_data but missing in filtered_data (i.e., lost after filtering)
check_missing_ids = False
if check_missing_ids:
	missing_ids = np.setdiff1d(data_pre_init['ID'].unique(), data_pre['ID'].unique())
	missing_data = data_pre_init[data_pre_init['ID'].isin(missing_ids)]
	print(f"Number of IDs lost after filtering: {len(missing_ids)}")
	_ = save_as_csv(missing_data, TMP_PATH, f"missing_{metadata_filename}")

## 💾 | Save Preprocessed Data

In [11]:
# save data_pre to CSV
save_as_csv(data_pre, OUTPUT_PATH, f"panic_pre_({metadata_filename})")

display(data_pre.head(3))
print("--------------------------------------------------------")
print("Total entries in original: ", data_pre_init.shape[0])
sym1_n = data_pre_init[data_pre_init['dataset'] == 'SYM1'].shape[0]
sym2_n = data_pre_init[data_pre_init['dataset'] == 'SYM2'].shape[0]
print("    SYM entries:", sym1_n+sym2_n)
print("    PXPN entries:", data_pre_init[data_pre_init['dataset'] == 'PXPN'].shape[0])
print("Number of unique IDs in original:", len(data_pre_init['ID'].unique()))
# find the unique IDs for SYM1 and SYM2
sym1_ids = data_pre_init[data_pre_init['dataset'] == 'SYM1']['ID'].unique()
sym2_ids = data_pre_init[data_pre_init['dataset'] == 'SYM2']['ID'].unique()
print("    SYM IDs: ", len(sym1_ids)+len(sym2_ids))
print("    PXPN IDs: ", len(data_pre_init[data_pre_init['dataset'] == 'PXPN']['ID'].unique()))
print("Number of panic events (dbp=0):", data_pre_init[data_pre_init['dbp'] == 0].shape[0])
print("--------------------------------------------------------")
print("Total entries in filtered: ", data_pre['dataset'].unique())
sym1_n = data_pre[data_pre['dataset'] == 'SYM1'].shape[0]
sym2_n = data_pre[data_pre['dataset'] == 'SYM2'].shape[0]
print("    SYM entries:", sym1_n+sym2_n)
print("    PXPN entries:", data_pre[data_pre['dataset'] == 'PXPN'].shape[0])
print("Number of unique IDs in filtered:", len(data_pre['ID'].unique()))
# find the unique IDs for SYM1 and SYM2
sym1_ids = data_pre[data_pre['dataset'] == 'SYM1']['ID'].unique()
sym2_ids = data_pre[data_pre['dataset'] == 'SYM2']['ID'].unique()
print("    SYM IDs: ", len(sym1_ids)+len(sym2_ids))
print("    PXPN IDs: ", len(data_pre[data_pre['dataset'] == 'PXPN']['ID'].unique()))
print("Number of panic events (dbp=0):", data_pre[data_pre['dbp'] == 0].shape[0])

DEBUG - (text_utils.py) save_as_csv: Saved panic_pre_(final_result_20250612).csv to C:\Users\cyshi\OneDrive\Documents\GitHub\Panic-Project-CYS\cys\_output


Unnamed: 0,entry_id,dataset,ID,date,suicide_need,medication_in_month,alcohol,bandpower(0.001-0.0005Hz),bandpower(0.0005-0.0001Hz),bandpower(0.0001-0.00005Hz),...,SLT1,SLT2,SLT3,SLT4,SLT5,SLT6,total_sleep,smoking,menstruation,dbp
0,SYM1-1-100_2021-03-02,SYM1,SYM1-1-100,2021-03-02,0.0,0.0,3.0,,,,...,,,,,,,,,,0.0
1,SYM1-1-100_2021-03-03,SYM1,SYM1-1-100,2021-03-03,0.0,0.0,3.0,,,,...,,,,,,,,,,0.0
2,SYM1-1-100_2021-03-19,SYM1,SYM1-1-100,2021-03-19,0.0,0.0,,,,,...,0.0,8.0,0.0,0.0,0.0,0.0,8.0,,,


--------------------------------------------------------
Total entries in original:  24370
    SYM entries: 23531
    PXPN entries: 839
Number of unique IDs in original: 429
    SYM IDs:  400
    PXPN IDs:  29
Number of panic events (dbp=0): 802
--------------------------------------------------------
Total entries in filtered:  ['SYM1' 'SYM2' 'PXPN']
    SYM entries: 23282
    PXPN entries: 814
Number of unique IDs in filtered: 274
    SYM IDs:  245
    PXPN IDs:  29
Number of panic events (dbp=0): 802


# 📖 | Metadata

### Calculate Days Before Panic Features for Metadata

In [12]:
# TODO: Not working

metadata_calc = create_empty_df()
metadata_calc = metadata_int.copy()

# Test
p_id = 'PXPN_10006'
metadata_calc = metadata_calc[metadata_calc['ID'] == p_id]

metadata_calc['n_prior_data']    = None
metadata_calc['ref_event_id']    = None
move_column(metadata_calc, 'panic_label', -1)
move_column(metadata_calc, 'severity', -1)
metadata_calc.sort_values(by=['ID', 'date'], ascending=False, inplace=True)

delta_days = 3
lookback_limit = 7

def calculate_days_before_panic(patient_id):
    patient_data = metadata_calc[metadata_calc['ID'] == patient_id].sort_values('date', ascending=False)
    idx_map = dict(zip(patient_data['date'], patient_data.index))  # <-- global indices

    entry_dates = patient_data['date'].values
    panic_dates = patient_data.loc[patient_data['dbp'] == 0, 'date'].values

    for panic_date in panic_dates:
        for j in range(1, delta_days + 1):
            prior_date = panic_date - pd.Timedelta(days=j)
            if prior_date in idx_map:
                idx = idx_map[prior_date]  # GLOBAL index
                metadata_calc.at[idx, 'dbp'] = j
                panic_row = patient_data[patient_data['date'] == panic_date]
                if not panic_row.empty:
                    metadata_calc.at[idx, 'ref_event_id'] = panic_row.iloc[0]['entry_id']

    date_set = set(entry_dates)
    for entry_date in entry_dates:
        idx = idx_map[entry_date]  # GLOBAL index
        for j in range(1, lookback_limit + 1):
            if j == lookback_limit:
                metadata_calc.at[idx, 'n_prior_data'] = j
                break
            prior_date = entry_date - pd.Timedelta(days=j)
            if prior_date not in date_set:
                break
            prior_idx = idx_map[prior_date]
            if patient_data.loc[prior_idx, 'panic_label'] == 1:
                break
            metadata_calc.at[idx, 'n_prior_data'] = j

# Test
calculate_days_before_panic(p_id)

# patient_ids = metadata_calc['ID'].unique()
# results = Parallel(n_jobs=-1, backend='loky')(
#     delayed(calculate_days_before_panic)(pid)
#     for pid in tqdm(patient_ids, desc="Calculating Days Before Panic")
# )

# for updates in results:
#     for col, updates_dict in updates.items():
#         for idx, value in updates_dict.items():
#             metadata_calc.at[idx, col] = value  # idx is already global

display(metadata_calc.head(50))

KeyError: numpy.datetime64('2024-12-02T00:00:00.000000000')

In [None]:
metadata_calc.sort_values(by=['ID', 'date'], inplace=True)
display(metadata_calc.head(50))

In [None]:
asdf

In [None]:
agg_matrix = [
	('n_entries', 'entry_id', 'count'),
	('n_panic_2', 'panic', lambda x: (x == 2).sum())
]

metadata_ljy_agg = aggregate_by_column(metadata, 'ID', agg_matrix)
display(metadata_ljy_agg.head(5))

In [None]:
# Find all IDs that ever had panic==2
panic_ids = metadata_ljy_agg.loc[
    metadata_ljy_agg['n_panic_2'] > 0, 'ID'
].unique()
print("Unique IDs with panic events (panic=2):", len(panic_ids))
print(f"Number of panic events (panic=2): {n_panic_2}")
print("--------------------------------------")
plot_histogram_of_counts(metadata_ljy_agg['n_panic_2'], title="Histogram of Panic Events per ID", xlabel="Number of Panic Events")

In [None]:
agg_matrix = [
	('n_entries', 'ref_event_id', 'count'),
	('n_dates', 'date', 'nunique')
]

metadata_agg = aggregate_by_column(metadata, 'ref_event_id', agg_matrix)
#display(metadata_agg.head(5))

check = metadata_agg[metadata_agg['n_entries'] != metadata_agg['n_dates']]
#print("Entries where n_entries != n_dates:")
#display(check)

In [None]:
# Filter down to only those rows we marked as panic events
panic_data = metadata[['ID', 'date', 'event_id', 'last_panic_days', 'n_prior_data', 'severity']].copy()
panic_data = panic_data[panic_data['event_id'].notnull()]

panic_data = panic_data[
    (panic_data['last_panic_days'] > delta_days) &
    (panic_data['n_prior_data'] >= delta_days) &
    (panic_data['severity'].notnull())
]
print(f"-------- last_panic_days > {delta_days} & n_prior_data ≥ {delta_days} --------")
print(f"Number of qualifying panic events: {panic_data.shape[0]} out of {n_panic_2} ({panic_data.shape[0] / n_panic_2:.2%})")
print(f"Unique IDs with panic events: {len(panic_data['ID'].unique())} out of {len(panic_ids)} ({len(panic_data['ID'].unique()) / len(panic_ids):.2%})")

display(panic_data.head(5))

In [None]:
filtered_metadata = metadata[(metadata['ref_event_id'].notnull()) | (metadata['event_id'].notnull())].copy()
qulifying_event_ids = panic_data['event_id'].unique()
qualifying_metadata = filtered_metadata[filtered_metadata['ref_event_id'].isin(qulifying_event_ids)]
display(qualifying_metadata.head(10))
print(f"Expected number of columns: {len(all_cols) * 3}")

disp_data = filtered_metadata[['ID', 'date', 'event_id', 'ref_event_id', 'last_panic_days', 'n_prior_data']].copy()
display(disp_data.head(50))
del disp_data

In [None]:
agg_matrix = [
	('n_entries', 'ref_event_id', 'count'),
	('n_dates', 'date', 'nunique')
]

qualifying_metadata_agg = aggregate_by_column(qualifying_metadata, 'ref_event_id', agg_matrix)
#display(filtered_metadata_agg.head(5))
plot_histogram_of_counts(qualifying_metadata_agg['n_entries'], title="Histogram of Entries per Ref Event ID", xlabel="Number of Entries", bins_step=1)
check = qualifying_metadata_agg[qualifying_metadata_agg['n_entries'] != qualifying_metadata_agg['n_dates']]
print("Entries where n_entries != n_dates:")
display(check)

In [None]:
unique_panic = metadata['severity'].unique()
print(f"\nUnique values in 'panic': {unique_panic}")