# Panic Project (DHLAB) - Data Analysis

# 📚 | Import Libraries 

In [None]:
import config as cfg
import logging

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
logging.getLogger('matplotlib').setLevel(logging.WARNING)

from library.pandas_utils import move_column, remove_columns, aggregate_by_column, create_empty_df
from library.text_utils import save_as_csv
# from library.path_utils import get_file_path
from library.matplotlib_utils import plot_histogram_of_counts
from library.json_utils import save_dict_to_file

# 📁 | Path Variables 

In [None]:
DATA_PATH = "../_data"
TMP_PATH = "./cys/tmp"
OUTPUT_PATH = "./cys/output"

metadata_filename = "final_result_20250609_02"

# ⛏️ | Scraped Data

Scraped data features

In [None]:
features_dict = {
	"demography": [
		'gender', 'age', 'marriage', 'job', 'smkHx', 'drinkHx', 'suicideHx'
	],
	"dailylog": [
		'panic', 'severity', 'exercise', 'alcohol', 'coffee', 'menstruation',
		'smoking', 'positive_feeling', 'negative_feeling', 'positive_E', 'negative_E',
		'anxiety', 'annoying'
	],
	"lifelog": [
		'HR_var', 'HR_max', 'HR_mean', 'HR_hvar_mean', 'HR_acrophase', 'HR_amplitude',
		'HR_mesor','HR_acrophase_difference', 'HR_acrophase_difference_2d', 'HR_amplitude_difference',
		'HR_amplitude_difference_2d', 'HR_mesor_difference', 'HR_mesor_difference_2d',
		'bandpower(0.001-0.0005Hz)', 'bandpower(0.0005-0.0001Hz)', 'bandpower(0.0001-0.00005Hz)', 'bandpower(0.00005-0.00001Hz)',
		'steps', 'SLT1', 'SLT2', 'SLT3', 'SLT4', 'SLT5', 'SLT6', 'total_sleep'
	],
	"questionnaire": [
		'PHQ_9', 'STAI_X2', 'CSM', 'CTQ_1', 'CTQ_2', 'CTQ_3', 'CTQ_4', 'CTQ_5', 'KRQ', 'MDQ',
		'ACQ', 'APPQ_1', 'APPQ_2', 'APPQ_3', 'BSQ', 'GAD_7', 'BRIAN'
	],
    "id": [
        'ID', 'date'
    ],
    "label": [
        'severity'
    ],
    "metadata": []
}

demo_vars = features_dict["demography"]
dailylog_vars = features_dict["dailylog"]
lifelog_vars = features_dict["lifelog"]
questionnaire_vars = features_dict["questionnaire"]

state_vars = demo_vars
trait_vars = dailylog_vars + lifelog_vars + questionnaire_vars
all_vars = state_vars + dailylog_vars + lifelog_vars + questionnaire_vars

print(f'Number of variables: {len(all_vars)}')
print(f'   Demographic variables: {len(state_vars)}')
print(f'   Daily log variables: {len(dailylog_vars)}')
print(f'   Life log variables: {len(lifelog_vars)}')
print(f'   Questionnaire variables: {len(questionnaire_vars)}')

save_dict_to_file(features_dict, OUTPUT_PATH+'/scraped', "scraped_features")

## Load Scraped Data

load preprocessed data (`junyeol_lee`)

In [None]:
scraped_data = pd.read_csv(os.path.join(DATA_PATH, f"{metadata_filename}.csv"))

# check if all columns are present
missing_cols = [col for col in all_vars if col not in scraped_data.columns]
if missing_cols:
    logging.warning(f"Missing columns in scraped_data: {missing_cols}")
else:
	logging.info("All expected columns are present in scraped_data.")
# convert date column to datetime format
scraped_data['date'] = pd.to_datetime(scraped_data['date'], format='%Y-%m-%d')
remove_columns(scraped_data, ['Unnamed: 0'])

print(f"Number of rows: {scraped_data.shape[0]}")
print(f"Number of columns: {scraped_data.shape[1]}")
display(scraped_data.head(5))

Add more descriptors to metadata
- `entry_id`
- `dataset`

# 📖 | Data Analysis

## Initialize Processed Data

In [None]:
data = scraped_data.copy()

# Add 'entry_id' column: unique identifier for each row
data['entry_id'] = data['ID'] + '_' + data['date'].astype(str)
instance_id_unique = data['entry_id'].unique()
move_column(data, 'entry_id', 0)
print("Number of unique entry IDs:", len(instance_id_unique))
# Check if 'entry_id' is unique
if data['entry_id'].duplicated().any():
	# return the rows with duplicate 'entry_id'
	duplicates = data[data['entry_id'].duplicated(keep=False)]
	print(f"Duplicate entry_id found [{len(duplicates)}]:")
	display(duplicates.head(5))
	save_as_csv(duplicates, TMP_PATH, f"duplicates_{metadata_filename}")

# Add 'dataset' column: source of data
data['dataset'] = data['ID'].str.split('_').str[0]
data['dataset'] = data['dataset'].str.split('-').str[0]
move_column(data, 'dataset', 1)

# Convert 'panic' column to Days Before Panic (dbp)
data['dbp'] = data.apply(
	lambda row: np.nan if row['panic'] == 0
 				else 0 if row['panic'] == 2 else row['panic'],
	axis=1
)
remove_columns(data, ['panic'])

# Convert 'daily_log' variables = 0 to NaN
data['exercise'] = data['exercise'].replace(0, np.nan)
data['alcohol'] = data['alcohol'].replace(0, np.nan)
data['coffee'] = data['coffee'].replace(0, np.nan)
data['menstruation'] = data['menstruation'].replace(0, np.nan)
data['smoking'] = data['smoking'].replace(0, np.nan)

# Update the features_dict
features_dict['id'] = ['entry_id'] + features_dict['id'] + ['dataset']
features_dict['label'] = ['dbp'] + features_dict['label']
features_dict['dailylog'].remove('panic')

In [None]:
display(data.head(5))
print("Unique sources in metadata_ljy: ", data['dataset'].unique())
sym1_n = data[data['dataset'] == 'SYM1'].shape[0]
sym2_n = data[data['dataset'] == 'SYM2'].shape[0]
print("    SYM entries:", sym1_n+sym2_n)
print("    PXPN entries:", data[data['dataset'] == 'PXPN'].shape[0])
print("Number of unique IDs in metadata_ljy:", len(data['ID'].unique()))
# find the unique IDs for SYM1 and SYM2
sym1_ids = data[data['dataset'] == 'SYM1']['ID'].unique()
sym2_ids = data[data['dataset'] == 'SYM2']['ID'].unique()
print("    SYM IDs: ", len(sym1_ids)+len(sym2_ids))
print("    PXPN IDs: ", len(data[data['dataset'] == 'PXPN']['ID'].unique()))
print("Number of panic events (dbp=0):", data[data['dbp'] == 0].shape[0])

## Initialize Metadata

In [None]:
metadata_init = create_empty_df()
metadata_init = data.copy()

metadata_init['demography_data'] = metadata_init[features_dict['demography']].notnull().any(axis=1).astype(int)
metadata_init['dailylog_data'] = metadata_init[features_dict['dailylog']].notnull().any(axis=1).astype(int)
metadata_init['lifelog_data'] = metadata_init[features_dict['lifelog']].notnull().any(axis=1).astype(int)
metadata_init['questionnaire_data'] = metadata_init[features_dict['questionnaire']].notnull().any(axis=1).astype(int)
metadata_init['dtype_n'] = metadata_init['dailylog_data'] + metadata_init['lifelog_data'] + metadata_init['questionnaire_data']
move_column(metadata_init, 'dtype_n', 8)
metadata_init['panic_label'] = metadata_init['dbp'].apply(lambda x: 1 if x == 0 else 0)

add_list = ['dailylog_data', 'lifelog_data', 'questionnaire_data', 'dtype_n']
for item in add_list:
	if item not in features_dict['metadata']:
		features_dict['metadata'].append(item)
del add_list
if 'panic_label' not in features_dict['label']:
	features_dict['label'].append('panic_label')

check_metadata = False
if check_metadata:
    check_type = 'dailylog' # demography, dailylog, lifelog, questionnaire
    check_for = 1
    test = metadata_init[metadata_init[check_type+'_data'] == check_for].copy()
    test = test[features_dict['id']+features_dict['metadata']+features_dict[check_type]]
    print(f"--------- TEST {test.shape[0]} ENTRIES WITH {check_type} = {check_for} ---------")
    display(test.head(10))
    save_as_csv(test, TMP_PATH, f"metadata_{check_type}_{check_for}")
    print("------------------------------------------------------------------------")
    del test, check_type, check_for

#metadata_init = metadata_init[features_dict['id'] + features_dict['metadata'] + features_dict['demography'] + features_dict['label']]
display(metadata_init.head(5))

## Extract Demography Data

In [None]:
agg_matrix = [
	('gender_n', 'gender', 'nunique'),
	('age_n', 'age', 'nunique'),
	('marriage_n', 'marriage', 'nunique'),
	('job_n', 'job', 'nunique'),
	('smkHx_n', 'smkHx', 'nunique'),
	('drinkHx_n', 'drinkHx', 'nunique'),
	('suicideHx_n', 'suicideHx', 'nunique'),
    ('gender', 'gender', 'first'),
	('age', 'age', 'first'),
	('marriage', 'marriage', 'first'),
	('job', 'job', 'first'),
	('smkHx', 'smkHx', 'first'),
	('drinkHx', 'drinkHx', 'first'),
	('suicideHx', 'suicideHx', 'first')
]
demo_data = create_empty_df()
demo_data = aggregate_by_column(metadata_init, 'ID', agg_matrix)
# check if the length of each unique value is 1
non_unique_cols = []
for col in features_dict['demography']:
	if demo_data[col+'_n'].apply(lambda x: x > 1).any():
		non_unique_cols.append(col)
if non_unique_cols:
	raise ValueError(f"Demographic columns {non_unique_cols} are not unique for each ID in demo_data.")
else:
	print("All demographic columns are unique for each ID in demo_data.")

for col in features_dict['demography']:
	remove_columns(demo_data, [col+'_n'])
print(f"Number of rows in demo_data: {demo_data.shape[0]}")
display(demo_data.head(5))

save_as_csv(demo_data, OUTPUT_PATH+'/analysis', f"demography")

## Construct Metadata

In [None]:
metadata = create_empty_df()
metadata = metadata_init.copy()

metadata = metadata[features_dict['id'] + features_dict['metadata'] + features_dict['label']]
move_column(metadata, 'severity', -1)
metadata = metadata[metadata['dtype_n'] > 0]
display(metadata.head(5))

## Filter Processed Data

In [None]:
unfiltered_data = data.copy()
print("Total entries in original: ", unfiltered_data.shape[0])
sym1_n = unfiltered_data[unfiltered_data['dataset'] == 'SYM1'].shape[0]
sym2_n = unfiltered_data[unfiltered_data['dataset'] == 'SYM2'].shape[0]
print("    SYM entries:", sym1_n+sym2_n)
print("    PXPN entries:", unfiltered_data[unfiltered_data['dataset'] == 'PXPN'].shape[0])
print("Number of unique IDs in original:", len(unfiltered_data['ID'].unique()))
# find the unique IDs for SYM1 and SYM2
sym1_ids = unfiltered_data[unfiltered_data['dataset'] == 'SYM1']['ID'].unique()
sym2_ids = unfiltered_data[unfiltered_data['dataset'] == 'SYM2']['ID'].unique()
print("    SYM IDs: ", len(sym1_ids)+len(sym2_ids))
print("    PXPN IDs: ", len(unfiltered_data[unfiltered_data['dataset'] == 'PXPN']['ID'].unique()))
print("Number of panic events (dbp=0):", unfiltered_data[unfiltered_data['dbp'] == 0].shape[0])

# remove rows with null dates
data = data[data['date'].notnull()]
# display(data.head(5))

print("Total entries in filtered: ", data['dataset'].unique())
sym1_n = data[data['dataset'] == 'SYM1'].shape[0]
sym2_n = data[data['dataset'] == 'SYM2'].shape[0]
print("    SYM entries:", sym1_n+sym2_n)
print("    PXPN entries:", data[data['dataset'] == 'PXPN'].shape[0])
print("Number of unique IDs in filtered:", len(data['ID'].unique()))
# find the unique IDs for SYM1 and SYM2
sym1_ids = data[data['dataset'] == 'SYM1']['ID'].unique()
sym2_ids = data[data['dataset'] == 'SYM2']['ID'].unique()
print("    SYM IDs: ", len(sym1_ids)+len(sym2_ids))
print("    PXPN IDs: ", len(data[data['dataset'] == 'PXPN']['ID'].unique()))
print("Number of panic events (dbp=0):", data[data['dbp'] == 0].shape[0])

# Find IDs present in unfiltered_data but missing in filtered_data (i.e., lost after filtering)
missing_ids = np.setdiff1d(unfiltered_data['ID'].unique(), data['ID'].unique())
missing_data = unfiltered_data[unfiltered_data['ID'].isin(missing_ids)]
print(f"Number of IDs lost after filtering: {len(missing_ids)}")
_ = save_as_csv(missing_data, TMP_PATH, f"missing_{metadata_filename}")

### Calculate Days Before Panic Features

In [None]:
# delta_days = 3

# calc_metadata = metadata.copy()
# calc_metadata.sort_values(by=['ID', 'date'], ascending=False, inplace=True)

# calc_metadata['n_prior_data']    = None
# calc_metadata['ref_event_id']    = None
# move_column(calc_metadata, 'panic_label', -1)
# move_column(calc_metadata, 'severity', -1)

# display(calc_metadata.head(30))

In [None]:
# delta_days = 3

# calc_metadata = metadata.copy()

# calc_metadata['last_panic_days'] = None
# calc_metadata['n_prior_data']    = None
# calc_metadata['ref_event_id']    = None
# move_column(calc_metadata, 'panic_label', -1)
# move_column(calc_metadata, 'severity', -1)

# for id in calc_metadata['ID'].unique():
#     id_data = calc_metadata[calc_metadata['ID'] == id].sort_values(by='date')
#     # Build a set of all observed dates for quick membership tests
#     date_set = set(id_data['date'])
    
#     last_panic_date = None
    
#     for idx, row in id_data.iterrows():
#         if row['panic_label'] == 1:
#             calc_metadata.loc[idx, 'ref_event_id'] = row['entry_id']
#             if last_panic_date is None:
#                 # This is the very first panic for this subject
#                 calc_metadata.loc[idx, 'last_panic_days'] = -1
#             else:
#                 days_diff = (row['date'] - last_panic_date).days
#                 calc_metadata.loc[idx, 'last_panic_days'] = days_diff
#             last_panic_date = row['date']
        
#         # Count how many consecutive prior days (up to 100) exist before this panic
#         # We look backward one day at a time. If we hit another panic, or a missing date, we break.
#         current_date = row['date']
#         found_n_prior = False

#         for j in range(1, 101):  # j = 1, 2, …, 100
#             look_date = current_date - pd.Timedelta(days=j)

#             if look_date not in date_set:
#                 # We encountered a missing day. That means only (j-1) consecutive prior days exist.
#                 calc_metadata.loc[idx, 'n_prior_data'] = j - 1
#                 found_n_prior = True
#                 break

#             # There *is* at least one row on look_date—grab it (or them)
#             rows_on_that_date = id_data[id_data['date'] == look_date]
#             if len(rows_on_that_date) > 1:
#                 raise ValueError(f"Warning: More than one row for date {look_date} for ID {row['ID']}")
#             # If ANY of those rows had panic==2, we stop and record (j-1):
#             if (rows_on_that_date['panic_label'] == 1).any():
#                 calc_metadata.loc[idx, 'n_prior_data'] = j - 1
#                 found_n_prior = True
#                 break

#             if j in range(1, delta_days + 1):
#                 # Set the 'ref_event_id' to event_id for all the rows for rows_on_that_date if we are within the delta_days
#                 calc_metadata.loc[rows_on_that_date.index, 'ref_event_id'] = row['entry_id']

#         if not found_n_prior:
#             # We never hit a prior panic or missing day within 100 days → cap at 100
#             calc_metadata.loc[idx, 'n_prior_data'] = 100

# display(calc_metadata.head(30))

In [None]:
# panic_ids = metadata[metadata['dbp'] == 0]['ID'].unique()

# for panic_id in panic_ids:
#     # Extract only rows for this subject, sorted by date ascending
#     df = metadata.loc[metadata['ID'] == panic_id].sort_values('date', ascending=True)

#     # Build a set of all observed dates for quick membership tests
#     date_set = set(df['date'])

#     last_panic_date = None

#     for idx, row in df.iterrows():

#         # Compute last_panic_days
#         if last_panic_date is None:
#             # This is the very first panic for this subject
#             metadata.loc[idx, 'last_panic_days'] = 100
#         else:
#             days_diff = (row['date'] - last_panic_date).days
#             metadata.loc[idx, 'last_panic_days'] = days_diff

#         # Count how many consecutive prior days (up to 100) exist before this panic
#         # We look backward one day at a time. If we hit another panic, or a missing date, we break.
#         current_date = row['date']
#         found_n_prior = False

#         for j in range(1, 101):  # j = 1, 2, …, 100
#             look_date = current_date - pd.Timedelta(days=j)

#             if look_date not in date_set:
#                 # We encountered a missing day. That means only (j-1) consecutive prior days exist.
#                 metadata.loc[idx, 'n_prior_data'] = j - 1
#                 found_n_prior = True
#                 break

#             # There *is* at least one row on look_date—grab it (or them)
#             rows_on_that_date = df[df['date'] == look_date]
#             if len(rows_on_that_date) > 1:
#                 print(f"Warning: More than one row for date {look_date} for ID {panic_id}")
#             # If ANY of those rows had panic==2, we stop and record (j-1):
#             if (rows_on_that_date['panic'] == 2).any():
#                 metadata.loc[idx, 'n_prior_data'] = j - 1
#                 found_n_prior = True
#                 break

#             if j in range(1, delta_days + 1):
#                 # Set the 'ref_event_id' to event_id for all the rows for rows_on_that_date if we are within the delta_days
#                 metadata.loc[rows_on_that_date.index, 'ref_event_id'] = event_id
  
#         if not found_n_prior:
#             # We never hit a prior panic or missing day within 100 days → cap at 100
#             metadata.loc[idx, 'n_prior_data'] = 100

#         # update last_panic_date so that the next panic (if any) calculates correctly
#         last_panic_date = row['date']

In [None]:
asdf

In [None]:
agg_matrix = [
	('n_entries', 'entry_id', 'count'),
	('n_panic_2', 'panic', lambda x: (x == 2).sum())
]

metadata_ljy_agg = aggregate_by_column(metadata, 'ID', agg_matrix)
display(metadata_ljy_agg.head(5))

In [None]:
# Find all IDs that ever had panic==2
panic_ids = metadata_ljy_agg.loc[
    metadata_ljy_agg['n_panic_2'] > 0, 'ID'
].unique()
print("Unique IDs with panic events (panic=2):", len(panic_ids))
print(f"Number of panic events (panic=2): {n_panic_2}")
print("--------------------------------------")
plot_histogram_of_counts(metadata_ljy_agg['n_panic_2'], title="Histogram of Panic Events per ID", xlabel="Number of Panic Events")

In [None]:
agg_matrix = [
	('n_entries', 'ref_event_id', 'count'),
	('n_dates', 'date', 'nunique')
]

metadata_agg = aggregate_by_column(metadata, 'ref_event_id', agg_matrix)
#display(metadata_agg.head(5))

check = metadata_agg[metadata_agg['n_entries'] != metadata_agg['n_dates']]
#print("Entries where n_entries != n_dates:")
#display(check)

In [None]:
# Filter down to only those rows we marked as panic events
panic_data = metadata[['ID', 'date', 'event_id', 'last_panic_days', 'n_prior_data', 'severity']].copy()
panic_data = panic_data[panic_data['event_id'].notnull()]

panic_data = panic_data[
    (panic_data['last_panic_days'] > delta_days) &
    (panic_data['n_prior_data'] >= delta_days) &
    (panic_data['severity'].notnull())
]
print(f"-------- last_panic_days > {delta_days} & n_prior_data ≥ {delta_days} --------")
print(f"Number of qualifying panic events: {panic_data.shape[0]} out of {n_panic_2} ({panic_data.shape[0] / n_panic_2:.2%})")
print(f"Unique IDs with panic events: {len(panic_data['ID'].unique())} out of {len(panic_ids)} ({len(panic_data['ID'].unique()) / len(panic_ids):.2%})")

display(panic_data.head(5))

In [None]:
filtered_metadata = metadata[(metadata['ref_event_id'].notnull()) | (metadata['event_id'].notnull())].copy()
qulifying_event_ids = panic_data['event_id'].unique()
qualifying_metadata = filtered_metadata[filtered_metadata['ref_event_id'].isin(qulifying_event_ids)]
display(qualifying_metadata.head(10))
print(f"Expected number of columns: {len(all_cols) * 3}")

disp_data = filtered_metadata[['ID', 'date', 'event_id', 'ref_event_id', 'last_panic_days', 'n_prior_data']].copy()
display(disp_data.head(50))
del disp_data

In [None]:
agg_matrix = [
	('n_entries', 'ref_event_id', 'count'),
	('n_dates', 'date', 'nunique')
]

qualifying_metadata_agg = aggregate_by_column(qualifying_metadata, 'ref_event_id', agg_matrix)
#display(filtered_metadata_agg.head(5))
plot_histogram_of_counts(qualifying_metadata_agg['n_entries'], title="Histogram of Entries per Ref Event ID", xlabel="Number of Entries", bins_step=1)
check = qualifying_metadata_agg[qualifying_metadata_agg['n_entries'] != qualifying_metadata_agg['n_dates']]
print("Entries where n_entries != n_dates:")
display(check)

In [None]:
unique_panic = metadata['severity'].unique()
print(f"\nUnique values in 'panic': {unique_panic}")