In [None]:
import config as cfg
import logging

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
logging.getLogger('matplotlib').setLevel(logging.WARNING)

In [None]:
METADATA_LJY_PATH = "../_data/all_data_with_severity.csv"

In [None]:
demo_cols = [
    'gender', 'age', 'marriage', 'job', 'smkHx', 'drinkHx', 'suicideHx'
]
dailylog_cols = [
    'panic', 'severity', 'exercise', 'alcohol', 'coffee', 'menstruation',
	'smoking', 'positive_feeling', 'negative', 'positive_E', 'negative_E',
	'anxiety', 'annoying'
]
lifelog_cols = [
	'HR_var', 'HR_max', 'HR_mean', 'HR_hvar_mean', 'HR_acrophase', 'HR_amplitude',
 	'HR_mesor','HR_acrophase_difference', 'HR_acrophase_difference_2d', 'HR_amplitude_difference',
  	'HR_amplitude_difference_2d', 'HR_mesor_difference', 'HR_mesor_difference_2d',
	'bandpower(0.001-0.0005Hz)', 'bandpower(0.0005-0.0001Hz)', 'bandpower(0.0001-0.00005Hz)', 'bandpower(0.00005-0.00001Hz)',
	'steps', 'SLT1', 'SLT2', 'SLT3', 'SLT4', 'SLT5', 'SLT6', 'total_sleep'
]
questionnaire_cols = [
	'PHQ_9', 'STAI_X2', 'CSM', 'CTQ_1', 'CTQ_2', 'CTQ_3', 'CTQ_4', 'CTQ_5', 'KRQ', 'MDQ',
 	'ACQ', 'APPQ_1', 'APPQ_2', 'APPQ_3', 'BSQ', 'GAD_7', 'BRIAN'
]
all_cols = demo_cols + dailylog_cols + lifelog_cols + questionnaire_cols

In [None]:
metadata_ljy = pd.read_csv(METADATA_LJY_PATH)

# check if all columns are present
missing_cols = [col for col in all_cols if col not in metadata_ljy.columns]
if missing_cols:
    print(f"Missing columns in metadata_ljy: {missing_cols}")
# convert date column to datetime format
metadata_ljy['date'] = pd.to_datetime(metadata_ljy['date'], format='%Y-%m-%d')

In [None]:
print(f"Number of rows: {metadata_ljy.shape[0]}")
print(f"Number of columns: {metadata_ljy.shape[1]}")
display(metadata_ljy.head(5))

In [None]:
# Add 'dataset' column: source of data
metadata_ljy['dataset'] = metadata_ljy['ID'].str.split('_').str[0]
metadata_ljy['dataset'] = metadata_ljy['dataset'].str.split('-').str[0]
unique_dataset = metadata_ljy['dataset'].unique()
print("Unique sources in metadata_ljy:")

print(unique_dataset)
print("--------------------------------------")
print("Total number of entries:", metadata_ljy.shape[0])
pxpn_n = metadata_ljy[metadata_ljy['dataset'] == 'PXPN'].shape[0]
print("    PXPN dataset:", pxpn_n)
sym1_n = metadata_ljy[metadata_ljy['dataset'] == 'SYM1'].shape[0]
sym2_n = metadata_ljy[metadata_ljy['dataset'] == 'SYM2'].shape[0]
print("    SYM dataset:", sym1_n+sym2_n)
print("--------------------------------------")
unique_ids = metadata_ljy['ID'].unique()
print("Number of unique IDs in metadata_ljy:", len(unique_ids))
n_panic_2 = metadata_ljy[metadata_ljy['panic'] == 2].shape[0]
print("Number of panic events (panic=2):", n_panic_2)

display(metadata_ljy.head(5))

In [None]:
from library.pandas_utils import aggregate_by_column

agg_matrix = [
	('n_entries', 'ID', 'count'),
	('n_dates', 'date', 'nunique'),
	('n_panic_2', 'panic', lambda x: (x == 2).sum())
]

metadata_ljy_agg = aggregate_by_column(metadata_ljy, 'ID', agg_matrix)
display(metadata_ljy_agg.head(5))

In [None]:
def plot_histogram_of_counts(column, title=None, xlabel=None, ylabel="Frequency", bins_step=5):
	"""
	Plot a histogram of counts for a given pandas Series (column).
	Only nonzero values are plotted.
	"""
	import matplotlib.pyplot as plt

	data = column[column > 0]
	if data.empty:
		print("No nonzero values to plot.")
		return

	min_val = int(data.min())
	max_val = int(data.max())
	bins = np.arange(min_val, max_val + 2)  # +2 to include max in bin edges

	plt.figure(figsize=(10, 4))
	plt.hist(data, bins=bins, color='blue', alpha=0.7)
	plt.title(title or f'Histogram of {column.name}')
	plt.xlabel(xlabel or column.name)
	plt.ylabel(ylabel)
	plt.grid(axis='y', alpha=0.75)
	plt.xticks(np.arange(0, max_val + 1, bins_step))
	plt.tight_layout()
	plt.show()

In [None]:
print(f"Number of panic events (panic=2): {n_panic_2}")
plot_histogram_of_counts(metadata_ljy_agg['n_panic_2'], title="Histogram of Panic Events per ID", xlabel="Number of Panic Events")

In [None]:
from library.pandas_utils import find_unique_row

delta_days = 3

panic_ids = metadata_ljy_agg[metadata_ljy_agg['n_panic_2'] > 0]['ID'].unique()
print("Unique IDs with panic events (panic=2):", len(panic_ids))
print("--------------------------------------")

metadata = metadata_ljy.copy()
metadata['last_panic_days'] = None
metadata['n_prior_data'] = None
metadata['event_id'] = None
metadata['ref_event_id'] = None

for panic_id in panic_ids:
	df = metadata[metadata['ID'] == panic_id].sort_values(by='date', ascending=True)
	last_panic_date = None
	for i, row in df.iterrows():
		if row['panic'] == 2:
			event_id = row['ID'] + '_' + str(row['date'].date())
			metadata.loc[i, 'event_id'] = event_id
			if last_panic_date is None:
				last_panic_date = row['date']
				metadata.loc[i, 'last_panic_days'] = 100  # Set to 200 for the first panic event
			else:
				days_diff = (row['date'] - last_panic_date).days
				metadata.loc[i, 'last_panic_days'] = days_diff
				last_panic_date = row['date']
			date = last_panic_date
			for j in range(1, 101):
				date = date - pd.Timedelta(days=1)
				if date not in df['date'].values:
					metadata.loc[i, 'n_prior_data'] = j - 1
					break

filtered_metadata = metadata[metadata['event_id'].notnull()]
print(f"Number of panic events: {filtered_metadata.shape[0]}")
plot_histogram_of_counts(metadata['last_panic_days'],
                         title="Histogram of Days Since Last Panic Event (All Data)", xlabel="Days Since Last Panic Event",
                         bins_step=5)
filtered_metadata = filtered_metadata[filtered_metadata['last_panic_days'] > delta_days]
print(f"Number of panic events with last_panic_days > {delta_days}:", filtered_metadata.shape[0])
plot_histogram_of_counts(filtered_metadata['last_panic_days'],
                         title="Histogram of Days Since Last Panic Event (delta_days > 3)", xlabel="Days Since Last Panic Event",
                         bins_step=5)
filtered_metadata = filtered_metadata[filtered_metadata['n_prior_data'] >= delta_days]
print(f"Number of panic events with n_prior_data >= {delta_days}:", filtered_metadata.shape[0])
plot_histogram_of_counts(filtered_metadata['last_panic_days'],
                         title="Histogram of Days Since Last Panic Event (delta_days > 3 & n_prior_data >= 3)", xlabel="Days Since Last Panic Event",
                         bins_step=5)
print("--------------------------------------")
filtered_ids = filtered_metadata['ID'].unique()
print("Unique IDs with panic events (last_panic_days > 3 & n_prior_data >= 3):", len(filtered_ids))
print("--------------------------------------")

In [None]:
filtered_metadata_agg = aggregate_by_column(filtered_metadata, 'ID', agg_matrix)
plot_histogram_of_counts(filtered_metadata_agg['n_panic_2'],
						 title="Histogram of Panic Events per ID (last_panic_days > 3 & n_prior_data >= 3)",
						 bins_step=1)

In [None]:
from library.pandas_utils import find_unique_row

In [None]:
unique_panic = metadata_ljy['severity'].unique()
print(f"\nUnique values in 'panic': {unique_panic}")