# Panic Project (DHLAB) - Multiclass Classification PyCaret Model for Panic Severity Prediction

author:  `@cyshin971`  

date:    `2025-06-xx`  

version: `1-0`

In [None]:
version = "1-0"

# 📚 | Import Libraries 

In [None]:
import config as cfg
import logging

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
logging.getLogger('matplotlib').setLevel(logging.WARNING)

from library.pandas_utils import move_column, remove_columns, create_empty_df, read_csv
from library.text_utils import save_as_csv
from library.json_utils import save_dict_to_file, load_dict_from_file
from library.path_utils import get_file_path

from pycaret.classification import *

# 📁 | Path Variables 

In [None]:
DATA_PATH = "./_data"
TMP_PATH = "./cys/_tmp"
OUTPUT_PATH = "./cys/_output"

# 🌐 | Global Variables

In [None]:
class OUTPUT:
    num_classes = 3
    class_names = ['Mild', 'Moderate', 'Severe']
    
    label2name = dict(enumerate(class_names))
    name2label = {v: k for k, v in label2name.items()}
    
    output_dict = {
		1: 'Mild',
		2: 'Mild',
		3: 'Moderate',
		4: 'Severe',
		5: 'Severe'
	}
    
    output_dict_inv = {v: k for k, v in output_dict.items()}

    @staticmethod
    def get_label_name(label):
        return OUTPUT.label2name[label]
    @staticmethod
    def get_label_from_name(name):
        return OUTPUT.name2label[name]

# ⚒️ | Preprocessed Data

In [None]:
try:
	features_dict = load_dict_from_file(OUTPUT_PATH, 'panic_features_dict')
except FileNotFoundError:
    raise FileNotFoundError(f"File not found: {get_file_path(OUTPUT_PATH, 'panic_features_dict')}. Please run data_analysis.ipynb first.")
print(f"Loaded features dict with {len(features_dict)} keys:")
scraped_data_filename = None
for k, v in features_dict.items():
    if k == 'scraped_data_filename':
        print(f"  {k}: {v}.csv")
        scraped_data_filename = v
    elif k == 'preproc_version':
        preproc_version = v
    elif k == 'analysis_version':
        analysis_version = v
    else:
        print(f'{k}: {features_dict[k]}')

if scraped_data_filename is None:
	raise ValueError("scraped_data_filename not found in features_dict")

pre_data = read_csv(get_file_path(OUTPUT_PATH, f'panic_pre_data_filled_{analysis_version}({scraped_data_filename}).csv'))
display(pre_data.head(5))
metadata = read_csv(get_file_path(OUTPUT_PATH, f'panic_metadata_{preproc_version}({scraped_data_filename}).csv'))
display(metadata.head(5))
demography_data = read_csv(get_file_path(OUTPUT_PATH, f'panic_demography_data_{preproc_version}({scraped_data_filename}).csv'))
display(demography_data.head(5))
patient_data = read_csv(get_file_path(OUTPUT_PATH, f'panic_patient_analysis_{analysis_version}({scraped_data_filename}).csv'))
display(patient_data.head(5))

# 

# 🔄️ | Data Processing

In [None]:
dbp_param = 3

filtered_metadata = create_empty_df()
filtered_pre_data = create_empty_df()
proc_data_init = create_empty_df()

filtered_panic_metadata = metadata[metadata['panic_label'] == 1].copy()
print(f"Found {len(filtered_panic_metadata)} entries with panic label.")
filtered_panic_metadata_entry_ids = filtered_panic_metadata[filtered_panic_metadata[f'valid_entry_{dbp_param}'] == 1]['entry_id'].unique()

proc_data_init = metadata[metadata['entry_id'].isin(filtered_panic_metadata_entry_ids)].copy()

print(f"Found {len(filtered_panic_metadata_entry_ids)} entries with panic label and at least {dbp_param} days of prior data.")
filtered_metadata = metadata[metadata['ref_event_id'].isin(filtered_panic_metadata_entry_ids)].copy()
print(f"Filtered metadata contains {len(filtered_metadata)} entries with panic label and at least {dbp_param} days of prior data.")
unique_dbp = filtered_metadata['dbp'].unique()
if len(unique_dbp) != 3:
	raise ValueError(f"Expected 3 unique DBP values, found {len(unique_dbp)}: {unique_dbp}")
del filtered_panic_metadata, filtered_panic_metadata_entry_ids, unique_dbp

filtered_entry_ids = filtered_metadata['entry_id'].unique()
filtered_panic_entry_ids = filtered_metadata['ref_event_id'].unique()
filtered_pre_data = pre_data[pre_data['entry_id'].isin(filtered_entry_ids)].copy()
if len(filtered_pre_data) != len(filtered_metadata):
	raise ValueError(f"Filtered pre_data length {len(filtered_pre_data)} does not match filtered_metadata length {len(filtered_metadata)}")
print(f"Filtered data contains {len(filtered_panic_entry_ids)} unique panic events and {len(filtered_entry_ids)} unique entry IDs.")
print(f"Filtered pre_data contains {len(filtered_pre_data['ID'].unique())} unique IDs.")
del filtered_entry_ids

proc_data_init = proc_data_init[features_dict['id']+features_dict['label']].copy()
print(f"Initial processed data contains {len(proc_data_init)} entries with {len(proc_data_init.columns)} columns.")
display(proc_data_init.head(5))

In [None]:
proc_data_int = create_empty_df()
proc_data_int = proc_data_init.copy()

# remove 'severity' from features_dict['dailylog]
features_dict['dailylog'] = [f for f in features_dict['dailylog'] if f != 'severity']

# use demography data to add demographic features to proc_data using ID (multiple entries per ID)
proc_data_int = pd.merge(proc_data_int, demography_data, on='ID', how='left')
print(f"Processed data after merging with demography data contains {len(proc_data_int)} entries with {len(proc_data_int.columns)} columns."	)

for i in range(1, dbp_param + 1):
    # make a dictionary of 'entry_id' : 'ref_event_id' for the current dbp
	dbp_dict = filtered_metadata[filtered_metadata['dbp'] == i].set_index('entry_id')['ref_event_id'].to_dict()
	print(f"Processing data for {i} days before panic.")

	entry_ids = dbp_dict.keys()
	filtered_pre_data_i = filtered_pre_data[filtered_pre_data['entry_id'].isin(entry_ids)].copy()
	if len(filtered_pre_data_i) != len(dbp_dict.keys()):
		raise ValueError(f"Filtered pre_data length {len(filtered_pre_data_i)} does not match filtered_metadata length {len(dbp_dict.keys())} for {i} days before panic")
  	# Update 'entry_id' in filtered_pre_data_i to the corresponding 'ref_event_id' from dbp_dict
	filtered_pre_data_i['entry_id'] = filtered_pre_data_i['entry_id'].map(dbp_dict)
	
	features_list = ['entry_id']+features_dict['dailylog']+features_dict['lifelog']
	if i == dbp_param:
		features_list += features_dict['questionnaire']
	filtered_pre_data_i = filtered_pre_data_i[features_list].copy()
	# rename ALL non-ID columns to include the suffix
	cols_to_rename = [c for c in filtered_pre_data_i.columns if c != 'entry_id']
	rename_map = {c: f"{c}_{i}" for c in cols_to_rename}
	filtered_pre_data_i.rename(columns=rename_map, inplace=True)
	
	proc_data_int = pd.merge(proc_data_int, filtered_pre_data_i, on='entry_id', how='left', suffixes=('', f'_{i}'))

# Use OUTPUT.output_dict to map severity labels
proc_data_int['severity'] = proc_data_int['severity'].map(OUTPUT.output_dict)

# save_as_csv(proc_data_int, TMP_PATH, f'proc_data_{dbp_param}days', index=False)
# display(proc_data_int.head(5))

In [None]:
proc_data = create_empty_df()
proc_data = proc_data_int.copy()

r_cols = ['panic',
          'dbp',
          'panic_label']
remove_columns(proc_data, r_cols)
move_column(proc_data, 'severity', -1)
display(proc_data.head(5))
save_as_csv(proc_data, OUTPUT_PATH, f'panic_severity_multi_proc_data_{dbp_param}days_{version}({scraped_data_filename})', index=False)

In [None]:
pd.crosstab(proc_data['severity'], proc_data['dataset'], margins=True, margins_name='Total')

# 🤖 | Modeling

In [None]:
data = proc_data.copy()
remove_columns(data, features_dict['id'])
display(data.head(5))

In [None]:
# 2. Initialize PyCaret setup
clf = setup(
    data=data,
    target='severity',           # replace with your target column name
    session_id=123,              # for reproducibility
    normalize=True,              # scale numeric features
    transformation=False,        # turn off power transformation
    train_size=0.8,              # 80/20 split
    fold=5,                      # 5-fold cross-validation
    fold_strategy='stratifiedkfold',
    numeric_imputation='mean',
    remove_multicollinearity=True,   # for small datasets, this is often helpful
	multicollinearity_threshold=0.9, # threshold for removing multicollinear features
	# html=False,                # do not generate HTML report (use plain-text output)
    verbose=True
)

In [None]:
# 3. Compare baseline models and select the best by Accuracy
best_model = compare_models(sort='Accuracy')

results = pull()  # Get the latest output table as a DataFrame
# Cross-Validation results
print("Cross-Validation Results:")
display(results)  # Jupyter display (can further style if you want)

In [None]:
# 4. Evaluate on hold-out set (20% test split)
holdout_results = predict_model(best_model)
for i in range(1, dbp_param + 1):
	for col in features_dict['dailylog']+ features_dict['lifelog']:
		remove_columns(holdout_results, [f"{col}_{i}"])
	if i == dbp_param:
		for col in features_dict['questionnaire']:
			remove_columns(holdout_results, [f"{col}_{i}"])
print("Hold-out set performance:")
display(holdout_results)

In [None]:
# 5. Create a specific model (e.g., LightGBM)
model = create_model('lightgbm')

In [None]:
# 6. Tune the model hyperparameters
tuned_model = tune_model(model, optimize='Accuracy')

In [None]:
# 7. Ensemble models (optional)
blended_model = blend_models([tuned_model, best_model])

In [None]:
# 8. Finalize the model for deployment
final_model = finalize_model(blended_model)

In [None]:
# 9. Use the finalized model on brand-new data
# new_data = pd.read_csv('new_data.csv')
# new_predictions = predict_model(final_model, data=new_data)

# 10. Save the finalized model for later use
# save_model(final_model, 'final_pycaret_multiclass_model')

# To load the saved model:
# loaded_model = load_model('final_pycaret_multiclass_model')

# 🚂 | Training

In [None]:
# s = setup(data, target = target_col)

# 📋 | Results

In [None]:
# best_model = compare_models()