# Panic Project (DHLAB) - Multiclass Classification PyCaret Model for Panic Severity Prediction

author:  `@cyshin971`  

date:    `2025-06-xx`  

version: `1-0`

In [1]:
version = "1-0"

# 📚 | Import Libraries 

In [2]:
import config as cfg
import logging

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
logging.getLogger('matplotlib').setLevel(logging.WARNING)

from library.pandas_utils import move_column, remove_columns, create_empty_df, read_csv
from library.text_utils import save_as_csv
from library.json_utils import save_dict_to_file, load_dict_from_file
from library.path_utils import get_file_path

from pycaret.classification import *

# 📁 | Path Variables 

In [3]:
DATA_PATH = "./_data"
TMP_PATH = "./cys/_tmp"
OUTPUT_PATH = "./cys/_output"

# 🌐 | Global Variables

In [4]:
class OUTPUT:
    num_classes = 3
    class_names = ['Mild', 'Moderate', 'Severe']
    
    label2name = dict(enumerate(class_names))
    name2label = {v: k for k, v in label2name.items()}
    
    output_dict = {
		1: 'Mild',
		2: 'Mild',
		3: 'Moderate',
		4: 'Severe',
		5: 'Severe'
	}
    
    output_dict_inv = {v: k for k, v in output_dict.items()}

    @staticmethod
    def get_label_name(label):
        return OUTPUT.label2name[label]
    @staticmethod
    def get_label_from_name(name):
        return OUTPUT.name2label[name]

# ⚒️ | Preprocessed Data

In [25]:
try:
	features_dict = load_dict_from_file(OUTPUT_PATH, 'panic_features_dict')
except FileNotFoundError:
    raise FileNotFoundError(f"File not found: {get_file_path(OUTPUT_PATH, 'panic_features_dict')}. Please run data_analysis.ipynb first.")
print(f"Loaded features dict with {len(features_dict)} keys:")
scraped_data_filename = None
for k, v in features_dict.items():
    if k == 'scraped_data_filename':
        print(f"  {k}: {v}.csv")
        scraped_data_filename = v
    elif k == 'preproc_version':
        preproc_version = v
    elif k == 'analysis_version':
        analysis_version = v
    else:
        print(f'{k}: {features_dict[k]}')

if scraped_data_filename is None:
	raise ValueError("scraped_data_filename not found in features_dict")

pre_data = read_csv(get_file_path(OUTPUT_PATH, f'panic_pre_data_filled_{analysis_version}({scraped_data_filename}).csv'))
display(pre_data.head(5))
metadata = read_csv(get_file_path(OUTPUT_PATH, f'panic_metadata_{preproc_version}({scraped_data_filename}).csv'))
display(metadata.head(5))
demography_data = read_csv(get_file_path(OUTPUT_PATH, f'panic_demography_data_{preproc_version}({scraped_data_filename}).csv'))
display(demography_data.head(5))
patient_data = read_csv(get_file_path(OUTPUT_PATH, f'panic_patient_analysis_{analysis_version}({scraped_data_filename}).csv'))
display(patient_data.head(5))

print(f"Number of Demographic Features: {len(features_dict['demography'])}")
print(f"Number of Daily Features: {len(features_dict['dailylog'])}")
print(f"Number of Life Log Features: {len(features_dict['lifelog'])}")
print(f"Number of Questionnaire Features: {len(features_dict['questionnaire'])}")

DEBUG - (json_utils.py) load_dict_from_file: Dictionary loaded successfully from C:\Users\cyshi\OneDrive\Documents\GitHub\Panic-Project-CYS\cys\_output\panic_features_dict.json


Loaded features dict with 13 keys:
  scraped_data_filename: final_result_diary_20250617_03.csv
demography: ['gender', 'age', 'marriage', 'job', 'smkHx', 'drinkHx', 'suicideHx', 'suicide_need']
dailylog: ['severity', 'exercise', 'alcohol', 'coffee', 'menstruation', 'smoking', 'positive_feeling', 'negative_feeling', 'positive_E', 'negative_E', 'anxiety', 'annoying']
lifelog: ['HR_var', 'HR_max', 'HR_mean', 'HR_hvar_mean', 'HR_acrophase', 'HR_amplitude', 'HR_mesor', 'HR_acrophase_difference', 'HR_acrophase_difference_2d', 'HR_amplitude_difference', 'HR_amplitude_difference_2d', 'HR_mesor_difference', 'HR_mesor_difference_2d', 'bandpower(0.001-0.0005Hz)', 'bandpower(0.0005-0.0001Hz)', 'bandpower(0.0001-0.00005Hz)', 'bandpower(0.00005-0.00001Hz)', 'steps', 'SLT1', 'SLT2', 'SLT3', 'SLT4', 'SLT5', 'SLT6', 'total_sleep', 'steps_maximum', 'steps_mean', 'step_hvar_mean', 'step_delta', 'step_max_delta', 'step_mean_delta', 'step_hvar_mean_delta', 'step_delta2', 'step_max_delta2', 'step_mean_delta2

Unnamed: 0,entry_id,dataset,ID,date,panic,PHQ_9,STAI_X2,CSM,CTQ_1,CTQ_2,...,SLT1,SLT2,SLT3,SLT4,SLT5,SLT6,total_sleep,severity,dbp,panic_label
0,PXPN_10006_2024-11-04,PXPN,PXPN_10006,2024-11-04,0.0,0.0,32.0,31.0,11.0,13.0,...,,,,,,,6.95,,,0
1,PXPN_10006_2024-11-05,PXPN,PXPN_10006,2024-11-05,0.0,0.0,32.0,31.0,11.0,13.0,...,0.0,4.47,3.62,4.67,0.65,1.85,15.26,,,0
2,PXPN_10006_2024-11-06,PXPN,PXPN_10006,2024-11-06,1.0,0.0,32.0,31.0,11.0,13.0,...,0.0,0.0,0.2,4.07,1.43,1.68,7.38,,1.0,0
3,PXPN_10006_2024-11-07,PXPN,PXPN_10006,2024-11-07,2.0,0.0,32.0,31.0,11.0,13.0,...,0.0,0.0,0.14,5.08,0.0,0.97,6.19,1.0,0.0,1
4,PXPN_10006_2024-11-08,PXPN,PXPN_10006,2024-11-08,0.0,0.0,32.0,31.0,11.0,13.0,...,,,,,,,6.95,,,0


Unnamed: 0,entry_id,ID,date,dataset,coffee,smoking,total_sleep,dailylog_data,lifelog_data,questionnaire_data,...,diary_data,dbp,panic,n_prior_data,valid_entry_3,valid_entry_2,valid_entry_1,ref_event_id,panic_label,severity
0,SYM2-1-96_2021-08-04,SYM2-1-96,2021-08-04,SYM2,,,,0,1,1,...,0,,0.0,7,1,1,1,,0,
1,SYM2-1-96_2021-08-03,SYM2-1-96,2021-08-03,SYM2,,,,0,1,1,...,0,,0.0,7,1,1,1,,0,
2,SYM2-1-96_2021-08-02,SYM2-1-96,2021-08-02,SYM2,,,,0,1,1,...,0,,0.0,7,1,1,1,,0,
3,SYM2-1-96_2021-08-01,SYM2-1-96,2021-08-01,SYM2,,,,0,0,1,...,0,,0.0,7,1,1,1,,0,
4,SYM2-1-96_2021-07-31,SYM2-1-96,2021-07-31,SYM2,,,,0,0,1,...,0,,0.0,7,1,1,1,,0,


Unnamed: 0,ID,gender,age,marriage,job,smkHx,drinkHx,suicideHx,suicide_need
0,PXPN_10006,0,32.0,0.0,1.0,1.0,1.0,0.0,0.0
1,PXPN_10007,1,38.0,1.0,1.0,0.0,0.0,0.0,0.0
2,PXPN_10008,0,38.0,1.0,0.0,0.0,1.0,0.0,0.0
3,PXPN_10009,1,28.0,0.0,0.0,1.0,0.0,1.0,0.0
4,PXPN_10010,1,21.0,0.0,0.0,1.0,1.0,0.0,0.0


Unnamed: 0,ID,n_entries,n_valid_3_entries,n_valid_2_entries,n_valid_1_entries,n_panic,max_severity,min_severity,mean_severity,n_dailylog,n_lifelog,n_questionnaire,sum_dtype,mean_dtype,n_diary,coffee_mean,coffee_n,smoking_mean,total_sleep_mean
0,PXPN_10006,29,18,21,25,3,2.0,1.0,1.33,22,28,29,79,2.72,0,1.0,4,,6.95
1,PXPN_10007,29,16,20,24,4,2.0,1.0,1.25,20,28,29,77,2.66,0,1.53,17,,
2,PXPN_10008,29,18,21,25,3,2.0,1.0,1.67,20,27,29,76,2.62,0,1.0,1,,
3,PXPN_10009,29,18,20,23,5,4.0,1.0,2.6,28,28,29,85,2.93,0,1.17,12,5.0,
4,PXPN_10010,29,26,27,28,0,,,,23,28,29,80,2.76,0,1.37,19,8.57,


Number of Demographic Features: 8
Number of Daily Features: 12
Number of Life Log Features: 37
Number of Questionnaire Features: 17


# 

# 🔄️ | Data Processing

In [6]:
dbp_param = 3

filtered_metadata = create_empty_df()
filtered_pre_data = create_empty_df()
proc_data_init = create_empty_df()

filtered_panic_metadata = metadata[metadata['panic_label'] == 1].copy()
print(f"Found {len(filtered_panic_metadata)} entries with panic label.")
filtered_panic_metadata_entry_ids = filtered_panic_metadata[filtered_panic_metadata[f'valid_entry_{dbp_param}'] == 1]['entry_id'].unique()

proc_data_init = metadata[metadata['entry_id'].isin(filtered_panic_metadata_entry_ids)].copy()

print(f"Found {len(filtered_panic_metadata_entry_ids)} entries with panic label and at least {dbp_param} days of prior data.")
filtered_metadata = metadata[metadata['ref_event_id'].isin(filtered_panic_metadata_entry_ids)].copy()
print(f"Filtered metadata contains {len(filtered_metadata)} entries with panic label and at least {dbp_param} days of prior data.")
unique_dbp = filtered_metadata['dbp'].unique()
if len(unique_dbp) != dbp_param:
	raise ValueError(f"Expected {dbp_param} unique DBP values, found {len(unique_dbp)}: {unique_dbp}")
del filtered_panic_metadata, filtered_panic_metadata_entry_ids, unique_dbp

filtered_entry_ids = filtered_metadata['entry_id'].unique()
filtered_panic_entry_ids = filtered_metadata['ref_event_id'].unique()
filtered_pre_data = pre_data[pre_data['entry_id'].isin(filtered_entry_ids)].copy()
if len(filtered_pre_data) != len(filtered_metadata):
	raise ValueError(f"Filtered pre_data length {len(filtered_pre_data)} does not match filtered_metadata length {len(filtered_metadata)}")
print(f"Filtered data contains {len(filtered_panic_entry_ids)} unique panic events and {len(filtered_entry_ids)} unique entry IDs.")
print(f"Filtered pre_data contains {len(filtered_pre_data['ID'].unique())} unique IDs.")
del filtered_entry_ids

proc_data_init = proc_data_init[features_dict['id']+features_dict['label']].copy()
print(f"Initial processed data contains {len(proc_data_init)} entries with {len(proc_data_init.columns)} columns.")
display(proc_data_init.head(5))

Found 811 entries with panic label.
Found 318 entries with panic label and at least 3 days of prior data.
Filtered metadata contains 954 entries with panic label and at least 3 days of prior data.
Filtered data contains 318 unique panic events and 954 unique entry IDs.
Filtered pre_data contains 76 unique IDs.
Initial processed data contains 318 entries with 8 columns.


Unnamed: 0,entry_id,ID,date,dataset,dbp,panic,severity,panic_label
155,SYM2-1-96_2021-03-02,SYM2-1-96,2021-03-02,SYM2,0.0,2.0,4.0,1
268,SYM2-1-476_2022-05-15,SYM2-1-476,2022-05-15,SYM2,0.0,2.0,5.0,1
278,SYM2-1-476_2022-05-05,SYM2-1-476,2022-05-05,SYM2,0.0,2.0,4.0,1
286,SYM2-1-476_2022-04-27,SYM2-1-476,2022-04-27,SYM2,0.0,2.0,3.0,1
1314,SYM2-1-422_2022-05-09,SYM2-1-422,2022-05-09,SYM2,0.0,2.0,3.0,1


In [7]:
proc_data_int = create_empty_df()
proc_data_int = proc_data_init.copy()

# remove 'severity' from features_dict['dailylog]
features_dict['dailylog'] = [f for f in features_dict['dailylog'] if f != 'severity']

# use demography data to add demographic features to proc_data using ID (multiple entries per ID)
proc_data_int = pd.merge(proc_data_int, demography_data, on='ID', how='left')
print(f"Processed data after merging with demography data contains {len(proc_data_int)} entries with {len(proc_data_int.columns)} columns."	)

for i in range(1, dbp_param + 1):
    # make a dictionary of 'entry_id' : 'ref_event_id' for the current dbp
	dbp_dict = filtered_metadata[filtered_metadata['dbp'] == i].set_index('entry_id')['ref_event_id'].to_dict()
	print(f"Processing data for {i} days before panic.")

	entry_ids = dbp_dict.keys()
	filtered_pre_data_i = filtered_pre_data[filtered_pre_data['entry_id'].isin(entry_ids)].copy()
	if len(filtered_pre_data_i) != len(dbp_dict.keys()):
		raise ValueError(f"Filtered pre_data length {len(filtered_pre_data_i)} does not match filtered_metadata length {len(dbp_dict.keys())} for {i} days before panic")
  	# Update 'entry_id' in filtered_pre_data_i to the corresponding 'ref_event_id' from dbp_dict
	filtered_pre_data_i['entry_id'] = filtered_pre_data_i['entry_id'].map(dbp_dict)
	
	features_list = ['entry_id']+features_dict['dailylog']+features_dict['lifelog']
	if i == dbp_param:
		features_list += features_dict['questionnaire']
	filtered_pre_data_i = filtered_pre_data_i[features_list].copy()
	# rename ALL non-ID columns to include the suffix
	cols_to_rename = [c for c in filtered_pre_data_i.columns if c != 'entry_id']
	rename_map = {c: f"{c}_{i}" for c in cols_to_rename}
	filtered_pre_data_i.rename(columns=rename_map, inplace=True)
	
	proc_data_int = pd.merge(proc_data_int, filtered_pre_data_i, on='entry_id', how='left', suffixes=('', f'_{i}'))

# Use OUTPUT.output_dict to map severity labels
proc_data_int['severity'] = proc_data_int['severity'].map(OUTPUT.output_dict)

# save_as_csv(proc_data_int, TMP_PATH, f'proc_data_{dbp_param}days', index=False)
# display(proc_data_int.head(5))

Processed data after merging with demography data contains 318 entries with 16 columns.
Processing data for 1 days before panic.
Processing data for 2 days before panic.
Processing data for 3 days before panic.


In [8]:
proc_data = create_empty_df()
proc_data = proc_data_int.copy()

r_cols = ['panic',
          'dbp',
          'panic_label']
remove_columns(proc_data, r_cols)
move_column(proc_data, 'severity', -1)
display(proc_data.head(5))
save_as_csv(proc_data, OUTPUT_PATH, f'panic_severity_multi_proc_data_{dbp_param}days_{version}({scraped_data_filename})', index=False)

Unnamed: 0,entry_id,ID,date,dataset,gender,age,marriage,job,smkHx,drinkHx,...,KRQ_3,MDQ_3,ACQ_3,APPQ_1_3,APPQ_2_3,APPQ_3_3,BSQ_3,GAD_7_3,BRIAN_3,severity
0,SYM2-1-96_2021-03-02,SYM2-1-96,2021-03-02,SYM2,1,31.0,0.0,1.0,1.0,0.0,...,,,,,,,,,,Severe
1,SYM2-1-476_2022-05-15,SYM2-1-476,2022-05-15,SYM2,1,39.0,0.0,1.0,1.0,0.0,...,102.0,3.0,39.0,28.0,44.0,22.0,74.0,15.0,65.0,Severe
2,SYM2-1-476_2022-05-05,SYM2-1-476,2022-05-05,SYM2,1,39.0,0.0,1.0,1.0,0.0,...,102.0,3.0,39.0,28.0,52.0,36.0,74.0,13.0,65.0,Severe
3,SYM2-1-476_2022-04-27,SYM2-1-476,2022-04-27,SYM2,1,39.0,0.0,1.0,1.0,0.0,...,102.0,3.0,39.0,28.0,52.0,36.0,74.0,13.0,65.0,Moderate
4,SYM2-1-422_2022-05-09,SYM2-1-422,2022-05-09,SYM2,0,45.0,1.0,1.0,0.0,1.0,...,170.0,3.0,55.0,31.0,41.0,24.0,64.0,17.0,49.0,Moderate


DEBUG - (text_utils.py) save_as_csv: Saved panic_severity_multi_proc_data_3days_1-0(final_result_diary_20250617_03).csv to C:\Users\cyshi\OneDrive\Documents\GitHub\Panic-Project-CYS\cys\_output


WindowsPath('C:/Users/cyshi/OneDrive/Documents/GitHub/Panic-Project-CYS/cys/_output/panic_severity_multi_proc_data_3days_1-0(final_result_diary_20250617_03).csv')

In [9]:
pd.crosstab(proc_data['severity'], proc_data['dataset'], margins=True, margins_name='Total')

dataset,PXPN,SYM1,SYM2,Total
severity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mild,29,71,37,137
Moderate,6,52,39,97
Severe,4,47,33,84
Total,39,170,109,318


# 🤖 | Modeling

In [22]:
data = proc_data.copy()
remove_columns(data, features_dict['id'])
print(f"Processed data contains {len(data)} entries with {len(data.columns)} columns after removing ID columns.")
display(data.head(5))

Processed data contains 318 entries with 170 columns after removing ID columns.


Unnamed: 0,gender,age,marriage,job,smkHx,drinkHx,suicideHx,suicide_need,exercise_1,alcohol_1,...,KRQ_3,MDQ_3,ACQ_3,APPQ_1_3,APPQ_2_3,APPQ_3_3,BSQ_3,GAD_7_3,BRIAN_3,severity
0,1,31.0,0.0,1.0,1.0,0.0,1.0,1.0,,,...,,,,,,,,,,Severe
1,1,39.0,0.0,1.0,1.0,0.0,0.0,0.0,180.0,,...,102.0,3.0,39.0,28.0,44.0,22.0,74.0,15.0,65.0,Severe
2,1,39.0,0.0,1.0,1.0,0.0,0.0,0.0,,,...,102.0,3.0,39.0,28.0,52.0,36.0,74.0,13.0,65.0,Severe
3,1,39.0,0.0,1.0,1.0,0.0,0.0,0.0,,,...,102.0,3.0,39.0,28.0,52.0,36.0,74.0,13.0,65.0,Moderate
4,0,45.0,1.0,1.0,0.0,1.0,1.0,1.0,60.0,,...,170.0,3.0,55.0,31.0,41.0,24.0,64.0,17.0,49.0,Moderate


In [None]:
# 2. Initialize PyCaret setup
clf = setup(
    data=data,
    target='severity',              # replace with your target column name
    session_id=123,                 # for reproducibility
    normalize=True,                 # scale numeric features
    transformation=False,           # turn off power transformation
    train_size=0.8,                 # 80/20 split
    fold=5,                         # 5-fold cross-validation
    fold_strategy='stratifiedkfold',
    numeric_imputation='mean',
    remove_multicollinearity=True,   # for small datasets, this is often helpful
	multicollinearity_threshold=0.9, # threshold for removing multicollinear features
	# html=False,                    # do not generate HTML report (use plain-text output)
    verbose=True
)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,severity
2,Target type,Multiclass
3,Target mapping,"Mild: 0, Moderate: 1, Severe: 2"
4,Original data shape,"(318, 170)"
5,Transformed data shape,"(318, 160)"
6,Transformed train set shape,"(254, 160)"
7,Transformed test set shape,"(64, 160)"
8,Numeric features,169
9,Rows with missing values,100.0%


In [12]:
# 3. Compare baseline models and select the best by Accuracy
best_model = compare_models(sort='Accuracy')

results = pull()  # Get the latest output table as a DataFrame
# Cross-Validation results
print("Cross-Validation Results:")
display(results)  # Jupyter display (can further style if you want)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7718,0.8872,0.7718,0.7966,0.7625,0.6429,0.6555,0.196
et,Extra Trees Classifier,0.7601,0.8931,0.7601,0.7847,0.7536,0.625,0.6384,0.044
gbc,Gradient Boosting Classifier,0.7479,0.0,0.7479,0.7561,0.7446,0.6126,0.6182,0.176
rf,Random Forest Classifier,0.7363,0.8647,0.7363,0.7595,0.7274,0.5865,0.5996,0.332
lr,Logistic Regression,0.6771,0.0,0.6771,0.6937,0.6738,0.5058,0.5134,0.498
dt,Decision Tree Classifier,0.6658,0.7486,0.6658,0.6746,0.6585,0.4864,0.4948,0.33
ridge,Ridge Classifier,0.6537,0.0,0.6537,0.6656,0.6522,0.4688,0.4742,0.34
qda,Quadratic Discriminant Analysis,0.646,0.0,0.646,0.683,0.6333,0.4477,0.4681,0.022
svm,SVM - Linear Kernel,0.6455,0.0,0.6455,0.6583,0.6403,0.4571,0.4636,0.328
lda,Linear Discriminant Analysis,0.63,0.0,0.63,0.6428,0.6303,0.4414,0.4459,0.018


Cross-Validation Results:


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.77,0.89,0.77,0.8,0.76,0.64,0.66,0.2
et,Extra Trees Classifier,0.76,0.89,0.76,0.78,0.75,0.62,0.64,0.04
gbc,Gradient Boosting Classifier,0.75,0.0,0.75,0.76,0.74,0.61,0.62,0.18
rf,Random Forest Classifier,0.74,0.86,0.74,0.76,0.73,0.59,0.6,0.33
lr,Logistic Regression,0.68,0.0,0.68,0.69,0.67,0.51,0.51,0.5
dt,Decision Tree Classifier,0.67,0.75,0.67,0.67,0.66,0.49,0.49,0.33
ridge,Ridge Classifier,0.65,0.0,0.65,0.67,0.65,0.47,0.47,0.34
qda,Quadratic Discriminant Analysis,0.65,0.0,0.65,0.68,0.63,0.45,0.47,0.02
svm,SVM - Linear Kernel,0.65,0.0,0.65,0.66,0.64,0.46,0.46,0.33
lda,Linear Discriminant Analysis,0.63,0.0,0.63,0.64,0.63,0.44,0.45,0.02


In [13]:
# 4. Evaluate on hold-out set (20% test split)
holdout_results = predict_model(best_model)
for i in range(1, dbp_param + 1):
	for col in features_dict['dailylog']+ features_dict['lifelog']:
		remove_columns(holdout_results, [f"{col}_{i}"])
	if i == dbp_param:
		for col in features_dict['questionnaire']:
			remove_columns(holdout_results, [f"{col}_{i}"])
print("Hold-out set performance:")
display(holdout_results)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.8281,0.8912,0.8281,0.8757,0.8342,0.7381,0.7553


Hold-out set performance:


Unnamed: 0,gender,age,marriage,job,smkHx,drinkHx,suicideHx,suicide_need,severity,prediction_label,prediction_score
0,1,41.0,1.0,1.0,0.0,0.0,0.0,0.0,Mild,Mild,0.73
1,0,50.0,1.0,0.0,0.0,0.0,0.0,0.0,Mild,Moderate,0.44
2,0,65.0,1.0,1.0,0.0,1.0,1.0,0.0,Mild,Mild,0.98
3,0,29.0,0.0,1.0,0.0,0.0,0.0,0.0,Mild,Mild,1.00
4,0,45.0,1.0,1.0,0.0,0.0,1.0,0.0,Moderate,Moderate,0.97
...,...,...,...,...,...,...,...,...,...,...,...
59,0,27.0,0.0,1.0,0.0,1.0,0.0,0.0,Moderate,Moderate,0.82
60,0,43.0,1.0,0.0,0.0,1.0,0.0,0.0,Mild,Mild,0.91
61,1,26.0,1.0,0.0,0.0,1.0,1.0,0.0,Mild,Mild,0.77
62,0,39.0,1.0,1.0,0.0,1.0,0.0,0.0,Severe,Severe,0.97


In [14]:
# 5. Create a specific model (e.g., LightGBM)
model = create_model('lightgbm')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6667,0.8161,0.6667,0.7487,0.636,0.4799,0.5091
1,0.902,0.9668,0.902,0.9049,0.901,0.8488,0.8508
2,0.7059,0.8384,0.7059,0.7332,0.6959,0.5304,0.5535
3,0.7843,0.9127,0.7843,0.7967,0.7818,0.6612,0.668
4,0.8,0.9021,0.8,0.7994,0.7976,0.6944,0.6961
Mean,0.7718,0.8872,0.7718,0.7966,0.7625,0.6429,0.6555
Std,0.0816,0.0541,0.0816,0.0601,0.0908,0.1301,0.1198


In [15]:
# 6. Tune the model hyperparameters
tuned_model = tune_model(model, optimize='Accuracy')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6275,0.7998,0.6275,0.6532,0.6211,0.4229,0.4326
1,0.8235,0.9427,0.8235,0.8252,0.8239,0.7292,0.7296
2,0.6667,0.8229,0.6667,0.6772,0.6649,0.4755,0.4813
3,0.8235,0.9059,0.8235,0.8389,0.8209,0.7218,0.7317
4,0.74,0.904,0.74,0.7476,0.7319,0.6005,0.6085
Mean,0.7362,0.8751,0.7362,0.7484,0.7325,0.59,0.5967
Std,0.0799,0.0543,0.0799,0.0751,0.0814,0.1248,0.1235


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [16]:
# 7. Ensemble models (optional)
blended_model = blend_models([tuned_model, best_model])

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6667,0.8161,0.6667,0.7487,0.636,0.4799,0.5091
1,0.902,0.9668,0.902,0.9049,0.901,0.8488,0.8508
2,0.7059,0.8384,0.7059,0.7332,0.6959,0.5304,0.5535
3,0.7843,0.9127,0.7843,0.7967,0.7818,0.6612,0.668
4,0.8,0.9021,0.8,0.7994,0.7976,0.6944,0.6961
Mean,0.7718,0.8872,0.7718,0.7966,0.7625,0.6429,0.6555
Std,0.0816,0.0541,0.0816,0.0601,0.0908,0.1301,0.1198


In [17]:
# 8. Finalize the model for deployment
final_model = finalize_model(blended_model)

In [18]:
# 9. Use the finalized model on brand-new data
# new_data = pd.read_csv('new_data.csv')
# new_predictions = predict_model(final_model, data=new_data)

# 10. Save the finalized model for later use
# save_model(final_model, 'final_pycaret_multiclass_model')

# To load the saved model:
# loaded_model = load_model('final_pycaret_multiclass_model')

# 🚂 | Training

In [19]:
# s = setup(data, target = target_col)

# 📋 | Results

In [20]:
# best_model = compare_models()