# Setup

In [None]:
import json
import pathlib

from dhi.utils import get_logger

SUBJECT = "DiabetesHealthIndicators"
CONFIG_PATH = pathlib.Path("config.json").resolve()

logger = get_logger(SUBJECT)

try:
    logger.info(f"Loading configuration from {CONFIG_PATH}...")
    with open(CONFIG_PATH, "r") as config_file:
        CONFIG = json.load(config_file)
    logger.info("Configuration loaded successfully.")
except Exception as e:
    logger.error(f"Failed to load configuration: {e}")
    CONFIG = {}
    
subject_config = CONFIG.get(SUBJECT, {})
LOADER_CONFIG = subject_config.get("loader", {})
PREPROCESSOR_CONFIG = subject_config.get("preprocessor", {})

# Data loading

In [None]:
from dhi.data.loader.dhi_loader import DHILoader

loader = DHILoader(**LOADER_CONFIG)
try:
    df = loader.load()
    logger.info("Data loaded successfully.")
except Exception as e:
    logger.error(f"Failed to load data: {e}")
    raise e

# Data Visualization

Before preprocessing.

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.sample(frac=1).reset_index(drop=True)
df.head()

# Data Preprocessing

In [None]:
from dhi.data.preprocessor.dhi_preprocessor import DHIPreprocessor

preprocessor = DHIPreprocessor(**PREPROCESSOR_CONFIG)
df_processed = preprocessor.preprocess(df)

# Data Visualization

After preprocessing.

In [None]:
df_processed.describe()

In [None]:
df_processed.info()

In [None]:
df_processed.sample(frac=1).reset_index(drop=True)
df_processed.head()

In [None]:
import dhi.data.visualizer.visualizer as vis

vis.plot_boxplots(df)
vis.plot_histograms(df)
vis.plot_distplots(df)

# Statistics

In [None]:
import dhi.statistics.feature_selection as fs

fs.correlation_matrix(df_processed)
fs.chi2_independence_test(df, label_columns=loader.label_columns, target_column="diagnosed_diabetes")
univariate_selector = fs.univariate_feature_selection(df_processed, label_columns=loader.label_columns, target_column="diagnosed_diabetes")

# Consumes a lot of RAM for this specific shape of the dataset
# relief_selector = fs.relief_feature_selection(df_processed, label_columns=loader.label_columns, target_column="diagnosed_diabetes", n_features=5)