In [4]:
import os

import polars as pl
import torch
from sklearn.model_selection import train_test_split

from src.data.preprocessing.preprocessor import Preprocessor, RuleTransform
from src.data.torch_tabular_dataset import TorchTabularDataset
from src.experiments.utils import (
    make_experiment_binary_perceptrone,
    make_experiments_binary_perceptrone,
    polars_dict_mapper,
)

os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"

RANDOM_SEED = 0
torch.use_deterministic_algorithms(True)
torch.backends.cudnn.benchmark = False

In [5]:
def prepare_heart_dataset(df: pl.DataFrame) -> pl.DataFrame:
    map_dict = {
        "AgeCategory": {
            "18-24": 0,
            "25-29": 1,
            "30-34": 2,
            "35-39": 3,
            "40-44": 4,
            "45-49": 5,
            "50-54": 6,
            "55-59": 7,
            "60-64": 8,
            "65-69": 9,
            "70-74": 10,
            "75-79": 11,
            "80 or older": 12,
        },
        "GenHealth": {
            "Poor": 0,
            "Fair": 1,
            "Good": 2,
            "Very good": 3,
            "Excellent": 4,
        },
    }
    df = polars_dict_mapper(df=df, map_dict=map_dict)
    return df

In [6]:
preprocessor = Preprocessor.from_rules(
    RuleTransform(
        transformer="BinaryTarget",
        columns=[
            "HeartDisease",
        ],
    ),
    RuleTransform(
        transformer="FeaturePowerTransformer",
        columns=[
            "BMI",
            "PhysicalHealth",
            "MentalHealth",
            "AgeCategory",
            "GenHealth",
            "SleepTime",
        ],
    ),
    RuleTransform(
        transformer="BinaryEncoder",
        columns=[
            "Smoking",
            "AlcoholDrinking",
            "Stroke",
            "DiffWalking",
            "Sex",
            "Diabetic",
            "PhysicalActivity",
            "Asthma",
            "KidneyDisease",
            "SkinCancer",
        ],
    ),
    RuleTransform(
        transformer="OneHotEncoder",
        columns=[
            "Race",
        ],
    ),
)

In [7]:
df = pl.read_csv("../data/heart_2020_cleaned.csv")
df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)

df_prepared_train = prepare_heart_dataset(df=df_train)
df_prepared_test = prepare_heart_dataset(df=df_test)


df_train_transformed = preprocessor.fit_transform(data=df_prepared_train)
df_test_transformed = preprocessor.transform(data=df_prepared_test)

dataset_train = TorchTabularDataset(
    df=df_train_transformed,
    metadata=preprocessor.metadata,
).cuda()
dataset_test = TorchTabularDataset(
    df=df_test_transformed,
    metadata=preprocessor.metadata,
).cuda()

In [8]:
result = make_experiment_binary_perceptrone(
    dataset_train=dataset_train,
    dataset_test=dataset_test,
    dim_hidden=32,
    n_hidden=1,
    weight_decays_classic=[0, 0.0001, 0.001, 0.01, 0.1],
    batch_size_inference_bayesian=2048,
    sample_size_inference_bayesian=1000,
    random_seed=RANDOM_SEED,
)

In [9]:
result

{'bayesian': {'roc_auc_train': 0.8424682222178626,
  'roc_auc_test': 0.840219151184615},
 'classic_0': {'roc_auc_train': 0.8492225907709913,
  'roc_auc_test': 0.8405329316551737},
 'classic_0.0001': {'roc_auc_train': 0.8483359844089937,
  'roc_auc_test': 0.842814271756086},
 'classic_0.001': {'roc_auc_train': 0.8445668645889256,
  'roc_auc_test': 0.8417685204038678},
 'classic_0.01': {'roc_auc_train': 0.8383939530915163,
  'roc_auc_test': 0.836352470840519},
 'classic_0.1': {'roc_auc_train': 0.8103438090032554,
  'roc_auc_test': 0.8090572063811872}}

In [None]:
make_experiments_binary_perceptrone(
    path_to_save="../experiments/heart",
    dims_hidden=list(range(1, 129)),
    dataset_train=dataset_train,
    dataset_test=dataset_test,
    n_hidden=2,
    weight_decays_classic=[0, 0.0001, 0.001, 0.01, 0.1],
    batch_size_inference_bayesian=2048,
    sample_size_inference_bayesian=1000,
    random_seed=RANDOM_SEED,
)