In [1]:
import os

import polars as pl
import torch
from sklearn.model_selection import train_test_split

from src.data.preprocessing.preprocessor import Preprocessor, RuleTransform
from src.data.torch_tabular_dataset import TorchTabularDataset
from src.experiments.utils import (
    make_experiment_binary_perceptrone,
    polars_dict_mapper,
)

os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"

RANDOM_SEED = 0
torch.use_deterministic_algorithms(True)
torch.backends.cudnn.benchmark = False

In [2]:
def prepare_heart_dataset(df: pl.DataFrame) -> pl.DataFrame:
    map_dict = {
        "AgeCategory": {
            "18-24": 0,
            "25-29": 1,
            "30-34": 2,
            "35-39": 3,
            "40-44": 4,
            "45-49": 5,
            "50-54": 6,
            "55-59": 7,
            "60-64": 8,
            "65-69": 9,
            "70-74": 10,
            "75-79": 11,
            "80 or older": 12,
        },
        "GenHealth": {
            "Poor": 0,
            "Fair": 1,
            "Good": 2,
            "Very good": 3,
            "Excellent": 4,
        },
    }
    df = polars_dict_mapper(df=df, map_dict=map_dict)
    return df

In [3]:
preprocessor = Preprocessor.from_rules(
    RuleTransform(
        transformer="BinaryTarget",
        columns=[
            "HeartDisease",
        ],
    ),
    RuleTransform(
        transformer="FeaturePowerTransformer",
        columns=[
            "BMI",
            "PhysicalHealth",
            "MentalHealth",
            "AgeCategory",
            "GenHealth",
            "SleepTime",
        ],
    ),
    RuleTransform(
        transformer="BinaryEncoder",
        columns=[
            "Smoking",
            "AlcoholDrinking",
            "Stroke",
            "DiffWalking",
            "Sex",
            "Diabetic",
            "PhysicalActivity",
            "Asthma",
            "KidneyDisease",
            "SkinCancer",
        ],
    ),
    RuleTransform(
        transformer="OneHotEncoder",
        columns=[
            "Race",
        ],
    ),
)

In [4]:
df = pl.read_csv("../data/heart_2020_cleaned.csv")
df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)

df_prepared_train = prepare_heart_dataset(df=df_train)
df_prepared_test = prepare_heart_dataset(df=df_test)


df_train_transformed = preprocessor.fit_transform(data=df_prepared_train)
df_test_transformed = preprocessor.transform(data=df_prepared_test)

dataset_train = TorchTabularDataset(
    df=df_train_transformed,
    metadata=preprocessor.metadata,
).cuda()
dataset_test = TorchTabularDataset(
    df=df_test_transformed,
    metadata=preprocessor.metadata,
).cuda()

In [5]:
result = make_experiment_binary_perceptrone(
    dataset_train=dataset_train,
    dataset_test=dataset_test,
    dim_hidden=32,
    batch_size_inference=2048,
    sample_size_inference=1000,
    random_seed=RANDOM_SEED,
)

In [6]:
result

{'dim_hidden': 32,
 'gini_train': 0.6902852766897949,
 'gini_test': 0.6853529460740808,
 'loss_init': [141.64918518066406,
  117.54356384277344,
  106.79981994628906,
  92.35298156738281,
  91.18936920166016,
  72.89987182617188,
  70.20731353759766,
  63.82101058959961,
  51.00267028808594,
  45.361473083496094,
  40.888729095458984,
  33.394256591796875,
  29.326936721801758,
  25.81267547607422,
  24.105363845825195,
  20.196487426757812,
  17.247848510742188,
  16.698291778564453,
  14.807088851928711,
  13.63770866394043,
  11.607879638671875,
  10.393750190734863,
  9.331356048583984,
  8.703012466430664,
  7.432939529418945,
  8.298260688781738,
  6.716613292694092,
  6.578930377960205,
  5.869180679321289,
  4.911751747131348,
  4.935860633850098,
  4.892308712005615,
  4.1083173751831055,
  4.132664680480957,
  4.025542259216309,
  3.4883129596710205,
  3.5417840480804443,
  3.2394466400146484,
  2.8887736797332764,
  3.020054340362549,
  2.9687438011169434,
  2.68319606781005