In [1]:
from typing import Dict

import polars as pl
import torch

from src.data.preprocessing.preprocessor import Preprocessor, RuleTransform
from src.data.torch_tabular_dataset import TorchTabularDataset
from src.nn import BayesianBinaryPerceptrone

In [2]:
def polars_dict_mapper(df: pl.DataFrame, map_dict: Dict) -> pl.DataFrame:
    rules = []
    for column in map_dict:
        map_dict_col: Dict = map_dict[column]
        rule = None
        for key, value in map_dict_col.items():
            if rule is None:
                rule = pl.when(pl.col(column) == key).then(value)
            else:
                rule = rule.when(pl.col(column) == key).then(value)
        rule = rule.alias(column)
        rules.append(rule)
    df = df.with_columns(*rules)
    return df

In [3]:
def prepare_heart_dataset(df: pl.DataFrame) -> pl.DataFrame:
    map_dict = {
        "AgeCategory": {
            "18-24": 0,
            "25-29": 1,
            "30-34": 2,
            "35-39": 3,
            "40-44": 4,
            "45-49": 5,
            "50-54": 6,
            "55-59": 7,
            "60-64": 8,
            "65-69": 9,
            "70-74": 10,
            "75-79": 11,
            "80 or older": 12,
        },
        "GenHealth": {
            "Poor": 0,
            "Fair": 1,
            "Good": 2,
            "Very good": 3,
            "Excellent": 4,
        },
    }
    df = polars_dict_mapper(df=df, map_dict=map_dict)
    return df

In [4]:
preprocessor = Preprocessor.from_rules(
    RuleTransform(
        transformer="BinaryTarget",
        columns=[
            "HeartDisease",
        ],
    ),
    RuleTransform(
        transformer="FeaturePowerTransformer",
        columns=[
            "BMI",
            "PhysicalHealth",
            "MentalHealth",
            "AgeCategory",
            "GenHealth",
            "SleepTime",
        ],
    ),
    RuleTransform(
        transformer="BinaryEncoder",
        columns=[
            "Smoking",
            "AlcoholDrinking",
            "Stroke",
            "DiffWalking",
            "Sex",
            "Diabetic",
            "PhysicalActivity",
            "Asthma",
            "KidneyDisease",
            "SkinCancer",
        ],
    ),
    RuleTransform(
        transformer="OneHotEncoder",
        columns=[
            "Race",
        ],
    ),
)

df = pl.read_csv("../data/heart_2020_cleaned.csv")
df_prepared = prepare_heart_dataset(df=df)
df_transformed = preprocessor.fit_transform(data=df_prepared)

In [5]:
dataset = TorchTabularDataset(
    df=df_transformed,
    metadata=preprocessor.metadata,
).cuda()

In [6]:
model = BayesianBinaryPerceptrone(
    name_in="features_numeric",
    name_out="p_heart_disease",
    name_target="HeartDisease_BinaryTarget_No_Yes",
    dim_in=dataset.data.features_numeric.shape[-1],
    dims_hidden=[
        32,
    ],
).cuda()

In [7]:
metrics_init = model.init(
    x={"features_numeric": dataset.data.features_numeric},
    num_epoch=1000,
)

In [8]:
metrics_init

[146.86752319335938,
 119.50065612792969,
 115.56416320800781,
 96.40100860595703,
 85.69386291503906,
 77.73672485351562,
 73.43080139160156,
 60.27134704589844,
 52.74482345581055,
 47.858890533447266,
 42.533966064453125,
 39.225013732910156,
 32.568634033203125,
 28.211759567260742,
 24.4359073638916,
 21.78456687927246,
 20.564821243286133,
 16.385242462158203,
 15.448797225952148,
 14.150790214538574,
 12.38117504119873,
 10.252955436706543,
 9.283121109008789,
 9.043756484985352,
 7.479655742645264,
 7.361969947814941,
 6.546566963195801,
 5.993212699890137,
 5.4588494300842285,
 5.576895713806152,
 5.49371337890625,
 4.296791076660156,
 3.960676431655884,
 3.926698684692383,
 3.8165090084075928,
 3.814373254776001,
 3.6300501823425293,
 3.599886178970337,
 3.1555280685424805,
 3.310462713241577,
 3.1842424869537354,
 2.7214858531951904,
 2.637197494506836,
 2.5581023693084717,
 2.5675013065338135,
 2.274082899093628,
 2.460087299346924,
 2.4986064434051514,
 2.316290855407715,


In [9]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0)
metrics_loss = []
model.train()
for batch in dataset.to_bathes(batch_size=None, shuffle=False, num_epochs=10_000):
    optimizer.zero_grad()
    loss = model.loss(
        x={"features_numeric": batch.features_numeric},
        target=batch.target,
        train_size=len(dataset),
    )
    loss.backward()
    optimizer.step()
    metrics_loss.append(loss.item())

In [10]:
metrics_loss

[0.8608952760696411,
 0.8500964641571045,
 0.8424680829048157,
 0.8287389278411865,
 0.8167031407356262,
 0.8108478784561157,
 0.7993978261947632,
 0.7933558225631714,
 0.7838083505630493,
 0.7761068344116211,
 0.7664390802383423,
 0.7584642171859741,
 0.7531964778900146,
 0.7473591566085815,
 0.7375915050506592,
 0.7331262230873108,
 0.7245438098907471,
 0.7212719321250916,
 0.715273916721344,
 0.7059394121170044,
 0.701233983039856,
 0.6967839002609253,
 0.6907032132148743,
 0.686408519744873,
 0.6804814338684082,
 0.6738963723182678,
 0.6704106330871582,
 0.6674461960792542,
 0.6627012491226196,
 0.6582456827163696,
 0.6537171006202698,
 0.6484609246253967,
 0.6451821327209473,
 0.6407706141471863,
 0.6357287168502808,
 0.6335614919662476,
 0.6294302940368652,
 0.6262003779411316,
 0.6229793429374695,
 0.6181735992431641,
 0.6141824126243591,
 0.6110886931419373,
 0.607780933380127,
 0.6062249541282654,
 0.6019524931907654,
 0.5989144444465637,
 0.5957467555999756,
 0.59261322021484