In [1]:
import os
import random
from typing import Dict

import numpy as np
import polars as pl
import torch
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from src.data.preprocessing.preprocessor import Preprocessor, RuleTransform
from src.data.torch_tabular_dataset import TorchTabularDataset
from src.nn import BayesianBinaryPerceptrone

os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"

RANDOM_SEED = 0
torch.random.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.use_deterministic_algorithms(True)
torch.backends.cudnn.benchmark = False

In [2]:
def polars_dict_mapper(df: pl.DataFrame, map_dict: Dict) -> pl.DataFrame:
    rules = []
    for column in map_dict:
        map_dict_col: Dict = map_dict[column]
        rule = None
        for key, value in map_dict_col.items():
            if rule is None:
                rule = pl.when(pl.col(column) == key).then(value)
            else:
                rule = rule.when(pl.col(column) == key).then(value)
        rule = rule.alias(column)
        rules.append(rule)
    df = df.with_columns(*rules)
    return df


def gini_score(
    y_true,
    y_score,
) -> np.float64:
    return (roc_auc_score(y_true=y_true, y_score=y_score) - 0.5) * 2

In [3]:
def prepare_heart_dataset(df: pl.DataFrame) -> pl.DataFrame:
    map_dict = {
        "AgeCategory": {
            "18-24": 0,
            "25-29": 1,
            "30-34": 2,
            "35-39": 3,
            "40-44": 4,
            "45-49": 5,
            "50-54": 6,
            "55-59": 7,
            "60-64": 8,
            "65-69": 9,
            "70-74": 10,
            "75-79": 11,
            "80 or older": 12,
        },
        "GenHealth": {
            "Poor": 0,
            "Fair": 1,
            "Good": 2,
            "Very good": 3,
            "Excellent": 4,
        },
    }
    df = polars_dict_mapper(df=df, map_dict=map_dict)
    return df

In [4]:
preprocessor = Preprocessor.from_rules(
    RuleTransform(
        transformer="BinaryTarget",
        columns=[
            "HeartDisease",
        ],
    ),
    RuleTransform(
        transformer="FeaturePowerTransformer",
        columns=[
            "BMI",
            "PhysicalHealth",
            "MentalHealth",
            "AgeCategory",
            "GenHealth",
            "SleepTime",
        ],
    ),
    RuleTransform(
        transformer="BinaryEncoder",
        columns=[
            "Smoking",
            "AlcoholDrinking",
            "Stroke",
            "DiffWalking",
            "Sex",
            "Diabetic",
            "PhysicalActivity",
            "Asthma",
            "KidneyDisease",
            "SkinCancer",
        ],
    ),
    RuleTransform(
        transformer="OneHotEncoder",
        columns=[
            "Race",
        ],
    ),
)

In [5]:
df = pl.read_csv("../data/heart_2020_cleaned.csv")
df_train, df_test = train_test_split(
    df, test_size=0.2, random_state=RANDOM_SEED)

df_prepared_train = prepare_heart_dataset(df=df_train)
df_prepared_test = prepare_heart_dataset(df=df_test)


df_train_transformed = preprocessor.fit_transform(data=df_prepared_train)
df_test_transformed = preprocessor.transform(data=df_prepared_test)

dataset_train = TorchTabularDataset(
    df=df_train_transformed,
    metadata=preprocessor.metadata,
).cuda()
dataset_test = TorchTabularDataset(
    df=df_test_transformed,
    metadata=preprocessor.metadata,
).cuda()

In [6]:
model = BayesianBinaryPerceptrone(
    name_out="p_heart_disease",
    name_target="HeartDisease_BinaryTarget_No_Yes",
    dim_in=dataset_train.data.features_numeric.shape[-1],
    dims_hidden=[
        32,
    ],
).cuda()

In [7]:
metrics_init = model.init(
    features=dataset_train.data.features_numeric,
    num_epoch=1000,
)

Consider using tensor.detach() first. (Triggered internally at /pytorch/aten/src/ATen/native/Scalar.cpp:22.)
  metrics.append(kl.item())


In [8]:
metrics_loss = []
model.train()
for epoch_num, batch in enumerate(
    dataset_train.to_bathes(batch_size=None, shuffle=False, num_epochs=5_000)
):
    if epoch_num == 0:
        optimizer = torch.optim.Adam(
            model.parameters(), lr=0.01, weight_decay=0)
    elif epoch_num == 1_000:
        optimizer = torch.optim.Adam(
            model.parameters(), lr=0.001, weight_decay=0)
    elif epoch_num == 2_000:
        optimizer = torch.optim.Adam(
            model.parameters(), lr=0.005, weight_decay=0)
    optimizer.zero_grad()
    loss = model.loss(
        features=batch.features_numeric,
        target=batch.target,
        train_size=len(dataset_train),
    )
    loss.backward()
    optimizer.step()
    metrics_loss.append(loss.item())

In [9]:
result = {
    "score": [],
    "target": [],
}
model.eval()
with torch.no_grad():
    for batch in dataset_test.to_bathes(batch_size=2048, shuffle=False, num_epochs=1):
        features = batch.features_numeric
        target = batch.target["HeartDisease_BinaryTarget_No_Yes"].value.int(
        ).tolist()
        score = (
            model(
                (
                    features
                    .view(1, *features.shape)
                    .expand(1000, *features.shape)
                )
            )["p_heart_disease"]
            .mean(0)
            .view(-1)
            .tolist()
        )

        result["score"].extend(score)
        result["target"].extend(target)
df_result = pl.DataFrame(result)

In [10]:
gini_score(y_score=df_result["score"], y_true=df_result["target"])

np.float64(0.6850475100527473)