In [1]:
import pandas as pd
import torch
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split

In [2]:
class TestDataset(Dataset):
    def __init__(self, X: torch.Tensor, y: torch.Tensor):
        self.X = X
        self.y = y
        self.n_samples = len(self.X)

    def __len__(self) -> int:
        return self.n_samples

    def __getitem__(self, idx) -> tuple:
        return self.X[idx], self.y[idx]


class Model(pl.LightningModule):
    def __init__(self, path: str, train_size: float = 0.8):
        super().__init__()
        data = pd.read_csv(path)

        # ordinal encode AgeCategory and GenHealth
        data['AgeCategory'] = OrdinalEncoder().fit_transform(data['AgeCategory'].values.reshape(-1, 1))
        data['GenHealth'] = data['GenHealth'].map({'Excellent': 0, 'Very good': 1, 'Good': 2, 'Fair': 3, 'Poor': 4})
        data['Diabetic'] = data['Diabetic'].replace({'Yes':2, 'No, borderline diabetes':1, 'Yes (during pregnancy)':0, 'No':0})
        # one-hot encode the other categorical variables
        cat_cols = [col for col in data.columns if data[col].dtype == 'object']
        self.data = pd.get_dummies(data, columns=cat_cols, drop_first=True)
        X = self.data.drop('HeartDisease_Yes', axis=1).values
        y = self.data['HeartDisease_Yes'].values
        # standardize the data
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
        # split the data into train and test sets
        self.train_X, self.val_X, self.train_y, self.val_y = train_test_split(X, y, train_size=train_size)
        # oversample the minority class
        smote = SMOTE(sampling_strategy='minority')
        self.train_X, self.train_y = smote.fit_resample(self.train_X, self.train_y)
        
        # to tensor
        self.train_X = torch.from_numpy(self.train_X).float()
        self.train_y = torch.from_numpy(self.train_y).float().reshape(-1, 1)
        self.val_X = torch.from_numpy(self.val_X).float()
        self.val_y = torch.from_numpy(self.val_y).float().reshape(-1, 1)
        self.model = nn.Sequential(
            nn.LazyLinear(32),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.LazyLinear(16),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.LazyLinear(1),
            nn.Sigmoid()
        )

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.01)

    def forward(self, x):
        return self.model(x)

    def train_dataloader(self):
        return DataLoader(TestDataset(self.train_X, self.train_y), batch_size=16)

    def training_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self.forward(X)
        loss = nn.BCELoss()(y_hat, y)
        return loss
    
    def dry_run(self):
        # passes a dummy input through the model to infer parameter shapes
        self.forward(self.train_X[0].reshape(1, -1))

In [3]:
model = Model('../data/heart_2020_cleaned.csv')
trainer = pl.Trainer(max_epochs=1, fast_dev_run=False)
model.dry_run()
trainer.fit(model)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 1.2 K 
-------------------------------------
1.2 K     Trainable params
0         Non-trainable params
1.2 K     Total params
0.005     Total estimated model params size (MB)
  rank_zero_warn(


Epoch 0: 100%|██████████| 29236/29236 [03:01<00:00, 160.77it/s, loss=1.75e-08, v_num=12]

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 29236/29236 [03:01<00:00, 160.76it/s, loss=1.75e-08, v_num=12]


In [4]:
td = TestDataset(model.val_X, model.val_y)
test_data = torch.tensor(td.X, dtype=torch.float)
preds = model(test_data)
# replace > 0.5 with 1 and < 0.5 with 0
preds = np.where(preds > 0.5, 1, 0)

true_labels = torch.tensor(td.y, dtype=torch.float).reshape(-1, 1)


print(classification_report(true_labels, preds))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00     58536
         1.0       0.08      1.00      0.16      5423

    accuracy                           0.08     63959
   macro avg       0.04      0.50      0.08     63959
weighted avg       0.01      0.08      0.01     63959



  test_data = torch.tensor(td.X, dtype=torch.float)
  true_labels = torch.tensor(td.y, dtype=torch.float).reshape(-1, 1)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
torch.save(model.state_dict(), '../models/model.pth')