In [27]:
import pandas as pd
import plotly.express as px
import lightning as L
from lightning.pytorch.loggers import CSVLogger
import torchmetrics as M
import torch
from sklearn.model_selection import train_test_split

In [28]:
# TODO
# 1. Add test (data and code). Done
# 2. Add good metrics. Done
# 3. Add flexibility
# 4. Add regularization

In [31]:
folder = "/srv/scratch/AMR/Reduced_genotype"
species = "Klebsiella_pneumoniae"
file = f"{folder}/{species}_reduced_genotype.tsv"
x = pd.read_csv(file, sep="\t", index_col=0)
y = pd.read_csv(
    f"/srv/scratch/AMR/IR_phenotype/{species}/phenotype.txt",
    sep="\t",
    index_col=0,
)
y = y.loc[x.index]

In [32]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_val, y_val, test_size=0.5, random_state=42)

In [33]:
train_dataloader = torch.utils.data.DataLoader(
    list(
        zip(
            torch.tensor(x_train.values, dtype=torch.float32),
            torch.tensor(y_train.values, dtype=torch.float32),
        )
    ),
    batch_size=32,
    shuffle=True,
)
val_dataloader = torch.utils.data.DataLoader(
    list(
        zip(
            torch.tensor(x_val.values, dtype=torch.float32),
            torch.tensor(y_val.values, dtype=torch.float32),
        )
    ),
    batch_size=32,
    shuffle=False,
)
test_dataloader = torch.utils.data.DataLoader(
    list(
        zip(
            torch.tensor(x_test.values, dtype=torch.float32),
            torch.tensor(y_test.values, dtype=torch.float32),
        )
    ),
    batch_size=32,
    shuffle=False,
)

In [39]:
class MyModel(L.LightningModule):
    def __init__(self, n_feats: int, dropout: float = 0.5):
        super().__init__()
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(n_feats, 64),  # layer 1
            torch.nn.ReLU(),  # activation function
            torch.nn.Dropout(dropout),  # dropout for regularization
            torch.nn.Linear(64, 32),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(32, 1),
        )

    def forward(self, x: torch.Tensor):
        return self.mlp(x)

    def shared_step(self, batch, step: str = "train"):
        x, y = batch
        y_hat = self.forward(x)
        loss = torch.nn.functional.binary_cross_entropy_with_logits(y_hat, y)
        mcc = M.functional.matthews_corrcoef(
            y_hat.sigmoid() > 0.5,
            y.int(),
            num_classes=2,
            task="binary",
        )
        acc = M.functional.accuracy(
            y_hat.sigmoid() > 0.5,
            y.int(),
            num_classes=2,
            task="binary",
        )
        self.log(f"{step}_loss", loss, on_step=False, on_epoch=True)
        self.log(f"{step}_mcc", mcc, on_step=False, on_epoch=True)
        self.log(f"{step}_acc", acc, on_step=False, on_epoch=True)
        return loss

    def training_step(self, batch, batch_idx):
        return self.shared_step(batch, step="train")

    def validation_step(self, batch, batch_idx):
        return self.shared_step(batch, step="val")

    def test_step(self, batch, batch_idx):
        return self.shared_step(batch, step="test")

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)

In [40]:
model = MyModel(n_feats=x.shape[1])
csv_logger = CSVLogger("logs", name="my_model")
trainer = L.Trainer(max_epochs=50, accelerator="cpu", logger=csv_logger, enable_progress_bar=True)
trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)
trainer.test(model, dataloaders=test_dataloader)


💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.


  | Name | Type       | Params | Mode 
--------------------------------------------
0 | mlp  | Sequential | 93.5 K | train
--------------------------------------------
93.5 K    Trainable params
0         Non-trainable params
93.5 K    Total params
0.374     Total estimated model params size (MB)
8         Modules in train mode
0         Modules in eval mode


                                                                            


The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


The number of training batches (20) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.



Epoch 49: 100%|██████████| 20/20 [00:00<00:00, 127.24it/s, v_num=2]

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 20/20 [00:00<00:00, 122.53it/s, v_num=2]



The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.



Testing DataLoader 0: 100%|██████████| 3/3 [00:00<00:00, 202.89it/s]


[{'test_loss': 0.6855779886245728,
  'test_mcc': 0.709280788898468,
  'test_acc': 0.8607594966888428}]

In [None]:
import pandas as pd

metrics = pd.read_csv("logs/my_model/version_1/metrics.csv")
metrics = metrics.groupby("epoch").mean().reset_index()
metric = "mcc"
px.line(metrics, x="epoch", y=[f"train_{metric}", f"val_{metric}"]).show()