In [143]:
# This notebook was prepared by Devdoot

from pytorch_lightning import Trainer
from pytorch_lightning.demos.boring_classes import BoringModel, BoringDataModule
from pytorch_lightning.callbacks import Callback

In [144]:
import pandas as pd

In [145]:
from pytorch_lightning.utilities.types import STEP_OUTPUT
from torch import Tensor
from typing import Optional, Any
import collections

In [146]:
class model(BoringModel):
    def __init__(self) -> None:
        super().__init__()
        self.loss_history = []
        self.curr_loss = collections.defaultdict(lambda: 0.0)

    def on_train_epoch_start(self, *args):
        print("----")

    def training_step(self, batch: Tensor, batch_idx: int) -> STEP_OUTPUT:
        loss_dct = {"loss": self.step(batch)}
        self.log_dict(loss_dct, on_epoch=True)
        for k,v in loss_dct.items():
            self.curr_loss[f"{k}"] += v
        return loss_dct

    def validation_step(self, batch: Tensor, batch_idx: int) -> Optional[STEP_OUTPUT]:
        loss_dct = {"loss_val": self.step(batch)}
        self.log_dict(loss_dct, on_epoch=True)
        if not self.trainer.sanity_checking:
            for k,v in loss_dct.items():
                self.curr_loss[f"{k}"] += v
        return loss_dct

    def on_validation_epoch_end(self, *args) -> None:        
        print(self.current_epoch, "OVEE", self.trainer.callback_metrics, self.trainer.num_training_batches)

        if self.trainer.sanity_checking:
            return
        
        loss_dict = {}
        for k,v in self.curr_loss.items():
            if k.startswith("loss"):
                loss_dict[k] = float(v/self.trainer.num_training_batches)
            else:
                loss_dict[k] = float(v/self.trainer.num_val_batches[0])
            self.curr_loss[k] = 0.0
        
        self.loss_history.append(loss_dict)
    
    def get_loss_history(self):
        return pd.DataFrame.from_records(self.loss_history)

In [147]:
class CollectLossHistory(Callback):
    def __init__(self):
        self.data_list = []
        self._trainer = None


    def on_validation_epoch_end(self, trainer, *args):
        self._trainer = trainer
        if not trainer.sanity_checking:
            print("CB logging", trainer.callback_metrics)
            self.data_list.append(trainer.callback_metrics.copy())
        else:
            print("Sanity checking. skip")

    def get_history(self):
        print("Normalizing", self._trainer.num_training_batches, self._trainer.num_val_batches)
        df = pd.DataFrame.from_records(self.data_list)
        df["loss"] /= self._trainer.num_training_batches
        return df

In [148]:
lmodel = model()
dm = BoringDataModule()
dm.setup(stage="fit")

In [149]:
loss_history_cb = CollectLossHistory()
trainer = Trainer(max_epochs=5,accelerator="gpu",callbacks=[loss_history_cb], enable_progress_bar=False, num_sanity_val_steps=0, limit_train_batches=1, limit_val_batches=1)
trainer.fit(lmodel, dm, )
loss_history = loss_history_cb.get_history()
loss_history_2 = lmodel.get_loss_history()

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer(limit_train_batches=1)` was configured so 1 batch per epoch will be used.
`Trainer(limit_val_batches=1)` was configured so 1 batch will be used.

  | Name  | Type   | Params
---------------------------------
0 | layer | Linear | 66    
---------------------------------
66        Trainable params
0         Non-trainable params
66        Total params
0.000     Total estimated model params size (MB)
`Trainer.fit` stopped: `max_epochs=5` reached.


----
CB logging {'loss': tensor(1.2727, device='mps:0'), 'loss_step': tensor(1.2727, device='mps:0'), 'loss_val': tensor(0.4627, device='mps:0')}
0 OVEE {'loss': tensor(1.2727, device='mps:0'), 'loss_step': tensor(1.2727, device='mps:0'), 'loss_val': tensor(0.4627, device='mps:0')} 1
----
CB logging {'loss': tensor(3.3194, device='mps:0'), 'loss_step': tensor(3.3194, device='mps:0'), 'loss_val': tensor(0.3913, device='mps:0'), 'loss_epoch': tensor(1.2727, device='mps:0')}
1 OVEE {'loss': tensor(3.3194, device='mps:0'), 'loss_step': tensor(3.3194, device='mps:0'), 'loss_val': tensor(0.3913, device='mps:0'), 'loss_epoch': tensor(1.2727, device='mps:0')} 1
----
CB logging {'loss': tensor(1.8103, device='mps:0'), 'loss_step': tensor(1.8103, device='mps:0'), 'loss_val': tensor(0.3884, device='mps:0'), 'loss_epoch': tensor(3.3194, device='mps:0')}
2 OVEE {'loss': tensor(1.8103, device='mps:0'), 'loss_step': tensor(1.8103, device='mps:0'), 'loss_val': tensor(0.3884, device='mps:0'), 'loss_epo

In [150]:
loss_history

Unnamed: 0,loss,loss_step,loss_val,loss_epoch
0,"tensor(1.2727, device='mps:0')","tensor(1.2727, device='mps:0')","tensor(0.4627, device='mps:0')",
1,"tensor(3.3194, device='mps:0')","tensor(3.3194, device='mps:0')","tensor(0.3913, device='mps:0')","tensor(1.2727, device='mps:0')"
2,"tensor(1.8103, device='mps:0')","tensor(1.8103, device='mps:0')","tensor(0.3884, device='mps:0')","tensor(3.3194, device='mps:0')"
3,"tensor(1.7169, device='mps:0')","tensor(1.7169, device='mps:0')","tensor(0.3881, device='mps:0')","tensor(1.8103, device='mps:0')"
4,"tensor(1.7079, device='mps:0')","tensor(1.7079, device='mps:0')","tensor(0.3881, device='mps:0')","tensor(1.7169, device='mps:0')"


In [151]:
loss_history_2

Unnamed: 0,loss,loss_val
0,1.27269,0.462687
1,3.319377,0.391279
2,1.81034,0.388408
3,1.716898,0.388146
4,1.70793,0.38812
