facebookresearch · robertodessi · Jan 8, 2021 · Dec 10, 2020 · Jan 2, 2021 · Jan 6, 2021
diff --git a/egg/core/callbacks.py b/egg/core/callbacks.py
@@ -4,7 +4,10 @@
 # LICENSE file in the root directory of this source tree.
 
 import json
+import os
 import pathlib
+import re
+import sys
 import time
 from typing import Any, Dict, List, NamedTuple, Union
 
@@ -131,22 +134,24 @@ def __init__(
         checkpoint_path: Union[str, pathlib.Path],
         checkpoint_freq: int = 1,
         prefix: str = "",
+        max_checkpoints: int = sys.maxsize,
     ):
+        """Saves a checkpoint file for training.
+        :param checkpoint_path:  path to checkpoint directory, will be created if not present
+        :param checkpoint_freq:  Number of epochs for checkpoint saving
+        :param prefix: Name of checkpoint file, will be {prefix}{current_epoch}.tar
+        :param max_checkpoints: Max number of concurrent checkpoint files in the directory.
+        """
         self.checkpoint_path = pathlib.Path(checkpoint_path)
         self.checkpoint_freq = checkpoint_freq
         self.prefix = prefix
+        self.max_checkpoints = max_checkpoints
         self.epoch_counter = 0
 
     def on_epoch_end(self, loss: float, logs: Interaction, epoch: int):
         self.epoch_counter = epoch
-        if self.checkpoint_freq > 0 and (
-            self.epoch_counter % self.checkpoint_freq == 0
-        ):
-            filename = (
-                f"{self.prefix}_{self.epoch_counter}"
-                if self.prefix
-                else str(self.epoch_counter)
-            )
+        if self.checkpoint_freq > 0 and (epoch % self.checkpoint_freq == 0):
+            filename = f"{self.prefix}_{epoch}" if self.prefix else str(epoch)
             self.save_checkpoint(filename=filename)
 
     def on_train_end(self):
@@ -159,6 +164,8 @@ def save_checkpoint(self, filename: str):
         Saves the game, agents, and optimizer states to the checkpointing path under `<number_of_epochs>.tar` name
         """
         self.checkpoint_path.mkdir(exist_ok=True, parents=True)
+        if len(self.get_checkpoint_files()) > self.max_checkpoints:
+            self.remove_oldest_checkpoint()
         path = self.checkpoint_path / f"{filename}.tar"
         torch.save(self.get_checkpoint(), path)
 
@@ -169,6 +176,29 @@ def get_checkpoint(self):
             optimizer_state_dict=self.trainer.optimizer.state_dict(),
         )
 
+    def get_checkpoint_files(self):
+        """
+        Return a list of the files in the checkpoint dir
+        """
+        return [name for name in os.listdir(self.checkpoint_path) if ".tar" in name]
+
+    @staticmethod
+    def natural_sort(to_sort):
+        """
+        Sort a list of files naturally
+        E.g. [file1,file4,file32,file2] -> [file1,file2,file4,file32]
+        """
+        convert = lambda text: int(text) if text.isdigit() else text.lower()
+        alphanum_key = lambda key: [convert(c) for c in re.split("([0-9]+)", key)]
+        return sorted(to_sort, key=alphanum_key)
+
+    def remove_oldest_checkpoint(self):
+        """
+        Remove the oldest checkpoint from the dir
+        """
+        checkpoints = self.natural_sort(self.get_checkpoint_files())
+        os.remove(os.path.join(self.checkpoint_path, checkpoints[0]))
+
 
 class InteractionSaver(Callback):
     def __init__(

diff --git a/tests/test_trainer.py b/tests/test_trainer.py
@@ -2,8 +2,6 @@
 
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
-
-
 import shutil
 import sys
 from pathlib import Path
@@ -111,6 +109,38 @@ def test_snapshoting():
     shutil.rmtree(CHECKPOINT_PATH)  # Clean-up
 
 
+def test_max_snapshoting():
+    CHECKPOINT_PATH = Path("./test_checkpoints")
+
+    core.init()
+    sender = core.GumbelSoftmaxWrapper(ToyAgent(), temperature=1)
+    receiver = Receiver()
+    loss = lambda sender_input, message, receiver_input, receiver_output, labels: (
+        F.cross_entropy(receiver_output, labels),
+        {},
+    )
+
+    game = core.SymbolGameGS(sender, receiver, loss)
+    optimizer = torch.optim.Adam(game.parameters())
+
+    data = Dataset()
+    trainer = core.Trainer(
+        game,
+        optimizer,
+        train_data=data,
+        validation_data=None,
+        callbacks=[
+            core.CheckpointSaver(checkpoint_path=CHECKPOINT_PATH, max_checkpoints=2)
+        ],
+    )
+    trainer.train(n_epochs=6)
+    assert (CHECKPOINT_PATH / Path("5.tar")).exists()
+    assert (CHECKPOINT_PATH / Path("6.tar")).exists()
+    assert (CHECKPOINT_PATH / Path("final.tar")).exists()
+    assert len([x for x in CHECKPOINT_PATH.glob("**/*") if x.is_file()]) == 3
+    del trainer
+
+
 def test_early_stopping():
     game, data = MockGame(), Dataset()
     early_stopper = core.EarlyStopperAccuracy(threshold=0.9)