Improve FSDP + QLora (#25) [release]

* Fix simple issues * Default eval_accumulation_steps * Filter model warnings * Support FSDP activation checkpointing * Refactor training code * Reduce CPU RAM usage by compute_metrics * Fix tests * Fix tests * Update RAM * Add test_StrWithSeed
datadreamer-dev · Apr 30, 2024 · 1312423 · 1312423
1 parent 7c4ab24
commit 1312423
Show file tree

Hide file tree

Showing 25 changed files with 1,235 additions and 1,059 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "DataDreamer"
-version = "0.31.0"
+version = "0.32.0"
 description = "Prompt. Generate Synthetic Data. Train & Align Models."
 license = "MIT"
 authors= [

diff --git a/scripts/.cluster/slurm/_sbatch_config.sh b/scripts/.cluster/slurm/_sbatch_config.sh
@@ -5,7 +5,7 @@
 #SBATCH --output=.cluster/slurm/.last_job/submission.out
 #SBATCH --ntasks 1
 #SBATCH --cpus-per-task 16
-#SBATCH --mem=10G
+#SBATCH --mem=30G
 #SBATCH --gpus=2
 
 # Source the user's bashrc

diff --git a/src/_cachable/_cachable.py b/src/_cachable/_cachable.py
@@ -60,7 +60,7 @@ def _notify_adaptive_batch_sizing(model_logger: Logger, progress_state: dict[str
 class _StrWithSeed(str):
     seed: Any
 
-    def __new__(cls, value: str, seed: "Any | _StrWithSeed"):
+    def __new__(cls, value: str, seed: "Any | _StrWithSeed" = None):
         obj = str.__new__(cls, value)
         obj.seed = seed.seed if isinstance(seed, _StrWithSeed) else seed
         return obj
@@ -75,6 +75,14 @@ def __eq__(self, __value: object) -> bool:
     def __hash__(self):
         return hash((self.seed, str(self)))
 
+    def __getstate__(self):
+        state = {"str": str(self), "seed": self.seed}
+
+        return state
+
+    def __setstate__(self, state):
+        self.seed = state["seed"]
+
     @staticmethod
     def total_per_input_seeds(inputs: list["str | _StrWithSeed"]) -> int:
         return sum(

diff --git a/src/embedders/sentence_transformers_embedder.py b/src/embedders/sentence_transformers_embedder.py
@@ -19,6 +19,7 @@
 )
 from ..utils.hf_model_utils import (
     convert_dtype,
+    filter_model_warnings,
     get_model_max_context_length,
     get_tokenizer,
 )
@@ -122,6 +123,9 @@ def model(self) -> SentenceTransformer:
         # torch._dynamo.config.suppress_errors = True
         # model = torch.compile(model)
 
+        # Filter any warnings from the model
+        filter_model_warnings()
+
         # Finished loading
         log_if_timeout.stop(
             partial(

diff --git a/src/llms/hf_transformers.py b/src/llms/hf_transformers.py
@@ -21,6 +21,7 @@
     HF_TRANSFORMERS_CITATION,
     PEFT_CITATION,
     convert_dtype,
+    filter_model_warnings,
     get_attn_implementation,
     get_config,
     get_model_max_context_length,
@@ -273,6 +274,9 @@ def model(self) -> PreTrainedModel:
         torch._dynamo.config.suppress_errors = True
         model = torch.compile(model)
 
+        # Filter any warnings from the model
+        filter_model_warnings()
+
         # Finished loading
         log_if_timeout.stop(
             partial(
@@ -323,15 +327,6 @@ def count_tokens(self, value: str) -> int:
         Returns:
             The number of tokens in the string.
         """
-        pass
-        """_summary_
-
-        Args:
-            value (_type_): _description_
-
-        Returns:
-            _type_: _description_
-        """
         return len(self.tokenizer.encode(value))
 
     @torch.no_grad()

diff --git a/src/llms/petals.py b/src/llms/petals.py
@@ -11,6 +11,7 @@
 from ..utils.arg_utils import AUTO, Default
 from ..utils.background_utils import RunIfTimeout
 from ..utils.fs_utils import safe_fn
+from ..utils.hf_model_utils import filter_model_warnings
 from ..utils.import_utils import ignore_hivemind_warnings, ignore_transformers_warnings
 from .hf_transformers import HFTransformers
 
@@ -161,6 +162,9 @@ def model(self) -> PreTrainedModel:
         # torch._dynamo.config.suppress_errors = True
         # model = torch.compile(model)
 
+        # Filter any warnings from the model
+        filter_model_warnings()
+
         # Finished loading
         log_if_timeout.stop(
             partial(

diff --git a/src/task_models/hf_classification_task_model.py b/src/task_models/hf_classification_task_model.py
@@ -16,6 +16,7 @@
     HF_TRANSFORMERS_CITATION,
     PEFT_CITATION,
     convert_dtype,
+    filter_model_warnings,
     get_config,
     get_model_max_context_length,
     get_tokenizer,
@@ -152,6 +153,9 @@ def model(self) -> PreTrainedModel:
         # torch._dynamo.config.suppress_errors = True
         # model = torch.compile(model)
 
+        # Filter any warnings from the model
+        filter_model_warnings()
+
         # Finished loading
         log_if_timeout.stop(
             partial(

diff --git a/src/tests/llms/test_llms.py b/src/tests/llms/test_llms.py
@@ -12,14 +12,15 @@
 from time import sleep
 from types import GeneratorType
 
+import dill
 import psutil
 import pytest
 import torch
 from flaky import flaky
 from sortedcontainers import SortedDict
 
 from ... import DataDreamer
-from ..._cachable._cachable import _is_primitive
+from ..._cachable._cachable import _is_primitive, _StrWithSeed
 from ...llms import (
     AI21,
     VLLM,
@@ -338,6 +339,25 @@ def test_is_primitive(self):
         assert _is_primitive({"foo": 5})
         assert not _is_primitive({"foo": object()})
 
+    def test_StrWithSeed(self):
+        seed_a = _StrWithSeed("hello", seed=1)
+        seed_b = _StrWithSeed("hello", seed=2)
+        seed_c = _StrWithSeed("hello", seed=1)
+        assert (
+            isinstance(seed_a, str)
+            and isinstance(seed_b, str)
+            and isinstance(seed_c, str)
+        )
+        assert seed_a.seed == 1
+        assert seed_b.seed == 2
+        assert seed_c.seed == 1
+        assert str(seed_a) == "hello"
+        assert str(seed_b) == "hello"
+        assert str(seed_c) == "hello"
+        assert hash(seed_a) != hash(seed_b)
+        assert hash(seed_a) == hash(seed_c)
+        assert hash(seed_a) == hash(dill.loads(dill.dumps(seed_c)))
+
     def test_check_temperature_and_top_p(self):
         assert _check_temperature_and_top_p(
             temperature=0.3,

diff --git a/src/tests/trainers/test_distributed.py b/src/tests/trainers/test_distributed.py
@@ -17,9 +17,9 @@
     TrainHFPPO,
     TrainSentenceTransformer,
 )
-from ...trainers._train_hf_base import CustomDataCollatorWithPadding
 from ...utils.arg_utils import AUTO
 from ...utils.hf_model_utils import get_orig_model, is_bnb_quantized
+from ...utils.hf_training_utils import CustomDataCollatorWithPadding
 from ...utils.import_utils import ignore_transformers_warnings
 
 with ignore_transformers_warnings():
@@ -422,12 +422,13 @@ def test_fsdp_peft(self, qlora, create_datadreamer, mocker):
                 validation_output=val_dataset.output["outputs"],
                 epochs=1,
                 batch_size=8,
+                gradient_checkpointing=qlora,
             )
             assert data_collator_spy.call_count == 0
             trainer_path = cast(str, trainer._output_folder_path)
             with open(os.path.join(trainer_path, "fingerprint.json"), "r") as f:
                 assert (
-                    json.load(f) == "ce4179deefbddefd" if qlora else "6b385aca0ce684b3"
+                    json.load(f) == "42a7bd193f804a4a" if qlora else "6b385aca0ce684b3"
                 )
             assert train_result is trainer
             assert (

diff --git a/src/tests/trainers/test_trainers.py b/src/tests/trainers/test_trainers.py
@@ -25,9 +25,9 @@
     TrainSentenceTransformer,
     TrainSetFitClassifier,
 )
-from ...trainers._train_hf_base import CustomDataCollatorWithPadding
 from ...utils.fs_utils import clear_dir
 from ...utils.hf_model_utils import get_orig_model, validate_peft_config
+from ...utils.hf_training_utils import CustomDataCollatorWithPadding
 from ...utils.import_utils import ignore_transformers_warnings
 
 with ignore_transformers_warnings():

diff --git a/src/tests/utils/test_device_utils.py b/src/tests/utils/test_device_utils.py
@@ -89,4 +89,7 @@ def test_get_device_env_variables(self):
             get_device_env_variables([0, 2, 999999, 0, 1, -1, -1])
         with pytest.raises(AssertionError):
             get_device_env_variables([0, 2, 0, 1])
-        assert get_device_env_variables([0, 2, 1]) == {"CUDA_VISIBLE_DEVICES": "6,3,4"}
+        assert get_device_env_variables([0, 2, 1]) == {
+            "CUDA_VISIBLE_DEVICES": "6,3,4",
+            "NCCL_P2P_DISABLE": "1",
+        }