diff --git a/CHANGELOG.md b/CHANGELOG.md index d12af22..fa20b15 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,7 +43,13 @@ Docs: Features: - Added Adagrad (without grad clipping) as 32-bit and 8-bit block-wise optimizer. - Added AdamW (copy of Adam with weight decay init 1e-2). #10 + - Introduced ModuleConfig overrides which can be seamlessly be used at initialization time of a module. + - Added `bnb.nn.Embedding` layer which runs at 32-bit but without the layernorm. This works well if you need to fine-tune pretrained models that do not have a embedding layer norm. #19 Bug fixes: - Fixed a bug where weight decay was incorrectly applied to 32-bit Adam. #13 - Fixed an unsafe use of eval. #8 + - Fixed a bug where the StableEmbedding layer 32-bit optimizer override would not work without registering the whole model first (`bnb.optim.GlobalOptimManager.get_instance().register_parameters(model.parameters())`). #13 #15 + +Docs: + - Added instructions how to solve "\_\_fatbinwrap_" errors. diff --git a/README.md b/README.md index 4a731b0..4b7db17 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,7 @@ For upcoming features and changes and full history see [Patch Notes](CHANGELOG.m ## Errors 1. RuntimeError: CUDA error: no kernel image is available for execution on the device. [Solution](errors_and_solutions.md#No-kernel-image-available) +2. __fatbinwrap_.. [Solution](errors_and_solutions.md#fatbinwrap_) ## Compile from source diff --git a/bitsandbytes/nn/__init__.py b/bitsandbytes/nn/__init__.py index 177540f..27ad6ca 100644 --- a/bitsandbytes/nn/__init__.py +++ b/bitsandbytes/nn/__init__.py @@ -2,4 +2,4 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from .modules import StableEmbedding +from .modules import StableEmbedding, Embedding diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index ce2f3a4..dc0a171 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -18,8 +18,7 @@ def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: Optiona sparse: bool = False, _weight: Optional[Tensor] = None) -> None: super(StableEmbedding, self).__init__(num_embeddings, embedding_dim, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse, _weight) self.norm = torch.nn.LayerNorm(embedding_dim) - GlobalOptimManager.get_instance().register_parameters(self.weight) - GlobalOptimManager.get_instance().override_config(self.weight, 'optim_bits', 32) + GlobalOptimManager.get_instance().register_module_override(self, 'weight', {'optim_bits': 32}) def reset_parameters(self) -> None: torch.nn.init.xavier_uniform_(self.weight) @@ -42,3 +41,33 @@ def forward(self, input: Tensor) -> Tensor: self.norm_type, self.scale_grad_by_freq, self.sparse) return self.norm(emb) + + +class Embedding(torch.nn.Embedding): + def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None, + max_norm: Optional[float] = None, norm_type: float = 2., scale_grad_by_freq: bool = False, + sparse: bool = False, _weight: Optional[Tensor] = None) -> None: + super(Embedding, self).__init__(num_embeddings, embedding_dim, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse, _weight) + GlobalOptimManager.get_instance().register_module_override(self, 'weight', {'optim_bits': 32}) + + def reset_parameters(self) -> None: + torch.nn.init.xavier_uniform_(self.weight) + self._fill_padding_idx_with_zero() + + ''' !!! This is a redefinition of _fill_padding_idx_with_zero in torch.nn.Embedding + to make the Layer compatible with Pytorch < 1.9. + This means that if this changes in future PyTorch releases this need to change too + which is cumbersome. However, with this we can ensure compatibility with previous + PyTorch releases. + ''' + def _fill_padding_idx_with_zero(self) -> None: + if self.padding_idx is not None: + with torch.no_grad(): + self.weight[self.padding_idx].fill_(0) + + def forward(self, input: Tensor) -> Tensor: + emb = F.embedding( + input, self.weight, self.padding_idx, self.max_norm, + self.norm_type, self.scale_grad_by_freq, self.sparse) + + return emb diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py index cfbd72e..5a5bb1e 100644 --- a/bitsandbytes/optim/optimizer.py +++ b/bitsandbytes/optim/optimizer.py @@ -26,6 +26,7 @@ def initialize(self): self.index2config = {} self.optimizer = None self.uses_config_override = False + self.module_weight_config_triple = [] @classmethod def get_instance(cls): @@ -77,12 +78,16 @@ def override_config(self, parameters, key=None, value=None, key_value_dict=None) if id(p) in self.pid2config:self.pid2config[id(p)].update(key_value_dict) else: self.pid2config[id(p)] = key_value_dict + def register_module_override(self, module, param_name, config): + self.module_weight_config_triple.append((module, param_name, config)) + + class Optimizer8bit(torch.optim.Optimizer): def __init__(self, params, defaults, optim_bits=32): super(Optimizer8bit, self).__init__(params, defaults) - self.checked_if_on_gpu = False + self.initialized = False self.name2qmap = {} self.mng = GlobalOptimManager.get_instance() @@ -172,7 +177,6 @@ def update_group(group, new_group): self.__setstate__({'state': state, 'param_groups': param_groups}) def to_gpu(self): - self.checked_if_on_gpu = True for gindex, group in enumerate(self.param_groups): for pindex, p in enumerate(group['params']): if p in self.state: @@ -181,6 +185,23 @@ def to_gpu(self): if isinstance(v, torch.Tensor): self.state[p][k] = v.to(p.device) + def check_overrides(self): + for module, attr, config in self.mng.module_weight_config_triple: + pmodule = getattr(module, attr) + assert pmodule is not None + assert isinstance(pmodule, torch.Tensor) or isinstance(pmodule, torch.Parameter) + found = False + for gindex, group in enumerate(self.param_groups): + if found: break + for pindex, p in enumerate(group['params']): + if found: break + if id(p) == id(pmodule): + # found the matching parameter + # init override + self.mng.pid2config[id(p)] = config + self.mng.index2config[(gindex, pindex)] = self.mng.pid2config[id(p)] + found = True + @torch.no_grad() def step(self, closure=None): """Performs a single optimization step. @@ -196,7 +217,11 @@ def step(self, closure=None): overflows = [] - if not self.checked_if_on_gpu: self.to_gpu() # needed for fairseq pure fp16 training + if not self.initialized: + self.check_overrides() + self.to_gpu() # needed for fairseq pure fp16 training + self.initialized = True + for gindex, group in enumerate(self.param_groups): for pindex, p in enumerate(group['params']): if p.grad is None: diff --git a/errors_and_solutions.md b/errors_and_solutions.md index dd99f7c..5e8b2d2 100644 --- a/errors_and_solutions.md +++ b/errors_and_solutions.md @@ -6,3 +6,16 @@ If you are feeling lucky, you can also try to compile the library from source. T __If you encounter any other error not listed here please create an issue. This will help resolve your problem and will help out others in the future. + + +# fatbinwrap + +This error occurs if there is a mismatch between CUDA versions in the C++ library and the CUDA part. Make sure you have right CUDA in your $PATH and $LD_LIBRARY_PATH variable. In the conda base environment you can find the library under: +```bash +ls $CONDA_PREFIX/lib/*cudart* +``` +Make sure this path is appended to the `LD_LIBRARY_PATH` so bnb can find the CUDA runtime environment library (cudart). + +If this does not fix the issue, please try [compilation from source](compile_from_source.md) next. + +If this does not work, please open an issue and paste the printed environment if you call `make` and the associated error when running bnb. diff --git a/howto_config_override.md b/howto_config_override.md index 11e9d49..4680776 100644 --- a/howto_config_override.md +++ b/howto_config_override.md @@ -2,6 +2,7 @@ If you want to optimize some unstable parameters with 32-bit Adam and others with 8-bit Adam, you can use the `GlobalOptimManager`. With this, we can also configure specific hyperparameters for particular layers, such as embedding layers. To do that, we need two things: (1) register the parameter while they are still on the CPU, (2) override the config with the new desired hyperparameters (anytime, anywhere). See our [guide](howto_config_override.md) for more details +For global overrides in many different places in your code you can do: ```python import torch import bitsandbytes as bnb @@ -24,3 +25,16 @@ mng.override_config([model.special.weight, model.also_special.weight], key_value_dict ={'is_sparse': True, 'lr': 1e-5, 'betas'=(0.9, 0.98)}) ``` Possible options for the config override are: `betas, eps, weight_decay, lr, optim_bits, min_8bit_size, percentile_clipping, block_wise, max_unorm` + +For overrides for particular layers we recommend overriding locally in each module. You can do this by passing the module, the parameter, and its attribute name to the GlobalOptimManager: +```python +class MyModule(torch.nn.Module): + def __init__(din, dout): + super(MyModule, self).__init__() + self.linear = torch.nn.Linear(din, dout) + # optimization will happen in 32-bit and + # learning rate will be set to 0.0001 independent of the main learning rate + config = {'optim_bits': 32, 'lr' : 0.0001} + GlobalOptimManager.get_instance().register_module_override(self, 'weight', config) + +``` diff --git a/tests/test_modules.py b/tests/test_modules.py new file mode 100644 index 0000000..6cbee7b --- /dev/null +++ b/tests/test_modules.py @@ -0,0 +1,46 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import pytest +import torch +import bitsandbytes as bnb + +from itertools import product + +from bitsandbytes import functional as F + + +@pytest.mark.parametrize("embcls", [bnb.nn.Embedding, bnb.nn.StableEmbedding], ids=['Embedding', 'StableEmbedding']) +def test_embeddings(embcls): + bnb.optim.GlobalOptimManager.get_instance().initialize() + emb1 = torch.nn.Embedding(100, 512).cuda() + emb2 = embcls(100, 512).cuda() + + adam1 = bnb.optim.Adam8bit(emb1.parameters()) + adam2 = bnb.optim.Adam8bit(emb2.parameters()) + + batches = torch.randint(1, 100, size=(100, 4, 32)).cuda() + + for i in range(100): + batch = batches[i] + + embedded1 = emb1(batch) + embedded2 = emb2(batch) + + l1 = embedded1.mean() + l2 = embedded2.mean() + + l1.backward() + l2.backward() + + adam1.step() + adam2.step() + + adam1.zero_grad() + adam2.zero_grad() + + assert adam1.state[emb1.weight]['state1'].dtype == torch.uint8 + assert adam2.state[emb2.weight]['state1'].dtype == torch.float32 + +