Training with learn.to_fp16() fails with PyTorch 1.9 / Cuda 11.4 #3438

svishnu88 · 2021-07-24T09:23:42Z

Hi, Training with to_fp16 breaks in the latest version of PyTorch.

Versions

Fastai - 2.4.1
Pytorch - 1.9.0
Cuda - 11.4

Describe the bug:

Unable to run learn.fine_tune with fp_16 in Pytorch 1.9.0 using the below code

from fastai.vision.all import *
path = untar_data(URLs.PETS)/'images'

def is_cat(x): return x[0].isupper()
dls = ImageDataLoaders.from_name_func(
    path, get_image_files(path), valid_pct=0.2, seed=42,
    label_func=is_cat, item_tfms=Resize(224))

learn = cnn_learner(dls, resnet34, metrics=error_rate)
learn = learn.to_fp16()
learn.fine_tune(1)

Error while running the above code

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/tmp/ipykernel_479/4101694703.py in <module>
      1 learn = learn.to_fp16()
----> 2 learn.fine_tune(1)

/opt/conda/lib/python3.8/site-packages/fastai/callback/schedule.py in fine_tune(self, epochs, base_lr, freeze_epochs, lr_mult, pct_start, div, **kwargs)
    156     "Fine tune with `freeze` for `freeze_epochs` then with `unfreeze` from `epochs` using discriminative LR"
    157     self.freeze()
--> 158     self.fit_one_cycle(freeze_epochs, slice(base_lr), pct_start=0.99, **kwargs)
    159     base_lr /= 2
    160     self.unfreeze()

/opt/conda/lib/python3.8/site-packages/fastai/callback/schedule.py in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt)
    111     scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
    112               'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 113     self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
    114 
    115 # Cell

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
    219             self.opt.set_hypers(lr=self.lr if lr is None else lr)
    220             self.n_epoch = n_epoch
--> 221             self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
    222 
    223     def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    161 
    162     def _with_events(self, f, event_type, ex, final=noop):
--> 163         try: self(f'before_{event_type}');  f()
    164         except ex: self(f'after_cancel_{event_type}')
    165         self(f'after_{event_type}');  final()

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _do_fit(self)
    210         for epoch in range(self.n_epoch):
    211             self.epoch=epoch
--> 212             self._with_events(self._do_epoch, 'epoch', CancelEpochException)
    213 
    214     def fit(self, n_epoch, lr=None, wd=None, cbs=None, reset_opt=False):

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    161 
    162     def _with_events(self, f, event_type, ex, final=noop):
--> 163         try: self(f'before_{event_type}');  f()
    164         except ex: self(f'after_cancel_{event_type}')
    165         self(f'after_{event_type}');  final()

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _do_epoch(self)
    204 
    205     def _do_epoch(self):
--> 206         self._do_epoch_train()
    207         self._do_epoch_validate()
    208 

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _do_epoch_train(self)
    196     def _do_epoch_train(self):
    197         self.dl = self.dls.train
--> 198         self._with_events(self.all_batches, 'train', CancelTrainException)
    199 
    200     def _do_epoch_validate(self, ds_idx=1, dl=None):

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    161 
    162     def _with_events(self, f, event_type, ex, final=noop):
--> 163         try: self(f'before_{event_type}');  f()
    164         except ex: self(f'after_cancel_{event_type}')
    165         self(f'after_{event_type}');  final()

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in all_batches(self)
    167     def all_batches(self):
    168         self.n_iter = len(self.dl)
--> 169         for o in enumerate(self.dl): self.one_batch(*o)
    170 
    171     def _do_one_batch(self):

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in one_batch(self, i, b)
    192         b = self._set_device(b)
    193         self._split(b)
--> 194         self._with_events(self._do_one_batch, 'batch', CancelBatchException)
    195 
    196     def _do_epoch_train(self):

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    161 
    162     def _with_events(self, f, event_type, ex, final=noop):
--> 163         try: self(f'before_{event_type}');  f()
    164         except ex: self(f'after_cancel_{event_type}')
    165         self(f'after_{event_type}');  final()

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _do_one_batch(self)
    175             self.loss_grad = self.loss_func(self.pred, *self.yb)
    176             self.loss = self.loss_grad.clone()
--> 177         self('after_loss')
    178         if not self.training or not len(self.yb): return
    179         self('before_backward')

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in __call__(self, event_name)
    139 
    140     def ordered_cbs(self, event): return [cb for cb in self.cbs.sorted('order') if hasattr(cb, event)]
--> 141     def __call__(self, event_name): L(event_name).map(self._call_one)
    142 
    143     def _call_one(self, event_name):

/opt/conda/lib/python3.8/site-packages/fastcore/foundation.py in map(self, f, gen, *args, **kwargs)
    152     def range(cls, a, b=None, step=None): return cls(range_of(a, b=b, step=step))
    153 
--> 154     def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))
    155     def argwhere(self, f, negate=False, **kwargs): return self._new(argwhere(self, f, negate, **kwargs))
    156     def filter(self, f=noop, negate=False, gen=False, **kwargs):

/opt/conda/lib/python3.8/site-packages/fastcore/basics.py in map_ex(iterable, f, gen, *args, **kwargs)
    664     res = map(g, iterable)
    665     if gen: return res
--> 666     return list(res)
    667 
    668 # Cell

/opt/conda/lib/python3.8/site-packages/fastcore/basics.py in __call__(self, *args, **kwargs)
    649             if isinstance(v,_Arg): kwargs[k] = args.pop(v.i)
    650         fargs = [args[x.i] if isinstance(x, _Arg) else x for x in self.pargs] + args[self.maxi+1:]
--> 651         return self.func(*fargs, **kwargs)
    652 
    653 # Cell

/opt/conda/lib/python3.8/site-packages/fastai/learner.py in _call_one(self, event_name)
    143     def _call_one(self, event_name):
    144         if not hasattr(event, event_name): raise Exception(f'missing {event_name}')
--> 145         for cb in self.cbs.sorted('order'): cb(event_name)
    146 
    147     def _bn_bias_state(self, with_bias): return norm_bias_params(self.model, with_bias).map(self.opt.state)

/opt/conda/lib/python3.8/site-packages/fastai/callback/core.py in __call__(self, event_name)
     43                (self.run_valid and not getattr(self, 'training', False)))
     44         res = None
---> 45         if self.run and _run: res = getattr(self, event_name, noop)()
     46         if event_name=='after_fit': self.run=True #Reset self.run to True at each end of fit
     47         return res

/opt/conda/lib/python3.8/site-packages/fastai/callback/fp16.py in after_loss(self)
     21     def after_pred(self):
     22         if listify(self.pred)[0].dtype==torch.float16: self.learn.pred = to_float(self.pred)
---> 23     def after_loss(self): self.autocast.__exit__()
     24     def before_backward(self): self.learn.loss_grad = self.scaler.scale(self.loss_grad)
     25     def before_step(self):

TypeError: __exit__() missing 3 required positional arguments: 'exc_type', 'exc_val', and 'exc_tb'

The text was updated successfully, but these errors were encountered:

Dutta-SD · 2021-07-29T15:46:35Z

Hi
I tried reproducing your code in Google Colab Notebook. It seems to be working fine.
The CUDA version is different however.

Is that a source of problem?

jph00 added the bug label Aug 6, 2021

jph00 changed the title ~~Training with learn.to_fp16() fails with PyTorch 1.9~~ Training with learn.to_fp16() fails with PyTorch 1.9 / Cuda 11.4 Aug 6, 2021

jph00 closed this as completed in 7dc343b Aug 6, 2021

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Training with learn.to_fp16() fails with PyTorch 1.9 / Cuda 11.4 #3438

Training with learn.to_fp16() fails with PyTorch 1.9 / Cuda 11.4 #3438

svishnu88 commented Jul 24, 2021

Dutta-SD commented Jul 29, 2021

Training with learn.to_fp16() fails with PyTorch 1.9 / Cuda 11.4 #3438

Training with learn.to_fp16() fails with PyTorch 1.9 / Cuda 11.4 #3438

Comments

svishnu88 commented Jul 24, 2021

Dutta-SD commented Jul 29, 2021