RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED Chapter 1 #9

rbavery · 2020-02-29T19:30:25Z

When I run the first text classifier example

#id training2
#caption Training loop in a text application
from fastai2.text.all import *

dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test')
learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)
learn.fine_tune(4, 1e-2)

The network trains on my 1080Ti for a few minutes and the GPU is utilized. Midway through the second epoch it fails:


epoch | train_loss | valid_loss | accuracy | time
-- | -- | -- | -- | --
0 | 0.597115 | 0.438984 | 0.804160 | 01:34


epoch | train_loss | valid_loss | accuracy | time
-- | -- | -- | -- | --
0 | 0.394730 | 00:40

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-15-772d141f2ac2> in <module>
      5 dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test')
      6 learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)
----> 7 learn.fine_tune(4, 1e-2)

~/fastai2/fastai2/callback/schedule.py in fine_tune(self, epochs, base_lr, freeze_epochs, lr_mult, pct_start, div, **kwargs)
    157     self.fit_one_cycle(freeze_epochs, slice(base_lr*2), pct_start=0.99, **kwargs)
    158     self.unfreeze()
--> 159     self.fit_one_cycle(epochs, slice(base_lr/lr_mult, base_lr), pct_start=pct_start, div=div, **kwargs)
    160 
    161 # Cell

~/fastai2/fastai2/callback/schedule.py in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt)
    110     scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
    111               'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 112     self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
    113 
    114 # Cell

~/fastai2/fastai2/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
    174                     try:
    175                         self.epoch=epoch;          self('begin_epoch')
--> 176                         self._do_epoch_train()
    177                         self._do_epoch_validate()
    178                     except CancelEpochException:   self('after_cancel_epoch')

~/fastai2/fastai2/learner.py in _do_epoch_train(self)
    147         try:
    148             self.dl = self.dls.train;                        self('begin_train')
--> 149             self.all_batches()
    150         except CancelTrainException:                         self('after_cancel_train')
    151         finally:                                             self('after_train')

~/fastai2/fastai2/learner.py in all_batches(self)
    125     def all_batches(self):
    126         self.n_iter = len(self.dl)
--> 127         for o in enumerate(self.dl): self.one_batch(*o)
    128 
    129     def one_batch(self, i, b):

~/fastai2/fastai2/learner.py in one_batch(self, i, b)
    135             self.loss = self.loss_func(self.pred, *self.yb); self('after_loss')
    136             if not self.training: return
--> 137             self.loss.backward();                            self('after_backward')
    138             self.opt.step();                                 self('after_step')
    139             self.opt.zero_grad()

~/miniconda3/envs/data-science-stack-2.1.0/lib/python3.7/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
    193                 products. Defaults to ``False``.
    194         """
--> 195         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    196 
    197     def register_hook(self, hook):

~/miniconda3/envs/data-science-stack-2.1.0/lib/python3.7/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
     97     Variable._execution_engine.run_backward(
     98         tensors, grad_tensors, retain_graph, create_graph,
---> 99         allow_unreachable=True)  # allow_unreachable flag
    100 
    101 

RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED

If I try to rerun the cell it fails immediately with


epoch | train_loss | valid_loss | accuracy | time
-- | -- | -- | -- | --
0 | 0.975069 | 00:01

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-16-772d141f2ac2> in <module>
      5 dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test')
      6 learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)
----> 7 learn.fine_tune(4, 1e-2)

~/fastai2/fastai2/callback/schedule.py in fine_tune(self, epochs, base_lr, freeze_epochs, lr_mult, pct_start, div, **kwargs)
    155     "Fine tune with `freeze` for `freeze_epochs` then with `unfreeze` from `epochs` using discriminative LR"
    156     self.freeze()
--> 157     self.fit_one_cycle(freeze_epochs, slice(base_lr*2), pct_start=0.99, **kwargs)
    158     self.unfreeze()
    159     self.fit_one_cycle(epochs, slice(base_lr/lr_mult, base_lr), pct_start=pct_start, div=div, **kwargs)

~/fastai2/fastai2/callback/schedule.py in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt)
    110     scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
    111               'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 112     self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
    113 
    114 # Cell

~/fastai2/fastai2/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
    174                     try:
    175                         self.epoch=epoch;          self('begin_epoch')
--> 176                         self._do_epoch_train()
    177                         self._do_epoch_validate()
    178                     except CancelEpochException:   self('after_cancel_epoch')

~/fastai2/fastai2/learner.py in _do_epoch_train(self)
    147         try:
    148             self.dl = self.dls.train;                        self('begin_train')
--> 149             self.all_batches()
    150         except CancelTrainException:                         self('after_cancel_train')
    151         finally:                                             self('after_train')

~/fastai2/fastai2/learner.py in all_batches(self)
    125     def all_batches(self):
    126         self.n_iter = len(self.dl)
--> 127         for o in enumerate(self.dl): self.one_batch(*o)
    128 
    129     def one_batch(self, i, b):

~/fastai2/fastai2/learner.py in one_batch(self, i, b)
    135             self.loss = self.loss_func(self.pred, *self.yb); self('after_loss')
    136             if not self.training: return
--> 137             self.loss.backward();                            self('after_backward')
    138             self.opt.step();                                 self('after_step')
    139             self.opt.zero_grad()

~/miniconda3/envs/data-science-stack-2.1.0/lib/python3.7/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
    193                 products. Defaults to ``False``.
    194         """
--> 195         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    196 
    197     def register_hook(self, hook):

~/miniconda3/envs/data-science-stack-2.1.0/lib/python3.7/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
     97     Variable._execution_engine.run_backward(
     98         tensors, grad_tensors, retain_graph, create_graph,
---> 99         allow_unreachable=True)  # allow_unreachable flag
    100 
    101 

RuntimeError: cuda runtime error (400) : invalid resource handle at /opt/conda/conda-bld/pytorch_1579022060824/work/aten/src/THC/generic/THCTensorMath.cu:35

The strange thing is that I ran this cell yesterday with the same conda environment and everything worked.

I'm using CUDA 10.1.

→ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243

This is the output of nvidia-smi during the training right before failure

Sat Feb 29 11:17:50 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.59       Driver Version: 440.59       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  GeForce GTX 108...  Off  | 00000000:01:00.0  On |                  N/A |
|  0%   49C    P2   198W / 280W |  11138MiB / 11177MiB |     96%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|    0      1222      G   /usr/lib/xorg/Xorg                          1690MiB |
|    0      1904      G   /usr/bin/compiz                               41MiB |
|    0      3764      G   ...uest-channel-token=10439615552343491592    42MiB |
|    0      4391      C   ...nvs/data-science-stack-2.1.0/bin/python  9351MiB |
+-----------------------------------------------------------------------------+

Thanks for making this prerelease available! Happy to provide more info if it helps.

The text was updated successfully, but these errors were encountered:

jph00 · 2020-02-29T19:35:33Z

For help running notebooks please ask on the forum: https://forums.fast.ai/c/fastai-users/fastai-v2

jph00 closed this as completed Feb 29, 2020

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED Chapter 1 #9

RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED Chapter 1 #9

rbavery commented Feb 29, 2020

jph00 commented Feb 29, 2020

RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED Chapter 1 #9

RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED Chapter 1 #9

Comments

rbavery commented Feb 29, 2020

jph00 commented Feb 29, 2020