Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OSError: [Errno 12] Cannot allocate memory #98

Closed
qfgaohao opened this issue May 23, 2019 · 3 comments
Closed

OSError: [Errno 12] Cannot allocate memory #98

qfgaohao opened this issue May 23, 2019 · 3 comments

Comments

@qfgaohao
Copy link

I am running the notebook in this tutorial https://ax.dev/tutorials/tune_cnn.html .
I get an error the error as shown in the title, when I run the folowing code block.

best_parameters, values, experiment, model = optimize( parameters=[ {"name": "lr", "type": "range", "bounds": [1e-6, 0.4], "log_scale": True}, {"name": "momentum", "type": "range", "bounds": [0.0, 1.0]}, ], evaluation_function=train_evaluate, objective_name='accuracy', )

Changing 'total_trials' or device type wouldn't help.

The error stack is as follows. Thanks in advance!

[INFO 05-24 11:51:58] ax.service.utils.dispatch: Using Bayesian Optimization generation strategy. Iterations after 5 will take longer to generate due to model-fitting.
[INFO 05-24 11:51:58] ax.service.managed_loop: Started full optimization with 20 steps.
[INFO 05-24 11:51:58] ax.service.managed_loop: Running optimization trial 1...

OSError Traceback (most recent call last)
in
5 ],
6 evaluation_function=train_evaluate,
----> 7 objective_name='accuracy',
8 )

~/.local/anaconda3/envs/ax/lib/python3.7/site-packages/ax/service/managed_loop.py in optimize(parameters, evaluation_function, experiment_name, objective_name, minimize, parameter_constraints, outcome_constraints, total_trials, arms_per_trial, wait_time)
204 wait_time=wait_time,
205 )
--> 206 loop.full_run()
207 parameterization, values = loop.get_best_point()
208 return parameterization, values, loop.experiment, loop.get_current_model()

~/.local/anaconda3/envs/ax/lib/python3.7/site-packages/ax/service/managed_loop.py in full_run(self)
148 logger.info(f"Started full optimization with {num_steps} steps.")
149 for _ in range(num_steps):
--> 150 self.run_trial()
151 return self
152

~/.local/anaconda3/envs/ax/lib/python3.7/site-packages/ax/service/managed_loop.py in run_trial(self)
139 else: # pragma: no cover
140 raise ValueError(f"Invalid number of arms per trial: {arms_per_trial}")
--> 141 trial.fetch_data()
142 self.current_trial += 1
143

~/.local/anaconda3/envs/ax/lib/python3.7/site-packages/ax/core/base_trial.py in fetch_data(self, metrics, **kwargs)
257 """
258 return self.experiment._fetch_trial_data(
--> 259 trial_index=self.index, metrics=metrics, **kwargs
260 )
261

~/.local/anaconda3/envs/ax/lib/python3.7/site-packages/ax/core/simple_experiment.py in _fetch_trial_data(self, trial_index, metrics, **kwargs)
203 self, trial_index: int, metrics: Optional[List[Metric]] = None, **kwargs: Any
204 ) -> Data:
--> 205 return self.eval_trial(self.trials[trial_index])
206
207 @copy_doc(Experiment.add_tracking_metric)

~/.local/anaconda3/envs/ax/lib/python3.7/site-packages/ax/core/simple_experiment.py in eval_trial(self, trial)
117 trial.mark_running()
118 evaluations[not_none(trial.arm).name] = self.evaluation_function_outer(
--> 119 not_none(trial.arm).parameters, None
120 )
121 elif isinstance(trial, BatchTrial):

~/.local/anaconda3/envs/ax/lib/python3.7/site-packages/ax/core/simple_experiment.py in evaluation_function_outer(self, parameterization, weight)
174 if num_evaluation_function_params == 1:
175 # pyre-fixme[20]: Anonymous call expects argument $1.
--> 176 evaluation = self._evaluation_function(parameterization)
177 elif num_evaluation_function_params == 2:
178 evaluation = self._evaluation_function(parameterization, weight)

in train_evaluate(parameterization)
1 def train_evaluate(parameterization):
----> 2 net = train(train_loader=train_loader, parameters=parameterization, dtype=dtype, device=device)
3 return evaluate(
4 net=net,
5 data_loader=valid_loader,

~/.local/anaconda3/envs/ax/lib/python3.7/site-packages/ax/utils/tutorials/cnn_utils.py in train(train_loader, parameters, dtype, device)
126
127 # Train Network
--> 128 for inputs, labels in train_loader:
129 # move data to proper dtype and device
130 inputs = inputs.to(device=device)

~/.local/anaconda3/envs/ax/lib/python3.7/site-packages/torch/utils/data/dataloader.py in iter(self)
191
192 def iter(self):
--> 193 return _DataLoaderIter(self)
194
195 def len(self):

~/.local/anaconda3/envs/ax/lib/python3.7/site-packages/torch/utils/data/dataloader.py in init(self, loader)
467 # before it starts, and del tries to join but will get:
468 # AssertionError: can only join a started process.
--> 469 w.start()
470 self.index_queues.append(index_queue)
471 self.workers.append(w)

~/.local/anaconda3/envs/ax/lib/python3.7/multiprocessing/process.py in start(self)
110 'daemonic processes are not allowed to have children'
111 _cleanup()
--> 112 self._popen = self._Popen(self)
113 self._sentinel = self._popen.sentinel
114 # Avoid a refcycle if the target function holds an indirect

~/.local/anaconda3/envs/ax/lib/python3.7/multiprocessing/context.py in _Popen(process_obj)
221 @staticmethod
222 def _Popen(process_obj):
--> 223 return _default_context.get_context().Process._Popen(process_obj)
224
225 class DefaultContext(BaseContext):

~/.local/anaconda3/envs/ax/lib/python3.7/multiprocessing/context.py in _Popen(process_obj)
275 def _Popen(process_obj):
276 from .popen_fork import Popen
--> 277 return Popen(process_obj)
278
279 class SpawnProcess(process.BaseProcess):

~/.local/anaconda3/envs/ax/lib/python3.7/multiprocessing/popen_fork.py in init(self, process_obj)
18 self.returncode = None
19 self.finalizer = None
---> 20 self._launch(process_obj)
21
22 def duplicate_for_child(self, fd):

~/.local/anaconda3/envs/ax/lib/python3.7/multiprocessing/popen_fork.py in _launch(self, process_obj)
68 code = 1
69 parent_r, child_w = os.pipe()
---> 70 self.pid = os.fork()
71 if self.pid == 0:
72 try:

OSError: [Errno 12] Cannot allocate memory

@lena-kashtelyan
Copy link
Contributor

lena-kashtelyan commented May 24, 2019

Hey, @qfgaohao! Have you, by any chance, trained a PyTorch CNN in a notebook on your machine before without errors? The error seems to be coming from the torch stack, so would be helpful to know if just the training works.

@qfgaohao
Copy link
Author

Not in this Env or Pytorch 1.1. I will check it out. Thanks @lena-kashtelyan .

@qfgaohao
Copy link
Author

You are right, @lena-kashtelyan . Fixed. Thanks.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants