You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I am running the notebook in this tutorial https://ax.dev/tutorials/tune_cnn.html .
I get an error the error as shown in the title, when I run the folowing code block.
Changing 'total_trials' or device type wouldn't help.
The error stack is as follows. Thanks in advance!
[INFO 05-24 11:51:58] ax.service.utils.dispatch: Using Bayesian Optimization generation strategy. Iterations after 5 will take longer to generate due to model-fitting.
[INFO 05-24 11:51:58] ax.service.managed_loop: Started full optimization with 20 steps.
[INFO 05-24 11:51:58] ax.service.managed_loop: Running optimization trial 1...
~/.local/anaconda3/envs/ax/lib/python3.7/site-packages/ax/service/managed_loop.py in full_run(self)
148 logger.info(f"Started full optimization with {num_steps} steps.")
149 for _ in range(num_steps):
--> 150 self.run_trial()
151 return self
152
~/.local/anaconda3/envs/ax/lib/python3.7/site-packages/ax/service/managed_loop.py in run_trial(self)
139 else: # pragma: no cover
140 raise ValueError(f"Invalid number of arms per trial: {arms_per_trial}")
--> 141 trial.fetch_data()
142 self.current_trial += 1
143
~/.local/anaconda3/envs/ax/lib/python3.7/site-packages/torch/utils/data/dataloader.py in init(self, loader)
467 # before it starts, and del tries to join but will get:
468 # AssertionError: can only join a started process.
--> 469 w.start()
470 self.index_queues.append(index_queue)
471 self.workers.append(w)
~/.local/anaconda3/envs/ax/lib/python3.7/multiprocessing/process.py in start(self)
110 'daemonic processes are not allowed to have children'
111 _cleanup()
--> 112 self._popen = self._Popen(self)
113 self._sentinel = self._popen.sentinel
114 # Avoid a refcycle if the target function holds an indirect
~/.local/anaconda3/envs/ax/lib/python3.7/multiprocessing/context.py in _Popen(process_obj)
221 @staticmethod
222 def _Popen(process_obj):
--> 223 return _default_context.get_context().Process._Popen(process_obj)
224
225 class DefaultContext(BaseContext):
~/.local/anaconda3/envs/ax/lib/python3.7/multiprocessing/context.py in _Popen(process_obj)
275 def _Popen(process_obj):
276 from .popen_fork import Popen
--> 277 return Popen(process_obj)
278
279 class SpawnProcess(process.BaseProcess):
Hey, @qfgaohao! Have you, by any chance, trained a PyTorch CNN in a notebook on your machine before without errors? The error seems to be coming from the torch stack, so would be helpful to know if just the training works.
I am running the notebook in this tutorial https://ax.dev/tutorials/tune_cnn.html .
I get an error the error as shown in the title, when I run the folowing code block.
best_parameters, values, experiment, model = optimize( parameters=[ {"name": "lr", "type": "range", "bounds": [1e-6, 0.4], "log_scale": True}, {"name": "momentum", "type": "range", "bounds": [0.0, 1.0]}, ], evaluation_function=train_evaluate, objective_name='accuracy', )
Changing 'total_trials' or device type wouldn't help.
The error stack is as follows. Thanks in advance!
[INFO 05-24 11:51:58] ax.service.utils.dispatch: Using Bayesian Optimization generation strategy. Iterations after 5 will take longer to generate due to model-fitting.
[INFO 05-24 11:51:58] ax.service.managed_loop: Started full optimization with 20 steps.
[INFO 05-24 11:51:58] ax.service.managed_loop: Running optimization trial 1...
OSError Traceback (most recent call last)
in
5 ],
6 evaluation_function=train_evaluate,
----> 7 objective_name='accuracy',
8 )
~/.local/anaconda3/envs/ax/lib/python3.7/site-packages/ax/service/managed_loop.py in optimize(parameters, evaluation_function, experiment_name, objective_name, minimize, parameter_constraints, outcome_constraints, total_trials, arms_per_trial, wait_time)
204 wait_time=wait_time,
205 )
--> 206 loop.full_run()
207 parameterization, values = loop.get_best_point()
208 return parameterization, values, loop.experiment, loop.get_current_model()
~/.local/anaconda3/envs/ax/lib/python3.7/site-packages/ax/service/managed_loop.py in full_run(self)
148 logger.info(f"Started full optimization with {num_steps} steps.")
149 for _ in range(num_steps):
--> 150 self.run_trial()
151 return self
152
~/.local/anaconda3/envs/ax/lib/python3.7/site-packages/ax/service/managed_loop.py in run_trial(self)
139 else: # pragma: no cover
140 raise ValueError(f"Invalid number of arms per trial: {arms_per_trial}")
--> 141 trial.fetch_data()
142 self.current_trial += 1
143
~/.local/anaconda3/envs/ax/lib/python3.7/site-packages/ax/core/base_trial.py in fetch_data(self, metrics, **kwargs)
257 """
258 return self.experiment._fetch_trial_data(
--> 259 trial_index=self.index, metrics=metrics, **kwargs
260 )
261
~/.local/anaconda3/envs/ax/lib/python3.7/site-packages/ax/core/simple_experiment.py in _fetch_trial_data(self, trial_index, metrics, **kwargs)
203 self, trial_index: int, metrics: Optional[List[Metric]] = None, **kwargs: Any
204 ) -> Data:
--> 205 return self.eval_trial(self.trials[trial_index])
206
207 @copy_doc(Experiment.add_tracking_metric)
~/.local/anaconda3/envs/ax/lib/python3.7/site-packages/ax/core/simple_experiment.py in eval_trial(self, trial)
117 trial.mark_running()
118 evaluations[not_none(trial.arm).name] = self.evaluation_function_outer(
--> 119 not_none(trial.arm).parameters, None
120 )
121 elif isinstance(trial, BatchTrial):
~/.local/anaconda3/envs/ax/lib/python3.7/site-packages/ax/core/simple_experiment.py in evaluation_function_outer(self, parameterization, weight)
174 if num_evaluation_function_params == 1:
175 # pyre-fixme[20]: Anonymous call expects argument
$1
.--> 176 evaluation = self._evaluation_function(parameterization)
177 elif num_evaluation_function_params == 2:
178 evaluation = self._evaluation_function(parameterization, weight)
in train_evaluate(parameterization)
1 def train_evaluate(parameterization):
----> 2 net = train(train_loader=train_loader, parameters=parameterization, dtype=dtype, device=device)
3 return evaluate(
4 net=net,
5 data_loader=valid_loader,
~/.local/anaconda3/envs/ax/lib/python3.7/site-packages/ax/utils/tutorials/cnn_utils.py in train(train_loader, parameters, dtype, device)
126
127 # Train Network
--> 128 for inputs, labels in train_loader:
129 # move data to proper dtype and device
130 inputs = inputs.to(device=device)
~/.local/anaconda3/envs/ax/lib/python3.7/site-packages/torch/utils/data/dataloader.py in iter(self)
191
192 def iter(self):
--> 193 return _DataLoaderIter(self)
194
195 def len(self):
~/.local/anaconda3/envs/ax/lib/python3.7/site-packages/torch/utils/data/dataloader.py in init(self, loader)
467 # before it starts, and del tries to join but will get:
468 # AssertionError: can only join a started process.
--> 469 w.start()
470 self.index_queues.append(index_queue)
471 self.workers.append(w)
~/.local/anaconda3/envs/ax/lib/python3.7/multiprocessing/process.py in start(self)
110 'daemonic processes are not allowed to have children'
111 _cleanup()
--> 112 self._popen = self._Popen(self)
113 self._sentinel = self._popen.sentinel
114 # Avoid a refcycle if the target function holds an indirect
~/.local/anaconda3/envs/ax/lib/python3.7/multiprocessing/context.py in _Popen(process_obj)
221 @staticmethod
222 def _Popen(process_obj):
--> 223 return _default_context.get_context().Process._Popen(process_obj)
224
225 class DefaultContext(BaseContext):
~/.local/anaconda3/envs/ax/lib/python3.7/multiprocessing/context.py in _Popen(process_obj)
275 def _Popen(process_obj):
276 from .popen_fork import Popen
--> 277 return Popen(process_obj)
278
279 class SpawnProcess(process.BaseProcess):
~/.local/anaconda3/envs/ax/lib/python3.7/multiprocessing/popen_fork.py in init(self, process_obj)
18 self.returncode = None
19 self.finalizer = None
---> 20 self._launch(process_obj)
21
22 def duplicate_for_child(self, fd):
~/.local/anaconda3/envs/ax/lib/python3.7/multiprocessing/popen_fork.py in _launch(self, process_obj)
68 code = 1
69 parent_r, child_w = os.pipe()
---> 70 self.pid = os.fork()
71 if self.pid == 0:
72 try:
OSError: [Errno 12] Cannot allocate memory
The text was updated successfully, but these errors were encountered: