Skip to content

Commit

Permalink
Merge pull request #143 from datmo/gpu_support
Browse files Browse the repository at this point in the history
GPU Support (fix for windows build)
  • Loading branch information
asampat3090 committed May 18, 2018
2 parents b7e1e71 + bd5e967 commit a7d0861
Show file tree
Hide file tree
Showing 9 changed files with 93 additions and 17 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,5 @@ ENV/
# Rope project settings
.ropeproject
foo

.pytest_cache
1 change: 1 addition & 0 deletions appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ install:
- "%PYTHON%/python.exe C:/get-pip.py"
- "%PYTHON%/python.exe -m pip install pip==9.0.3"
- "%PYTHON%/Scripts/pip.exe install pytest"
- "%PYTHON%/Scripts/pip.exe install pypiwin32"
- "%PYTHON%/python.exe setup.py install"
test_script:
- "%PYTHON%/Scripts/pip.exe --version"
Expand Down
58 changes: 45 additions & 13 deletions datmo/core/controller/environment/driver/dockerenv.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
PathDoesNotExist, EnvironmentDoesNotExist, EnvironmentInitFailed,
EnvironmentExecutionException, FileAlreadyExistsException,
EnvironmentRequirementsCreateException, EnvironmentImageNotFound,
EnvironmentContainerNotFound)
EnvironmentContainerNotFound, GPUSupportNotEnabled)
from datmo.core.controller.environment.driver import EnvironmentDriver


Expand Down Expand Up @@ -152,12 +152,22 @@ def build(self, name, path):

# running daemon needed
def run(self, name, options, log_filepath):
run_return_code, run_id = \
self.run_container(image_name=name, **options)
if "gpu" in options:
gpu_ready = self.gpu_enabled()
if options["gpu"] is True:
if not gpu_ready:
raise GPUSupportNotEnabled('nvidia')
else:
options['runtime'] = 'nvidia'
options.pop("gpu", None)

run_return_code, run_id = self.run_container(
image_name=name, **options)

log_return_code, logs = self.log_container(
run_id, filepath=log_filepath)
final_return_code = run_return_code and log_return_code

final_return_code = run_return_code and log_return_code
return final_return_code, run_id, logs

# running daemon needed
Expand All @@ -178,6 +188,32 @@ def remove(self, name, force=False):
return stop_and_remove_containers_result and \
remove_image_result

def gpu_enabled(self):
# test if this images works
# docker run --runtime=nvidia --rm nvidia/cuda nvidia-smi
process = subprocess.Popen(
[
"docker",
"run",
"--runtime=nvidia",
"--rm",
"nvidia/cuda",
"nvidia-smi",
],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stdout, stderr = process.communicate()
stderr = stderr.decode("utf-8")
if "Unknown runtime specified nvidia" in stderr:
return False
if "OCI runtime create failed" in stderr:
return False
if len(stderr) > 2:
raise GPUSupportNotEnabled(stderr)

# this may mean we're good to go. Untested though.
return True

# running daemon needed
def get_tags_for_docker_repository(self, repo_name):
# TODO: Use more common CLI command (e.g. curl instead of wget)
Expand Down Expand Up @@ -332,12 +368,12 @@ def run_container(self,
ports=None,
name=None,
volumes=None,
runtime=None,
detach=False,
stdin_open=False,
tty=False,
api=False):
"""Run Docker container with parameters given as defined below
Parameters
----------
image_name : str
Expand All @@ -364,27 +400,19 @@ def run_container(self,
True to connect pseudo-terminal with stdin / stdout else False
api : bool, optional
True if Docker python client should be used else use subprocess
Returns
-------
if api=False:
return_code: int
integer success code of command
container_id: str
output container id
if api=True & if detach=True:
container_obj: Container
object from Docker python api with details about container
if api=True & if detach=False:
logs: str
output logs for the run function
Raises
------
EnvironmentExecutionException
Expand Down Expand Up @@ -420,6 +448,10 @@ def run_container(self,
docker_shell_cmd_list.append("--name")
docker_shell_cmd_list.append(name)

if runtime:
docker_shell_cmd_list.append("--runtime")
docker_shell_cmd_list.append(runtime)

if stdin_open:
docker_shell_cmd_list.append("-i")

Expand Down
16 changes: 16 additions & 0 deletions datmo/core/controller/environment/driver/tests/test_dockerenv.py
Original file line number Diff line number Diff line change
Expand Up @@ -797,3 +797,19 @@ def test_form_datmo_dockerfile(self):
assert result and \
os.path.isfile(output_dockerfile_path) and \
"datmo" in open(output_dockerfile_path, "r").read()

@pytest_docker_environment_failed_instantiation(test_datmo_dir)
def test_gpu_enabled(self):
if not self.docker_environment_manager.gpu_enabled():
print("GPU not available")
else:
log_filepath = os.path.join(
self.docker_environment_manager.filepath, "test.log")
return_code, run_id, logs = self.docker_environment_manager.run(
"nvidia/cuda", {
"command": ["nvidia-smi"],
"name": str(uuid.uuid1()),
"detach": True,
"gpu": True
}, log_filepath)
assert return_code == 0
1 change: 1 addition & 0 deletions datmo/core/controller/environment/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ def run(self, environment_id, options, log_filepath):
which maps the running host port (right) to that of the environment (left)
name : str, optional
volumes : dict, optional
gpu : bool, default False
detach : bool, optional
stdin_open : bool, optional
tty : bool, optional
Expand Down
19 changes: 16 additions & 3 deletions datmo/core/controller/task.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import os
import traceback

from datetime import datetime

from datmo.core.controller.base import BaseController
Expand Down Expand Up @@ -108,17 +110,19 @@ def _run_helper(self, environment_id, options, log_filepath):
"ports": options.get('ports', None),
"name": options.get('name', None),
"volumes": options.get('volumes', None),
"gpu": options.get('gpu', False),
"detach": options.get('detach', False),
"stdin_open": options.get('stdin_open', False),
"tty": options.get('tty', False),
"api": False
"api": False,
}

# Build image for environment
self.environment.build(environment_id)

# Run container with environment
return_code, run_id, logs = \
self.environment.run(environment_id, run_options, log_filepath)
return_code, run_id, logs = self.environment.run(
environment_id, run_options, log_filepath)

return return_code, run_id, logs

Expand Down Expand Up @@ -222,6 +226,8 @@ def run(self, task_id, snapshot_dict=None, task_dict=None):
task_dict.get('before_snapshot_id', before_snapshot_obj.id),
"command":
task_dict.get('command', task_obj.command),
"gpu":
task_dict.get('gpu', False),
"interactive":
task_dict.get('interactive', task_obj.interactive),
"detach":
Expand All @@ -247,11 +253,13 @@ def run(self, task_id, snapshot_dict=None, task_dict=None):
os.path.join(self.home, task_obj.task_dirpath))

return_code, run_id, logs = 0, None, None

try:
# Set the parameters set in the task
if task_obj.detach and task_obj.interactive:
raise TaskInteractiveDetachException(
__("error", "controller.task.run.args.detach.interactive"))

environment_run_options = {
"command": task_obj.command,
"ports": [] if task_obj.ports is None else task_obj.ports,
Expand All @@ -266,6 +274,7 @@ def run(self, task_id, snapshot_dict=None, task_dict=None):
'mode': 'rw'
}
},
"gpu": task_obj.gpu,
"detach": task_obj.detach,
"stdin_open": task_obj.interactive,
"tty": task_obj.interactive,
Expand All @@ -277,6 +286,10 @@ def run(self, task_id, snapshot_dict=None, task_dict=None):
self._run_helper(before_snapshot_obj.environment_id,
environment_run_options,
os.path.join(self.home, task_obj.log_filepath))

except Exception as e:
return_code = 1
logs += "Error running task: %" % e.message
finally:
# Create the after snapshot after execution is completed with new filepaths
after_snapshot_dict = snapshot_dict.copy()
Expand Down
3 changes: 3 additions & 0 deletions datmo/core/entity/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ class Task():
boolean to signify if task should be run in interactive mode
detach : bool
boolean to signify if task should be run in detach mode
gpu : bool
boolean to signify gpu task
ports : list or None
list of string mappings from host system (left) to environment (right)
(e.g. ["9999:9999", "8888:8888"])
Expand Down Expand Up @@ -128,6 +130,7 @@ def __init__(self, dictionary):
self.task_dirpath = dictionary.get('task_dirpath', None)
self.log_filepath = dictionary.get('log_filepath', None)
self.start_time = dictionary.get('start_time', None)
self.gpu = dictionary.get('gpu', False)

# Post-Execution
self.after_snapshot_id = dictionary.get('after_snapshot_id', None)
Expand Down
4 changes: 4 additions & 0 deletions datmo/core/util/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,3 +181,7 @@ def get_error_str(self):

class ValidationSchemaMissing(Exception):
pass


class GPUSupportNotEnabled(Exception):
pass
6 changes: 5 additions & 1 deletion datmo/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def __eq__(self, other):
return self.id == other.id if other else False


def run(command, env=None, home=None):
def run(command, env=None, home=None, gpu=False):
"""Run the code or script inside
The project must be created before this is implemented. You can do that by using
Expand All @@ -121,6 +121,8 @@ def run(command, env=None, home=None):
home : str, optional
absolute home path of the project
(default is None, which will use the CWD as the project path)
gpu: boolean
try to run task on GPU (if available)
Returns
-------
Expand Down Expand Up @@ -163,6 +165,8 @@ def run(command, env=None, home=None):
elif isinstance(command, basestring):
task_dict["command"] = shlex.split(command)

task_dict["gpu"] = gpu

# Create the task object
core_task_obj = task_controller.create()

Expand Down

0 comments on commit a7d0861

Please sign in to comment.