Merge pull request #143 from datmo/gpu_support

GPU Support (fix for windows build)
datmo · May 18, 2018 · a7d0861 · a7d0861
2 parents b7e1e71 + bd5e967
commit a7d0861
Show file tree

Hide file tree

Showing 9 changed files with 93 additions and 17 deletions.
diff --git a/.gitignore b/.gitignore
@@ -98,3 +98,5 @@ ENV/
 # Rope project settings
 .ropeproject
 foo
+
+.pytest_cache
diff --git a/appveyor.yml b/appveyor.yml
@@ -45,6 +45,7 @@ install:
   - "%PYTHON%/python.exe C:/get-pip.py"
   - "%PYTHON%/python.exe -m pip install pip==9.0.3"
   - "%PYTHON%/Scripts/pip.exe install pytest"
+  - "%PYTHON%/Scripts/pip.exe install pypiwin32"
   - "%PYTHON%/python.exe setup.py install"
 test_script:
   - "%PYTHON%/Scripts/pip.exe --version"

diff --git a/datmo/core/controller/environment/driver/dockerenv.py b/datmo/core/controller/environment/driver/dockerenv.py
@@ -16,7 +16,7 @@
     PathDoesNotExist, EnvironmentDoesNotExist, EnvironmentInitFailed,
     EnvironmentExecutionException, FileAlreadyExistsException,
     EnvironmentRequirementsCreateException, EnvironmentImageNotFound,
-    EnvironmentContainerNotFound)
+    EnvironmentContainerNotFound, GPUSupportNotEnabled)
 from datmo.core.controller.environment.driver import EnvironmentDriver
 
 
@@ -152,12 +152,22 @@ def build(self, name, path):
 
     # running daemon needed
     def run(self, name, options, log_filepath):
-        run_return_code, run_id = \
-            self.run_container(image_name=name, **options)
+        if "gpu" in options:
+            gpu_ready = self.gpu_enabled()
+            if options["gpu"] is True:
+                if not gpu_ready:
+                    raise GPUSupportNotEnabled('nvidia')
+                else:
+                    options['runtime'] = 'nvidia'
+            options.pop("gpu", None)
+
+        run_return_code, run_id = self.run_container(
+            image_name=name, **options)
+
         log_return_code, logs = self.log_container(
             run_id, filepath=log_filepath)
-        final_return_code = run_return_code and log_return_code
 
+        final_return_code = run_return_code and log_return_code
         return final_return_code, run_id, logs
 
     # running daemon needed
@@ -178,6 +188,32 @@ def remove(self, name, force=False):
         return stop_and_remove_containers_result and \
                remove_image_result
 
+    def gpu_enabled(self):
+        # test if this images works
+        # docker run --runtime=nvidia --rm nvidia/cuda nvidia-smi
+        process = subprocess.Popen(
+            [
+                "docker",
+                "run",
+                "--runtime=nvidia",
+                "--rm",
+                "nvidia/cuda",
+                "nvidia-smi",
+            ],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        stdout, stderr = process.communicate()
+        stderr = stderr.decode("utf-8")
+        if "Unknown runtime specified nvidia" in stderr:
+            return False
+        if "OCI runtime create failed" in stderr:
+            return False
+        if len(stderr) > 2:
+            raise GPUSupportNotEnabled(stderr)
+
+        # this may mean we're good to go.   Untested though.
+        return True
+
     # running daemon needed
     def get_tags_for_docker_repository(self, repo_name):
         # TODO: Use more common CLI command (e.g. curl instead of wget)
@@ -332,12 +368,12 @@ def run_container(self,
                       ports=None,
                       name=None,
                       volumes=None,
+                      runtime=None,
                       detach=False,
                       stdin_open=False,
                       tty=False,
                       api=False):
         """Run Docker container with parameters given as defined below
-
         Parameters
         ----------
         image_name : str
@@ -364,27 +400,19 @@ def run_container(self,
             True to connect pseudo-terminal with stdin / stdout else False
         api : bool, optional
             True if Docker python client should be used else use subprocess
-
         Returns
         -------
         if api=False:
-
         return_code: int
             integer success code of command
         container_id: str
             output container id
-
-
         if api=True & if detach=True:
-
         container_obj: Container
             object from Docker python api with details about container
-
         if api=True & if detach=False:
-
         logs: str
             output logs for the run function
-
         Raises
         ------
         EnvironmentExecutionException
@@ -420,6 +448,10 @@ def run_container(self,
                     docker_shell_cmd_list.append("--name")
                     docker_shell_cmd_list.append(name)
 
+                if runtime:
+                    docker_shell_cmd_list.append("--runtime")
+                    docker_shell_cmd_list.append(runtime)
+
                 if stdin_open:
                     docker_shell_cmd_list.append("-i")
 

diff --git a/datmo/core/controller/environment/driver/tests/test_dockerenv.py b/datmo/core/controller/environment/driver/tests/test_dockerenv.py
@@ -797,3 +797,19 @@ def test_form_datmo_dockerfile(self):
         assert result and \
             os.path.isfile(output_dockerfile_path) and \
             "datmo" in open(output_dockerfile_path, "r").read()
+
+    @pytest_docker_environment_failed_instantiation(test_datmo_dir)
+    def test_gpu_enabled(self):
+        if not self.docker_environment_manager.gpu_enabled():
+            print("GPU not available")
+        else:
+            log_filepath = os.path.join(
+                self.docker_environment_manager.filepath, "test.log")
+            return_code, run_id, logs = self.docker_environment_manager.run(
+                "nvidia/cuda", {
+                    "command": ["nvidia-smi"],
+                    "name": str(uuid.uuid1()),
+                    "detach": True,
+                    "gpu": True
+                }, log_filepath)
+            assert return_code == 0
diff --git a/datmo/core/controller/environment/environment.py b/datmo/core/controller/environment/environment.py
@@ -208,6 +208,7 @@ def run(self, environment_id, options, log_filepath):
                 which maps the running host port (right) to that of the environment (left)
             name : str, optional
             volumes : dict, optional
+            gpu : bool, default False
             detach : bool, optional
             stdin_open : bool, optional
             tty : bool, optional

diff --git a/datmo/core/controller/task.py b/datmo/core/controller/task.py
@@ -1,4 +1,6 @@
 import os
+import traceback
+
 from datetime import datetime
 
 from datmo.core.controller.base import BaseController
@@ -108,17 +110,19 @@ def _run_helper(self, environment_id, options, log_filepath):
             "ports": options.get('ports', None),
             "name": options.get('name', None),
             "volumes": options.get('volumes', None),
+            "gpu": options.get('gpu', False),
             "detach": options.get('detach', False),
             "stdin_open": options.get('stdin_open', False),
             "tty": options.get('tty', False),
-            "api": False
+            "api": False,
         }
 
         # Build image for environment
         self.environment.build(environment_id)
+
         # Run container with environment
-        return_code, run_id, logs = \
-            self.environment.run(environment_id, run_options, log_filepath)
+        return_code, run_id, logs = self.environment.run(
+            environment_id, run_options, log_filepath)
 
         return return_code, run_id, logs
 
@@ -222,6 +226,8 @@ def run(self, task_id, snapshot_dict=None, task_dict=None):
                 task_dict.get('before_snapshot_id', before_snapshot_obj.id),
             "command":
                 task_dict.get('command', task_obj.command),
+            "gpu":
+                task_dict.get('gpu', False),
             "interactive":
                 task_dict.get('interactive', task_obj.interactive),
             "detach":
@@ -247,11 +253,13 @@ def run(self, task_id, snapshot_dict=None, task_dict=None):
             os.path.join(self.home, task_obj.task_dirpath))
 
         return_code, run_id, logs = 0, None, None
+
         try:
             # Set the parameters set in the task
             if task_obj.detach and task_obj.interactive:
                 raise TaskInteractiveDetachException(
                     __("error", "controller.task.run.args.detach.interactive"))
+
             environment_run_options = {
                 "command": task_obj.command,
                 "ports": [] if task_obj.ports is None else task_obj.ports,
@@ -266,6 +274,7 @@ def run(self, task_id, snapshot_dict=None, task_dict=None):
                         'mode': 'rw'
                     }
                 },
+                "gpu": task_obj.gpu,
                 "detach": task_obj.detach,
                 "stdin_open": task_obj.interactive,
                 "tty": task_obj.interactive,
@@ -277,6 +286,10 @@ def run(self, task_id, snapshot_dict=None, task_dict=None):
                 self._run_helper(before_snapshot_obj.environment_id,
                                  environment_run_options,
                                  os.path.join(self.home, task_obj.log_filepath))
+
+        except Exception as e:
+            return_code = 1
+            logs += "Error running task: %" % e.message
         finally:
             # Create the after snapshot after execution is completed with new filepaths
             after_snapshot_dict = snapshot_dict.copy()

diff --git a/datmo/core/entity/task.py b/datmo/core/entity/task.py
@@ -87,6 +87,8 @@ class Task():
         boolean to signify if task should be run in interactive mode
     detach : bool
         boolean to signify if task should be run in detach mode
+    gpu : bool
+        boolean to signify gpu task
     ports : list or None
         list of string mappings from host system (left) to environment (right)
         (e.g. ["9999:9999", "8888:8888"])
@@ -128,6 +130,7 @@ def __init__(self, dictionary):
         self.task_dirpath = dictionary.get('task_dirpath', None)
         self.log_filepath = dictionary.get('log_filepath', None)
         self.start_time = dictionary.get('start_time', None)
+        self.gpu = dictionary.get('gpu', False)
 
         # Post-Execution
         self.after_snapshot_id = dictionary.get('after_snapshot_id', None)

diff --git a/datmo/core/util/exceptions.py b/datmo/core/util/exceptions.py
@@ -181,3 +181,7 @@ def get_error_str(self):
 
 class ValidationSchemaMissing(Exception):
     pass
+
+
+class GPUSupportNotEnabled(Exception):
+    pass
diff --git a/datmo/task.py b/datmo/task.py
@@ -101,7 +101,7 @@ def __eq__(self, other):
         return self.id == other.id if other else False
 
 
-def run(command, env=None, home=None):
+def run(command, env=None, home=None, gpu=False):
     """Run the code or script inside
 
     The project must be created before this is implemented. You can do that by using
@@ -121,6 +121,8 @@ def run(command, env=None, home=None):
     home : str, optional
         absolute home path of the project
         (default is None, which will use the CWD as the project path)
+    gpu: boolean
+        try to run task on GPU (if available)
 
     Returns
     -------
@@ -163,6 +165,8 @@ def run(command, env=None, home=None):
         elif isinstance(command, basestring):
             task_dict["command"] = shlex.split(command)
 
+    task_dict["gpu"] = gpu
+
     # Create the task object
     core_task_obj = task_controller.create()