Skip to content

Commit

Permalink
Address PR comments.
Browse files Browse the repository at this point in the history
  • Loading branch information
jjotero committed Oct 13, 2020
1 parent 84398ea commit dcff63c
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 80 deletions.
84 changes: 34 additions & 50 deletions cscs-checks/prgenv/cuda/cuda_aware_mpi.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
import os
import reframe as rfm
import reframe.utility.sanity as sn
import reframe.utility.os_ext as osx


@rfm.simple_test
class CudaAwareMPICheck(rfm.CompileOnlyRegressionTest):
def __init__(self):
super().__init__()
self.descr = 'Cuda-aware MPI test from the NVIDIA repo.'
self.sourcesdir = ('https://github.com/NVIDIA-developer-blog/'
'code-samples.git')
Expand All @@ -32,79 +32,84 @@ def __init__(self):

self.sanity_patterns = sn.assert_found(r'Finished building'
r' CUDA samples', self.stdout)
self.num_tasks = 2
if self.current_system.name in ['arolla', 'tsa', 'ault']:
self.exclusive_access = True
nvidia_sm = '70'
else:
nvidia_sm = '60'

if self.current_system.name in ['daint']:
self.prebuild_cmds = ['export CUDA_HOME=$CUDATOOLKIT_HOME']
else:
self.prebuild_cmds = []

self.prebuild_cmds += ['cd posts/cuda-aware-mpi-example/src']
gcd_flgs = '-gencode arch=compute_{0},code=sm_{0}'.format(nvidia_sm)
self.build_system = 'Make'
self.build_system.options = ['CUDA_INSTALL_PATH=$CUDA_HOME',
'MPI_HOME=$CRAY_MPICH_PREFIX',
'GENCODE_FLAGS="%s"' % (gcd_flgs)]

self.postbuild_cmds= ['ls ../bin']
self.sanity_patterns= sn.assert_found(r'jacobi_cuda_aware_mpi',
self.postbuild_cmds = ['ls ../bin']
self.sanity_patterns = sn.assert_found(r'jacobi_cuda_aware_mpi',
self.stdout)
self.maintainers=['JO']
self.tags={'production', 'external_resources'}
self.maintainers = ['JO']
self.tags = {'production', 'external_resources'}

@rfm.run_before('compile')
def set_cuda_home(self):
if self.current_system.name in [
'daint', 'dom'] and osx.cray_cdt_version() == '20.08':
self.prebuild_cmds += ['export CUDA_HOME=$CUDATOOLKIT_HOME']

@ rfm.run_before('compile')
@rfm.run_before('compile')
def set_compilers(self):
if self.current_environ.name == 'PrgEnv-pgi':
self.build_system.cflags=['-std=c99', ' -O3']
self.build_system.cflags = ['-std=c99', ' -O3']

self.build_system.options += [
'MPICC="%s"' % self.build_system._cc(self.current_environ),
'MPILD="%s"' % self.build_system._cxx(self.current_environ)
'MPICC="%s"' % self.current_environ.cc,
'MPILD="%s"' % self.current_environ.cxx
]


class CudaAwareMPIRuns(rfm.RunOnlyRegressionTest):
def __init__(self):
super().__init__()
self.depends_on('CudaAwareMPICheck')
self.valid_systems = ['daint:gpu', 'dom:gpu', 'tiger:gpu',
'arolla:cn', 'tsa:cn',
'ault:amdv100', 'ault:intelv100']
self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-gnu']
if self.current_system.name in ['arolla', 'tsa', 'daint']:
if self.current_system.name in ['arolla', 'tsa', 'daint', 'dom']:
self.valid_prog_environs += ['PrgEnv-pgi']
elif self.current_system.name in ['ault']:
self.valid_prog_environs = ['PrgEnv-gnu']

if self.current_system.name in ['arolla', 'tsa','ault']:
if self.current_system.name in ['arolla', 'tsa', 'ault']:
self.valid_prog_environs = ['PrgEnv-gnu']
else:
self.modules = ['craype-accel-nvidia60']

self.executable = '../bin/jacobi_cuda_aware_mpi'
self.prerun_cmds = ['export MPICH_RDMA_ENABLED_CUDA=1']
self.depends_on('CudaAwareMPICheck')
self.sanity_patterns = sn.assert_found(r'Stopped after 1000 iterations'
r' with residue 0.00024',
r' with residue 0.00024',
self.stdout)


@rfm.require_deps
def set_executable(self, CudaAwareMPICheck):
self.executable = os.path.join(
CudaAwareMPICheck().stagedir,
'posts', 'cuda-aware-mpi-example',
'bin', 'jacobi_cuda_aware_mpi'
)


@rfm.simple_test
class CudaAwareMPIOneNodeCheck(CudaAwareMPIRuns):
def __init__(self):
super().__init__()
super().__init__()
self.partition_num_gpus_per_node = {
'daint:gpu': 1,
'dom:gpu': 1,
'kesh:cn': 2,
'tiger:gpu': 2,
'arolla:cn': 2,
'tsa:cn': 2,
'ault:amdv100': 2,
'ault:amdv100': 2,
'ault:intelv100': 4
}

Expand All @@ -113,39 +118,18 @@ def __init__(self):
@rfm.run_before('run')
def set_num_gpus_per_node(self):
cp = self.current_partition.fullname
if cp in self.partition_num_gpus_per_node:
self.num_gpus_per_node = self.partition_num_gpus_per_node.get(cp)
else:
self.num_gpus_per_node = 1

self.num_gpus_per_node = self.partition_num_gpus_per_node.get(cp, 1)
self.num_tasks = 2 * self.num_gpus_per_node
self.num_tasks_per_node = self.num_tasks
self.executable_opts = ['-t %d %d' % (self.num_tasks/2, 2)]

@rfm.require_deps
def set_executable(self, CudaAwareMPICheck):
self.executable = os.path.join(
CudaAwareMPICheck().stagedir,
'posts','cuda-aware-mpi-example',
'bin', 'jacobi_cuda_aware_mpi'
)
self.num_tasks_per_node = self.num_tasks
self.executable_opts = [f'-t {self.num_tasks/2} 2']


@rfm.simple_test
class CudaAwareMPITwoNodesCheck(CudaAwareMPIRuns):
def __init__(self):
super().__init__()
super().__init__()
# Run the case across two nodes
self.num_tasks = 2
self.num_tasks_per_node = 1
self.num_gpus_per_node = 1
self.executable_opts = ['-t %d 1' % self.num_tasks]

@rfm.require_deps
def set_executable(self, CudaAwareMPICheck):
self.executable = os.path.join(
CudaAwareMPICheck().stagedir,
'posts','cuda-aware-mpi-example',
'bin', 'jacobi_cuda_aware_mpi'
)

44 changes: 14 additions & 30 deletions cscs-checks/prgenv/cuda/cuda_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,12 @@ def __init__(self):
else:
self.prebuild_cmds = []

self.build_system.options = [
'SMS="%s"' %
self.nvidia_sm,
'CUDA_PATH=$CUDA_HOME']
self.build_system.options = [f'SMS="{self.nvidia_sm}"',
f'CUDA_PATH=$CUDA_HOME']
self.maintainers = ['JO']
self.tags = {'production', 'external_resosurces'}


@rfm.required_version('>=2.14')
@rfm.simple_test
class CudaDeviceQueryCheck(CudaSamples):
def __init__(self):
Expand All @@ -66,21 +63,13 @@ def __init__(self):
self.stdout)


class DependentCudaSamples(CudaSamples):
def __init__(self):
super().__init__()
self.depends_on('CudaDeviceQueryCheck')


@rfm.required_version('>=2.14')
@rfm.simple_test
class CudaConcurrentKernelsCheck(DependentCudaSamples):
class CudaConcurrentKernelsCheck(CudaSamples):
def __init__(self):
super().__init__()
self.depends_on('CudaDeviceQueryCheck')
self.descr = 'CUDA concurrentKernels test'
self.sanity_patterns = sn.assert_found(
r'Test passed',
self.stdout)
self.sanity_patterns = sn.assert_found(r'Test passed', self.stdout)

@rfm.require_deps
def set_prebuild_cmds(self, CudaDeviceQueryCheck):
Expand All @@ -95,15 +84,13 @@ def set_executable(self, CudaDeviceQueryCheck):
'Samples', 'concurrentKernels', 'concurrentKernels')


@rfm.required_version('>=2.14')
@rfm.simple_test
class CudaMatrixMultCublasCheck(DependentCudaSamples):
class CudaMatrixMultCublasCheck(CudaSamples):
def __init__(self):
super().__init__()
self.depends_on('CudaDeviceQueryCheck')
self.descr = 'CUDA simpleCUBLAS test'
self.sanity_patterns = sn.assert_found(
r'test passed',
self.stdout)
self.sanity_patterns = sn.assert_found(r'test passed', self.stdout)

@rfm.require_deps
def set_prebuild_cmds(self, CudaDeviceQueryCheck):
Expand All @@ -118,15 +105,13 @@ def set_executable(self, CudaDeviceQueryCheck):
'Samples', 'simpleCUBLAS', 'simpleCUBLAS')


@rfm.required_version('>=2.14')
@rfm.simple_test
class CudaBandwidthCheck(DependentCudaSamples):
class CudaBandwidthCheck(CudaSamples):
def __init__(self):
super().__init__()
self.depends_on('CudaDeviceQueryCheck')
self.descr = 'CUDA simpleCUBLAS test'
self.sanity_patterns = sn.assert_found(
r'Result = PASS',
self.stdout)
self.sanity_patterns = sn.assert_found(r'Result = PASS', self.stdout)

@rfm.require_deps
def set_prebuild_cmds(self, CudaDeviceQueryCheck):
Expand All @@ -141,15 +126,14 @@ def set_executable(self, CudaDeviceQueryCheck):
'Samples', 'bandwidthTest', 'bandwidthTest')


@rfm.required_version('>=2.14')
@rfm.simple_test
class CudaGraphsCGCheck(DependentCudaSamples):
class CudaGraphsCGCheck(CudaSamples):
def __init__(self):
super().__init__()
self.depends_on('CudaDeviceQueryCheck')
self.descr = 'CUDA simpleCUBLAS test'
self.sanity_patterns = sn.assert_found(
r'Test Summary: Error amount = 0.00000',
self.stdout)
r'Test Summary: Error amount = 0.00000', self.stdout)

@rfm.require_deps
def set_prebuild_cmds(self, CudaDeviceQueryCheck):
Expand Down

0 comments on commit dcff63c

Please sign in to comment.