Skip to content

Commit c2546dd

Browse files
committed
Merge branch 'bug11893' into 'master'
Bug11893: Better documentation about Orion's GPU in OAR See merge request grid5000/reference-repository!405
2 parents badab0f + 75d0d73 commit c2546dd

File tree

3 files changed

+57
-37
lines changed

3 files changed

+57
-37
lines changed

lib/refrepo/gen/oar-properties.rb

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -423,19 +423,19 @@ def get_ref_node_properties_internal(cluster_uid, cluster, node_uid, node)
423423
h['memcpu'] = node['main_memory']['ram_size'] / node['architecture']['nb_procs']/MiB
424424
h['memnode'] = node['main_memory']['ram_size'] / MiB
425425

426-
if node.key?('gpu_devices') \
427-
and h['cluster'] != 'orion'
428-
# Do not generate GPU ppty for orion, cf #10785
426+
h['gpu_model'] = ''
427+
h['gpu_count'] = 0
429428

430-
models = node['gpu_devices'].values.map { |g| g['model'] }.uniq
429+
if node.key?('gpu_devices')
430+
models = node['gpu_devices'].map { |_, g| g['model'] }.uniq
431431
if models.length > 1
432432
raise "Node #{h['uid']} has more than one model of GPU"
433433
end
434-
h['gpu_model'] = models.first
435-
h['gpu_count'] = node['gpu_devices'].length
436-
else
437-
h['gpu_model'] = ''
438-
h['gpu_count'] = 0
434+
device = node['gpu_devices'].first[1]
435+
if GPURef.is_gpu_supported?(device)
436+
h['gpu_model'] = device['model']
437+
h['gpu_count'] = node['gpu_devices'].length
438+
end
439439
end
440440

441441
if node.key?('exotic')

lib/refrepo/gen/wiki/generators/site_hardware.rb

Lines changed: 34 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -11,22 +11,32 @@ def initialize(page_name, site)
1111

1212
def generate_content(_options)
1313
has_reservable_disks = false
14+
has_unsupported_gpu = false
1415
G5K::get_global_hash['sites'][@site]['clusters'].each do |_,c|
1516
c['nodes'].each do |_,n|
1617
n['storage_devices'].each do |d|
1718
has_reservable_disks ||= d['reservation']
1819
end
20+
21+
if ! n['gpu_devices'].nil?
22+
has_unsupported_gpu ||= n['gpu_devices'].map { |_, g| g['model'] }.uniq
23+
.map{|gpu_model| GPURef.is_gpu_supported?(gpu_model)}.reduce(:&)
24+
end
1925
end
2026
end
2127

28+
asterisks = []
29+
asterisks << "''*: disk is [[Disk_reservation|reservable]]''" if has_reservable_disks
30+
asterisks << "''**: crossed GPUs are not supported by Grid'5000 default environments''" if has_unsupported_gpu
31+
2232
@generated_content = "__NOTOC__\n__NOEDITSECTION__\n" +
2333
"{{Portal|User}}\n" +
2434
"<div class=\"sitelink\">Hardware: [[Hardware|Global]] | " + G5K::SITES.map { |e| "[[#{e.capitalize}:Hardware|#{e.capitalize}]]" }.join(" | ") + "</div>\n" +
2535
"'''See also:''' [[#{@site.capitalize}:Network|Network topology for #{@site.capitalize}]]\n" +
2636
"#{SiteHardwareGenerator.generate_header_summary({@site => G5K::get_global_hash['sites'][@site]})}\n" +
2737
"= Clusters =\n" +
2838
self.class.generate_summary(@site, false) +
29-
(has_reservable_disks ? "''*: disk is [[Disk_reservation|reservable]]''" : '') +
39+
asterisks.join("\n\n") +
3040
self.class.generate_description(@site) +
3141
MW.italic(MW.small(generated_date_string)) +
3242
MW::LINE_FEED
@@ -302,39 +312,35 @@ def sort_data(data, key)
302312

303313
def gpu_description(node_hash, long_names)
304314
lgpu = node_hash['gpu_devices']
315+
res = []
305316
if lgpu
306-
bymodel = {}
307-
memgib = (node_hash['gpu_devices'].first()[1]['memory'].to_f/2**30).round(0)
308-
lgpu.each { |g|
309-
d = g[1]
310-
vendor = d['vendor']
311-
if long_names
312-
model = d['model']
313-
else
314-
model = GPURef.model2shortname(d['model'])
315-
end
316-
vm = vendor.to_s + ' ' + model.to_s.gsub(' ', '&nbsp;') + "&nbsp;(#{memgib}&nbsp;GiB)"
317-
if long_names
318-
cc = GPURef.get_compute_capability(d['model'])
319-
vm += "<br>Compute&nbsp;capability:&nbsp;#{cc}" if cc
320-
end
317+
gpu_types = lgpu.values.group_by{|device_hash| device_hash['model']}.map do |model_name, device_hashes|
318+
description = gpu_model_description(device_hashes.first, long_names)
319+
[model_name, {number: device_hashes.length, description: description}]
320+
end.to_h
321321

322-
if bymodel[vm]
323-
bymodel[vm] += 1
324-
else
325-
bymodel[vm] = 1
326-
end
327-
}
328-
res = []
329-
bymodel.each { |model,count|
330-
res << (count == 1 ? '' : count.to_s + '&nbsp;x&nbsp;') + model
322+
gpu_types.each{|_model, hash|
323+
res << (hash[:number] == 1 ? '' : hash[:number].to_s + '&nbsp;x&nbsp;') + hash[:description]
331324
}
332-
else
333-
res = []
325+
334326
end
335-
res.join(", ")
327+
return res.join(", ")
336328
end
337329

330+
def gpu_model_description(device_hash, long_name)
331+
model = long_name ? device_hash['model'] : GPURef.model2shortname(device_hash['model'])
332+
memgib = (device_hash['memory'].to_f/2**30).round(0)
333+
vendor = device_hash['vendor']
334+
description = vendor.to_s + ' ' + model.to_s.gsub(' ', '&nbsp;') + "&nbsp;(#{memgib}&nbsp;GiB)"
335+
if long_name
336+
cc = GPURef.get_compute_capability(model)
337+
description += "<br>Compute&nbsp;capability:&nbsp;#{cc}" if cc
338+
description = "<s>" + description + "</s><br>''not supported by Grid'5000 default environments''" if !GPURef.is_gpu_supported?(device_hash)
339+
else
340+
description = "<s>" + description + "</s>**" if !GPURef.is_gpu_supported?(device_hash)
341+
end
342+
return description
343+
end
338344

339345
def get_hardware(sites)
340346
global_hash = G5K::get_global_hash

lib/refrepo/gpu_ref.rb

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# coding: utf-8
22

3+
NVIDIA = 'Nvidia'
4+
MINIMAL_COMPUTE_CAPABILITY_SUPPORTED = 3.0
5+
36
class GPURef
47
@@gpus = {
58
'GeForce RTX 2080 Ti' => {
@@ -93,6 +96,7 @@ class GPURef
9396
},
9497
}
9598

99+
96100
def self.getNumberOfCoresFor(model)
97101
if @@gpus[model]
98102
return @@gpus[model]['cores']
@@ -126,4 +130,14 @@ def self.get_all_aliases
126130

127131
aliases
128132
end
133+
134+
def self.is_gpu_supported?(device)
135+
support = (device['vendor'] == NVIDIA) ? is_cc_supported?(device['model']) : true
136+
return support
137+
end
138+
139+
def self.is_cc_supported?(model)
140+
compute_capability = @@gpus[model]['compute_capability']
141+
return (compute_capability.to_f >= MINIMAL_COMPUTE_CAPABILITY_SUPPORTED)
142+
end
129143
end

0 commit comments

Comments
 (0)