/
g5k-campaign-tutorial.rb
277 lines (238 loc) · 10.9 KB
/
g5k-campaign-tutorial.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
#!/usr/bin/env ruby
# This tutorial will show you how to use the [g5k-campaign](http://g5k-campaign.gforge.inria.fr)
# tool to easily build a whole experiment workflow.
#
# This tool is built on the [Restfully](http://github.com/grid5000/restfully) library,
# and provide a higher level of abstraction, with support for parallel
# execution of reservations, deployments, and resource configuration.
#
# You can download the source file for this tutorial from here:
# <https://github.com/grid5000/tutorials/blob/master/api/2.0/g5k-campaign-tutorial.rb>.
#
# Prerequisites
# ---------------------------
# You need to install the `g5k-campaign` library. Assuming you have `Ruby` and
# `rubygems` (1.3.5+) already installed on your system, this can be done with:
#
# gem install g5k-campaign \
# --source http://g5k-campaign.gforge.inria.fr/pkg
#
# The library comes with an executable.
# You can get a better feel of what it can do by displaying the usage help:
#
# g5k-campaign -h
#
# If you do not provide any option, it will launch a campaign using the default parameters, which are:
#
# * submit a 1-hour job on one node of the `rennes` site, and
# * deploy the `lenny-x64-base` environment on the reserved node.
#
# Note that you must have an SSH key installed on your machine, with the
# public part of that key in the authorized keys of the
# `access.grid5000.fr` machine.
# If this is not the case, follow the first steps as described at <http://pkeck.myweb.uga.edu/ssh/>, or in the [Grid'5000 wiki](https://www.grid5000.fr/mediawiki/index.php/SSH).
# Building your own engine
# ---------------------------
# You can play a little with the various options of the default engine, but
# it is probably more useful to create your own engine that will execute
# specific actions before, after or at any state of your experiment.
# Let's start with a simple engine:
class SimpleCustomEngine < Grid5000::Campaign::Engine
# We override some of the parameters.
# The complete list of options can be found [here](http://g5k-campaign.gforge.inria.fr/Grid5000/Campaign/Engine.html).
# Note that every parameter given on the command-line will always overwrite those defined in the engine.
set :environment, "lenny-x64-base"
set :resources, "nodes=2"
set :walltime, 7200
set :site, "nancy"
# By default, all the reserved nodes are released when the engine terminates.
# Here we want to keep the nodes available after the end of the workflow, so that we can still use them.
set :no_cleanup, true
# Define an action to execute before reserving resources:
before :reserve! do |env, *args|
logger.info "Executed before reservation!"
# For all hooks, we must always return `env`.
# Otherwise the hook is considered to have failed.
env
end
# Define an action to execute after deploying resources:
after :deployment! do |env, *args|
logger.info "[#{env[:site]}] Nodes have been deployed: #{env[:nodes].inspect}"
env
end
# Define an action to execute on the installation phase (i.e. after reservation and deployment are done):
on :install! do |env, *args|
logger.info "[#{env[:site]}] Installing additional software on the nodes..."
# We SSH to each node to install additional software.
# Note that this is a naive approach that only works for small numbers of nodes.
env[:nodes].each do |node|
ssh(node, "root", :timeout => 10) do |ssh|
output = ssh.exec!("apt-get update && apt-get install -y ganglia-monitor bonnie++")
logger.debug output
# You can easily copy files on your nodes if you wish.
# As an example, here we just send a file containing the list of
# reserved nodes.
nodes_file = "/tmp/#{env[:job]['uid']}"
ssh.scp.upload!(StringIO.new(env[:nodes].join("\n")), nodes_file)
end
end
env
end
# Define an action to execute after the nodes have been reserved, deployed, and installed:
on :execute! do |env, *args|
# Use the :multi option if you want to run SSH commands in parallel.
# This is better than sequentially SSHing to nodes, but for large number
# of nodes, you should probably connect to the frontend and launch a
# [`taktuk`](taktuk.gforge.inria.fr/) process for efficient execution.
ssh(env[:nodes], "root", :multi => true, :timeout => 10) do |ssh|
# Run the `bonnie++` benchmark on each node, and publish a custom metric every 5 secs (in a real experiment you'd want to send something else than $RANDOM):
cmd = %Q{nohup sh -c '(while true; do gmetric --name custom_metric_#{env[:user]} --type uint16 --value $RANDOM; sleep 20; done &) && bonnie++ -u root -d /tmp 1>/dev/null 2>&1' >/dev/null &}
logger.info "[#{env[:site]}] Executing command: #{cmd}"
ssh.exec(cmd)
# In the multi version, we must explicitly tell when the commands are
# ready to be launched in parallel on all nodes.
# See <http://net-ssh.github.com/multi/v1/api/index.html> for more info.
ssh.loop
end
env
end
# The following is just an example of what we can do once everything is setup and running.
# Here we'll just poll the values for two timeseries (our custom metric and `cpu_idle`).
# We use the [Metrology API](https://api.grid5000.fr/2.0/metrics/help/index.html) for that.
after :execute! do |env, *args|
from = env[:job]['submitted_at']
resolution = 15
10.times do
to = Time.now.to_i-resolution
["custom_metric_#{env[:user]}", "cpu_idle"].each do |metric|
begin
logger.info "[#{env[:site]}] Fetching timeseries for #{metric} metric..."
# In all engines you have access to a `connection` handler, which is
# a `Restfully::Session` object that you can use to access the
# Grid'5000 API.
#
# See the Restfully tutorial for more details.
connection.root.sites[env[:site].to_sym].metrics[metric.to_sym].
timeseries(
:query => {
:only => env[:nodes].join(","),
:resolution => resolution,
:from => from,
:to => to
}
).reload.each do |timeseries|
logger.info [env[:site], timeseries['uid'], metric].join(" - ")
logger.info timeseries['values'].inspect
end
rescue => e
logger.warn "[#{env[:site]}] Error when fetching #{metric} metric: #{e.inspect}"
end
end
sleep 15
end
env
end
end
# Running your engines
# ---------------------------
# To execute this engine, you have two solutions.
# Either you copy the [source file](#section-2) on your machine, and then launch it as followsi (please replace `login` with your login):
#
# g5k-campaign -i path/to/file --gateway access.grid5000.fr \
# -u `login`
# SimpleCustomEngine
#
# or you directly pass the source file URI to `g5k-campaign` (but you can't make changes):
#
# g5k-campaign -i https://github.com/grid5000/tutorials\
# /raw/master/api/2.0/g5k-campaign-tutorial.rb \
# --gateway access.grid5000.fr \
# -u `login`
# SimpleCustomEngine
#
# As a side-note, if you are in the process of developing or modyfing an
# engine, the `--dev` option of `g5k-campaign` is very useful ;-)
#
# Advanced example
# ---------------------------
# One of the interesting feature of `g5k-campaign` is that you can reuse
# existing engines by creating a subclass. Let's say you'd like to execute the
# previous workflow on more than one site, here is a way to do it.
# Note how we inherit from `SimpleCustomEngine`:
class Grid < SimpleCustomEngine
set :site, :all # :all or :rennes or [:rennes, :nancy] or...
before :reserve! do |env, *args|
logger.info "Reserving nodes on #{site} sites..."
env
end
# Here we change what is done by the default `:reserve!` hook, so that we
# can launch the reservation process on more than one site at a time.
on :reserve! do |env, block|
# We make use of the `how_many?` helper function which returns the number of available nodes on each site (see <http://g5k-campaign.gforge.inria.fr/Grid5000/Campaign/Engine.html#how_many%3F-instance_method>).
status = how_many?
logger.info "Status=#{status.inspect}"
case env[:site].to_s
when "all"
sites = status.keys
when "any"
# If any site will do, take the one with the most nodes available:
sites = [status.sort_by{|k,v| v}.last[0]]
else
sites = [env[:site]].flatten
end
# `g5k-campaign` comes with helper methods for parallel execution (see <http://g5k-campaign.gforge.inria.fr/Grid5000/Campaign/Parallel.html>), whose usage is demonstrated here.
env[:parallel_reserve!] = parallel(:ignore_thread_exceptions => true)
envs = []
sites.each do |uid|
if status[uid].nil? || status[uid] < 5
logger.info "Skipped #{uid} since it has only #{status[uid]} nodes that match our requirements."
else
new_env = env.merge(:site => uid)
env[:parallel_reserve!].add(new_env) do |env|
reserve!(env, &block)
end
envs.push(new_env)
end
end
# Master thread must wait for all other threads termination:
env[:parallel_reserve!].loop!
# At the end of the whole workflow, automatically display the URL at which
# a graphical view of the metrics can be seen.
# Skip the sites where the reservation failed.
metrics_query = envs.reject{|e| e[:job].nil?}.map do |e|
[e[:site], e[:job]['uid']].join(":")
end.join(",")
logger.info "You can get a graph of your metrics at https://api.grid5000.fr/sid/ui/metrics.html?jobs=#{metrics_query}"
env
end
before :execute! do |env, *args|
# Call `#wait!` on the parallel object if you want to synchronize all
# threads at some point. In this example, the execution phase will be
# launched only after all the other threads are arrived here.
logger.info "[#{env[:site]}] Waiting for other deployments to finish..."
env[:parallel_reserve!].wait!
env
end
after :execute! do |env, *args|
logger.info "[#{env[:site]}] Done!"
env
end
end
# Running your engines (with inheritance)
# ---------------------------
# If all your engines are declared in the same file, just use the same command
# as before but replace `SimpleCustomEngine` with `Grid` in the command (please replace `login` with your login):
#
# g5k-campaign -i https://github.com/grid5000/tutorials\
# /raw/master/api/2.0/g5k-campaign-tutorial.rb \
# --gateway access.grid5000.fr \
# -u `login`
# Grid
#
# If you have engines declared in more than one file, just use multiple `-i`
# flags to include them all.
# Conclusion
# ---------------------------
# This concludes our tutorial, please see the [documentation](http://g5k-campaign.gforge.inria.fr/) and
# [examples](https://gforge.inria.fr/plugins/scmgit/cgi-bin/gitweb.cgi?p=g5k-campaign/g5k-campaign.git;a=tree;f=examples;hb=HEAD) for more advanced usages,
# including grid reservation, multiple deployments, notifications, etc.