-
Notifications
You must be signed in to change notification settings - Fork 34
/
argo_poet.yaml
250 lines (220 loc) · 7.77 KB
/
argo_poet.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
apiVersion: argoproj.io/v1alpha1
kind: Workflow
metadata:
generateName: poet-process-
spec:
entrypoint: cms-od-example
volumes:
- name: task-pv-storage
persistentVolumeClaim:
claimName: nfs-<NUMBER>
arguments:
parameters:
- name: startFile
value: 1
- name: nEvents
value: 10000
- name: recid
value: 24119
- name: nJobs
value: 4
templates:
- name: cms-od-example
inputs:
parameters:
- name: startFile
- name: nEvents
- name: recid
- name: nJobs
dag:
tasks:
- name: prepare
template: prepare-template
- name: get-metadata
dependencies: [prepare]
template: get-metadata-template
arguments:
parameters:
- name: recid
value: "{{inputs.parameters.recid}}"
- name: dataType
value: "{{outputs.parameters.dataType}}"
- name: joblist
dependencies: [get-metadata]
template: joblist-template
arguments:
parameters:
- name: startFile
value: "{{inputs.parameters.startFile}}"
- name: nJobs
value: "{{inputs.parameters.nJobs}}"
- name: nEvents
value: "{{inputs.parameters.nEvents}}"
- name: totFiles
value: "{{tasks.get-metadata.outputs.result}}"
- name: runpoet
dependencies: [joblist]
template: runpoet-template
arguments:
parameters:
- name: recid
value: "{{inputs.parameters.recid}}"
- name: dataType
value: "{{tasks.get-metadata.outputs.parameters.dataType}}"
- name: it
value: "{{item.it}}"
- name: firstFile
value: "{{item.firstfile}}"
- name: lastFile
value: "{{item.lastfile}}"
- name: eventsInJob
value: "{{item.eventsinjob}}"
withParam: "{{tasks.joblist.outputs.result}}"
- name: merge-step
dependencies: [runpoet]
template: merge-step-template
- name: analysis-step
dependencies: [merge-step]
template: analysis-step-template
# prepare the data directories needed in the workflow steps
- name: prepare-template
script:
image: ubuntu:latest
command: [bash]
source: |
mkdir -p /mnt/vol/scatter
chmod -R 777 /mnt/vol
volumeMounts:
- name: task-pv-storage
mountPath: /mnt/vol
# Get the metadata of the dataset
# Accidentally showing three different ways of passing parameters/files btw the steps
# - the full list of files: write it to a file on the common disk mounted on /mnt/vol
# - the type of data: write it to the step's ouput parameter "{{tasks.get-metadata.outputs.parameters.dataType}}# (through a temporary file /tmp/type.txt)
# - the total number of files which is the stdout output of this step and goes to {{tasks.get-metadata.outputs.result}}
#
- name: get-metadata-template
inputs:
parameters:
- name: recid
outputs:
parameters:
- name: dataType
valueFrom:
default: "default"
path: /tmp/type.txt
script:
image: cernopendata/cernopendata-client
command: [bash]
source: |
cernopendata-client get-file-locations --recid "{{inputs.parameters.recid}}" --protocol xrootd > /mnt/vol/files_{{inputs.parameters.recid}}.txt;
cernopendata-client get-metadata --recid "{{inputs.parameters.recid}}" --output-value type.secondary > /tmp/type.txt
cernopendata-client get-metadata --recid "{{inputs.parameters.recid}}" --output-value distribution.number_files
volumeMounts:
- name: task-pv-storage
mountPath: /mnt/vol
# Generate the iterator list for the scatter step
# Compute the number of events and files for each step
# Write out the list with first and last filenumbers and the numebr of events to be taken as the input of the following steps
# (see {{tasks.joblist.outputs.result}} as "withParam" in runpoet-template)
- name: joblist-template
inputs:
parameters:
- name: nJobs
- name: nEvents
- name: startFile
- name: totFiles
script:
image: python:alpine3.6
command: [python]
source: |
import json
import sys
start = {{inputs.parameters.startFile}}
nJobs = {{inputs.parameters.nJobs}}
nEvents = {{inputs.parameters.nEvents}}
totFiles = {{inputs.parameters.totFiles}}
filesInJob = int(totFiles/nJobs)
modFiles = totFiles % nJobs
eventsInJob = int(nEvents/nJobs)
modEvents = nEvents % nJobs
itlist = [i for i in range(1, nJobs+1)]
dictlist = []
for i in itlist:
first = start+(i-1)*filesInJob
last = first + filesInJob - 1
adict = { "it": i,
"firstfile": first,
"lastfile": last,
"eventsinjob": eventsInJob}
if i == nJobs:
adict = { "it": i,
"firstfile": first,
"lastfile": last + modFiles,
"eventsinjob": eventsInJob + modEvents}
dictlist.append(adict)
json.dump(dictlist, sys.stdout)
# Run the CMSSW step
# This iterates over the list that it gets as "withParam"
- name: runpoet-template
inputs:
parameters:
- name: it
- name: firstFile
- name: lastFile
- name: recid
- name: dataType
- name: eventsInJob
script:
image: gitlab-registry.cern.ch/cms-cloud/cmssw-docker-opendata/cmssw_7_6_7-slc6_amd64_gcc493
command: [bash]
source: |
sudo chown $USER /mnt/vol
source /opt/cms/entrypoint.sh
git clone -b odws2023 https://github.com/cms-opendata-analyses/PhysObjectExtractorTool.git
cd PhysObjectExtractorTool/PhysObjectExtractor
scram b
it="{{inputs.parameters.it}}"
eventsInJob="{{inputs.parameters.eventsInJob}}"
dataType="{{inputs.parameters.dataType}}"
echo Datatype $dataType
isData=False
if echo $dataType | grep Collision ; then isData=True; fi
firstFile="{{inputs.parameters.firstFile}}"
lastFile="{{inputs.parameters.lastFile}}"
echo firstFile $firstFile
echo lastFile $lastFile
cmsRun python/poet_cfg_cloud.py $isData $firstFile $lastFile '"/mnt/vol/files_{{inputs.parameters.recid}}.txt"' $eventsInJob
mv myoutput.root /mnt/vol/scatter/poetoutput$it.root
volumeMounts:
- name: task-pv-storage
mountPath: /mnt/vol
resources:
requests:
cpu: 750m
# merge the files from the scatter steps to a single file
- name: merge-step-template
script:
image: rootproject/root:latest
command: [bash]
source: |
rm -f /mnt/vol/poetoutput.root
hadd -f /mnt/vol/poetoutput.root /mnt/vol/scatter/poetoutput*.root
rm -rf /mnt/vol/scatter/ # this contains the separated output files, now merged
volumeMounts:
- name: task-pv-storage
mountPath: /mnt/vol
# prepare some histograms to check the merged output file
- name: analysis-step-template
script:
image: rootproject/root:latest
command: [bash]
source: |
cp /mnt/vol/poetoutput.root .
curl -LO https://raw.githubusercontent.com/cms-opendata-analyses/PhysObjectExtractorTool/odws2023/PhysObjectExtractor/cloud/analysis.C
root -l -b -q analysis.C
mv analysis_output.root /mnt/vol/analysis_output.root
mv *.png /mnt/vol
volumeMounts:
- name: task-pv-storage
mountPath: /mnt/vol