-
Notifications
You must be signed in to change notification settings - Fork 21
/
run_spark
executable file
·184 lines (170 loc) · 7.31 KB
/
run_spark
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/bin/sh
# Author: Valentin Kuznetsov <vkuznet AT gmail [DOT] com>
# A wrapper script to submit spark job with CMSSpark python script.
# on vocms092 spark-submit resides in non-standard location
export PATH=$PATH:/usr/hdp/spark/bin
# test arguments
if [ "$#" -eq 0 ]; then
echo "Usage: run_spark <cmsspark_script> <options>"
exit 1
fi
# find out where CMSSpark is installed on a system
droot=`python -c "import CMSSpark; print('/'.join(CMSSpark.__file__.split('/')[:-1]))"`
if [ -f $1 ]; then
cmsspark=$1
else
cmsspark=$droot/$1
fi
# enable simple secret to run in non-yarn mode
conf=""
hostname=`hostname -s`
if [ "$hostname" != "vocms092" ]; then
#conf="--conf spark.authenticate.secret=cmsspark --conf spark.yarn.security.tokens.hive.enabled=false --conf spark.driver.port=5001 --conf spark.blockManager.port=5101 --conf spark.ui.port=5201"
conf="--conf spark.authenticate.secret=cmsspark --conf spark.yarn.security.credentials.hive.enabled=false --conf spark.driver.port=5001 --conf spark.blockManager.port=5101 --conf spark.ui.port=5201"
else
conf="$conf --py-files /data/cms/CMSMonitoring/src/python/CMSMonitoring.zip"
fi
# look if we requested to show full log output, to disable spark output
# client should setup his/her own log4j.properties via WMA_LOG4J environment variable
if [[ "$@" =~ "--no-log4j" ]]; then
conf=" --conf spark.ui.showConsoleProgress=false "
if [ -n "$LOG4J_CONF" ] && [ -f $LOG4J_CONF ]; then
conf="$conf --conf spark.driver.extraJavaOptions='-Dlog4j.configuration=file:$LOG4J_CONF'"
fi
fi
# read spark options from configuration file if it exists
# the SPARK_OPTIONS file should provide <option=value> line for every option
# spark.network.timeout=120s
# spark.rpc.numRetries=3
if [ -n "$SPARK_OPTIONS" ]; then
conf="$conf `cat $SPARK_OPTIONS | awk '{ORS=" "; print "--conf "$0""}'`"
fi
# from https://cern.service-now.com/service-portal/view-request.do?n=RQF0876659
# we can specify in jar files as following (before Spark 2.0):
# --packages com.databricks:spark-csv_2.11:1.6.0
# check if we're on lxplus7
lxplus7=`cat /etc/redhat-release | grep 7`
lxplus7_setup=""
if [ -n "$lxplus7" ] && [ -n "`hostname -s | grep ^lxplus`" ]; then
if [ -f /cvmfs/sft.cern.ch/lcg/views/LCG_94/x86_64-centos7-gcc7-opt/setup.sh ] && \
[ -f /cvmfs/sft.cern.ch/lcg/etc/hadoop-confext/hadoop-setconf.sh ]; then
source /cvmfs/sft.cern.ch/lcg/views/LCG_94/x86_64-centos7-gcc7-opt/setup.sh
source /cvmfs/sft.cern.ch/lcg/etc/hadoop-confext/hadoop-setconf.sh analytix
lxplus7_setup="true"
fi
fi
# set avro jars
spark2=`spark-submit --version 2>&1 | grep "version 2"`
if [ -n "$spark2" ] || [ -f $lxplus7_spark ]; then
echo "Using spark 2.X"
sparkexjar=`ls /usr/hdp/spark/jars/spark-examples* /usr/hdp/spark-2/examples/jars/spark-examples* 2> /dev/null | tail -1`
if [ -n "$sparkexjar" ]; then
jars="$sparkexjar"
else
jars="/afs/cern.ch/user/v/valya/public/spark/spark-examples-1.6.0-cdh5.15.1-hadoop2.6.0-cdh5.15.1.jar"
fi
conf="$conf --packages org.apache.spark:spark-avro_2.11:2.4.3"
else
echo "Using spark 1.X"
csvjar=/afs/cern.ch/user/v/valya/public/spark/spark-csv-assembly-1.4.0.jar
avrojar=/afs/cern.ch/user/v/valya/public/spark/avro-mapred-1.7.6-cdh5.7.6.jar
jars="$csvjar,$avrojar"
if [ -f /usr/hdp/spark-2/examples/jars/spark-examples_2.11-2.3.2.jar ]; then
sparkexjar=/usr/hdp/spark-2/examples/jars/spark-examples_2.11-2.3.2.jar
jars="$jars,$sparkexjar"
else
sparkexjar=`ls /usr/lib/spark/examples/lib/spark-examples* 2> /dev/null | tail -1`
if [ -n "$sparkexjar" ]; then
jars="$jars,$sparkexjar"
else
sparkexjar=/afs/cern.ch/user/v/valya/public/spark/spark-examples-1.6.0-cdh5.15.1-hadoop2.6.0-cdh5.15.1.jar
jars="$jars,$sparkexjar"
fi
fi
mapreduce=/eos/project/s/swan/public/hadoop-mapreduce-client-core-2.6.0-cdh5.7.6.jar
if [ ! -f $mapreduce ]; then
mapreduce=/usr/lib/hadoop-mapreduce/hadoop-mapreduce-client-core-2.6.0-cdh5.7.6.jar
fi
fi
if [ -n "$jars" ]; then
jars="--jars $jars"
fi
args="${@:2}"
echo "$cmsspark $args"
yarn=`echo $args | grep -- "--yarn"`
# determine if we can load CVMFS
# https://cern.service-now.com/service-portal/article.do?n=KB0005361
cvmfs=""
ctest=`echo $args | grep -- "--cvmfs"`
if [ -z "$lxplus7_setup" ]; then
if [ -n "`hostname -s | grep ^lxplus`" ] || [ -n "$ctest" ]; then
if [ -f /cvmfs/sft.cern.ch/lcg/views/LCG_93/x86_64-centos7-gcc62-opt/setup.sh ] && \
[ -f /cvmfs/sft.cern.ch/lcg/etc/hadoop-confext/hadoop-setconf.sh ]; then
echo "Enable CVMFS ..."
source /cvmfs/sft.cern.ch/lcg/views/LCG_93/x86_64-centos7-gcc62-opt/setup.sh
source /cvmfs/sft.cern.ch/lcg/etc/hadoop-confext/hadoop-setconf.sh analytix
cvmfs="true"
fi
fi
fi
# source proper environemnt for it-hadoop-client machines
if [ -n "`hostname -s | grep ithdp`" ]; then
source hadoop-setconf.sh analytix
fi
if [ "$2" == "-h" ] || [ "$2" == "--help" ] || [ "$2" == "-help" ]; then
# run help
python $cmsspark --help
exit 0
fi
echo "PYTHONPATH: $PYTHONPATH"
echo "cmsspark: $cmsspark $args"
if [ -n "$yarn" ]; then
# to tune up these numbers:
# - executor-memory not more than 5G
# - num-executor can be increased (suggested not more than 10)
# - cores = 2/4/8
# Temp solution to have a wrapper for python27 on spark cluster
# once CERN IT will resolve python version we can remove PYSPARK_PYTHON
echo "YARN execution: $conf"
echo "conf=$conf cmsspark=$cmsspark args=$args cvmfs=$cvmfs"
if [ "$cvmfs" == "true" ]; then
spark-submit $jars \
--master yarn \
--executor-memory 5g \
--driver-memory 4g \
--num-executors 10 \
--conf spark.pyspark.python=/cvmfs/sft.cern.ch/lcg/views/LCG_93/x86_64-centos7-gcc62-opt/bin/python \
--conf spark.driver.extraClassPath=$mapreduce \
--conf spark.executorEnv.LD_LIBRARY_PATH=$LD_LIBRARY_PATH \
$conf $cmsspark $args
else
spark-submit $jars \
--master yarn \
--executor-memory 5g \
--num-executors 10 \
$conf $cmsspark $args
fi
else
# submit spark job with our file, please note
# that user may increase memory options if necessary
# the executor and driver memory options can be given in human readable form
# while spark yarn option should use memoryOverhead as KB value.
# Modify with local[*] to use all the available cores in the node
# optionally increase driver memory with --driver-memory 2G (default 1G)
echo "LOCAL (NO-YARN) execution"
echo "conf=$conf cmsspark=$cmsspark args=$args cvmfs=$cvmfs"
if [ "$cvmfs" == "true" ]; then
spark-submit $jars \
--executor-memory $((`nproc`/4))G \
--master local[$((`nproc`/4))] \
--conf spark.pyspark.python=/cvmfs/sft.cern.ch/lcg/views/LCG_93/x86_64-centos7-gcc62-opt/bin/python \
--conf spark.driver.extraClassPath=$mapreduce \
--conf spark.executorEnv.LD_LIBRARY_PATH=$LD_LIBRARY_PATH \
$conf $cmsspark $args
else
spark-submit $jars \
--executor-memory $((`nproc`/4))G \
--master local[$((`nproc`/4))] \
$conf $cmsspark $args
fi
fi