forked from ufoym/deepo
/
start_remote.sh
executable file
·264 lines (233 loc) · 8.06 KB
/
start_remote.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
#!/bin/bash
help(){
printf "script usage: $(basename $0) [-d name] [-b image name] [-s dir] [-p project] [-g IDs] [-m amount] [-e port] [-i init] [-v verbose] \n \
-d name : Name of developer(container username) (required) \n \
-b base image : Full name the base image (default: dayta/ml_development:latest ) \n \
-s dir : Root path of source code (default: ~/github) \n \
-p project : Project name (directory) (default: DaytaBase) \n \
-g IDs : IDs of GPUs which are exposed to container , seperated by comma (default: 0) \n \
-m amount : share memory size (default: 2g) \n \
-e exposed port : expose extra port, can be a port number or a range of port number \n \
-i init script : starting shell script (default: scripts/init.sh) \n \
-v verbose : enable docker build logs (default: False) \n \
" >&2
exit 1
}
find_next_available(){
find_next_available_ret=$1
checked=false
shift
for item in $(printf '%s\n' "$@" | grep '^[0-9]\+$' | sort)
do
if [ "$checked" = true ]
then
if [ "$find_next_available_ret" = "$item" ]
then
((find_next_available_ret++))
else
break
fi
elif [ "$item" = "$find_next_available_ret" ]
then
checked=true
((find_next_available_ret++))
fi
done
}
# Get deepo path
SCRIPTPATH=$(dirname $(realpath $0))
# Default values
BASE_IMAGE='dayta/ml_development:latest'
SOURCE_CODE_DIR=${HOME}/github
PROJECT=DaytaBase
GPU=0
SHM_SIZE=2g
EXPOSED_PORT=''
STARTUP_SCRIPT=scripts/init.sh
VERBOSE='-q'
# Constants
TENSORBOARD_PORT=16006
JUPYTER_PORT=8888
while getopts 'd:b:s:p:g:m:e:i:v' OPTION; do
case "$OPTION" in
d)
DEVELOPER_NAME=$OPTARG
;;
b)
BASE_IMAGE=$OPTARG
;;
s)
SOURCE_CODE_DIR=$OPTARG
;;
p)
PROJECT=$OPTARG
;;
g)
GPU=$OPTARG
;;
m)
SHM_SIZE=$OPTARG
;;
e)
EXPOSED_PORT=$OPTARG
;;
i)
STARTUP_SCRIPT=$OPTARG
;;
v)
VERBOSE=''
;;
?)
help
;;
esac
done
# Auto clean None images
docker ps -a | grep "Exited (1" | cut -d ' ' -f 1 | xargs docker rm > null 2>&1
docker rmi $(docker images -f "dangling=true" -q) > null 2>&1
rm null
# Check if required argument is provided
if [ -z $DEVELOPER_NAME ]
then
echo "Missing argument -d name"
help
fi
# Check if project folder exists
if [ ! -d "${SOURCE_CODE_DIR}/${PROJECT}/" ]
then
printf "Project folder ${SOURCE_CODE_DIR}/${PROJECT}/ does not exist, please check the arguments \n"
help
fi
# Check if DISPLAY environment variable is set
if [ -z $DISPLAY ]
then
printf "\$DISPLAY environment variable is not set, check your ssh command or use start_local.sh if you don't need to forward X11(application display) \n"
exit 1
fi
# Define image name, conatiner name and container username, container display number
ML_CONTAINER_DISPLAY="0"
ML_IMAGE_NAME=$DEVELOPER_NAME\_$PROJECT\_ENVIRONMENT
ML_CONTAINER_NAME=$DEVELOPER_NAME\_$PROJECT
ML_CONTAINER_USERNAME=$DEVELOPER_NAME
ML_IMAGE_NAME=${ML_IMAGE_NAME,,}
# Add suffix to contain name
find_next_available 0 $(docker ps -a --format '{{.Names}}' | \
grep "^$ML_CONTAINER_NAME" | \
sed -e "s/^$ML_CONTAINER_NAME\_//")
CURRENT_INDEX=$find_next_available_ret
ML_CONTAINER_NAME=$ML_CONTAINER_NAME\_$CURRENT_INDEX
# Get the DISPLAY slot
DISPLAY_NUMBER=$(echo $DISPLAY | cut -d. -f1 | cut -d: -f2)
# Remove and create a directory for the x11 socket
rm -rf .display_${DISPLAY_NUMBER}
mkdir -p .display_${DISPLAY_NUMBER}/socket
touch .display_${DISPLAY_NUMBER}/Xauthority
# Copy project requirements.txt to context, if it does not exist, create an empty requirements.txt
if [ -f "${SOURCE_CODE_DIR}/${PROJECT}/requirements.txt" ]
then
cp ${SOURCE_CODE_DIR}/${PROJECT}/requirements.txt ./requirements.txt
else
touch ./requirements.txt
fi
# Build the image
echo "Pulling docker base image, please wait..."
docker pull ${VERBOSE} ${BASE_IMAGE}
echo "Building docker image, please wait..."
docker tag ${BASE_IMAGE} deepo_tmp
docker build \
${VERBOSE} \
-t ${ML_IMAGE_NAME} \
--build-arg USERNAME=${ML_CONTAINER_USERNAME} \
--build-arg USER_ID=$(id -u) \
--build-arg GROUP_ID=$(id -g) \
--build-arg REQUIREMENTS=requirements.txt \
--build-arg INIT_SCRIPT=${STARTUP_SCRIPT} .
rm ./requirements.txt
# Extract current authentication cookie
AUTH_COOKIE=$(xauth list | grep "^$(hostname):${DISPLAY_NUMBER} " | awk '{print $3}')
if [ -z "$AUTH_COOKIE" ]
then
AUTH_COOKIE=$(xauth list | grep "^$(hostname)/unix:${DISPLAY_NUMBER} " | awk '{print $3}')
fi
# Create the new X Authority file
xauth -v -f .display_${DISPLAY_NUMBER}/Xauthority add ${ML_CONTAINER_NAME}/unix:${ML_CONTAINER_DISPLAY} MIT-MAGIC-COOKIE-1 ${AUTH_COOKIE}
# Proxy with the :0 DISPLAY
while true; do socat -d -d TCP4:localhost:60${DISPLAY_NUMBER} UNIX-LISTEN:.display_${DISPLAY_NUMBER}/socket/X${ML_CONTAINER_DISPLAY} > .display_${DISPLAY_NUMBER}/socat.log 2>&1; sleep 1; done &
# Determine docker run gpu arguments by docker version
chmod u+x scripts/version_compare
DOCKER_VERSION=$(docker --version | sed 's/^Docker version //; s/,.*//')
if [ $(./scripts/version_compare $DOCKER_VERSION 19.03) -gt 0 ]
then
# Expose specified gpus to container (Docker version >= 19.03)
GPU_ARG='--gpus "device='${GPU}'"'
else
# Expose all gpus to container and asign specified gpus by environment variable (Docker version < 19.03)
GPU_ARG="--runtime=nvidia -e CUDA_VISIBLE_DEVICES=${GPU}"
fi
# Get all cameras
CAMS=""
for CAM in /dev/video*
do
[ -e "$CAM" ] || continue
CAMS="$CAMS --device $CAM:$CAM"
done
echo "Available cameras: $CAMS"
# Get network drives
N_DRIVES=""
for N_DRIVE in /mnt/network_drive*
do
[ -e "$N_DRIVE" ] || continue
N_DRIVES="$N_DRIVES -v $N_DRIVE:$N_DRIVE"
done
# Create Jupyter directory
mkdir -p ${SCRIPTPATH}/User_${ML_CONTAINER_USERNAME}/.jupyter
# Create pytorch cache directory
mkdir -p ${HOME}/.cache/torch/checkpoints/
# Create mxnet cache directory
mkdir -p ${HOME}/.mxnet/
# Get jupyter port
find_next_available $JUPYTER_PORT $(docker ps --format {{.Ports}} | \
tr , '\n' | \
awk '{$1=$1};1'| \
sed "s/\(.*\)->\(.*\)\/tcp/\2/" )
JUPYTER_PORT=$find_next_available_ret
echo "The Jupyter Notebook is running at: http://localhost:${JUPYTER_PORT}?token=daytaai"
# Get tensorboard port
find_next_available $TENSORBOARD_PORT $(docker ps --format {{.Ports}} | \
tr , '\n' | \
awk '{$1=$1};1'| \
sed "s/\(.*\)->\(.*\)\/tcp/\2/" )
TENSORBOARD_PORT=$find_next_available_ret
# Expose extra port
if [ ! -z $EXPOSED_PORT ]
then
EXPOSED_PORT="-p $EXPOSED_PORT:$EXPOSED_PORT"
fi
# Launch the container
docker run -it --rm \
--name ${ML_CONTAINER_NAME} \
--hostname ${ML_CONTAINER_NAME} \
--shm-size=${SHM_SIZE} \
${GPU_ARG} \
${CAMS} \
-h ${ML_CONTAINER_NAME} \
-e QT_X11_NO_MITSHM=1 \
-e DISPLAY=:${ML_CONTAINER_DISPLAY} \
-e TENSORBOARD_PORT=${TENSORBOARD_PORT} \
-e JUPYTER_PORT=${JUPYTER_PORT} \
-e NVIDIA_DRIVER_CAPABILITIES=compute,video,utility,graphics \
-v ${HOME}/.aws:/home/${ML_CONTAINER_USERNAME}/.aws \
-v ${SOURCE_CODE_DIR}:/home/${ML_CONTAINER_USERNAME}/workspace \
-v ${HOME}/.cache/torch/checkpoints:/home/${ML_CONTAINER_USERNAME}/.cache/torch/checkpoints \
-v ${HOME}/.mxnet:/home/${ML_CONTAINER_USERNAME}/.mxnet \
-v ${PWD}/.display_${DISPLAY_NUMBER}/socket:/tmp/.X11-unix \
-v ${PWD}/.display_${DISPLAY_NUMBER}/Xauthority:/home/${ML_CONTAINER_USERNAME}/.Xauthority \
-v ${SCRIPTPATH}/User_${ML_CONTAINER_USERNAME}/.jupyter:/home/${ML_CONTAINER_USERNAME}/.jupyter \
${N_DRIVES} \
-p ${TENSORBOARD_PORT}:${TENSORBOARD_PORT} \
-p ${JUPYTER_PORT}:${JUPYTER_PORT} \
${EXPOSED_PORT} \
-w /home/${ML_CONTAINER_USERNAME}/workspace/${PROJECT} \
${ML_IMAGE_NAME} /home/${ML_CONTAINER_USERNAME}/init.sh
# Kill while true socat loop
kill -9 $!