merge nested warden into master #27
Changes from all commits
c8446d7
e932f25
b1cb5f3
6e1842c
e833b9b
be58d88
9712451
f664755
105df82
0781da5
0fa1f98
2169405
4766176
02d6a29
1d7587f
bb37857
b06c343
7bea55d
eaeb910
ec6f157
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,6 +37,8 @@ server: | |
quota: | ||
disk_quota_enabled: true | ||
|
||
allow_nested_warden: false | ||
|
||
health_check_server: | ||
port: 2345 | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -82,6 +82,7 @@ def env | |
"network_netmask" => self.class.network_pool.pooled_netmask.to_human, | ||
"user_uid" => uid, | ||
"rootfs_path" => container_rootfs_path, | ||
"allow_nested_warden" => Server.config.allow_nested_warden?.to_s, | ||
"container_iface_mtu" => container_iface_mtu, | ||
} | ||
env | ||
|
@@ -194,38 +195,53 @@ def perform_rsync(src_path, dst_path) | |
sh *args | ||
end | ||
|
||
def write_bind_mount_commands(request) | ||
return if request.bind_mounts.nil? || request.bind_mounts.empty? | ||
def add_bind_mount(file, src_path, dst_path, mode) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
dst_path = File.join(container_path, "mnt", dst_path[1..-1]) | ||
|
||
file.puts "mkdir -p #{dst_path}" | ||
file.puts "mount -n --bind #{src_path} #{dst_path}" | ||
file.puts "mount -n --bind -o remount,#{mode} #{src_path} #{dst_path}" | ||
end | ||
|
||
File.open(File.join(container_path, "lib", "hook-parent-before-clone.sh"), "a") do |file| | ||
def write_bind_mount_commands(request) | ||
File.open(File.join(container_path, "lib", "hook-child-before-pivot.sh"), "a") do |file| | ||
file.puts | ||
file.puts | ||
|
||
request.bind_mounts.each do |bind_mount| | ||
src_path = bind_mount.src_path | ||
dst_path = bind_mount.dst_path | ||
|
||
# Check that the source path exists | ||
stat = File.stat(src_path) rescue nil | ||
if stat.nil? | ||
raise WardenError.new("Source path for bind mount does not exist: #{src_path}") | ||
if request.bind_mounts.respond_to?(:each) | ||
request.bind_mounts.each do |bind_mount| | ||
src_path = bind_mount.src_path | ||
dst_path = bind_mount.dst_path | ||
|
||
# Check that the source path exists | ||
stat = File.stat(src_path) rescue nil | ||
raise WardenError.new("Source path for bind mount does not exist: #{src_path}") if stat.nil? | ||
|
||
mode = case bind_mount.mode | ||
when Protocol::CreateRequest::BindMount::Mode::RO | ||
"ro" | ||
when Protocol::CreateRequest::BindMount::Mode::RW | ||
"rw" | ||
else | ||
raise "Unknown mode" | ||
end | ||
|
||
add_bind_mount(file, src_path, dst_path, mode) | ||
end | ||
end | ||
|
||
# Fix up destination path to be an absolute path inside the union | ||
dst_path = File.join(container_path, "mnt", dst_path[1..-1]) | ||
# for nested warden, we share the host's cgroup fs to contrainers using bind mount | ||
if Server.config.allow_nested_warden? | ||
tmp_warden_cgroup = File.join(container_path, "tmp", "warden", "cgroup") | ||
FileUtils.mkdir_p(tmp_warden_cgroup) | ||
|
||
mode = case bind_mount.mode | ||
when Protocol::CreateRequest::BindMount::Mode::RO | ||
"ro" | ||
when Protocol::CreateRequest::BindMount::Mode::RW | ||
"rw" | ||
else | ||
raise "Unknown mode" | ||
end | ||
# Bind-mount cgroups | ||
add_bind_mount(file, tmp_warden_cgroup, "/tmp/warden/cgroup", "rw") | ||
|
||
file.puts "mkdir -p #{dst_path}" | ||
file.puts "mount -n --bind #{src_path} #{dst_path}" | ||
file.puts "mount -n --bind -o remount,#{mode} #{src_path} #{dst_path}" | ||
# for each subsystems, only pass the group of the current container instead of all groups, so that nested warder server will setup groups directly under parent container's hierarchy | ||
%w(cpu cpuacct devices memory).each do |subsystem| | ||
add_bind_mount(file, cgroup_path(subsystem), "/tmp/warden/cgroup/#{subsystem}", "rw") | ||
end | ||
end | ||
end | ||
end | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,30 +20,27 @@ fi | |
|
||
cgroup_path=/tmp/warden/cgroup | ||
|
||
mkdir -p $cgroup_path | ||
|
||
if grep "${cgroup_path} " /proc/mounts | cut -d' ' -f3 | grep -q cgroup | ||
if [ ! -d $cgroup_path ] | ||
then | ||
find $cgroup_path -mindepth 1 -type d | sort | tac | xargs rmdir | ||
umount $cgroup_path | ||
fi | ||
|
||
# Mount tmpfs | ||
if ! grep "${cgroup_path} " /proc/mounts | cut -d' ' -f3 | grep -q tmpfs | ||
then | ||
mount -t tmpfs none $cgroup_path | ||
fi | ||
mkdir -p $cgroup_path | ||
|
||
# Mount cgroup subsystems individually | ||
for subsystem in cpu cpuacct devices memory | ||
do | ||
mkdir -p $cgroup_path/$subsystem | ||
|
||
if ! grep -q "${cgroup_path}/$subsystem " /proc/mounts | ||
# Mount tmpfs | ||
if ! grep "${cgroup_path} " /proc/mounts | cut -d' ' -f3 | grep -q tmpfs | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. writing on the parent cgroup FS is exactly the solution for nested warden: sharing cgroup fs since we share one kernel among host & containers. it is possible to setup separate cgroup fs for each warden server (parent and child), but their content will be the same. so NO points to do so. and, like nested resource groups in vCenter, child containers' cgroups should under parent containers' groups, like this: /tmp/warden/cgroup/cpu/parent_container/child_container. |
||
then | ||
mount -t cgroup -o $subsystem none $cgroup_path/$subsystem | ||
mount -t tmpfs none $cgroup_path | ||
fi | ||
done | ||
|
||
# Mount cgroup subsystems individually | ||
for subsystem in cpu cpuacct devices memory | ||
do | ||
mkdir -p $cgroup_path/$subsystem | ||
|
||
if ! grep -q "${cgroup_path}/$subsystem " /proc/mounts | ||
then | ||
mount -t cgroup -o $subsystem none $cgroup_path/$subsystem | ||
fi | ||
done | ||
fi | ||
|
||
./net.sh setup | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,22 +24,28 @@ do | |
|
||
if [ $(basename $system_path) == "devices" ] | ||
then | ||
# disallow everything, allow explicitly | ||
echo a > $instance_path/devices.deny | ||
# /dev/null | ||
echo "c 1:3 rw" > $instance_path/devices.allow | ||
# /dev/zero | ||
echo "c 1:5 rw" > $instance_path/devices.allow | ||
# /dev/random | ||
echo "c 1:8 rw" > $instance_path/devices.allow | ||
# /dev/urandom | ||
echo "c 1:9 rw" > $instance_path/devices.allow | ||
# /dev/tty | ||
echo "c 5:0 rw" > $instance_path/devices.allow | ||
# /dev/ptmx | ||
echo "c 5:2 rw" > $instance_path/devices.allow | ||
# /dev/pts/* | ||
echo "c 136:* rw" > $instance_path/devices.allow | ||
if [ $allow_nested_warden == "true" ] | ||
then | ||
# Allow everything | ||
echo "a *:* rw" > $instance_path/devices.allow | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. at least one thing fail as I observed, when untar rootfs package for dea job, a lot of device node need mknod. and since warden-cpi are the only consumer of nested warden, we see no requirement to restrict devices access for it right now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
else | ||
# Deny everything, allow explicitly | ||
echo a > $instance_path/devices.deny | ||
# /dev/null | ||
echo "c 1:3 rw" > $instance_path/devices.allow | ||
# /dev/zero | ||
echo "c 1:5 rw" > $instance_path/devices.allow | ||
# /dev/random | ||
echo "c 1:8 rw" > $instance_path/devices.allow | ||
# /dev/urandom | ||
echo "c 1:9 rw" > $instance_path/devices.allow | ||
# /dev/tty | ||
echo "c 5:0 rw" > $instance_path/devices.allow | ||
# /dev/ptmx | ||
echo "c 5:2 rw" > $instance_path/devices.allow | ||
# /dev/pts/* | ||
echo "c 136:* rw" > $instance_path/devices.allow | ||
fi | ||
fi | ||
|
||
echo $PID > $instance_path/tasks | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,7 +38,7 @@ function setup_filter() { | |
--goto ${filter_default_chain} | ||
|
||
# Bind instance chain to forward chain | ||
iptables -I ${filter_forward_chain} \ | ||
iptables -I ${filter_forward_chain} 2 \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
--in-interface ${network_host_iface} \ | ||
--goto ${filter_instance_chain} | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
--- | ||
server: | ||
container_klass: Warden::Container::Linux | ||
|
||
# Wait this long before destroying a container, after the last client | ||
# referencing it disconnected. The timer is cancelled when during this | ||
# period, another client references the container. | ||
# | ||
# Clients can be forced to specify this setting by setting the | ||
# server-wide variable to an invalid value: | ||
# container_grace_time: invalid | ||
# | ||
# The grace time can be disabled by setting it to nil: | ||
# container_grace_time: ~ | ||
# | ||
container_grace_time: 30 | ||
|
||
unix_domain_permissions: 0777 | ||
|
||
# Specifies the path to the base chroot used as the read-only root | ||
# filesystem | ||
container_rootfs_path: /tmp/warden/rootfs | ||
|
||
# Specifies the path to the parent directory under which all containers | ||
# will live. | ||
container_depot_path: /tmp/warden/containers | ||
|
||
# See getrlimit(2) for details. Integer values are passed verbatim. | ||
container_rlimits: | ||
as: 4294967296 | ||
nofile: 8192 | ||
nproc: 512 | ||
|
||
# Specifies the output limit of a job (stdout/stderr combined). | ||
job_output_limit: 10485760 | ||
|
||
quota: | ||
disk_quota_enabled: false | ||
|
||
allow_nested_warden: false | ||
|
||
health_check_server: | ||
port: 2345 | ||
|
||
logging: | ||
level: debug2 | ||
file: /tmp/warden.log | ||
|
||
network: | ||
# Use this /30 network as offset for the network pool. | ||
pool_start_address: 10.254.0.0 | ||
|
||
# Pool this many /30 networks. | ||
pool_size: 256 | ||
|
||
# Interface MTU size | ||
# (for OpenStack use 1454 to avoid problems with rubygems with GRE tunneling) | ||
mtu: 1500 | ||
|
||
user: | ||
pool_start_uid: 20000 | ||
pool_size: 25 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why this is required for nested warden functionality?
@mariash @ryantang
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
in newer kernel , /proc/sys/net/ipv4/ip_local_port_range is not exported inside container anymore.
so we should check whether it exist before provide some default values.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, makes sense.
@vito & @mariash