Skip to content
This repository has been archived by the owner on Jan 25, 2022. It is now read-only.

merge nested warden into master #27

Merged
merged 20 commits into from Sep 12, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions warden/config/linux.yml
Expand Up @@ -37,6 +37,8 @@ server:
quota:
disk_quota_enabled: true

allow_nested_warden: false

health_check_server:
port: 2345

Expand Down
15 changes: 14 additions & 1 deletion warden/lib/warden/config.rb
Expand Up @@ -14,6 +14,7 @@ def self.server_defaults
"quota" => {
"disk_quota_enabled" => true,
},
"allow_nested_warden" => false,
}
end

Expand Down Expand Up @@ -59,6 +60,8 @@ def self.server_schema
"quota" => {
optional("disk_quota_enabled") => bool,
},

"allow_nested_warden" => bool,
}
end
end
Expand Down Expand Up @@ -97,6 +100,7 @@ def self.network_schema
# Present for Backwards compatibility
optional("pool_start_address") => String,
optional("pool_size") => Integer,
optional("release_delay") => Integer,
optional("mtu") => Integer,

"deny_networks" => [String],
Expand All @@ -106,7 +110,12 @@ def self.network_schema
end

def self.ip_local_port_range
File.read("/proc/sys/net/ipv4/ip_local_port_range").split.map(&:to_i)
# if no ip_local_port_range found, make some guess"
if File.exist?("/proc/sys/net/ipv4/ip_local_port_range")
File.read("/proc/sys/net/ipv4/ip_local_port_range").split.map(&:to_i)
else
return 32768, 61000

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why this is required for nested warden functionality?

@mariash @ryantang

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in newer kernel , /proc/sys/net/ipv4/ip_local_port_range is not exported inside container anymore.
so we should check whether it exist before provide some default values.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, makes sense.

@vito & @mariash

end
end

def self.port_defaults
Expand Down Expand Up @@ -205,6 +214,10 @@ def rlimits
@server["container_rlimits"] || {}
end

def allow_nested_warden?
!!@server["allow_nested_warden"]
end

def to_hash
{
"server" => server,
Expand Down
64 changes: 40 additions & 24 deletions warden/lib/warden/container/linux.rb
Expand Up @@ -82,6 +82,7 @@ def env
"network_netmask" => self.class.network_pool.pooled_netmask.to_human,
"user_uid" => uid,
"rootfs_path" => container_rootfs_path,
"allow_nested_warden" => Server.config.allow_nested_warden?.to_s,
"container_iface_mtu" => container_iface_mtu,
}
env
Expand Down Expand Up @@ -194,38 +195,53 @@ def perform_rsync(src_path, dst_path)
sh *args
end

def write_bind_mount_commands(request)
return if request.bind_mounts.nil? || request.bind_mounts.empty?
def add_bind_mount(file, src_path, dst_path, mode)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will this blow up if src doesn't exist? Previously we used to File.stat it and raise a Warden error if it didn't exist.

@vito & @mariash

dst_path = File.join(container_path, "mnt", dst_path[1..-1])

file.puts "mkdir -p #{dst_path}"
file.puts "mount -n --bind #{src_path} #{dst_path}"
file.puts "mount -n --bind -o remount,#{mode} #{src_path} #{dst_path}"
end

File.open(File.join(container_path, "lib", "hook-parent-before-clone.sh"), "a") do |file|
def write_bind_mount_commands(request)
File.open(File.join(container_path, "lib", "hook-child-before-pivot.sh"), "a") do |file|
file.puts
file.puts

request.bind_mounts.each do |bind_mount|
src_path = bind_mount.src_path
dst_path = bind_mount.dst_path

# Check that the source path exists
stat = File.stat(src_path) rescue nil
if stat.nil?
raise WardenError.new("Source path for bind mount does not exist: #{src_path}")
if request.bind_mounts.respond_to?(:each)
request.bind_mounts.each do |bind_mount|
src_path = bind_mount.src_path
dst_path = bind_mount.dst_path

# Check that the source path exists
stat = File.stat(src_path) rescue nil
raise WardenError.new("Source path for bind mount does not exist: #{src_path}") if stat.nil?

mode = case bind_mount.mode
when Protocol::CreateRequest::BindMount::Mode::RO
"ro"
when Protocol::CreateRequest::BindMount::Mode::RW
"rw"
else
raise "Unknown mode"
end

add_bind_mount(file, src_path, dst_path, mode)
end
end

# Fix up destination path to be an absolute path inside the union
dst_path = File.join(container_path, "mnt", dst_path[1..-1])
# for nested warden, we share the host's cgroup fs to contrainers using bind mount
if Server.config.allow_nested_warden?
tmp_warden_cgroup = File.join(container_path, "tmp", "warden", "cgroup")
FileUtils.mkdir_p(tmp_warden_cgroup)

mode = case bind_mount.mode
when Protocol::CreateRequest::BindMount::Mode::RO
"ro"
when Protocol::CreateRequest::BindMount::Mode::RW
"rw"
else
raise "Unknown mode"
end
# Bind-mount cgroups
add_bind_mount(file, tmp_warden_cgroup, "/tmp/warden/cgroup", "rw")

file.puts "mkdir -p #{dst_path}"
file.puts "mount -n --bind #{src_path} #{dst_path}"
file.puts "mount -n --bind -o remount,#{mode} #{src_path} #{dst_path}"
# for each subsystems, only pass the group of the current container instead of all groups, so that nested warder server will setup groups directly under parent container's hierarchy
%w(cpu cpuacct devices memory).each do |subsystem|
add_bind_mount(file, cgroup_path(subsystem), "/tmp/warden/cgroup/#{subsystem}", "rw")
end
end
end
end
Expand Down
2 changes: 1 addition & 1 deletion warden/lib/warden/server.rb
Expand Up @@ -189,7 +189,7 @@ def self.setup_logging
end

def self.setup_network
network_pool = Pool::Network.new(config.network["pool_network"])
network_pool = Pool::Network.new(config.network["pool_network"], :release_delay => config.network["release_delay"])
container_klass.network_pool = network_pool
end

Expand Down
5 changes: 5 additions & 0 deletions warden/root/linux/net.sh
Expand Up @@ -103,7 +103,12 @@ function setup_filter() {
iptables -A ${filter_default_chain} --destination "$n" --jump DROP
done

# Forward outbound traffic via ${filter_forward_chain}
iptables -A FORWARD -i w-+ --jump ${filter_forward_chain}

# Forward inbound traffic immediately
default_interface=$(ip route show | grep default | cut -d' ' -f5 | head -1)
iptables -I ${filter_forward_chain} -i $default_interface --jump ACCEPT
}

function teardown_nat() {
Expand Down
37 changes: 17 additions & 20 deletions warden/root/linux/setup.sh
Expand Up @@ -20,30 +20,27 @@ fi

cgroup_path=/tmp/warden/cgroup

mkdir -p $cgroup_path

if grep "${cgroup_path} " /proc/mounts | cut -d' ' -f3 | grep -q cgroup
if [ ! -d $cgroup_path ]
then
find $cgroup_path -mindepth 1 -type d | sort | tac | xargs rmdir
umount $cgroup_path
fi

# Mount tmpfs
if ! grep "${cgroup_path} " /proc/mounts | cut -d' ' -f3 | grep -q tmpfs
then
mount -t tmpfs none $cgroup_path
fi
mkdir -p $cgroup_path

# Mount cgroup subsystems individually
for subsystem in cpu cpuacct devices memory
do
mkdir -p $cgroup_path/$subsystem

if ! grep -q "${cgroup_path}/$subsystem " /proc/mounts
# Mount tmpfs
if ! grep "${cgroup_path} " /proc/mounts | cut -d' ' -f3 | grep -q tmpfs

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This block of code is never executed if the $cgroup_path directory existed prior to line 12. What if the $cgroup directory exists and not mounted to /tmpfs? In this case, we'll be writing on the parent FS.

@mariash @ryantang

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

writing on the parent cgroup FS is exactly the solution for nested warden: sharing cgroup fs since we share one kernel among host & containers. it is possible to setup separate cgroup fs for each warden server (parent and child), but their content will be the same. so NO points to do so.

and, like nested resource groups in vCenter, child containers' cgroups should under parent containers' groups, like this: /tmp/warden/cgroup/cpu/parent_container/child_container.

then
mount -t cgroup -o $subsystem none $cgroup_path/$subsystem
mount -t tmpfs none $cgroup_path
fi
done

# Mount cgroup subsystems individually
for subsystem in cpu cpuacct devices memory
do
mkdir -p $cgroup_path/$subsystem

if ! grep -q "${cgroup_path}/$subsystem " /proc/mounts
then
mount -t cgroup -o $subsystem none $cgroup_path/$subsystem
fi
done
fi

./net.sh setup

Expand Down
1 change: 1 addition & 0 deletions warden/root/linux/skeleton/lib/common.sh
Expand Up @@ -44,6 +44,7 @@ function setup_fs_other() {
overlay_directory_in_rootfs /etc rw
overlay_directory_in_rootfs /home rw
overlay_directory_in_rootfs /sbin rw
overlay_directory_in_rootfs /var rw

mkdir -p tmp/rootfs/tmp
chmod 777 tmp/rootfs/tmp
Expand Down
38 changes: 22 additions & 16 deletions warden/root/linux/skeleton/lib/hook-parent-after-clone.sh
Expand Up @@ -24,22 +24,28 @@ do

if [ $(basename $system_path) == "devices" ]
then
# disallow everything, allow explicitly
echo a > $instance_path/devices.deny
# /dev/null
echo "c 1:3 rw" > $instance_path/devices.allow
# /dev/zero
echo "c 1:5 rw" > $instance_path/devices.allow
# /dev/random
echo "c 1:8 rw" > $instance_path/devices.allow
# /dev/urandom
echo "c 1:9 rw" > $instance_path/devices.allow
# /dev/tty
echo "c 5:0 rw" > $instance_path/devices.allow
# /dev/ptmx
echo "c 5:2 rw" > $instance_path/devices.allow
# /dev/pts/*
echo "c 136:* rw" > $instance_path/devices.allow
if [ $allow_nested_warden == "true" ]
then
# Allow everything
echo "a *:* rw" > $instance_path/devices.allow

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain why you need everything? It seems like the right thing to do is to just add the specific devices you need.

@mariash @ryantang

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

at least one thing fail as I observed, when untar rootfs package for dea job, a lot of device node need mknod.
otherwise we will fail because of Operation not permitted. (and we want share the same package from cf-realase master)

and since warden-cpi are the only consumer of nested warden, we see no requirement to restrict devices access for it right now.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, makes sense.

else
# Deny everything, allow explicitly
echo a > $instance_path/devices.deny
# /dev/null
echo "c 1:3 rw" > $instance_path/devices.allow
# /dev/zero
echo "c 1:5 rw" > $instance_path/devices.allow
# /dev/random
echo "c 1:8 rw" > $instance_path/devices.allow
# /dev/urandom
echo "c 1:9 rw" > $instance_path/devices.allow
# /dev/tty
echo "c 5:0 rw" > $instance_path/devices.allow
# /dev/ptmx
echo "c 5:2 rw" > $instance_path/devices.allow
# /dev/pts/*
echo "c 136:* rw" > $instance_path/devices.allow
fi
fi

echo $PID > $instance_path/tasks
Expand Down
2 changes: 1 addition & 1 deletion warden/root/linux/skeleton/net.sh
Expand Up @@ -38,7 +38,7 @@ function setup_filter() {
--goto ${filter_default_chain}

# Bind instance chain to forward chain
iptables -I ${filter_forward_chain} \
iptables -I ${filter_forward_chain} 2 \

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you need this change? Can you add a test to showcase the desired behavior?

@mariash @ryantang

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was this added by you guys or Pieter & David?

@vito & @mariash

--in-interface ${network_host_iface} \
--goto ${filter_instance_chain}
}
Expand Down
2 changes: 2 additions & 0 deletions warden/root/linux/skeleton/setup.sh
Expand Up @@ -18,6 +18,7 @@ network_container_ip=${network_container_ip:-10.0.0.2}
network_container_iface="w-${id}-1"
user_uid=${user_uid:-10000}
rootfs_path=$(readlink -f $rootfs_path)
allow_nested_warden=${allow_nested_warden:-false}

# Write configuration
cat > etc/config <<-EOS
Expand All @@ -29,6 +30,7 @@ network_container_ip=$network_container_ip
network_container_iface=$network_container_iface
user_uid=$user_uid
rootfs_path=$rootfs_path
allow_nested_warden=$allow_nested_warden
EOS

setup_fs
Expand Down
62 changes: 62 additions & 0 deletions warden/spec/assets/config/child-linux.yml
@@ -0,0 +1,62 @@
---
server:
container_klass: Warden::Container::Linux

# Wait this long before destroying a container, after the last client
# referencing it disconnected. The timer is cancelled when during this
# period, another client references the container.
#
# Clients can be forced to specify this setting by setting the
# server-wide variable to an invalid value:
# container_grace_time: invalid
#
# The grace time can be disabled by setting it to nil:
# container_grace_time: ~
#
container_grace_time: 30

unix_domain_permissions: 0777

# Specifies the path to the base chroot used as the read-only root
# filesystem
container_rootfs_path: /tmp/warden/rootfs

# Specifies the path to the parent directory under which all containers
# will live.
container_depot_path: /tmp/warden/containers

# See getrlimit(2) for details. Integer values are passed verbatim.
container_rlimits:
as: 4294967296
nofile: 8192
nproc: 512

# Specifies the output limit of a job (stdout/stderr combined).
job_output_limit: 10485760

quota:
disk_quota_enabled: false

allow_nested_warden: false

health_check_server:
port: 2345

logging:
level: debug2
file: /tmp/warden.log

network:
# Use this /30 network as offset for the network pool.
pool_start_address: 10.254.0.0

# Pool this many /30 networks.
pool_size: 256

# Interface MTU size
# (for OpenStack use 1454 to avoid problems with rubygems with GRE tunneling)
mtu: 1500

user:
pool_start_uid: 20000
pool_size: 25