Skip to content

Commit e1aab16

Browse files
glommerdavem330
authored andcommitted
socket: initial cgroup code.
The goal of this work is to move the memory pressure tcp controls to a cgroup, instead of just relying on global conditions. To avoid excessive overhead in the network fast paths, the code that accounts allocated memory to a cgroup is hidden inside a static_branch(). This branch is patched out until the first non-root cgroup is created. So when nobody is using cgroups, even if it is mounted, no significant performance penalty should be seen. This patch handles the generic part of the code, and has nothing tcp-specific. Signed-off-by: Glauber Costa <glommer@parallels.com> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujtsu.com> CC: Kirill A. Shutemov <kirill@shutemov.name> CC: David S. Miller <davem@davemloft.net> CC: Eric W. Biederman <ebiederm@xmission.com> CC: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent 180d8cd commit e1aab16

File tree

5 files changed

+235
-17
lines changed

5 files changed

+235
-17
lines changed

Documentation/cgroups/memory.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,9 @@ to trigger slab reclaim when those limits are reached.
289289

290290
2.7.1 Current Kernel Memory resources accounted
291291

292-
None
292+
* sockets memory pressure: some sockets protocols have memory pressure
293+
thresholds. The Memory Controller allows them to be controlled individually
294+
per cgroup, instead of globally.
293295

294296
3. User Interface
295297

include/linux/memcontrol.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
8585
extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
8686
extern struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm);
8787

88+
extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
89+
8890
static inline
8991
int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup)
9092
{
@@ -381,5 +383,25 @@ mem_cgroup_print_bad_page(struct page *page)
381383
}
382384
#endif
383385

386+
#ifdef CONFIG_INET
387+
enum {
388+
UNDER_LIMIT,
389+
SOFT_LIMIT,
390+
OVER_LIMIT,
391+
};
392+
393+
struct sock;
394+
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
395+
void sock_update_memcg(struct sock *sk);
396+
void sock_release_memcg(struct sock *sk);
397+
#else
398+
static inline void sock_update_memcg(struct sock *sk)
399+
{
400+
}
401+
static inline void sock_release_memcg(struct sock *sk)
402+
{
403+
}
404+
#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
405+
#endif /* CONFIG_INET */
384406
#endif /* _LINUX_MEMCONTROL_H */
385407

include/net/sock.h

Lines changed: 150 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
#include <linux/slab.h>
5555
#include <linux/uaccess.h>
5656
#include <linux/memcontrol.h>
57+
#include <linux/res_counter.h>
5758

5859
#include <linux/filter.h>
5960
#include <linux/rculist_nulls.h>
@@ -168,6 +169,7 @@ struct sock_common {
168169
/* public: */
169170
};
170171

172+
struct cg_proto;
171173
/**
172174
* struct sock - network layer representation of sockets
173175
* @__sk_common: shared layout with inet_timewait_sock
@@ -228,6 +230,7 @@ struct sock_common {
228230
* @sk_security: used by security modules
229231
* @sk_mark: generic packet mark
230232
* @sk_classid: this socket's cgroup classid
233+
* @sk_cgrp: this socket's cgroup-specific proto data
231234
* @sk_write_pending: a write to stream socket waits to start
232235
* @sk_state_change: callback to indicate change in the state of the sock
233236
* @sk_data_ready: callback to indicate there is data to be processed
@@ -342,6 +345,7 @@ struct sock {
342345
#endif
343346
__u32 sk_mark;
344347
u32 sk_classid;
348+
struct cg_proto *sk_cgrp;
345349
void (*sk_state_change)(struct sock *sk);
346350
void (*sk_data_ready)(struct sock *sk, int bytes);
347351
void (*sk_write_space)(struct sock *sk);
@@ -838,6 +842,37 @@ struct proto {
838842
#ifdef SOCK_REFCNT_DEBUG
839843
atomic_t socks;
840844
#endif
845+
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
846+
/*
847+
* cgroup specific init/deinit functions. Called once for all
848+
* protocols that implement it, from cgroups populate function.
849+
* This function has to setup any files the protocol want to
850+
* appear in the kmem cgroup filesystem.
851+
*/
852+
int (*init_cgroup)(struct cgroup *cgrp,
853+
struct cgroup_subsys *ss);
854+
void (*destroy_cgroup)(struct cgroup *cgrp,
855+
struct cgroup_subsys *ss);
856+
struct cg_proto *(*proto_cgroup)(struct mem_cgroup *memcg);
857+
#endif
858+
};
859+
860+
struct cg_proto {
861+
void (*enter_memory_pressure)(struct sock *sk);
862+
struct res_counter *memory_allocated; /* Current allocated memory. */
863+
struct percpu_counter *sockets_allocated; /* Current number of sockets. */
864+
int *memory_pressure;
865+
long *sysctl_mem;
866+
/*
867+
* memcg field is used to find which memcg we belong directly
868+
* Each memcg struct can hold more than one cg_proto, so container_of
869+
* won't really cut.
870+
*
871+
* The elegant solution would be having an inverse function to
872+
* proto_cgroup in struct proto, but that means polluting the structure
873+
* for everybody, instead of just for memcg users.
874+
*/
875+
struct mem_cgroup *memcg;
841876
};
842877

843878
extern int proto_register(struct proto *prot, int alloc_slab);
@@ -856,7 +891,7 @@ static inline void sk_refcnt_debug_dec(struct sock *sk)
856891
sk->sk_prot->name, sk, atomic_read(&sk->sk_prot->socks));
857892
}
858893

859-
static inline void sk_refcnt_debug_release(const struct sock *sk)
894+
inline void sk_refcnt_debug_release(const struct sock *sk)
860895
{
861896
if (atomic_read(&sk->sk_refcnt) != 1)
862897
printk(KERN_DEBUG "Destruction of the %s socket %p delayed, refcnt=%d\n",
@@ -868,6 +903,24 @@ static inline void sk_refcnt_debug_release(const struct sock *sk)
868903
#define sk_refcnt_debug_release(sk) do { } while (0)
869904
#endif /* SOCK_REFCNT_DEBUG */
870905

906+
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
907+
extern struct jump_label_key memcg_socket_limit_enabled;
908+
static inline struct cg_proto *parent_cg_proto(struct proto *proto,
909+
struct cg_proto *cg_proto)
910+
{
911+
return proto->proto_cgroup(parent_mem_cgroup(cg_proto->memcg));
912+
}
913+
#define mem_cgroup_sockets_enabled static_branch(&memcg_socket_limit_enabled)
914+
#else
915+
#define mem_cgroup_sockets_enabled 0
916+
static inline struct cg_proto *parent_cg_proto(struct proto *proto,
917+
struct cg_proto *cg_proto)
918+
{
919+
return NULL;
920+
}
921+
#endif
922+
923+
871924
static inline bool sk_has_memory_pressure(const struct sock *sk)
872925
{
873926
return sk->sk_prot->memory_pressure != NULL;
@@ -877,59 +930,147 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
877930
{
878931
if (!sk->sk_prot->memory_pressure)
879932
return false;
933+
934+
if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
935+
return !!*sk->sk_cgrp->memory_pressure;
936+
880937
return !!*sk->sk_prot->memory_pressure;
881938
}
882939

883940
static inline void sk_leave_memory_pressure(struct sock *sk)
884941
{
885942
int *memory_pressure = sk->sk_prot->memory_pressure;
886943

887-
if (memory_pressure && *memory_pressure)
944+
if (!memory_pressure)
945+
return;
946+
947+
if (*memory_pressure)
888948
*memory_pressure = 0;
949+
950+
if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
951+
struct cg_proto *cg_proto = sk->sk_cgrp;
952+
struct proto *prot = sk->sk_prot;
953+
954+
for (; cg_proto; cg_proto = parent_cg_proto(prot, cg_proto))
955+
if (*cg_proto->memory_pressure)
956+
*cg_proto->memory_pressure = 0;
957+
}
958+
889959
}
890960

891961
static inline void sk_enter_memory_pressure(struct sock *sk)
892962
{
893-
if (sk->sk_prot->enter_memory_pressure)
894-
sk->sk_prot->enter_memory_pressure(sk);
963+
if (!sk->sk_prot->enter_memory_pressure)
964+
return;
965+
966+
if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
967+
struct cg_proto *cg_proto = sk->sk_cgrp;
968+
struct proto *prot = sk->sk_prot;
969+
970+
for (; cg_proto; cg_proto = parent_cg_proto(prot, cg_proto))
971+
cg_proto->enter_memory_pressure(sk);
972+
}
973+
974+
sk->sk_prot->enter_memory_pressure(sk);
895975
}
896976

897977
static inline long sk_prot_mem_limits(const struct sock *sk, int index)
898978
{
899979
long *prot = sk->sk_prot->sysctl_mem;
980+
if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
981+
prot = sk->sk_cgrp->sysctl_mem;
900982
return prot[index];
901983
}
902984

985+
static inline void memcg_memory_allocated_add(struct cg_proto *prot,
986+
unsigned long amt,
987+
int *parent_status)
988+
{
989+
struct res_counter *fail;
990+
int ret;
991+
992+
ret = res_counter_charge(prot->memory_allocated,
993+
amt << PAGE_SHIFT, &fail);
994+
995+
if (ret < 0)
996+
*parent_status = OVER_LIMIT;
997+
}
998+
999+
static inline void memcg_memory_allocated_sub(struct cg_proto *prot,
1000+
unsigned long amt)
1001+
{
1002+
res_counter_uncharge(prot->memory_allocated, amt << PAGE_SHIFT);
1003+
}
1004+
1005+
static inline u64 memcg_memory_allocated_read(struct cg_proto *prot)
1006+
{
1007+
u64 ret;
1008+
ret = res_counter_read_u64(prot->memory_allocated, RES_USAGE);
1009+
return ret >> PAGE_SHIFT;
1010+
}
1011+
9031012
static inline long
9041013
sk_memory_allocated(const struct sock *sk)
9051014
{
9061015
struct proto *prot = sk->sk_prot;
1016+
if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1017+
return memcg_memory_allocated_read(sk->sk_cgrp);
1018+
9071019
return atomic_long_read(prot->memory_allocated);
9081020
}
9091021

9101022
static inline long
911-
sk_memory_allocated_add(struct sock *sk, int amt)
1023+
sk_memory_allocated_add(struct sock *sk, int amt, int *parent_status)
9121024
{
9131025
struct proto *prot = sk->sk_prot;
1026+
1027+
if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
1028+
memcg_memory_allocated_add(sk->sk_cgrp, amt, parent_status);
1029+
/* update the root cgroup regardless */
1030+
atomic_long_add_return(amt, prot->memory_allocated);
1031+
return memcg_memory_allocated_read(sk->sk_cgrp);
1032+
}
1033+
9141034
return atomic_long_add_return(amt, prot->memory_allocated);
9151035
}
9161036

9171037
static inline void
918-
sk_memory_allocated_sub(struct sock *sk, int amt)
1038+
sk_memory_allocated_sub(struct sock *sk, int amt, int parent_status)
9191039
{
9201040
struct proto *prot = sk->sk_prot;
1041+
1042+
if (mem_cgroup_sockets_enabled && sk->sk_cgrp &&
1043+
parent_status != OVER_LIMIT) /* Otherwise was uncharged already */
1044+
memcg_memory_allocated_sub(sk->sk_cgrp, amt);
1045+
9211046
atomic_long_sub(amt, prot->memory_allocated);
9221047
}
9231048

9241049
static inline void sk_sockets_allocated_dec(struct sock *sk)
9251050
{
9261051
struct proto *prot = sk->sk_prot;
1052+
1053+
if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
1054+
struct cg_proto *cg_proto = sk->sk_cgrp;
1055+
1056+
for (; cg_proto; cg_proto = parent_cg_proto(prot, cg_proto))
1057+
percpu_counter_dec(cg_proto->sockets_allocated);
1058+
}
1059+
9271060
percpu_counter_dec(prot->sockets_allocated);
9281061
}
9291062

9301063
static inline void sk_sockets_allocated_inc(struct sock *sk)
9311064
{
9321065
struct proto *prot = sk->sk_prot;
1066+
1067+
if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
1068+
struct cg_proto *cg_proto = sk->sk_cgrp;
1069+
1070+
for (; cg_proto; cg_proto = parent_cg_proto(prot, cg_proto))
1071+
percpu_counter_inc(cg_proto->sockets_allocated);
1072+
}
1073+
9331074
percpu_counter_inc(prot->sockets_allocated);
9341075
}
9351076

@@ -938,6 +1079,9 @@ sk_sockets_allocated_read_positive(struct sock *sk)
9381079
{
9391080
struct proto *prot = sk->sk_prot;
9401081

1082+
if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1083+
return percpu_counter_sum_positive(sk->sk_cgrp->sockets_allocated);
1084+
9411085
return percpu_counter_sum_positive(prot->sockets_allocated);
9421086
}
9431087

mm/memcontrol.c

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -379,7 +379,48 @@ enum mem_type {
379379

380380
static void mem_cgroup_get(struct mem_cgroup *memcg);
381381
static void mem_cgroup_put(struct mem_cgroup *memcg);
382-
static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
382+
383+
/* Writing them here to avoid exposing memcg's inner layout */
384+
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
385+
#ifdef CONFIG_INET
386+
#include <net/sock.h>
387+
388+
static bool mem_cgroup_is_root(struct mem_cgroup *memcg);
389+
void sock_update_memcg(struct sock *sk)
390+
{
391+
/* A socket spends its whole life in the same cgroup */
392+
if (sk->sk_cgrp) {
393+
WARN_ON(1);
394+
return;
395+
}
396+
if (static_branch(&memcg_socket_limit_enabled)) {
397+
struct mem_cgroup *memcg;
398+
399+
BUG_ON(!sk->sk_prot->proto_cgroup);
400+
401+
rcu_read_lock();
402+
memcg = mem_cgroup_from_task(current);
403+
if (!mem_cgroup_is_root(memcg)) {
404+
mem_cgroup_get(memcg);
405+
sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg);
406+
}
407+
rcu_read_unlock();
408+
}
409+
}
410+
EXPORT_SYMBOL(sock_update_memcg);
411+
412+
void sock_release_memcg(struct sock *sk)
413+
{
414+
if (static_branch(&memcg_socket_limit_enabled) && sk->sk_cgrp) {
415+
struct mem_cgroup *memcg;
416+
WARN_ON(!sk->sk_cgrp->memcg);
417+
memcg = sk->sk_cgrp->memcg;
418+
mem_cgroup_put(memcg);
419+
}
420+
}
421+
#endif /* CONFIG_INET */
422+
#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
423+
383424
static void drain_all_stock_async(struct mem_cgroup *memcg);
384425

385426
static struct mem_cgroup_per_zone *
@@ -4932,12 +4973,13 @@ static void mem_cgroup_put(struct mem_cgroup *memcg)
49324973
/*
49334974
* Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
49344975
*/
4935-
static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
4976+
struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
49364977
{
49374978
if (!memcg->res.parent)
49384979
return NULL;
49394980
return mem_cgroup_from_res_counter(memcg->res.parent, res);
49404981
}
4982+
EXPORT_SYMBOL(parent_mem_cgroup);
49414983

49424984
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
49434985
static void __init enable_swap_cgroup(void)

0 commit comments

Comments
 (0)