From 7d944b3c65868bc09e111a020835fcc39ae66c00 Mon Sep 17 00:00:00 2001 From: arter97 Date: Wed, 24 Jul 2013 13:00:20 +0900 Subject: [PATCH] backport SLUB from Linux 3.3 (thanks to faux123) Conflicts: include/linux/page-flags.h Change-Id: I185253455ab5cb45014be09ee0058d056ceb0d47 --- include/linux/mm_types.h | 109 +-- include/linux/page-flags.h | 5 - include/linux/slub_def.h | 20 + mm/slab.c | 2 +- mm/slub.c | 1324 +++++++++++++++++++++++++----------- 5 files changed, 1024 insertions(+), 436 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0f230ce730a6..cfd7ab17038b 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -32,35 +32,65 @@ struct address_space; * who is mapping it. */ struct page { + /* First double word block */ unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ - atomic_t _count; /* Usage count, see below. */ + struct address_space *mapping; /* If low bit clear, points to + * inode address_space, or NULL. + * If page mapped as anonymous + * memory, low bit is set, and + * it points to anon_vma object: + * see PAGE_MAPPING_ANON below. + */ + /* Second double word */ + struct { + union { + pgoff_t index; /* Our offset within mapping. */ + void *freelist; /* slub first free object */ + }; + + union { + /* Used for cmpxchg_double in slub */ + unsigned long counters; + + struct { + + union { + atomic_t _mapcount; /* Count of ptes mapped in mms, + * to show when page is mapped + * & limit reverse map searches. + */ + + struct { + unsigned inuse:16; + unsigned objects:15; + unsigned frozen:1; + }; + }; + atomic_t _count; /* Usage count, see below. */ + }; + }; + }; + + /* Third double word block */ union { - /* - * Count of ptes mapped in - * mms, to show when page is - * mapped & limit reverse map - * searches. - * - * Used also for tail pages - * refcounting instead of - * _count. Tail pages cannot - * be mapped and keeping the - * tail page _count zero at - * all times guarantees - * get_page_unless_zero() will - * never succeed on tail - * pages. - */ - atomic_t _mapcount; - - struct { /* SLUB */ - u16 inuse; - u16 objects; + struct list_head lru; /* Pageout list, eg. active_list + * protected by zone->lru_lock ! + */ + struct { /* slub per cpu partial pages */ + struct page *next; /* Next partial slab */ +#ifdef CONFIG_64BIT + int pages; /* Nr of partial slabs left */ + int pobjects; /* Approximate # of objects */ +#else + short int pages; + short int pobjects; +#endif }; }; + + /* Remainder is not double word aligned */ union { - struct { unsigned long private; /* Mapping-private opaque data: * usually used for buffer_heads * if PagePrivate set; used for @@ -68,27 +98,13 @@ struct page { * indicates order in the buddy * system if PG_buddy is set. */ - struct address_space *mapping; /* If low bit clear, points to - * inode address_space, or NULL. - * If page mapped as anonymous - * memory, low bit is set, and - * it points to anon_vma object: - * see PAGE_MAPPING_ANON below. - */ - }; #if USE_SPLIT_PTLOCKS - spinlock_t ptl; + spinlock_t ptl; #endif - struct kmem_cache *slab; /* SLUB: Pointer to slab */ - struct page *first_page; /* Compound tail pages */ + struct kmem_cache *slab; /* SLUB: Pointer to slab */ + struct page *first_page; /* Compound tail pages */ }; - union { - pgoff_t index; /* Our offset within mapping. */ - void *freelist; /* SLUB: freelist req. slab lock */ - }; - struct list_head lru; /* Pageout list, eg. active_list - * protected by zone->lru_lock ! - */ + /* * On machines where all RAM is mapped into kernel address space, * we can simply calculate the virtual address. On machines with @@ -114,7 +130,16 @@ struct page { */ void *shadow; #endif -}; +} +/* + * If another subsystem starts using the double word pairing for atomic + * operations on struct page then it must change the #if to ensure + * proper alignment of the page struct. + */ +#if defined(CONFIG_SLUB) && defined(CONFIG_CMPXCHG_LOCAL) + __attribute__((__aligned__(2*sizeof(unsigned long)))) +#endif +; typedef unsigned long __nocast vm_flags_t; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index cfd77026db69..59e0b93ef56a 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -124,9 +124,6 @@ enum pageflags { /* SLOB */ PG_slob_free = PG_private, - - /* SLUB */ - PG_slub_frozen = PG_active, }; #ifndef __GENERATING_BOUNDS_H @@ -212,8 +209,6 @@ PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked) __PAGEFLAG(SlobFree, slob_free) -__PAGEFLAG(SlubFrozen, slub_frozen) - /* * Private page markings that may be used by the filesystem that owns the page * for its own purposes. diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index c8668d161dd8..76c1fd120fbc 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -24,6 +24,7 @@ enum stat_item { ALLOC_FROM_PARTIAL, /* Cpu slab acquired from partial list */ ALLOC_SLAB, /* Cpu slab acquired from page allocator */ ALLOC_REFILL, /* Refill cpu slab from slab freelist */ + ALLOC_NODE_MISMATCH, /* Switching cpu slab */ FREE_SLAB, /* Slab freed to the page allocator */ CPUSLAB_FLUSH, /* Abandoning of the cpu slab */ DEACTIVATE_FULL, /* Cpu slab was full when deactivated */ @@ -31,14 +32,19 @@ enum stat_item { DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */ DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ + DEACTIVATE_BYPASS, /* Implicit deactivation */ ORDER_FALLBACK, /* Number of times fallback was necessary */ CMPXCHG_DOUBLE_CPU_FAIL,/* Failure of this_cpu_cmpxchg_double */ + CMPXCHG_DOUBLE_FAIL, /* Number of times that cmpxchg double did not match */ + CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */ + CPU_PARTIAL_FREE, /* USed cpu partial on free */ NR_SLUB_STAT_ITEMS }; struct kmem_cache_cpu { void **freelist; /* Pointer to next available object */ unsigned long tid; /* Globally unique transaction id */ struct page *page; /* The slab from which we are allocating */ + struct page *partial; /* Partially allocated frozen slabs */ int node; /* The node of the page (or -1 for debug) */ #ifdef CONFIG_SLUB_STATS unsigned stat[NR_SLUB_STAT_ITEMS]; @@ -76,6 +82,7 @@ struct kmem_cache { int size; /* The size of an object including meta data */ int objsize; /* The size of an object without meta data */ int offset; /* Free pointer offset. */ + int cpu_partial; /* Number of per cpu partial objects to keep around */ struct kmem_cache_order_objects oo; /* Allocation and freeing of slabs */ @@ -228,6 +235,19 @@ kmalloc_order(size_t size, gfp_t flags, unsigned int order) return ret; } +/** + * Calling this on allocated memory will check that the memory + * is expected to be in use, and print warnings if not. + */ +#ifdef CONFIG_SLUB_DEBUG +extern bool verify_mem_not_deleted(const void *x); +#else +static inline bool verify_mem_not_deleted(const void *x) +{ + return true; +} +#endif + #ifdef CONFIG_TRACING extern void * kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size); diff --git a/mm/slab.c b/mm/slab.c index a67f8121ce5a..7882dc174ded 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4535,7 +4535,7 @@ static const struct file_operations proc_slabstats_operations = { static int __init slab_proc_init(void) { - proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); + proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations); #ifdef CONFIG_DEBUG_SLAB_LEAK proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); #endif diff --git a/mm/slub.c b/mm/slub.c index ae6e80ed1e5c..42e9a86517fe 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2,10 +2,11 @@ * SLUB: A slab allocator that limits cache line use instead of queuing * objects in per cpu and per node lists. * - * The allocator synchronizes using per slab locks and only - * uses a centralized lock to manage a pool of partial slabs. + * The allocator synchronizes using per slab locks or atomic operatios + * and only uses a centralized lock to manage a pool of partial slabs. * * (C) 2007 SGI, Christoph Lameter + * (C) 2011 Linux Foundation, Christoph Lameter */ #include @@ -27,20 +28,33 @@ #include #include #include +#include #include /* * Lock order: - * 1. slab_lock(page) - * 2. slab->list_lock + * 1. slub_lock (Global Semaphore) + * 2. node->list_lock + * 3. slab_lock(page) (Only on some arches and for debugging) * - * The slab_lock protects operations on the object of a particular - * slab and its metadata in the page struct. If the slab lock - * has been taken then no allocations nor frees can be performed - * on the objects in the slab nor can the slab be added or removed - * from the partial or full lists since this would mean modifying - * the page_struct of the slab. + * slub_lock + * + * The role of the slub_lock is to protect the list of all the slabs + * and to synchronize major metadata changes to slab cache structures. + * + * The slab_lock is only used for debugging and on arches that do not + * have the ability to do a cmpxchg_double. It only protects the second + * double word in the page struct. Meaning + * A. page->freelist -> List of object free in a page + * B. page->counters -> Counters of objects + * C. page->frozen -> frozen state + * + * If a slab is frozen then it is exempt from list management. It is not + * on any list. The processor that froze the slab is the one who can + * perform list operations on the page. Other processors may put objects + * onto the freelist but the processor that froze the slab is the only + * one that can retrieve the objects from the page's freelist. * * The list_lock protects the partial and full list on each node and * the partial slab counter. If taken then no new slabs may be added or @@ -53,20 +67,6 @@ * slabs, operations can continue without any centralized lock. F.e. * allocating a long series of objects that fill up slabs does not require * the list lock. - * - * The lock order is sometimes inverted when we are trying to get a slab - * off a list. We take the list_lock and then look for a page on the list - * to use. While we do that objects in the slabs may be freed. We can - * only operate on the slab if we have also taken the slab_lock. So we use - * a slab_trylock() on the slab. If trylock was successful then no frees - * can occur anymore and we can use the slab for allocations etc. If the - * slab_trylock() does not succeed then frees are in progress in the slab and - * we must stay away from it for a while since we may cause a bouncing - * cacheline if we try to acquire the lock. So go onto the next slab. - * If all pages are busy then we may allocate a new slab instead of reusing - * a partial slab. A new slab has no one operating on it and thus there is - * no danger of cacheline contention. - * * Interrupts are disabled during allocation and deallocation in order to * make the slab allocator safe to use in the context of an irq. In addition * interrupts are disabled to ensure that the processor does not change @@ -131,6 +131,9 @@ static inline int kmem_cache_debug(struct kmem_cache *s) /* Enable to test recovery from slab corruption on boot */ #undef SLUB_RESILIENCY_TEST +/* Enable to log cmpxchg failures */ +#undef SLUB_DEBUG_CMPXCHG + /* * Mininum number of partial slabs. These will be left on the partial * lists even if they are empty. kmem_cache_shrink may reclaim them. @@ -166,10 +169,11 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #define OO_SHIFT 16 #define OO_MASK ((1 << OO_SHIFT) - 1) -#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ +#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ /* Internal SLUB flags */ #define __OBJECT_POISON 0x80000000UL /* Poison object */ +#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ static int kmem_size = sizeof(struct kmem_cache); @@ -191,8 +195,12 @@ static LIST_HEAD(slab_caches); /* * Tracking user of a slab. */ +#define TRACK_ADDRS_COUNT 16 struct track { unsigned long addr; /* Called from address */ +#ifdef CONFIG_STACKTRACE + unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ +#endif int cpu; /* Was running on cpu */ int pid; /* Pid context */ unsigned long when; /* When did the operation occur */ @@ -338,11 +346,99 @@ static inline int oo_objects(struct kmem_cache_order_objects x) return x.x & OO_MASK; } +/* + * Per slab locking using the pagelock + */ +static __always_inline void slab_lock(struct page *page) +{ + bit_spin_lock(PG_locked, &page->flags); +} + +static __always_inline void slab_unlock(struct page *page) +{ + __bit_spin_unlock(PG_locked, &page->flags); +} + +/* Interrupts must be disabled (for the fallback code to work right) */ +static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, + void *freelist_old, unsigned long counters_old, + void *freelist_new, unsigned long counters_new, + const char *n) +{ + VM_BUG_ON(!irqs_disabled()); +#ifdef CONFIG_CMPXCHG_DOUBLE + if (s->flags & __CMPXCHG_DOUBLE) { + if (cmpxchg_double(&page->freelist, + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; + } else +#endif + { + slab_lock(page); + if (page->freelist == freelist_old && page->counters == counters_old) { + page->freelist = freelist_new; + page->counters = counters_new; + slab_unlock(page); + return 1; + } + slab_unlock(page); + } + + cpu_relax(); + stat(s, CMPXCHG_DOUBLE_FAIL); + +#ifdef SLUB_DEBUG_CMPXCHG + printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); +#endif + + return 0; +} + +static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, + void *freelist_old, unsigned long counters_old, + void *freelist_new, unsigned long counters_new, + const char *n) +{ +#ifdef CONFIG_CMPXCHG_DOUBLE + if (s->flags & __CMPXCHG_DOUBLE) { + if (cmpxchg_double(&page->freelist, + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; + } else +#endif + { + unsigned long flags; + + local_irq_save(flags); + slab_lock(page); + if (page->freelist == freelist_old && page->counters == counters_old) { + page->freelist = freelist_new; + page->counters = counters_new; + slab_unlock(page); + local_irq_restore(flags); + return 1; + } + slab_unlock(page); + local_irq_restore(flags); + } + + cpu_relax(); + stat(s, CMPXCHG_DOUBLE_FAIL); + +#ifdef SLUB_DEBUG_CMPXCHG + printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); +#endif + + return 0; +} + #ifdef CONFIG_SLUB_DEBUG /* * Determine a map of object in use on a page. * - * Slab lock or node listlock must be held to guarantee that the page does + * Node listlock must be held to guarantee that the page does * not vanish from under us. */ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) @@ -371,34 +467,8 @@ static int disable_higher_order_debug; */ static void print_section(char *text, u8 *addr, unsigned int length) { - int i, offset; - int newline = 1; - char ascii[17]; - - ascii[16] = 0; - - for (i = 0; i < length; i++) { - if (newline) { - printk(KERN_ERR "%8s 0x%p: ", text, addr + i); - newline = 0; - } - printk(KERN_CONT " %02x", addr[i]); - offset = i % 16; - ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; - if (offset == 15) { - printk(KERN_CONT " %s\n", ascii); - newline = 1; - } - } - if (!newline) { - i %= 16; - while (i < 16) { - printk(KERN_CONT " "); - ascii[i] = ' '; - i++; - } - printk(KERN_CONT " %s\n", ascii); - } + print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, + length, 1); } static struct track *get_track(struct kmem_cache *s, void *object, @@ -420,6 +490,24 @@ static void set_track(struct kmem_cache *s, void *object, struct track *p = get_track(s, object, alloc); if (addr) { +#ifdef CONFIG_STACKTRACE + struct stack_trace trace; + int i; + + trace.nr_entries = 0; + trace.max_entries = TRACK_ADDRS_COUNT; + trace.entries = p->addrs; + trace.skip = 3; + save_stack_trace(&trace); + + /* See rant in lockdep.c */ + if (trace.nr_entries != 0 && + trace.entries[trace.nr_entries - 1] == ULONG_MAX) + trace.nr_entries--; + + for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++) + p->addrs[i] = 0; +#endif p->addr = addr; p->cpu = smp_processor_id(); p->pid = current->pid; @@ -444,6 +532,16 @@ static void print_track(const char *s, struct track *t) printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); +#ifdef CONFIG_STACKTRACE + { + int i; + for (i = 0; i < TRACK_ADDRS_COUNT; i++) + if (t->addrs[i]) + printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); + else + break; + } +#endif } static void print_tracking(struct kmem_cache *s, void *object) @@ -472,7 +570,7 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...) va_end(args); printk(KERN_ERR "========================================" "=====================================\n"); - printk(KERN_ERR "BUG %s: %s\n", s->name, buf); + printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); printk(KERN_ERR "----------------------------------------" "-------------------------------------\n\n"); } @@ -501,12 +599,12 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) p, p - addr, get_freepointer(s, p)); if (p > addr + 16) - print_section("Bytes b4", p - 16, 16); - - print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE)); + print_section("Bytes b4 ", p - 16, 16); + print_section("Object ", p, min_t(unsigned long, s->objsize, + PAGE_SIZE)); if (s->flags & SLAB_RED_ZONE) - print_section("Redzone", p + s->objsize, + print_section("Redzone ", p + s->objsize, s->inuse - s->objsize); if (s->offset) @@ -519,7 +617,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (off != s->size) /* Beginning of the filler is the free pointer */ - print_section("Padding", p + off, s->size - off); + print_section("Padding ", p + off, s->size - off); dump_stack(); } @@ -557,17 +655,6 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) memset(p + s->objsize, val, s->inuse - s->objsize); } -static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) -{ - while (bytes) { - if (*start != (u8)value) - return start; - start++; - bytes--; - } - return NULL; -} - static void restore_bytes(struct kmem_cache *s, char *message, u8 data, void *from, void *to) { @@ -582,7 +669,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, u8 *fault; u8 *end; - fault = check_bytes(start, value, bytes); + fault = memchr_inv(start, value, bytes); if (!fault) return 1; @@ -675,14 +762,14 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) if (!remainder) return 1; - fault = check_bytes(end - remainder, POISON_INUSE, remainder); + fault = memchr_inv(end - remainder, POISON_INUSE, remainder); if (!fault) return 1; while (end > fault && end[-1] == POISON_INUSE) end--; slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); - print_section("Padding", end - remainder, remainder); + print_section("Padding ", end - remainder, remainder); restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); return 0; @@ -773,10 +860,11 @@ static int check_slab(struct kmem_cache *s, struct page *page) static int on_freelist(struct kmem_cache *s, struct page *page, void *search) { int nr = 0; - void *fp = page->freelist; + void *fp; void *object = NULL; unsigned long max_objects; + fp = page->freelist; while (fp && nr <= page->objects) { if (fp == search) return 1; @@ -830,7 +918,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, page->freelist); if (!alloc) - print_section("Object", (void *)object, s->objsize); + print_section("Object ", (void *)object, s->objsize); dump_stack(); } @@ -881,26 +969,27 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) /* * Tracking of fully allocated slabs for debugging purposes. + * + * list_lock must be held. */ -static void add_full(struct kmem_cache_node *n, struct page *page) +static void add_full(struct kmem_cache *s, + struct kmem_cache_node *n, struct page *page) { - spin_lock(&n->list_lock); + if (!(s->flags & SLAB_STORE_USER)) + return; + list_add(&page->lru, &n->full); - spin_unlock(&n->list_lock); } +/* + * list_lock must be held. + */ static void remove_full(struct kmem_cache *s, struct page *page) { - struct kmem_cache_node *n; - if (!(s->flags & SLAB_STORE_USER)) return; - n = get_node(s, page_to_nid(page)); - - spin_lock(&n->list_lock); list_del(&page->lru); - spin_unlock(&n->list_lock); } /* Tracking of the number of slabs for debugging purposes */ @@ -956,11 +1045,6 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *pa if (!check_slab(s, page)) goto bad; - if (!on_freelist(s, page, object)) { - object_err(s, page, object, "Object already allocated"); - goto bad; - } - if (!check_valid_pointer(s, page, object)) { object_err(s, page, object, "Freelist Pointer check fails"); goto bad; @@ -993,6 +1077,12 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *pa static noinline int free_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { + unsigned long flags; + int rc = 0; + + local_irq_save(flags); + slab_lock(page); + if (!check_slab(s, page)) goto fail; @@ -1007,7 +1097,7 @@ static noinline int free_debug_processing(struct kmem_cache *s, } if (!check_object(s, page, object, SLUB_RED_ACTIVE)) - return 0; + goto out; if (unlikely(s != page->slab)) { if (!PageSlab(page)) { @@ -1024,18 +1114,19 @@ static noinline int free_debug_processing(struct kmem_cache *s, goto fail; } - /* Special debug activities for freeing objects */ - if (!PageSlubFrozen(page) && !page->freelist) - remove_full(s, page); if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_FREE, addr); trace(s, page, object, 0); init_object(s, object, SLUB_RED_INACTIVE); - return 1; + rc = 1; +out: + slab_unlock(page); + local_irq_restore(flags); + return rc; fail: slab_fix(s, "Object at 0x%p not freed", object); - return 0; + goto out; } static int __init setup_slub_debug(char *str) @@ -1135,7 +1226,9 @@ static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } static inline int check_object(struct kmem_cache *s, struct page *page, void *object, u8 val) { return 1; } -static inline void add_full(struct kmem_cache_node *n, struct page *page) {} +static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, + struct page *page) {} +static inline void remove_full(struct kmem_cache *s, struct page *page) {} static inline unsigned long kmem_cache_flags(unsigned long objsize, unsigned long flags, const char *name, void (*ctor)(void *)) @@ -1187,6 +1280,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; + flags &= gfp_allowed_mask; + + if (flags & __GFP_WAIT) + local_irq_enable(); + flags |= s->allocflags; /* @@ -1203,12 +1301,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * Try a lower order alloc if possible */ page = alloc_slab_page(flags, node, oo); - if (!page) - return NULL; - stat(s, ORDER_FALLBACK); + if (page) + stat(s, ORDER_FALLBACK); } + if (flags & __GFP_WAIT) + local_irq_disable(); + + if (!page) + return NULL; + if (kmemcheck_enabled && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { int pages = 1 << oo_order(oo); @@ -1258,7 +1361,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) inc_slabs_node(s, page_to_nid(page), page->objects); page->slab = s; - page->flags |= 1 << PG_slab; + __SetPageSlab(page); start = page_address(page); @@ -1275,7 +1378,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, last, NULL); page->freelist = start; - page->inuse = 0; + page->inuse = page->objects; + page->frozen = 1; out: return page; } @@ -1353,79 +1457,80 @@ static void discard_slab(struct kmem_cache *s, struct page *page) } /* - * Per slab locking using the pagelock - */ -static __always_inline void slab_lock(struct page *page) -{ - bit_spin_lock(PG_locked, &page->flags); -} - -static __always_inline void slab_unlock(struct page *page) -{ - __bit_spin_unlock(PG_locked, &page->flags); -} - -static __always_inline int slab_trylock(struct page *page) -{ - int rc = 1; - - rc = bit_spin_trylock(PG_locked, &page->flags); - return rc; -} - -/* - * Management of partially allocated slabs + * Management of partially allocated slabs. + * + * list_lock must be held. */ -static void add_partial(struct kmem_cache_node *n, +static inline void add_partial(struct kmem_cache_node *n, struct page *page, int tail) { - spin_lock(&n->list_lock); n->nr_partial++; - if (tail) + if (tail == DEACTIVATE_TO_TAIL) list_add_tail(&page->lru, &n->partial); else list_add(&page->lru, &n->partial); - spin_unlock(&n->list_lock); } -static inline void __remove_partial(struct kmem_cache_node *n, +/* + * list_lock must be held. + */ +static inline void remove_partial(struct kmem_cache_node *n, struct page *page) { list_del(&page->lru); n->nr_partial--; } -static void remove_partial(struct kmem_cache *s, struct page *page) -{ - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - - spin_lock(&n->list_lock); - __remove_partial(n, page); - spin_unlock(&n->list_lock); -} - /* - * Lock slab and remove from the partial list. + * Lock slab, remove from the partial list and put the object into the + * per cpu freelist. + * + * Returns a list of objects or NULL if it fails. * * Must hold list_lock. */ -static inline int lock_and_freeze_slab(struct kmem_cache_node *n, - struct page *page) +static inline void *acquire_slab(struct kmem_cache *s, + struct kmem_cache_node *n, struct page *page, + int mode) { - if (slab_trylock(page)) { - __remove_partial(n, page); - __SetPageSlubFrozen(page); - return 1; - } - return 0; + void *freelist; + unsigned long counters; + struct page new; + + /* + * Zap the freelist and set the frozen bit. + * The old freelist is the list of objects for the + * per cpu allocation list. + */ + do { + freelist = page->freelist; + counters = page->counters; + new.counters = counters; + if (mode) + new.inuse = page->objects; + + VM_BUG_ON(new.frozen); + new.frozen = 1; + + } while (!__cmpxchg_double_slab(s, page, + freelist, counters, + NULL, new.counters, + "lock and freeze")); + + remove_partial(n, page); + return freelist; } +static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); + /* * Try to allocate a partial slab from a specific node. */ -static struct page *get_partial_node(struct kmem_cache_node *n) +static void *get_partial_node(struct kmem_cache *s, + struct kmem_cache_node *n, struct kmem_cache_cpu *c) { - struct page *page; + struct page *page, *page2; + void *object = NULL; /* * Racy check. If we mistakenly see no partial slabs then we @@ -1437,26 +1542,43 @@ static struct page *get_partial_node(struct kmem_cache_node *n) return NULL; spin_lock(&n->list_lock); - list_for_each_entry(page, &n->partial, lru) - if (lock_and_freeze_slab(n, page)) - goto out; - page = NULL; -out: + list_for_each_entry_safe(page, page2, &n->partial, lru) { + void *t = acquire_slab(s, n, page, object == NULL); + int available; + + if (!t) + break; + + if (!object) { + c->page = page; + c->node = page_to_nid(page); + stat(s, ALLOC_FROM_PARTIAL); + object = t; + available = page->objects - page->inuse; + } else { + page->freelist = t; + available = put_cpu_partial(s, page, 0); + } + if (kmem_cache_debug(s) || available > s->cpu_partial / 2) + break; + + } spin_unlock(&n->list_lock); - return page; + return object; } /* - * Get a page from somewhere. Search in increasing NUMA distances. + * Get a partial slab from somewhere. Search in increasing NUMA distances. */ -static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) +static void *get_any_partial(struct kmem_cache *s, gfp_t flags, + struct kmem_cache_cpu *c) { #ifdef CONFIG_NUMA struct zonelist *zonelist; struct zoneref *z; struct zone *zone; enum zone_type high_zoneidx = gfp_zone(flags); - struct page *page; + void *object; unsigned int cpuset_mems_cookie; /* @@ -1481,6 +1603,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) get_cycles() % 1024 > s->remote_node_defrag_ratio) return NULL; + do { cpuset_mems_cookie = get_mems_allowed(); zonelist = node_zonelist(slab_node(current->mempolicy), flags); @@ -1512,65 +1635,19 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) } /* - * Get a partial page, lock it and return it. + * Get a partial slab, lock it and return it. */ -static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) +static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, + struct kmem_cache_cpu *c) { - struct page *page; + void *object; int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; - page = get_partial_node(get_node(s, searchnode)); - if (page || node != NUMA_NO_NODE) - return page; - - return get_any_partial(s, flags); -} - -/* - * Move a page back to the lists. - * - * Must be called with the slab lock held. - * - * On exit the slab lock will have been dropped. - */ -static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) - __releases(bitlock) -{ - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - - __ClearPageSlubFrozen(page); - if (page->inuse) { + object = get_partial_node(s, get_node(s, searchnode), c); + if (object || node != NUMA_NO_NODE) + return object; - if (page->freelist) { - add_partial(n, page, tail); - stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); - } else { - stat(s, DEACTIVATE_FULL); - if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER)) - add_full(n, page); - } - slab_unlock(page); - } else { - stat(s, DEACTIVATE_EMPTY); - if (n->nr_partial < s->min_partial) { - /* - * Adding an empty slab to the partial slabs in order - * to avoid page allocator overhead. This slab needs - * to come after the other slabs with objects in - * so that the others get filled first. That way the - * size of the partial list stays small. - * - * kmem_cache_shrink can reclaim any empty slabs from - * the partial list. - */ - add_partial(n, page, 1); - slab_unlock(page); - } else { - slab_unlock(page); - stat(s, FREE_SLAB); - discard_slab(s, page); - } - } + return get_any_partial(s, flags, c); } #ifdef CONFIG_PREEMPT @@ -1639,45 +1716,270 @@ void init_kmem_cache_cpus(struct kmem_cache *s) for_each_possible_cpu(cpu) per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); } + +/* + * Remove the cpu slab + */ +static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) +{ + enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; + struct page *page = c->page; + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + int lock = 0; + enum slab_modes l = M_NONE, m = M_NONE; + void *freelist; + void *nextfree; + int tail = DEACTIVATE_TO_HEAD; + struct page new; + struct page old; + + if (page->freelist) { + stat(s, DEACTIVATE_REMOTE_FREES); + tail = DEACTIVATE_TO_TAIL; + } + + c->tid = next_tid(c->tid); + c->page = NULL; + freelist = c->freelist; + c->freelist = NULL; + + /* + * Stage one: Free all available per cpu objects back + * to the page freelist while it is still frozen. Leave the + * last one. + * + * There is no need to take the list->lock because the page + * is still frozen. + */ + while (freelist && (nextfree = get_freepointer(s, freelist))) { + void *prior; + unsigned long counters; + + do { + prior = page->freelist; + counters = page->counters; + set_freepointer(s, freelist, prior); + new.counters = counters; + new.inuse--; + VM_BUG_ON(!new.frozen); + + } while (!__cmpxchg_double_slab(s, page, + prior, counters, + freelist, new.counters, + "drain percpu freelist")); + + freelist = nextfree; + } + + /* + * Stage two: Ensure that the page is unfrozen while the + * list presence reflects the actual number of objects + * during unfreeze. + * + * We setup the list membership and then perform a cmpxchg + * with the count. If there is a mismatch then the page + * is not unfrozen but the page is on the wrong list. + * + * Then we restart the process which may have to remove + * the page from the list that we just put it on again + * because the number of objects in the slab may have + * changed. + */ +redo: + + old.freelist = page->freelist; + old.counters = page->counters; + VM_BUG_ON(!old.frozen); + + /* Determine target state of the slab */ + new.counters = old.counters; + if (freelist) { + new.inuse--; + set_freepointer(s, freelist, old.freelist); + new.freelist = freelist; + } else + new.freelist = old.freelist; + + new.frozen = 0; + + if (!new.inuse && n->nr_partial > s->min_partial) + m = M_FREE; + else if (new.freelist) { + m = M_PARTIAL; + if (!lock) { + lock = 1; + /* + * Taking the spinlock removes the possiblity + * that acquire_slab() will see a slab page that + * is frozen + */ + spin_lock(&n->list_lock); + } + } else { + m = M_FULL; + if (kmem_cache_debug(s) && !lock) { + lock = 1; + /* + * This also ensures that the scanning of full + * slabs from diagnostic functions will not see + * any frozen slabs. + */ + spin_lock(&n->list_lock); + } + } + + if (l != m) { + + if (l == M_PARTIAL) + + remove_partial(n, page); + + else if (l == M_FULL) + + remove_full(s, page); + + if (m == M_PARTIAL) { + + add_partial(n, page, tail); + stat(s, tail); + + } else if (m == M_FULL) { + + stat(s, DEACTIVATE_FULL); + add_full(s, n, page); + + } + } + + l = m; + if (!__cmpxchg_double_slab(s, page, + old.freelist, old.counters, + new.freelist, new.counters, + "unfreezing slab")) + goto redo; + + if (lock) + spin_unlock(&n->list_lock); + + if (m == M_FREE) { + stat(s, DEACTIVATE_EMPTY); + discard_slab(s, page); + stat(s, FREE_SLAB); + } +} + +/* Unfreeze all the cpu partial slabs */ +static void unfreeze_partials(struct kmem_cache *s) +{ + struct kmem_cache_node *n = NULL, *n2 = NULL; + struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); + struct page *page, *discard_page = NULL; + + while ((page = c->partial)) { + struct page new; + struct page old; + + c->partial = page->next; + + n2 = get_node(s, page_to_nid(page)); + if (n != n2) { + if (n) + spin_unlock(&n->list_lock); + + n = n2; + spin_lock(&n->list_lock); + } + + do { + + old.freelist = page->freelist; + old.counters = page->counters; + VM_BUG_ON(!old.frozen); + + new.counters = old.counters; + new.freelist = old.freelist; + + new.frozen = 0; + + } while (!__cmpxchg_double_slab(s, page, + old.freelist, old.counters, + new.freelist, new.counters, + "unfreezing slab")); + + if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) { + page->next = discard_page; + discard_page = page; + } else { + add_partial(n, page, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } + } + + if (n) + spin_unlock(&n->list_lock); + + while (discard_page) { + page = discard_page; + discard_page = discard_page->next; + + stat(s, DEACTIVATE_EMPTY); + discard_slab(s, page); + stat(s, FREE_SLAB); + } +} + /* - * Remove the cpu slab + * Put a page that was just frozen (in __slab_free) into a partial page + * slot if available. This is done without interrupts disabled and without + * preemption disabled. The cmpxchg is racy and may put the partial page + * onto a random cpus partial slot. + * + * If we did not find a slot then simply move all the partials to the + * per node partial list. */ -static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) - __releases(bitlock) +int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) { - struct page *page = c->page; - int tail = 1; + struct page *oldpage; + int pages; + int pobjects; - if (page->freelist) - stat(s, DEACTIVATE_REMOTE_FREES); - /* - * Merge cpu freelist into slab freelist. Typically we get here - * because both freelists are empty. So this is unlikely - * to occur. - */ - while (unlikely(c->freelist)) { - void **object; + do { + pages = 0; + pobjects = 0; + oldpage = this_cpu_read(s->cpu_slab->partial); + + if (oldpage) { + pobjects = oldpage->pobjects; + pages = oldpage->pages; + if (drain && pobjects > s->cpu_partial) { + unsigned long flags; + /* + * partial array is full. Move the existing + * set to the per node partial list. + */ + local_irq_save(flags); + unfreeze_partials(s); + local_irq_restore(flags); + pobjects = 0; + pages = 0; + } + } - tail = 0; /* Hot objects. Put the slab first */ + pages++; + pobjects += page->objects - page->inuse; - /* Retrieve object from cpu_freelist */ - object = c->freelist; - c->freelist = get_freepointer(s, c->freelist); + page->pages = pages; + page->pobjects = pobjects; + page->next = oldpage; - /* And put onto the regular freelist */ - set_freepointer(s, object, page->freelist); - page->freelist = object; - page->inuse--; - } - c->page = NULL; - c->tid = next_tid(c->tid); - unfreeze_slab(s, page, tail); + } while (irqsafe_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); + stat(s, CPU_PARTIAL_FREE); + return pobjects; } static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { stat(s, CPUSLAB_FLUSH); - slab_lock(c->page); deactivate_slab(s, c); } @@ -1690,8 +1992,12 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - if (likely(c && c->page)) - flush_slab(s, c); + if (likely(c)) { + if (c->page) + flush_slab(s, c); + + unfreeze_partials(s); + } } static void flush_cpu_slab(void *d) @@ -1782,12 +2088,70 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) } } +static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, + int node, struct kmem_cache_cpu **pc) +{ + void *object; + struct kmem_cache_cpu *c; + struct page *page = new_slab(s, flags, node); + + if (page) { + c = __this_cpu_ptr(s->cpu_slab); + if (c->page) + flush_slab(s, c); + + /* + * No other reference to the page yet so we can + * muck around with it freely without cmpxchg + */ + object = page->freelist; + page->freelist = NULL; + + stat(s, ALLOC_SLAB); + c->node = page_to_nid(page); + c->page = page; + *pc = c; + } else + object = NULL; + + return object; +} + +/* + * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist + * or deactivate the page. + * + * The page is still frozen if the return value is not NULL. + * + * If this function returns NULL then the page has been unfrozen. + */ +static inline void *get_freelist(struct kmem_cache *s, struct page *page) +{ + struct page new; + unsigned long counters; + void *freelist; + + do { + freelist = page->freelist; + counters = page->counters; + new.counters = counters; + VM_BUG_ON(!new.frozen); + + new.inuse = page->objects; + new.frozen = freelist != NULL; + + } while (!__cmpxchg_double_slab(s, page, + freelist, counters, + NULL, new.counters, + "get_freelist")); + + return freelist; +} + /* * Slow path. The lockless freelist is empty or we need to perform * debugging duties. * - * Interrupts are disabled. - * * Processing is still very fast if new objects have been freed to the * regular freelist. In that case we simply take over the regular freelist * as the lockless freelist and zap the regular freelist. @@ -1804,7 +2168,6 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c) { void **object; - struct page *page; unsigned long flags; local_irq_save(flags); @@ -1817,87 +2180,74 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, c = this_cpu_ptr(s->cpu_slab); #endif - /* We handle __GFP_ZERO in the caller */ - gfpflags &= ~__GFP_ZERO; - - page = c->page; - if (!page) + if (!c->page) goto new_slab; - - slab_lock(page); - if (unlikely(!node_match(c, node))) - goto another_slab; +redo: + if (unlikely(!node_match(c, node))) { + stat(s, ALLOC_NODE_MISMATCH); + deactivate_slab(s, c); + goto new_slab; + } /* must check again c->freelist in case of cpu migration or IRQ */ object = c->freelist; if (object) - goto update_freelist; + goto load_freelist; + + stat(s, ALLOC_SLOWPATH); + + object = get_freelist(s, c->page); + + if (!object) { + c->page = NULL; + stat(s, DEACTIVATE_BYPASS); + goto new_slab; + } stat(s, ALLOC_REFILL); load_freelist: - object = page->freelist; - if (unlikely(!object)) - goto another_slab; - if (kmem_cache_debug(s)) - goto debug; - -update_freelist: c->freelist = get_freepointer(s, object); - page->inuse = page->objects; - page->freelist = NULL; - - slab_unlock(page); c->tid = next_tid(c->tid); local_irq_restore(flags); - stat(s, ALLOC_SLOWPATH); return object; -another_slab: - deactivate_slab(s, c); - new_slab: - page = get_partial(s, gfpflags, node); - if (page) { - stat(s, ALLOC_FROM_PARTIAL); - c->node = page_to_nid(page); - c->page = page; - goto load_freelist; + + if (c->partial) { + c->page = c->partial; + c->partial = c->page->next; + c->node = page_to_nid(c->page); + stat(s, CPU_PARTIAL_ALLOC); + c->freelist = NULL; + goto redo; } - gfpflags &= gfp_allowed_mask; - if (gfpflags & __GFP_WAIT) - local_irq_enable(); + /* Then do expensive stuff like retrieving pages from the partial lists */ + object = get_partial(s, gfpflags, node, c); - page = new_slab(s, gfpflags, node); + if (unlikely(!object)) { - if (gfpflags & __GFP_WAIT) - local_irq_disable(); + object = new_slab_objects(s, gfpflags, node, &c); - if (page) { - c = __this_cpu_ptr(s->cpu_slab); - stat(s, ALLOC_SLAB); - if (c->page) - flush_slab(s, c); + if (unlikely(!object)) { + if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) + slab_out_of_memory(s, gfpflags, node); - slab_lock(page); - __SetPageSlubFrozen(page); - c->node = page_to_nid(page); - c->page = page; - goto load_freelist; + local_irq_restore(flags); + return NULL; + } } - if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) - slab_out_of_memory(s, gfpflags, node); - local_irq_restore(flags); - return NULL; -debug: - if (!alloc_debug_processing(s, page, object, addr)) - goto another_slab; - page->inuse++; - page->freelist = get_freepointer(s, object); + if (likely(!kmem_cache_debug(s))) + goto load_freelist; + + /* Only entered in the debug case */ + if (!alloc_debug_processing(s, c->page, object, addr)) + goto new_slab; /* Slab failed checks. Next slab needed */ + + c->freelist = get_freepointer(s, object); deactivate_slab(s, c); - c->page = NULL; c->node = NUMA_NO_NODE; local_irq_restore(flags); return object; @@ -2047,52 +2397,110 @@ static void __slab_free(struct kmem_cache *s, struct page *page, { void *prior; void **object = (void *)x; - unsigned long flags; + int was_frozen; + int inuse; + struct page new; + unsigned long counters; + struct kmem_cache_node *n = NULL; + unsigned long uninitialized_var(flags); - local_irq_save(flags); - slab_lock(page); stat(s, FREE_SLOWPATH); if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) - goto out_unlock; + return; - prior = page->freelist; - set_freepointer(s, object, prior); - page->freelist = object; - page->inuse--; + do { + prior = page->freelist; + counters = page->counters; + set_freepointer(s, object, prior); + new.counters = counters; + was_frozen = new.frozen; + new.inuse--; + if ((!new.inuse || !prior) && !was_frozen && !n) { - if (unlikely(PageSlubFrozen(page))) { - stat(s, FREE_FROZEN); - goto out_unlock; - } + if (!kmem_cache_debug(s) && !prior) + + /* + * Slab was on no list before and will be partially empty + * We can defer the list move and instead freeze it. + */ + new.frozen = 1; + + else { /* Needs to be taken off a list */ + + n = get_node(s, page_to_nid(page)); + /* + * Speculatively acquire the list_lock. + * If the cmpxchg does not succeed then we may + * drop the list_lock without any processing. + * + * Otherwise the list_lock will synchronize with + * other processors updating the list of slabs. + */ + spin_lock_irqsave(&n->list_lock, flags); + + } + } + inuse = new.inuse; - if (unlikely(!page->inuse)) - goto slab_empty; + } while (!cmpxchg_double_slab(s, page, + prior, counters, + object, new.counters, + "__slab_free")); + + if (likely(!n)) { + + /* + * If we just froze the page then put it onto the + * per cpu partial list. + */ + if (new.frozen && !was_frozen) + put_cpu_partial(s, page, 1); + + /* + * The list lock was not taken therefore no list + * activity can be necessary. + */ + if (was_frozen) + stat(s, FREE_FROZEN); + return; + } /* - * Objects left in the slab. If it was not on the partial list before - * then add it. + * was_frozen may have been set after we acquired the list_lock in + * an earlier loop. So we need to check it here again. */ - if (unlikely(!prior)) { - add_partial(get_node(s, page_to_nid(page)), page, 1); - stat(s, FREE_ADD_PARTIAL); - } + if (was_frozen) + stat(s, FREE_FROZEN); + else { + if (unlikely(!inuse && n->nr_partial > s->min_partial)) + goto slab_empty; -out_unlock: - slab_unlock(page); - local_irq_restore(flags); + /* + * Objects left in the slab. If it was not on the partial list before + * then add it. + */ + if (unlikely(!prior)) { + remove_full(s, page); + add_partial(n, page, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } + } + spin_unlock_irqrestore(&n->list_lock, flags); return; slab_empty: if (prior) { /* - * Slab still on the partial list. + * Slab on the partial list. */ - remove_partial(s, page); + remove_partial(n, page); stat(s, FREE_REMOVE_PARTIAL); - } - slab_unlock(page); - local_irq_restore(flags); + } else + /* Slab must be on the full list */ + remove_full(s, page); + + spin_unlock_irqrestore(&n->list_lock, flags); stat(s, FREE_SLAB); discard_slab(s, page); } @@ -2118,7 +2526,6 @@ static __always_inline void slab_free(struct kmem_cache *s, slab_free_hook(s, x); redo: - /* * Determine the currently cpus per cpu slab. * The cpu may change afterward. However that does not matter since @@ -2319,7 +2726,7 @@ static unsigned long calculate_alignment(unsigned long flags, } static void -init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) +init_kmem_cache_node(struct kmem_cache_node *n) { n->nr_partial = 0; spin_lock_init(&n->list_lock); @@ -2366,7 +2773,6 @@ static void early_kmem_cache_node_alloc(int node) { struct page *page; struct kmem_cache_node *n; - unsigned long flags; BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); @@ -2383,23 +2789,17 @@ static void early_kmem_cache_node_alloc(int node) n = page->freelist; BUG_ON(!n); page->freelist = get_freepointer(kmem_cache_node, n); - page->inuse++; + page->inuse = 1; + page->frozen = 0; kmem_cache_node->node[node] = n; #ifdef CONFIG_SLUB_DEBUG init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); init_tracking(kmem_cache_node, n); #endif - init_kmem_cache_node(n, kmem_cache_node); + init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, page->objects); - /* - * lockdep requires consistent irq usage for each lock - * so even though there cannot be a race this early in - * the boot sequence, we still disable irqs. - */ - local_irq_save(flags); - add_partial(n, page, 0); - local_irq_restore(flags); + add_partial(n, page, DEACTIVATE_TO_HEAD); } static void free_kmem_cache_nodes(struct kmem_cache *s) @@ -2436,7 +2836,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) } s->node[node] = n; - init_kmem_cache_node(n, s); + init_kmem_cache_node(n); } return 1; } @@ -2605,11 +3005,46 @@ static int kmem_cache_open(struct kmem_cache *s, } } +#ifdef CONFIG_CMPXCHG_DOUBLE + if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) + /* Enable fast mode */ + s->flags |= __CMPXCHG_DOUBLE; +#endif + /* * The larger the object size is, the more pages we want on the partial * list to avoid pounding the page allocator excessively. */ - set_min_partial(s, ilog2(s->size)); + set_min_partial(s, ilog2(s->size) / 2); + + /* + * cpu_partial determined the maximum number of objects kept in the + * per cpu partial lists of a processor. + * + * Per cpu partial lists mainly contain slabs that just have one + * object freed. If they are used for allocation then they can be + * filled up again with minimal effort. The slab will never hit the + * per node partial lists and therefore no locking will be required. + * + * This setting also determines + * + * A) The number of objects from per cpu partial slabs dumped to the + * per node list when we reach the limit. + * B) The number of objects in cpu partial slabs to extract from the + * per node list when we run out of per cpu objects. We only fetch 50% + * to keep some capacity around for frees. + */ + if (kmem_cache_debug(s)) + s->cpu_partial = 0; + else if (s->size >= PAGE_SIZE) + s->cpu_partial = 2; + else if (s->size >= 1024) + s->cpu_partial = 6; + else if (s->size >= 256) + s->cpu_partial = 13; + else + s->cpu_partial = 30; + s->refcount = 1; #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; @@ -2668,23 +3103,22 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, /* * Attempt to free all partial slabs on a node. + * This is called from kmem_cache_close(). We must be the last thread + * using the cache and therefore we do not need to lock anymore. */ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { - unsigned long flags; struct page *page, *h; - spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { - __remove_partial(n, page); + remove_partial(n, page); discard_slab(s, page); } else { list_slab_objects(s, page, "Objects remaining on kmem_cache_close()"); } } - spin_unlock_irqrestore(&n->list_lock, flags); } /* @@ -2718,6 +3152,7 @@ void kmem_cache_destroy(struct kmem_cache *s) s->refcount--; if (!s->refcount) { list_del(&s->list); + up_write(&slub_lock); if (kmem_cache_close(s)) { printk(KERN_ERR "SLUB %s: %s called for cache that " "still has objects.\n", s->name, __func__); @@ -2726,8 +3161,8 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->flags & SLAB_DESTROY_BY_RCU) rcu_barrier(); sysfs_slab_remove(s); - } - up_write(&slub_lock); + } else + up_write(&slub_lock); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -2944,6 +3379,42 @@ size_t ksize(const void *object) } EXPORT_SYMBOL(ksize); +#ifdef CONFIG_SLUB_DEBUG +bool verify_mem_not_deleted(const void *x) +{ + struct page *page; + void *object = (void *)x; + unsigned long flags; + bool rv; + + if (unlikely(ZERO_OR_NULL_PTR(x))) + return false; + + local_irq_save(flags); + + page = virt_to_head_page(x); + if (unlikely(!PageSlab(page))) { + /* maybe it was from stack? */ + rv = true; + goto out_unlock; + } + + slab_lock(page); + if (on_freelist(page->slab, page, object)) { + object_err(page->slab, page, object, "Object is on free-list"); + rv = false; + } else { + rv = true; + } + slab_unlock(page); + +out_unlock: + local_irq_restore(flags); + return rv; +} +EXPORT_SYMBOL(verify_mem_not_deleted); +#endif + void kfree(const void *x) { struct page *page; @@ -3009,29 +3480,23 @@ int kmem_cache_shrink(struct kmem_cache *s) * list_lock. page->inuse here is the upper limit. */ list_for_each_entry_safe(page, t, &n->partial, lru) { - if (!page->inuse && slab_trylock(page)) { - /* - * Must hold slab lock here because slab_free - * may have freed the last object and be - * waiting to release the slab. - */ - __remove_partial(n, page); - slab_unlock(page); - discard_slab(s, page); - } else { - list_move(&page->lru, - slabs_by_inuse + page->inuse); - } + list_move(&page->lru, slabs_by_inuse + page->inuse); + if (!page->inuse) + n->nr_partial--; } /* * Rebuild the partial list with the slabs filled up most * first and the least used slabs at the end. */ - for (i = objects - 1; i >= 0; i--) + for (i = objects - 1; i > 0; i--) list_splice(slabs_by_inuse + i, n->partial.prev); spin_unlock_irqrestore(&n->list_lock, flags); + + /* Release empty slabs */ + list_for_each_entry_safe(page, t, slabs_by_inuse, lru) + discard_slab(s, page); } kfree(slabs_by_inuse); @@ -3119,7 +3584,7 @@ static int slab_mem_going_online_callback(void *arg) ret = -ENOMEM; goto out; } - init_kmem_cache_node(n, s); + init_kmem_cache_node(n); s->node[nid] = n; } out: @@ -3203,7 +3668,7 @@ void __init kmem_cache_init(void) /* Allocate two kmem_caches from the page allocator */ kmalloc_size = ALIGN(kmem_size, cache_line_size()); order = get_order(2 * kmalloc_size); - kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order); + kmem_cache = (void *)__get_free_pages(GFP_NOWAIT | __GFP_ZERO, order); /* * Must first have the slab cache available for the allocations of the @@ -3605,12 +4070,9 @@ static int validate_slab(struct kmem_cache *s, struct page *page, static void validate_slab_slab(struct kmem_cache *s, struct page *page, unsigned long *map) { - if (slab_trylock(page)) { - validate_slab(s, page, map); - slab_unlock(page); - } else - printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", - s->name, page); + slab_lock(page); + validate_slab(s, page, map); + slab_unlock(page); } static int validate_slab_node(struct kmem_cache *s, @@ -3991,22 +4453,31 @@ static ssize_t show_slab_objects(struct kmem_cache *s, for_each_possible_cpu(cpu) { struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + int node = ACCESS_ONCE(c->node); + struct page *page; - if (!c || c->node < 0) + if (node < 0) continue; - - if (c->page) { - if (flags & SO_TOTAL) - x = c->page->objects; + page = ACCESS_ONCE(c->page); + if (page) { + if (flags & SO_TOTAL) + x = page->objects; else if (flags & SO_OBJECTS) - x = c->page->inuse; + x = page->inuse; else x = 1; total += x; - nodes[c->node] += x; + nodes[node] += x; + } + page = c->partial; + + if (page) { + x = page->pobjects; + total += x; + nodes[node] += x; } - per_cpu[c->node]++; + per_cpu[node]++; } } @@ -4084,11 +4555,12 @@ struct slab_attribute { }; #define SLAB_ATTR_RO(_name) \ - static struct slab_attribute _name##_attr = __ATTR_RO(_name) + static struct slab_attribute _name##_attr = \ + __ATTR(_name, 0400, _name##_show, NULL) #define SLAB_ATTR(_name) \ static struct slab_attribute _name##_attr = \ - __ATTR(_name, 0644, _name##_show, _name##_store) + __ATTR(_name, 0600, _name##_show, _name##_store) static ssize_t slab_size_show(struct kmem_cache *s, char *buf) { @@ -4157,6 +4629,27 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, } SLAB_ATTR(min_partial); +static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%u\n", s->cpu_partial); +} + +static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + unsigned long objects; + int err; + + err = strict_strtoul(buf, 10, &objects); + if (err) + return err; + + s->cpu_partial = objects; + flush_all(s); + return length; +} +SLAB_ATTR(cpu_partial); + static ssize_t ctor_show(struct kmem_cache *s, char *buf) { if (!s->ctor) @@ -4195,6 +4688,37 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(objects_partial); +static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) +{ + int objects = 0; + int pages = 0; + int cpu; + int len; + + for_each_online_cpu(cpu) { + struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial; + + if (page) { + pages += page->pages; + objects += page->pobjects; + } + } + + len = sprintf(buf, "%d(%d)", objects, pages); + +#ifdef CONFIG_SMP + for_each_online_cpu(cpu) { + struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial; + + if (page && len < PAGE_SIZE - 20) + len += sprintf(buf + len, " C%d=%d(%d)", cpu, + page->pobjects, page->pages); + } +#endif + return len + sprintf(buf + len, "\n"); +} +SLAB_ATTR_RO(slabs_cpu_partial); + static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); @@ -4258,8 +4782,10 @@ static ssize_t sanity_checks_store(struct kmem_cache *s, const char *buf, size_t length) { s->flags &= ~SLAB_DEBUG_FREE; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_DEBUG_FREE; + } return length; } SLAB_ATTR(sanity_checks); @@ -4273,8 +4799,10 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf, size_t length) { s->flags &= ~SLAB_TRACE; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_TRACE; + } return length; } SLAB_ATTR(trace); @@ -4291,8 +4819,10 @@ static ssize_t red_zone_store(struct kmem_cache *s, return -EBUSY; s->flags &= ~SLAB_RED_ZONE; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_RED_ZONE; + } calculate_sizes(s, -1); return length; } @@ -4310,8 +4840,10 @@ static ssize_t poison_store(struct kmem_cache *s, return -EBUSY; s->flags &= ~SLAB_POISON; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_POISON; + } calculate_sizes(s, -1); return length; } @@ -4329,8 +4861,10 @@ static ssize_t store_user_store(struct kmem_cache *s, return -EBUSY; s->flags &= ~SLAB_STORE_USER; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_STORE_USER; + } calculate_sizes(s, -1); return length; } @@ -4495,6 +5029,7 @@ STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); STAT_ATTR(ALLOC_SLAB, alloc_slab); STAT_ATTR(ALLOC_REFILL, alloc_refill); +STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); STAT_ATTR(FREE_SLAB, free_slab); STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); STAT_ATTR(DEACTIVATE_FULL, deactivate_full); @@ -4502,7 +5037,12 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); +STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); STAT_ATTR(ORDER_FALLBACK, order_fallback); +STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); +STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); +STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); +STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); #endif static struct attribute *slab_attrs[] = { @@ -4511,6 +5051,7 @@ static struct attribute *slab_attrs[] = { &objs_per_slab_attr.attr, &order_attr.attr, &min_partial_attr.attr, + &cpu_partial_attr.attr, &objects_attr.attr, &objects_partial_attr.attr, &partial_attr.attr, @@ -4523,6 +5064,7 @@ static struct attribute *slab_attrs[] = { &destroy_by_rcu_attr.attr, &shrink_attr.attr, &reserved_attr.attr, + &slabs_cpu_partial_attr.attr, #ifdef CONFIG_SLUB_DEBUG &total_objects_attr.attr, &slabs_attr.attr, @@ -4552,6 +5094,7 @@ static struct attribute *slab_attrs[] = { &alloc_from_partial_attr.attr, &alloc_slab_attr.attr, &alloc_refill_attr.attr, + &alloc_node_mismatch_attr.attr, &free_slab_attr.attr, &cpuslab_flush_attr.attr, &deactivate_full_attr.attr, @@ -4559,7 +5102,12 @@ static struct attribute *slab_attrs[] = { &deactivate_to_head_attr.attr, &deactivate_to_tail_attr.attr, &deactivate_remote_frees_attr.attr, + &deactivate_bypass_attr.attr, &order_fallback_attr.attr, + &cmpxchg_double_fail_attr.attr, + &cmpxchg_double_cpu_fail_attr.attr, + &cpu_partial_alloc_attr.attr, + &cpu_partial_free_attr.attr, #endif #ifdef CONFIG_FAILSLAB &failslab_attr.attr, @@ -4911,7 +5459,7 @@ static const struct file_operations proc_slabinfo_operations = { static int __init slab_proc_init(void) { - proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations); + proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); return 0; } module_init(slab_proc_init);