Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

msm: kgsl: Optimize page_alloc allocations

User memory needs to be zeroed out before it is sent to the user.
To do this, the kernel maps the page, memsets it to zero and then
unmaps it.  By virtue of mapping it, this forces us to flush the
dcache to ensure cache coherency between kernel and user mappings.
Originally, the page_alloc loop was using GFP_ZERO (which does a
map, memset, and unmap for each individual page) and then we were
additionally calling flush_dcache_page() for each page killing us
on performance.  It is far more efficient, especially for large
allocations (> 1MB), to allocate the pages without GFP_ZERO and
then to vmap the entire allocation, memset it to zero, flush the
cache and then unmap. This process is slightly slower for very
small allocations, but only by a few microseconds, and is well
within the margin of acceptability. In all, the new scheme is
faster than the default for all sizes greater than 16k, and is
almost 4X faster for 2MB and 4MB allocations which are common for
textures and very large buffer objects.

The downside is that if there isn't enough vmalloc room for the
allocation that we are forced to fallback to a slow page by
page memset/flush, but this should happen rarely (if at all) and
is only included for completeness.

CRs-Fixed: 372638
Change-Id: Ic0dedbadf3e27dcddf0f068594a40c00d64b495e
Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
  • Loading branch information...
commit 5a90ea6b5b2023a5daf055fa3397944aa3366ae9 1 parent c0cd1aa
Jordan Crouse authored Grigori Goronzy committed
Showing with 71 additions and 7 deletions.
  1. +71 −7 drivers/gpu/msm/kgsl_sharedmem.c
View
78 drivers/gpu/msm/kgsl_sharedmem.c
@@ -15,6 +15,7 @@
#include <asm/cacheflush.h>
#include <linux/slab.h>
#include <linux/kmemleak.h>
+#include <linux/highmem.h>
#include "kgsl.h"
#include "kgsl_sharedmem.h"
@@ -467,9 +468,11 @@ _kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc,
struct kgsl_pagetable *pagetable,
size_t size, unsigned int protflags)
{
- int order, ret = 0;
+ int i, order, ret = 0;
int sglen = PAGE_ALIGN(size) / PAGE_SIZE;
- int i;
+ struct page **pages = NULL;
+ pgprot_t page_prot = pgprot_writecombine(PAGE_KERNEL);
+ void *ptr;
memdesc->size = size;
memdesc->pagetable = pagetable;
@@ -485,22 +488,81 @@ _kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc,
goto done;
}
+ /*
+ * Allocate space to store the list of pages to send to vmap.
+ * This is an array of pointers so we can track 1024 pages per page of
+ * allocation which means we can handle up to a 8MB buffer request with
+ * two pages; well within the acceptable limits for using kmalloc.
+ */
+
+ pages = kmalloc(sglen * sizeof(struct page *), GFP_KERNEL);
+
+ if (pages == NULL) {
+ KGSL_CORE_ERR("kmalloc (%d) failed\n",
+ sglen * sizeof(struct page *));
+ ret = -ENOMEM;
+ goto done;
+ }
+
kmemleak_not_leak(memdesc->sg);
memdesc->sglen = sglen;
sg_init_table(memdesc->sg, sglen);
for (i = 0; i < memdesc->sglen; i++) {
- struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO |
- __GFP_HIGHMEM);
- if (!page) {
+
+ /*
+ * Don't use GFP_ZERO here because it is faster to memset the
+ * range ourselves (see below)
+ */
+
+ pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+ if (pages[i] == NULL) {
ret = -ENOMEM;
memdesc->sglen = i;
goto done;
}
- flush_dcache_page(page);
- sg_set_page(&memdesc->sg[i], page, PAGE_SIZE, 0);
+ sg_set_page(&memdesc->sg[i], pages[i], PAGE_SIZE, 0);
}
+
+ /*
+ * All memory that goes to the user has to be zeroed out before it gets
+ * exposed to userspace. This means that the memory has to be mapped in
+ * the kernel, zeroed (memset) and then unmapped. This also means that
+ * the dcache has to be flushed to ensure coherency between the kernel
+ * and user pages. We used to pass __GFP_ZERO to alloc_page which mapped
+ * zeroed and unmaped each individual page, and then we had to turn
+ * around and call flush_dcache_page() on that page to clear the caches.
+ * This was killing us for performance. Instead, we found it is much
+ * faster to allocate the pages without GFP_ZERO, map the entire range,
+ * memset it, flush the range and then unmap - this results in a factor
+ * of 4 improvement for speed for large buffers. There is a small
+ * increase in speed for small buffers, but only on the order of a few
+ * microseconds at best. The only downside is that there needs to be
+ * enough temporary space in vmalloc to accomodate the map. This
+ * shouldn't be a problem, but if it happens, fall back to a much slower
+ * path
+ */
+
+ ptr = vmap(pages, i, VM_IOREMAP, page_prot);
+
+ if (ptr != NULL) {
+ memset(ptr, 0, memdesc->size);
+ dmac_flush_range(ptr, ptr + memdesc->size);
+ vunmap(ptr);
+ } else {
+ int j;
+
+ /* Very, very, very slow path */
+
+ for (j = 0; j < i; j++) {
+ ptr = kmap_atomic(pages[j], KM_USER0);
+ memset(ptr, 0, PAGE_SIZE);
+ dmac_flush_range(ptr, ptr + PAGE_SIZE);
+ kunmap_atomic(ptr, KM_USER0);
+ }
+ }
+
outer_cache_range_op_sg(memdesc->sg, memdesc->sglen,
KGSL_CACHE_OP_FLUSH);
@@ -518,6 +580,8 @@ _kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc,
kgsl_driver.stats.histogram[order]++;
done:
+ kfree(pages);
+
if (ret)
kgsl_sharedmem_free(memdesc);
Please sign in to comment.
Something went wrong with that request. Please try again.