Permalink
Switch branches/tags
Nothing to show
Find file
Fetching contributors…
Cannot retrieve contributors at this time
782 lines (674 sloc) 19.2 KB
/*
* Copyright (C) 2013-2015 Kay Sievers
* Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
* Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
* Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
* Copyright (C) 2013-2015 Linux Foundation
* Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
*
* kdbus is free software; you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the
* Free Software Foundation; either version 2.1 of the License, or (at
* your option) any later version.
*/
#include <linux/aio.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/pagemap.h>
#include <linux/rbtree.h>
#include <linux/sched.h>
#include <linux/shmem_fs.h>
#include <linux/sizes.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/uio.h>
#include <linux/version.h>
#include "pool.h"
#include "util.h"
/**
* struct kdbus_pool - the receiver's buffer
* @f: The backing shmem file
* @size: The size of the file
* @accounted_size: Currently accounted memory in bytes
* @lock: Pool data lock
* @slices: All slices sorted by address
* @slices_busy: Tree of allocated slices
* @slices_free: Tree of free slices
*
* The receiver's buffer, managed as a pool of allocated and free
* slices containing the queued messages.
*
* Messages sent with KDBUS_CMD_SEND are copied direcly by the
* sending process into the receiver's pool.
*
* Messages received with KDBUS_CMD_RECV just return the offset
* to the data placed in the pool.
*
* The internally allocated memory needs to be returned by the receiver
* with KDBUS_CMD_FREE.
*/
struct kdbus_pool {
struct file *f;
size_t size;
size_t accounted_size;
struct mutex lock;
struct list_head slices;
struct rb_root slices_busy;
struct rb_root slices_free;
};
/**
* struct kdbus_pool_slice - allocated element in kdbus_pool
* @pool: Pool this slice belongs to
* @off: Offset of slice in the shmem file
* @size: Size of slice
* @entry: Entry in "all slices" list
* @rb_node: Entry in free or busy list
* @free: Unused slice
* @accounted: Accounted as queue slice
* @ref_kernel: Kernel holds a reference
* @ref_user: Userspace holds a reference
*
* The pool has one or more slices, always spanning the entire size of the
* pool.
*
* Every slice is an element in a list sorted by the buffer address, to
* provide access to the next neighbor slice.
*
* Every slice is member in either the busy or the free tree. The free
* tree is organized by slice size, the busy tree organized by buffer
* offset.
*/
struct kdbus_pool_slice {
struct kdbus_pool *pool;
size_t off;
size_t size;
struct list_head entry;
struct rb_node rb_node;
bool free:1;
bool accounted:1;
bool ref_kernel:1;
bool ref_user:1;
};
static struct kdbus_pool_slice *kdbus_pool_slice_new(struct kdbus_pool *pool,
size_t off, size_t size)
{
struct kdbus_pool_slice *slice;
slice = kzalloc(sizeof(*slice), GFP_KERNEL);
if (!slice)
return NULL;
slice->pool = pool;
slice->off = off;
slice->size = size;
slice->free = true;
return slice;
}
/* insert a slice into the free tree */
static void kdbus_pool_add_free_slice(struct kdbus_pool *pool,
struct kdbus_pool_slice *slice)
{
struct rb_node **n;
struct rb_node *pn = NULL;
n = &pool->slices_free.rb_node;
while (*n) {
struct kdbus_pool_slice *pslice;
pn = *n;
pslice = rb_entry(pn, struct kdbus_pool_slice, rb_node);
if (slice->size < pslice->size)
n = &pn->rb_left;
else
n = &pn->rb_right;
}
rb_link_node(&slice->rb_node, pn, n);
rb_insert_color(&slice->rb_node, &pool->slices_free);
}
/* insert a slice into the busy tree */
static void kdbus_pool_add_busy_slice(struct kdbus_pool *pool,
struct kdbus_pool_slice *slice)
{
struct rb_node **n;
struct rb_node *pn = NULL;
n = &pool->slices_busy.rb_node;
while (*n) {
struct kdbus_pool_slice *pslice;
pn = *n;
pslice = rb_entry(pn, struct kdbus_pool_slice, rb_node);
if (slice->off < pslice->off)
n = &pn->rb_left;
else if (slice->off > pslice->off)
n = &pn->rb_right;
else
BUG();
}
rb_link_node(&slice->rb_node, pn, n);
rb_insert_color(&slice->rb_node, &pool->slices_busy);
}
static struct kdbus_pool_slice *kdbus_pool_find_slice(struct kdbus_pool *pool,
size_t off)
{
struct rb_node *n;
n = pool->slices_busy.rb_node;
while (n) {
struct kdbus_pool_slice *s;
s = rb_entry(n, struct kdbus_pool_slice, rb_node);
if (off < s->off)
n = n->rb_left;
else if (off > s->off)
n = n->rb_right;
else
return s;
}
return NULL;
}
/**
* kdbus_pool_slice_alloc() - allocate memory from a pool
* @pool: The receiver's pool
* @size: The number of bytes to allocate
* @accounted: Whether this slice should be accounted for
*
* The returned slice is used for kdbus_pool_slice_release() to
* free the allocated memory. If either @kvec or @iovec is non-NULL, the data
* will be copied from kernel or userspace memory into the new slice at
* offset 0.
*
* Return: the allocated slice on success, ERR_PTR on failure.
*/
struct kdbus_pool_slice *kdbus_pool_slice_alloc(struct kdbus_pool *pool,
size_t size, bool accounted)
{
size_t slice_size = KDBUS_ALIGN8(size);
struct rb_node *n, *found = NULL;
struct kdbus_pool_slice *s;
int ret = 0;
if (WARN_ON(!size))
return ERR_PTR(-EINVAL);
/* search a free slice with the closest matching size */
mutex_lock(&pool->lock);
n = pool->slices_free.rb_node;
while (n) {
s = rb_entry(n, struct kdbus_pool_slice, rb_node);
if (slice_size < s->size) {
found = n;
n = n->rb_left;
} else if (slice_size > s->size) {
n = n->rb_right;
} else {
found = n;
break;
}
}
/* no slice with the minimum size found in the pool */
if (!found) {
ret = -EXFULL;
goto exit_unlock;
}
/* no exact match, use the closest one */
if (!n) {
struct kdbus_pool_slice *s_new;
s = rb_entry(found, struct kdbus_pool_slice, rb_node);
/* split-off the remainder of the size to its own slice */
s_new = kdbus_pool_slice_new(pool, s->off + slice_size,
s->size - slice_size);
if (!s_new) {
ret = -ENOMEM;
goto exit_unlock;
}
list_add(&s_new->entry, &s->entry);
kdbus_pool_add_free_slice(pool, s_new);
/* adjust our size now that we split-off another slice */
s->size = slice_size;
}
/* move slice from free to the busy tree */
rb_erase(found, &pool->slices_free);
kdbus_pool_add_busy_slice(pool, s);
WARN_ON(s->ref_kernel || s->ref_user);
s->ref_kernel = true;
s->free = false;
s->accounted = accounted;
if (accounted)
pool->accounted_size += s->size;
mutex_unlock(&pool->lock);
return s;
exit_unlock:
mutex_unlock(&pool->lock);
return ERR_PTR(ret);
}
static void __kdbus_pool_slice_release(struct kdbus_pool_slice *slice)
{
struct kdbus_pool *pool = slice->pool;
/* don't free the slice if either has a reference */
if (slice->ref_kernel || slice->ref_user)
return;
if (WARN_ON(slice->free))
return;
rb_erase(&slice->rb_node, &pool->slices_busy);
/* merge with the next free slice */
if (!list_is_last(&slice->entry, &pool->slices)) {
struct kdbus_pool_slice *s;
s = list_entry(slice->entry.next,
struct kdbus_pool_slice, entry);
if (s->free) {
rb_erase(&s->rb_node, &pool->slices_free);
list_del(&s->entry);
slice->size += s->size;
kfree(s);
}
}
/* merge with previous free slice */
if (pool->slices.next != &slice->entry) {
struct kdbus_pool_slice *s;
s = list_entry(slice->entry.prev,
struct kdbus_pool_slice, entry);
if (s->free) {
rb_erase(&s->rb_node, &pool->slices_free);
list_del(&slice->entry);
s->size += slice->size;
kfree(slice);
slice = s;
}
}
slice->free = true;
kdbus_pool_add_free_slice(pool, slice);
}
/**
* kdbus_pool_slice_release() - drop kernel-reference on allocated slice
* @slice: Slice allocated from the pool
*
* This releases the kernel-reference on the given slice. If the
* kernel-reference and the user-reference on a slice are dropped, the slice is
* returned to the pool.
*
* So far, we do not implement full ref-counting on slices. Each, kernel and
* user-space can have exactly one reference to a slice. If both are dropped at
* the same time, the slice is released.
*/
void kdbus_pool_slice_release(struct kdbus_pool_slice *slice)
{
struct kdbus_pool *pool;
if (!slice)
return;
/* @slice may be freed, so keep local ptr to @pool */
pool = slice->pool;
mutex_lock(&pool->lock);
/* kernel must own a ref to @slice to drop it */
WARN_ON(!slice->ref_kernel);
slice->ref_kernel = false;
/* no longer kernel-owned, de-account slice */
if (slice->accounted && !WARN_ON(pool->accounted_size < slice->size))
pool->accounted_size -= slice->size;
__kdbus_pool_slice_release(slice);
mutex_unlock(&pool->lock);
}
/**
* kdbus_pool_release_offset() - release a public offset
* @pool: pool to operate on
* @off: offset to release
*
* This should be called whenever user-space frees a slice given to them. It
* verifies the slice is available and public, and then drops it. It ensures
* correct locking and barriers against queues.
*
* Return: 0 on success, ENXIO if the offset is invalid or not public.
*/
int kdbus_pool_release_offset(struct kdbus_pool *pool, size_t off)
{
struct kdbus_pool_slice *slice;
int ret = 0;
/* 'pool->size' is used as dummy offset for empty slices */
if (off == pool->size)
return 0;
mutex_lock(&pool->lock);
slice = kdbus_pool_find_slice(pool, off);
if (slice && slice->ref_user) {
slice->ref_user = false;
__kdbus_pool_slice_release(slice);
} else {
ret = -ENXIO;
}
mutex_unlock(&pool->lock);
return ret;
}
/**
* kdbus_pool_publish_empty() - publish empty slice to user-space
* @pool: pool to operate on
* @off: output storage for offset, or NULL
* @size: output storage for size, or NULL
*
* This is the same as kdbus_pool_slice_publish(), but uses a dummy slice with
* size 0. The returned offset points to the end of the pool and is never
* returned on real slices.
*/
void kdbus_pool_publish_empty(struct kdbus_pool *pool, u64 *off, u64 *size)
{
if (off)
*off = pool->size;
if (size)
*size = 0;
}
/**
* kdbus_pool_slice_publish() - publish slice to user-space
* @slice: The slice
* @out_offset: Output storage for offset, or NULL
* @out_size: Output storage for size, or NULL
*
* This prepares a slice to be published to user-space.
*
* This call combines the following operations:
* * the memory region is flushed so the user's memory view is consistent
* * the slice is marked as referenced by user-space, so user-space has to
* call KDBUS_CMD_FREE to release it
* * the offset and size of the slice are written to the given output
* arguments, if non-NULL
*/
void kdbus_pool_slice_publish(struct kdbus_pool_slice *slice,
u64 *out_offset, u64 *out_size)
{
mutex_lock(&slice->pool->lock);
/* kernel must own a ref to @slice to gain a user-space ref */
WARN_ON(!slice->ref_kernel);
slice->ref_user = true;
mutex_unlock(&slice->pool->lock);
if (out_offset)
*out_offset = slice->off;
if (out_size)
*out_size = slice->size;
}
/**
* kdbus_pool_slice_offset() - Get a slice's offset inside the pool
* @slice: Slice to return the offset of
*
* Return: The internal offset @slice inside the pool.
*/
off_t kdbus_pool_slice_offset(const struct kdbus_pool_slice *slice)
{
return slice->off;
}
/**
* kdbus_pool_slice_size() - get size of a pool slice
* @slice: slice to query
*
* Return: size of the given slice
*/
size_t kdbus_pool_slice_size(const struct kdbus_pool_slice *slice)
{
return slice->size;
}
/**
* kdbus_pool_new() - create a new pool
* @name: Name of the (deleted) file which shows up in
* /proc, used for debugging
* @size: Maximum size of the pool
*
* Return: a new kdbus_pool on success, ERR_PTR on failure.
*/
struct kdbus_pool *kdbus_pool_new(const char *name, size_t size)
{
struct kdbus_pool_slice *s;
struct kdbus_pool *p;
struct file *f;
char *n = NULL;
int ret;
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (!p)
return ERR_PTR(-ENOMEM);
if (name) {
n = kasprintf(GFP_KERNEL, KBUILD_MODNAME "-conn:%s", name);
if (!n) {
ret = -ENOMEM;
goto exit_free;
}
}
f = shmem_file_setup(n ?: KBUILD_MODNAME "-conn", size, 0);
kfree(n);
if (IS_ERR(f)) {
ret = PTR_ERR(f);
goto exit_free;
}
ret = get_write_access(file_inode(f));
if (ret < 0)
goto exit_put_shmem;
/* allocate first slice spanning the entire pool */
s = kdbus_pool_slice_new(p, 0, size);
if (!s) {
ret = -ENOMEM;
goto exit_put_write;
}
p->f = f;
p->size = size;
p->slices_free = RB_ROOT;
p->slices_busy = RB_ROOT;
mutex_init(&p->lock);
INIT_LIST_HEAD(&p->slices);
list_add(&s->entry, &p->slices);
kdbus_pool_add_free_slice(p, s);
return p;
exit_put_write:
put_write_access(file_inode(f));
exit_put_shmem:
fput(f);
exit_free:
kfree(p);
return ERR_PTR(ret);
}
/**
* kdbus_pool_free() - destroy pool
* @pool: The receiver's pool
*/
void kdbus_pool_free(struct kdbus_pool *pool)
{
struct kdbus_pool_slice *s, *tmp;
if (!pool)
return;
list_for_each_entry_safe(s, tmp, &pool->slices, entry) {
list_del(&s->entry);
kfree(s);
}
put_write_access(file_inode(pool->f));
fput(pool->f);
kfree(pool);
}
/**
* kdbus_pool_accounted() - retrieve accounting information
* @pool: pool to query
* @size: output for overall pool size
* @acc: output for currently accounted size
*
* This returns accounting information of the pool. Note that the data might
* change after the function returns, as the pool lock is dropped. You need to
* protect the data via other means, if you need reliable accounting.
*/
void kdbus_pool_accounted(struct kdbus_pool *pool, size_t *size, size_t *acc)
{
mutex_lock(&pool->lock);
if (size)
*size = pool->size;
if (acc)
*acc = pool->accounted_size;
mutex_unlock(&pool->lock);
}
/**
* kdbus_pool_slice_copy_iovec() - copy user memory to a slice
* @slice: The slice to write to
* @off: Offset in the slice to write to
* @iov: iovec array, pointing to data to copy
* @iov_len: Number of elements in @iov
* @total_len: Total number of bytes described in members of @iov
*
* User memory referenced by @iov will be copied into @slice at offset @off.
*
* Return: the numbers of bytes copied, negative errno on failure.
*/
ssize_t
kdbus_pool_slice_copy_iovec(const struct kdbus_pool_slice *slice, loff_t off,
struct iovec *iov, size_t iov_len, size_t total_len)
{
#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0)
struct file *f = slice->pool->f;
struct iov_iter iter;
struct kiocb kiocb;
ssize_t len;
if (WARN_ON(off + total_len > slice->size))
return -EFAULT;
init_sync_kiocb(&kiocb, f);
kiocb.ki_pos = slice->off + off;
kiocb.ki_nbytes = total_len;
iov_iter_init(&iter, WRITE, iov, iov_len, total_len);
len = f->f_op->write_iter(&kiocb, &iter);
if (len < 0)
return len;
if (len != total_len)
return -EFAULT;
return len;
#else
struct iov_iter iter;
ssize_t len;
if (WARN_ON(off + total_len > slice->size))
return -EFAULT;
off += slice->off;
iov_iter_init(&iter, WRITE, iov, iov_len, total_len);
len = vfs_iter_write(slice->pool->f, &iter, &off);
return (len >= 0 && len != total_len) ? -EFAULT : len;
#endif
}
/**
* kdbus_pool_slice_copy_kvec() - copy kernel memory to a slice
* @slice: The slice to write to
* @off: Offset in the slice to write to
* @kvec: kvec array, pointing to data to copy
* @kvec_len: Number of elements in @kvec
* @total_len: Total number of bytes described in members of @kvec
*
* Kernel memory referenced by @kvec will be copied into @slice at offset @off.
*
* Return: the numbers of bytes copied, negative errno on failure.
*/
ssize_t kdbus_pool_slice_copy_kvec(const struct kdbus_pool_slice *slice,
loff_t off, struct kvec *kvec,
size_t kvec_len, size_t total_len)
{
#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0)
struct file *f = slice->pool->f;
struct iov_iter iter;
mm_segment_t old_fs;
struct kiocb kiocb;
ssize_t len;
if (WARN_ON(off + total_len > slice->size))
return -EFAULT;
old_fs = get_fs();
set_fs(get_ds());
init_sync_kiocb(&kiocb, f);
kiocb.ki_pos = slice->off + off;
kiocb.ki_nbytes = total_len;
iov_iter_kvec(&iter, WRITE | ITER_KVEC, kvec, kvec_len, total_len);
len = f->f_op->write_iter(&kiocb, &iter);
set_fs(old_fs);
if (len < 0)
return len;
if (len != total_len)
return -EFAULT;
return len;
#else
struct iov_iter iter;
mm_segment_t old_fs;
ssize_t len;
if (WARN_ON(off + total_len > slice->size))
return -EFAULT;
off += slice->off;
iov_iter_kvec(&iter, WRITE | ITER_KVEC, kvec, kvec_len, total_len);
old_fs = get_fs();
set_fs(get_ds());
len = vfs_iter_write(slice->pool->f, &iter, &off);
set_fs(old_fs);
return (len >= 0 && len != total_len) ? -EFAULT : len;
#endif
}
/**
* kdbus_pool_slice_copy() - copy data from one slice into another
* @slice_dst: destination slice
* @slice_src: source slice
*
* Return: 0 on success, negative error number on failure.
*/
int kdbus_pool_slice_copy(const struct kdbus_pool_slice *slice_dst,
const struct kdbus_pool_slice *slice_src)
{
struct file *f_src = slice_src->pool->f;
struct file *f_dst = slice_dst->pool->f;
struct inode *i_dst = file_inode(f_dst);
struct address_space *mapping_dst = f_dst->f_mapping;
const struct address_space_operations *aops = mapping_dst->a_ops;
unsigned long len = slice_src->size;
loff_t off_src = slice_src->off;
loff_t off_dst = slice_dst->off;
mm_segment_t old_fs;
int ret = 0;
if (WARN_ON(slice_src->size != slice_dst->size) ||
WARN_ON(slice_src->free || slice_dst->free))
return -EINVAL;
mutex_lock(&i_dst->i_mutex);
old_fs = get_fs();
set_fs(get_ds());
while (len > 0) {
unsigned long page_off;
unsigned long copy_len;
char __user *kaddr;
struct page *page;
ssize_t n_read;
void *fsdata;
long status;
page_off = off_dst & (PAGE_CACHE_SIZE - 1);
copy_len = min_t(unsigned long,
PAGE_CACHE_SIZE - page_off, len);
status = aops->write_begin(f_dst, mapping_dst, off_dst,
copy_len, 0, &page, &fsdata);
if (unlikely(status < 0)) {
ret = status;
break;
}
kaddr = (char __force __user *)kmap(page) + page_off;
n_read = f_src->f_op->read(f_src, kaddr, copy_len, &off_src);
kunmap(page);
mark_page_accessed(page);
flush_dcache_page(page);
if (unlikely(n_read != copy_len)) {
ret = -EFAULT;
break;
}
status = aops->write_end(f_dst, mapping_dst, off_dst,
copy_len, copy_len, page, fsdata);
if (unlikely(status != copy_len)) {
ret = -EFAULT;
break;
}
off_dst += copy_len;
len -= copy_len;
}
set_fs(old_fs);
mutex_unlock(&i_dst->i_mutex);
return ret;
}
/**
* kdbus_pool_mmap() - map the pool into the process
* @pool: The receiver's pool
* @vma: passed by mmap() syscall
*
* Return: the result of the mmap() call, negative errno on failure.
*/
int kdbus_pool_mmap(const struct kdbus_pool *pool, struct vm_area_struct *vma)
{
/* deny write access to the pool */
if (vma->vm_flags & VM_WRITE)
return -EPERM;
vma->vm_flags &= ~VM_MAYWRITE;
/* do not allow to map more than the size of the file */
if ((vma->vm_end - vma->vm_start) > pool->size)
return -EFAULT;
/* replace the connection file with our shmem file */
if (vma->vm_file)
fput(vma->vm_file);
vma->vm_file = get_file(pool->f);
return pool->f->f_op->mmap(pool->f, vma);
}