Skip to content
Browse files

Add TRIM support for disk vdevs.

This adds TRIM (a.k.a UNMAP, DISCARD, hole punching) support for disk
vdevs. The original patch is from Pawel Jakub Dawidek
<pjd@FreeBSD.org> who wrote it for FreeBSD. Etienne Dechamps
<etienne.dechamps@ovh.net> ported it to ZFS On Linux.

The code builds a map of regions that were freed. On every write the
code consults the map and eventually removes ranges that were freed
before, but are now overwritten.

Freed blocks are not TRIMed immediately. There is a tunable that defines
how many txg we should wait with TRIMming freed blocks (64 by default).

There is a low priority thread that TRIMs ranges when the time comes.
During TRIM we keep in-flight ranges on a list to detect colliding
writes - we have to delay writes that collide with in-flight TRIMs in
case something will be reordered and write will reached the disk before
the TRIM. We don't have to do the same for in-flight writes, as
colliding writes just remove ranges to TRIM.

Most of the code stayed unchanged during the porting to Linux. The only
big change is in the vdev disk module, since the FreeBSD and Linux
interfaces for issuing discards to block devices is obviously different.
On FreeBSD it seems that issuing a DELETE request of any size is
sufficient; on Linux we have to be careful not to exceed maximum discard
limits. That's why we introduce a new vdev_disk_io_trim() function
inspired from the Linux blkdev_issue_discard() function and the
pre-existing vdev_disk_physio() function. The new function takes care of
splitting discard requests into smaller ones if necessary.

In theory, the code should work for main pool disk vdevs, slog disk
vdevs, L2ARC disk vdevs, and supports mirror and raidz. File vdevs are
not supported yet.

Note that the new feature is disabled by default (zfs_notrim=1). To use
it, you have to explictly set the module parameter "zfs_notrim" to "0".
Be aware that this code is largely untested and brings huge risks of
potential data corruption. Use at your own risk and expect data loss.
  • Loading branch information...
1 parent 2b28613 commit cc6cd40ad71e1e611591929ad08184516357eaf5 @dechamps committed Aug 31, 2012
View
3 include/sys/spa_impl.h
@@ -217,6 +217,9 @@ struct spa {
spa_proc_state_t spa_proc_state; /* see definition */
proc_t *spa_proc; /* "zpool-poolname" process */
uint64_t spa_did; /* if procp != p0, did of t1 */
+ kthread_t *spa_trim_thread; /* thread sending TRIM I/Os */
+ kmutex_t spa_trim_lock; /* protects spa_trim_cv */
+ kcondvar_t spa_trim_cv; /* used to notify TRIM thread */
boolean_t spa_autoreplace; /* autoreplace set in open */
int spa_vdev_locks; /* locks grabbed */
uint64_t spa_creation_version; /* version at pool creation */
View
51 include/sys/trim_map.h
@@ -0,0 +1,51 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
+ * All rights reserved.
+ */
+
+#ifndef _SYS_TRIM_MAP_H
+#define _SYS_TRIM_MAP_H
+
+#include <sys/avl.h>
+#include <sys/list.h>
+#include <sys/spa.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void trim_map_create(vdev_t *vd);
+extern void trim_map_destroy(vdev_t *vd);
+extern void trim_map_free(zio_t *zio);
+extern boolean_t trim_map_write_start(zio_t *zio);
+extern void trim_map_write_done(zio_t *zio);
+
+extern void trim_thread_create(spa_t *spa);
+extern void trim_thread_destroy(spa_t *spa);
+extern void trim_thread_wakeup(spa_t *spa);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_TRIM_MAP_H */
View
1 include/sys/vdev.h
@@ -45,6 +45,7 @@ typedef enum vdev_dtl_type {
} vdev_dtl_type_t;
extern int zfs_nocacheflush;
+extern int zfs_notrim;
extern int vdev_open(vdev_t *);
extern void vdev_open_children(vdev_t *);
View
2 include/sys/vdev_impl.h
@@ -186,6 +186,7 @@ struct vdev {
uint64_t vdev_unspare; /* unspare when resilvering done */
hrtime_t vdev_last_try; /* last reopen time */
boolean_t vdev_nowritecache; /* true if flushwritecache failed */
+ boolean_t vdev_notrim; /* true if trim failed */
boolean_t vdev_checkremove; /* temporary online test */
boolean_t vdev_forcefault; /* force online fault */
boolean_t vdev_splitting; /* split or repair in progress */
@@ -201,6 +202,7 @@ struct vdev {
spa_aux_vdev_t *vdev_aux; /* for l2cache vdevs */
zio_t *vdev_probe_zio; /* root of current probe */
vdev_aux_t vdev_label_aux; /* on-disk aux state */
+ struct trim_map *vdev_trimmap;
/*
* For DTrace to work in userland (libzpool) context, these fields must
View
10 include/sys/zio.h
@@ -140,7 +140,8 @@ enum zio_compress {
#define ZIO_PRIORITY_RESILVER (zio_priority_table[9])
#define ZIO_PRIORITY_SCRUB (zio_priority_table[10])
#define ZIO_PRIORITY_DDT_PREFETCH (zio_priority_table[11])
-#define ZIO_PRIORITY_TABLE_SIZE 12
+#define ZIO_PRIORITY_TRIM (zio_priority_table[12])
+#define ZIO_PRIORITY_TABLE_SIZE 13
#define ZIO_PIPELINE_CONTINUE 0x100
#define ZIO_PIPELINE_STOP 0x101
@@ -429,6 +430,9 @@ struct zio {
/* Taskq dispatching state */
taskq_ent_t io_tqent;
+
+ avl_node_t io_trim_node;
+ list_node_t io_trim_link;
};
extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
@@ -459,7 +463,8 @@ extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
zio_done_func_t *done, void *private, enum zio_flag flags);
extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
- zio_done_func_t *done, void *private, int priority, enum zio_flag flags);
+ uint64_t offset, uint64_t size, zio_done_func_t *done,
+ void *private, int priority, enum zio_flag flags);
extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
uint64_t size, void *data, int checksum,
@@ -478,6 +483,7 @@ extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
blkptr_t *old_bp, uint64_t size, boolean_t use_slog);
extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
extern void zio_flush(zio_t *zio, vdev_t *vd);
+extern void zio_trim(zio_t *zio, vdev_t *vd, uint64_t offset, uint64_t size);
extern void zio_shrink(zio_t *zio, uint64_t size);
extern int zio_wait(zio_t *zio);
View
10 include/sys/zio_impl.h
@@ -60,9 +60,9 @@ enum zio_stage {
ZIO_STAGE_READY = 1 << 15, /* RWFCI */
- ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RW--I */
- ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RW--I */
- ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RW--I */
+ ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RWF-I */
+ ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RWF-- */
+ ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RWF-I */
ZIO_STAGE_CHECKSUM_VERIFY = 1 << 19, /* R---- */
@@ -143,7 +143,9 @@ enum zio_stage {
#define ZIO_FREE_PIPELINE \
(ZIO_INTERLOCK_STAGES | \
ZIO_STAGE_FREE_BP_INIT | \
- ZIO_STAGE_DVA_FREE)
+ ZIO_STAGE_DVA_FREE | \
+ ZIO_STAGE_VDEV_IO_START | \
+ ZIO_STAGE_VDEV_IO_ASSESS)
#define ZIO_DDT_FREE_PIPELINE \
(ZIO_INTERLOCK_STAGES | \
View
2 lib/libspl/include/sys/dkio.h
@@ -188,6 +188,8 @@ struct dk_geom {
*/
#define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */
+#define DKIOCTRIM (DKIOC|35) /* TRIM a block */
+
struct dk_callback {
void (*dkc_callback)(void *dkc_cookie, int error);
void *dkc_cookie;
View
1 lib/libzpool/Makefile.am
@@ -59,6 +59,7 @@ libzpool_la_SOURCES = \
$(top_srcdir)/module/zfs/spa_history.c \
$(top_srcdir)/module/zfs/spa_misc.c \
$(top_srcdir)/module/zfs/space_map.c \
+ $(top_srcdir)/module/zfs/trim_map.c \
$(top_srcdir)/module/zfs/txg.c \
$(top_srcdir)/module/zfs/uberblock.c \
$(top_srcdir)/module/zfs/unique.c \
View
1 module/avl/avl.c
@@ -1052,6 +1052,7 @@ EXPORT_SYMBOL(avl_nearest);
EXPORT_SYMBOL(avl_add);
EXPORT_SYMBOL(avl_remove);
EXPORT_SYMBOL(avl_numnodes);
+EXPORT_SYMBOL(avl_is_empty);
EXPORT_SYMBOL(avl_destroy_nodes);
EXPORT_SYMBOL(avl_destroy);
#endif
View
1 module/zfs/Makefile.in
@@ -43,6 +43,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/spa_errlog.o
$(MODULE)-objs += @top_srcdir@/module/zfs/spa_history.o
$(MODULE)-objs += @top_srcdir@/module/zfs/spa_misc.o
$(MODULE)-objs += @top_srcdir@/module/zfs/space_map.o
+$(MODULE)-objs += @top_srcdir@/module/zfs/trim_map.o
$(MODULE)-objs += @top_srcdir@/module/zfs/txg.o
$(MODULE)-objs += @top_srcdir@/module/zfs/uberblock.o
$(MODULE)-objs += @top_srcdir@/module/zfs/unique.o
View
17 module/zfs/spa.c
@@ -63,6 +63,7 @@
#include <sys/spa_boot.h>
#include <sys/zfs_ioctl.h>
#include <sys/dsl_scan.h>
+#include <sys/trim_map.h>
#ifdef _KERNEL
#include <sys/bootprops.h>
@@ -854,6 +855,11 @@ spa_activate(spa_t *spa, int mode)
spa_create_zio_taskqs(spa);
}
+ /*
+ * Start TRIM thread.
+ */
+ trim_thread_create(spa);
+
list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_config_dirty_node));
list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
@@ -884,6 +890,12 @@ spa_deactivate(spa_t *spa)
ASSERT(spa->spa_async_zio_root == NULL);
ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
+ /*
+ * Stop TRIM thread in case spa_unload() wasn't called before
+ * spa_deactivate().
+ */
+ trim_thread_destroy(spa);
+
txg_list_destroy(&spa->spa_vdev_txg_list);
list_destroy(&spa->spa_config_dirty_list);
@@ -999,6 +1011,11 @@ spa_unload(spa_t *spa)
ASSERT(MUTEX_HELD(&spa_namespace_lock));
/*
+ * Stop TRIM thread.
+ */
+ trim_thread_destroy(spa);
+
+ /*
* Stop async tasks.
*/
spa_async_suspend(spa);
View
535 module/zfs/trim_map.c
@@ -0,0 +1,535 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
+ * All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/trim_map.h>
+
+typedef struct trim_map {
+ list_t tm_head; /* List of segments sorted by txg. */
+ avl_tree_t tm_queued_frees; /* AVL tree of segments waiting for TRIM. */
+ avl_tree_t tm_inflight_frees; /* AVL tree of in-flight TRIMs. */
+ avl_tree_t tm_inflight_writes; /* AVL tree of in-flight writes. */
+ list_t tm_pending_writes; /* Writes blocked on in-flight frees. */
+ kmutex_t tm_lock;
+} trim_map_t;
+
+typedef struct trim_seg {
+ avl_node_t ts_node; /* AVL node. */
+ list_node_t ts_next; /* List element. */
+ uint64_t ts_start; /* Starting offset of this segment. */
+ uint64_t ts_end; /* Ending offset (non-inclusive). */
+ uint64_t ts_txg; /* Segment creation txg. */
+} trim_seg_t;
+
+/*
+ * TRIM support. Disabled by default.
+ * HIGHLY EXPERIMENTAL, USE AT YOUR OWN RISK, EXPECT DATA LOSS
+ */
+int zfs_notrim = B_TRUE;
+int trim_txg_limit = 64;
+
+static int
+trim_map_seg_compare(const void *x1, const void *x2)
+{
+ const trim_seg_t *s1 = x1;
+ const trim_seg_t *s2 = x2;
+
+ if (s1->ts_start < s2->ts_start) {
+ if (s1->ts_end > s2->ts_start)
+ return (0);
+ return (-1);
+ }
+ if (s1->ts_start > s2->ts_start) {
+ if (s1->ts_start < s2->ts_end)
+ return (0);
+ return (1);
+ }
+ return (0);
+}
+
+static int
+trim_map_zio_compare(const void *x1, const void *x2)
+{
+ const zio_t *z1 = x1;
+ const zio_t *z2 = x2;
+
+ if (z1->io_offset < z2->io_offset) {
+ if (z1->io_offset + z1->io_size > z2->io_offset)
+ return (0);
+ return (-1);
+ }
+ if (z1->io_offset > z2->io_offset) {
+ if (z1->io_offset < z2->io_offset + z2->io_size)
+ return (0);
+ return (1);
+ }
+ return (0);
+}
+
+void
+trim_map_create(vdev_t *vd)
+{
+ trim_map_t *tm;
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ if (zfs_notrim)
+ return;
+
+ tm = kmem_zalloc(sizeof (*tm), KM_SLEEP);
+ mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&tm->tm_head, sizeof (trim_seg_t),
+ offsetof(trim_seg_t, ts_next));
+ list_create(&tm->tm_pending_writes, sizeof (zio_t),
+ offsetof(zio_t, io_trim_link));
+ avl_create(&tm->tm_queued_frees, trim_map_seg_compare,
+ sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
+ avl_create(&tm->tm_inflight_frees, trim_map_seg_compare,
+ sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
+ avl_create(&tm->tm_inflight_writes, trim_map_zio_compare,
+ sizeof (zio_t), offsetof(zio_t, io_trim_node));
+ vd->vdev_trimmap = tm;
+}
+
+void
+trim_map_destroy(vdev_t *vd)
+{
+ trim_map_t *tm;
+ trim_seg_t *ts;
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ if (zfs_notrim)
+ return;
+
+ tm = vd->vdev_trimmap;
+ if (tm == NULL)
+ return;
+
+ mutex_enter(&tm->tm_lock);
+ while ((ts = list_head(&tm->tm_head)) != NULL) {
+ avl_remove(&tm->tm_queued_frees, ts);
+ list_remove(&tm->tm_head, ts);
+ kmem_free(ts, sizeof (*ts));
+ }
+ mutex_exit(&tm->tm_lock);
+
+ avl_destroy(&tm->tm_queued_frees);
+ avl_destroy(&tm->tm_inflight_frees);
+ avl_destroy(&tm->tm_inflight_writes);
+ list_destroy(&tm->tm_pending_writes);
+ list_destroy(&tm->tm_head);
+ mutex_destroy(&tm->tm_lock);
+ kmem_free(tm, sizeof (*tm));
+ vd->vdev_trimmap = NULL;
+}
+
+static void
+trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
+{
+ avl_index_t where;
+ trim_seg_t tsearch, *ts_before, *ts_after, *ts;
+ boolean_t merge_before, merge_after;
+
+ ASSERT(MUTEX_HELD(&tm->tm_lock));
+ VERIFY(start < end);
+
+ tsearch.ts_start = start;
+ tsearch.ts_end = end;
+
+ ts = avl_find(&tm->tm_queued_frees, &tsearch, &where);
+ if (ts != NULL) {
+ if (start < ts->ts_start)
+ trim_map_segment_add(tm, start, ts->ts_start, txg);
+ if (end > ts->ts_end)
+ trim_map_segment_add(tm, ts->ts_end, end, txg);
+ return;
+ }
+
+ ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE);
+ ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER);
+
+ merge_before = (ts_before != NULL && ts_before->ts_end == start &&
+ ts_before->ts_txg == txg);
+ merge_after = (ts_after != NULL && ts_after->ts_start == end &&
+ ts_after->ts_txg == txg);
+
+ if (merge_before && merge_after) {
+ avl_remove(&tm->tm_queued_frees, ts_before);
+ list_remove(&tm->tm_head, ts_before);
+ ts_after->ts_start = ts_before->ts_start;
+ kmem_free(ts_before, sizeof (*ts_before));
+ } else if (merge_before) {
+ ts_before->ts_end = end;
+ } else if (merge_after) {
+ ts_after->ts_start = start;
+ } else {
+ ts = kmem_alloc(sizeof (*ts), KM_SLEEP);
+ ts->ts_start = start;
+ ts->ts_end = end;
+ ts->ts_txg = txg;
+ avl_insert(&tm->tm_queued_frees, ts, where);
+ list_insert_tail(&tm->tm_head, ts);
+ }
+}
+
+static void
+trim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start,
+ uint64_t end)
+{
+ trim_seg_t *nts;
+ boolean_t left_over, right_over;
+
+ ASSERT(MUTEX_HELD(&tm->tm_lock));
+
+ left_over = (ts->ts_start < start);
+ right_over = (ts->ts_end > end);
+
+ if (left_over && right_over) {
+ nts = kmem_alloc(sizeof (*nts), KM_SLEEP);
+ nts->ts_start = end;
+ nts->ts_end = ts->ts_end;
+ nts->ts_txg = ts->ts_txg;
+ ts->ts_end = start;
+ avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER);
+ list_insert_after(&tm->tm_head, ts, nts);
+ } else if (left_over) {
+ ts->ts_end = start;
+ } else if (right_over) {
+ ts->ts_start = end;
+ } else {
+ avl_remove(&tm->tm_queued_frees, ts);
+ list_remove(&tm->tm_head, ts);
+ kmem_free(ts, sizeof (*ts));
+ }
+}
+
+static void
+trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
+{
+ zio_t *zs;
+ /*
+ * Declaring a zio_t variable on the stack makes the frame size too
+ * large for the taste of GCC. We work around the issue by allocating
+ * just the fields we actually need.
+ */
+ char zsearch_buffer[offsetof(zio_t, io_offset) + sizeof(zs->io_offset)];
+ zio_t *zsearch = (zio_t *)zsearch_buffer;
+
+ ASSERT(MUTEX_HELD(&tm->tm_lock));
+
+ zsearch->io_offset = start;
+ zsearch->io_size = end - start;
+
+ zs = avl_find(&tm->tm_inflight_writes, zsearch, NULL);
+ if (zs == NULL) {
+ trim_map_segment_add(tm, start, end, txg);
+ return;
+ }
+ if (start < zs->io_offset)
+ trim_map_free_locked(tm, start, zs->io_offset, txg);
+ if (zs->io_offset + zs->io_size < end)
+ trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg);
+}
+
+void
+trim_map_free(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ trim_map_t *tm = vd->vdev_trimmap;
+
+ if (zfs_notrim || vd->vdev_notrim || tm == NULL)
+ return;
+
+ mutex_enter(&tm->tm_lock);
+ trim_map_free_locked(tm, zio->io_offset, zio->io_offset + zio->io_size,
+ vd->vdev_spa->spa_syncing_txg);
+ mutex_exit(&tm->tm_lock);
+}
+
+boolean_t
+trim_map_write_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ trim_map_t *tm = vd->vdev_trimmap;
+ trim_seg_t tsearch, *ts;
+ uint64_t start, end;
+
+ if (zfs_notrim || vd->vdev_notrim || tm == NULL)
+ return (B_TRUE);
+
+ start = zio->io_offset;
+ end = start + zio->io_size;
+ tsearch.ts_start = start;
+ tsearch.ts_end = end;
+
+ mutex_enter(&tm->tm_lock);
+
+ /*
+ * Checking for colliding in-flight frees.
+ */
+ ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL);
+ if (ts != NULL) {
+ list_insert_tail(&tm->tm_pending_writes, zio);
+ mutex_exit(&tm->tm_lock);
+ return (B_FALSE);
+ }
+
+ ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL);
+ if (ts != NULL) {
+ /*
+ * Loop until all overlapping segments are removed.
+ */
+ do {
+ trim_map_segment_remove(tm, ts, start, end);
+ ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL);
+ } while (ts != NULL);
+ }
+ avl_add(&tm->tm_inflight_writes, zio);
+
+ mutex_exit(&tm->tm_lock);
+
+ return (B_TRUE);
+}
+
+void
+trim_map_write_done(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ trim_map_t *tm = vd->vdev_trimmap;
+
+ if (zfs_notrim || vd->vdev_notrim || tm == NULL)
+ return;
+
+ mutex_enter(&tm->tm_lock);
+ avl_remove(&tm->tm_inflight_writes, zio);
+ mutex_exit(&tm->tm_lock);
+}
+
+/*
+ * Return the oldest segment (the one with the lowest txg) or false if
+ * the list is empty or the first element's txg is greater than txg given
+ * as function argument.
+ */
+static trim_seg_t *
+trim_map_first(trim_map_t *tm, uint64_t txg)
+{
+ trim_seg_t *ts;
+
+ ASSERT(MUTEX_HELD(&tm->tm_lock));
+
+ ts = list_head(&tm->tm_head);
+ if (ts != NULL && ts->ts_txg <= txg)
+ return (ts);
+ return (NULL);
+}
+
+static void
+trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
+{
+ trim_map_t *tm = vd->vdev_trimmap;
+ trim_seg_t *ts;
+ uint64_t txglimit;
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ if (tm == NULL)
+ return;
+
+ txglimit = spa->spa_syncing_txg - trim_txg_limit;
+
+ mutex_enter(&tm->tm_lock);
+ /*
+ * Loop until we send all frees up to the txglimit.
+ */
+ while ((ts = trim_map_first(tm, txglimit)) != NULL) {
+ list_remove(&tm->tm_head, ts);
+ avl_remove(&tm->tm_queued_frees, ts);
+ avl_add(&tm->tm_inflight_frees, ts);
+ zio_trim(zio, vd, ts->ts_start, ts->ts_end - ts->ts_start);
+ }
+ mutex_exit(&tm->tm_lock);
+}
+
+static void
+trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd)
+{
+ trim_map_t *tm = vd->vdev_trimmap;
+ trim_seg_t *ts;
+ list_t pending_writes;
+ zio_t *zio;
+ void *cookie;
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ if (tm == NULL)
+ return;
+
+ mutex_enter(&tm->tm_lock);
+ if (!avl_is_empty(&tm->tm_inflight_frees)) {
+ cookie = NULL;
+ while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees,
+ &cookie)) != NULL) {
+ kmem_free(ts, sizeof (*ts));
+ }
+ }
+ list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t,
+ io_trim_link));
+ list_move_tail(&pending_writes, &tm->tm_pending_writes);
+ mutex_exit(&tm->tm_lock);
+
+ while ((zio = list_remove_head(&pending_writes)) != NULL) {
+ zio_vdev_io_reissue(zio);
+ zio_execute(zio);
+ }
+ list_destroy(&pending_writes);
+}
+
+static void
+trim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
+{
+ int c;
+
+ if (vd == NULL || spa->spa_syncing_txg <= trim_txg_limit)
+ return;
+
+ if (vd->vdev_ops->vdev_op_leaf) {
+ trim_map_vdev_commit(spa, zio, vd);
+ } else {
+ for (c = 0; c < vd->vdev_children; c++)
+ trim_map_commit(spa, zio, vd->vdev_child[c]);
+ }
+}
+
+static void
+trim_map_commit_done(spa_t *spa, vdev_t *vd)
+{
+ int c;
+
+ if (vd == NULL)
+ return;
+
+ if (vd->vdev_ops->vdev_op_leaf) {
+ trim_map_vdev_commit_done(spa, vd);
+ } else {
+ for (c = 0; c < vd->vdev_children; c++)
+ trim_map_commit_done(spa, vd->vdev_child[c]);
+ }
+}
+
+static void
+trim_thread(void *arg)
+{
+ spa_t *spa = arg;
+ zio_t *zio;
+
+ for (;;) {
+ mutex_enter(&spa->spa_trim_lock);
+ if (spa->spa_trim_thread == NULL) {
+ spa->spa_trim_thread = curthread;
+ cv_signal(&spa->spa_trim_cv);
+ mutex_exit(&spa->spa_trim_lock);
+ thread_exit();
+ }
+ cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock);
+ mutex_exit(&spa->spa_trim_lock);
+
+ zio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL);
+
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ trim_map_commit(spa, zio, spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ (void) zio_wait(zio);
+
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ trim_map_commit_done(spa, spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_STATE, FTAG);
+ }
+}
+
+void
+trim_thread_create(spa_t *spa)
+{
+
+ if (zfs_notrim)
+ return;
+
+ mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL);
+ mutex_enter(&spa->spa_trim_lock);
+ spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0,
+ TS_RUN, minclsyspri);
+ mutex_exit(&spa->spa_trim_lock);
+}
+
+void
+trim_thread_destroy(spa_t *spa)
+{
+
+ if (zfs_notrim)
+ return;
+ if (spa->spa_trim_thread == NULL)
+ return;
+
+ mutex_enter(&spa->spa_trim_lock);
+ /* Setting spa_trim_thread to NULL tells the thread to stop. */
+ spa->spa_trim_thread = NULL;
+ cv_signal(&spa->spa_trim_cv);
+ /* The thread will set it back to != NULL on exit. */
+ while (spa->spa_trim_thread == NULL)
+ cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock);
+ spa->spa_trim_thread = NULL;
+ mutex_exit(&spa->spa_trim_lock);
+
+ cv_destroy(&spa->spa_trim_cv);
+ mutex_destroy(&spa->spa_trim_lock);
+}
+
+void
+trim_thread_wakeup(spa_t *spa)
+{
+
+ if (zfs_notrim)
+ return;
+ if (spa->spa_trim_thread == NULL)
+ return;
+
+ mutex_enter(&spa->spa_trim_lock);
+ cv_signal(&spa->spa_trim_cv);
+ mutex_exit(&spa->spa_trim_lock);
+}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+module_param(zfs_notrim, int, 0644);
+MODULE_PARM_DESC(zfs_notrim, "Disable TRIM (default is 1; 0 IS HIGHLY EXPERIMENTAL, USE AT YOUR OWN RISK, EXPECT DATA LOSS)");
+
+module_param(trim_txg_limit, int, 0644);
+MODULE_PARM_DESC(trim_txg_limit, "Delay TRIMs by that many TXGs.");
+#endif
+
View
9 module/zfs/vdev.c
@@ -42,6 +42,7 @@
#include <sys/arc.h>
#include <sys/zil.h>
#include <sys/dsl_scan.h>
+#include <sys/trim_map.h>
/*
* Virtual device management.
@@ -1213,6 +1214,11 @@ vdev_open(vdev_t *vd)
if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
return (0);
+ if (vd->vdev_ops->vdev_op_leaf) {
+ vd->vdev_notrim = B_FALSE;
+ trim_map_create(vd);
+ }
+
for (c = 0; c < vd->vdev_children; c++) {
if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
@@ -1451,6 +1457,9 @@ vdev_close(vdev_t *vd)
vdev_cache_purge(vd);
+ if (vd->vdev_ops->vdev_op_leaf)
+ trim_map_destroy(vd);
+
/*
* We record the previous state before we close it, so that if we are
* doing a reopen(), we don't generate FMA ereports if we notice that
View
112 module/zfs/vdev_disk.c
@@ -291,6 +291,8 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift)
/* Based on the minimum sector size set the block size */
*ashift = highbit(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
+ v->vdev_notrim = !blk_queue_discard(bdev_get_queue(bdev));
+
/* Try to set the io scheduler elevator algorithm */
(void) vdev_elevator_switch(v, zfs_vdev_scheduler);
@@ -640,6 +642,101 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
#endif /* HAVE_BIO_EMPTY_BARRIER */
static int
+vdev_disk_io_trim(struct block_device *bdev, zio_t *zio)
+{
+ struct request_queue *q = bdev_get_queue(bdev);
+ unsigned int max_discard_sectors;
+ dio_request_t *dr;
+ sector_t bio_sector, bio_sector_count;
+ int bio_count = 16;
+ int i = 0;
+
+ if (!blk_queue_discard(q))
+ return ENOTSUP;
+
+ max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
+ if (!max_discard_sectors)
+ return ENOTSUP;
+ if (q->limits.discard_granularity)
+ max_discard_sectors &= ~((q->limits.discard_granularity >> 9) - 1);
+
+retry:
+ dr = vdev_disk_dio_alloc(bio_count);
+ if (dr == NULL)
+ return ENOMEM;
+
+ dr->dr_zio = zio;
+ dr->dr_rw = REQ_WRITE | REQ_DISCARD;
+
+ if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
+ bio_set_flags_failfast(bdev, &dr->dr_rw);
+
+ /*
+ * When the discard size exceeds the maximum discard size for the
+ * request queue we are forced to break the discard in multiple bio's
+ * and wait for them all to complete.
+ */
+ bio_sector = zio->io_offset >> 9;
+ bio_sector_count = zio->io_size >> 9;
+ for (i = 0; i <= dr->dr_bio_count; i++) {
+
+ /* Finished constructing bio's for given buffer */
+ if (bio_sector_count <= 0)
+ break;
+
+ /*
+ * By default only 'bio_count' bio's per dio are allowed.
+ * However, if we find ourselves in a situation where more
+ * are needed we allocate a larger dio and warn the user.
+ */
+ if (dr->dr_bio_count == i) {
+ vdev_disk_dio_free(dr);
+ bio_count *= 2;
+ printk("WARNING: Resized discard bio's/dio to %d\n",bio_count);
+ goto retry;
+ }
+
+ dr->dr_bio[i] = bio_alloc(GFP_NOIO, 1);
+ if (dr->dr_bio[i] == NULL) {
+ vdev_disk_dio_free(dr);
+ return ENOMEM;
+ }
+
+ /* Matching put called by vdev_disk_physio_completion */
+ vdev_disk_dio_get(dr);
+
+ dr->dr_bio[i]->bi_bdev = bdev;
+ dr->dr_bio[i]->bi_sector = bio_sector;
+ dr->dr_bio[i]->bi_rw = dr->dr_rw;
+ dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
+ dr->dr_bio[i]->bi_private = dr;
+
+ if (bio_sector_count > max_discard_sectors) {
+ dr->dr_bio[i]->bi_size = max_discard_sectors << 9;
+ bio_sector_count -= max_discard_sectors;
+ bio_sector += max_discard_sectors;
+ } else {
+ dr->dr_bio[i]->bi_size = bio_sector_count << 9;
+ bio_sector_count = 0;
+ }
+ }
+
+ /* Extra reference to protect dio_request during submit_bio */
+ vdev_disk_dio_get(dr);
+ if (zio)
+ zio->io_delay = jiffies_64;
+
+ /* Submit all bio's associated with this dio */
+ for (i = 0; i < dr->dr_bio_count; i++)
+ if (dr->dr_bio[i])
+ submit_bio(dr->dr_rw, dr->dr_bio[i]);
+
+ (void)vdev_disk_dio_put(dr);
+
+ return (0);
+}
+
+static int
vdev_disk_io_start(zio_t *zio)
{
vdev_t *v = zio->io_vd;
@@ -654,6 +751,9 @@ vdev_disk_io_start(zio_t *zio)
return ZIO_PIPELINE_CONTINUE;
}
+ if (zio->io_cmd == DKIOCTRIM)
+ break;
+
switch (zio->io_cmd) {
case DKIOCFLUSHWRITECACHE:
@@ -694,8 +794,16 @@ vdev_disk_io_start(zio_t *zio)
return ZIO_PIPELINE_CONTINUE;
}
- error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data,
- zio->io_size, zio->io_offset, flags);
+ if (zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCTRIM)
+ {
+ error = vdev_disk_io_trim(vd->vd_bdev, zio);
+ if (error == ENOTSUP)
+ v->vdev_notrim = B_TRUE;
+ }
+ else
+ error = __vdev_disk_physio(vd->vd_bdev, zio,
+ zio->io_data, zio->io_size,
+ zio->io_offset, flags);
if (error) {
zio->io_error = error;
return ZIO_PIPELINE_CONTINUE;
View
8 module/zfs/vdev_label.c
@@ -141,6 +141,7 @@
#include <sys/metaslab.h>
#include <sys/zio.h>
#include <sys/dsl_scan.h>
+#include <sys/trim_map.h>
#include <sys/fs/zfs.h>
/*
@@ -1221,5 +1222,10 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard)
* to disk to ensure that all odd-label updates are committed to
* stable storage before the next transaction group begins.
*/
- return (vdev_label_sync_list(spa, 1, txg, flags));
+ if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0)
+ return (error);
+
+ trim_thread_wakeup(spa);
+
+ return (0);
}
View
7 module/zfs/vdev_mirror.c
@@ -290,10 +290,11 @@ vdev_mirror_io_start(zio_t *zio)
c = vdev_mirror_child_select(zio);
children = (c >= 0);
} else {
- ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE ||
+ zio->io_type == ZIO_TYPE_FREE);
/*
- * Writes go to all children.
+ * Writes and frees go to all children.
*/
c = 0;
children = mm->mm_children;
@@ -374,6 +375,8 @@ vdev_mirror_io_done(zio_t *zio)
zio->io_error = vdev_mirror_worst_error(mm);
}
return;
+ } else if (zio->io_type == ZIO_TYPE_FREE) {
+ return;
}
ASSERT(zio->io_type == ZIO_TYPE_READ);
View
32 module/zfs/vdev_raidz.c
@@ -503,14 +503,20 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
ASSERT3U(rm->rm_nskip, <=, nparity);
- for (c = 0; c < rm->rm_firstdatacol; c++)
- rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
+ if (zio->io_type != ZIO_TYPE_FREE) {
+ for (c = 0; c < rm->rm_firstdatacol; c++) {
+ rm->rm_col[c].rc_data =
+ zio_buf_alloc(rm->rm_col[c].rc_size);
+ }
- rm->rm_col[c].rc_data = zio->io_data;
+ rm->rm_col[c].rc_data = zio->io_data;
- for (c = c + 1; c < acols; c++)
- rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
- rm->rm_col[c - 1].rc_size;
+ for (c = c + 1; c < acols; c++) {
+ rm->rm_col[c].rc_data =
+ (char *)rm->rm_col[c - 1].rc_data +
+ rm->rm_col[c - 1].rc_size;
+ }
+ }
/*
* If all data stored spans all columns, there's a danger that parity
@@ -1531,6 +1537,18 @@ vdev_raidz_io_start(zio_t *zio)
ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
+ if (zio->io_type == ZIO_TYPE_FREE) {
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_devidx];
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_data, rc->rc_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_raidz_child_done, rc));
+ }
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
if (zio->io_type == ZIO_TYPE_WRITE) {
vdev_raidz_generate_parity(rm);
@@ -1916,6 +1934,8 @@ vdev_raidz_io_done(zio_t *zio)
zio->io_error = vdev_raidz_worst_error(rm);
return;
+ } else if (zio->io_type == ZIO_TYPE_FREE) {
+ return;
}
ASSERT(zio->io_type == ZIO_TYPE_READ);
View
52 module/zfs/zio.c
@@ -36,6 +36,7 @@
#include <sys/dmu_objset.h>
#include <sys/arc.h>
#include <sys/ddt.h>
+#include <sys/trim_map.h>
/*
* ==========================================================================
@@ -55,6 +56,7 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
10, /* ZIO_PRIORITY_RESILVER */
20, /* ZIO_PRIORITY_SCRUB */
2, /* ZIO_PRIORITY_DDT_PREFETCH */
+ 30, /* ZIO_PRIORITY_TRIM */
};
/*
@@ -553,7 +555,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
{
zio_t *zio;
- ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE);
ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
@@ -803,15 +805,16 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
}
zio_t *
-zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
- zio_done_func_t *done, void *private, int priority, enum zio_flag flags)
+zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset,
+ uint64_t size, zio_done_func_t *done, void *private, int priority,
+ enum zio_flag flags)
{
zio_t *zio;
int c;
if (vd->vdev_children == 0) {
- zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
- ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL,
+ zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private,
+ ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL,
ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
zio->io_cmd = cmd;
@@ -820,7 +823,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
for (c = 0; c < vd->vdev_children; c++)
zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
- done, private, priority, flags));
+ offset, size, done, private, priority, flags));
}
return (zio);
@@ -945,12 +948,23 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
void
zio_flush(zio_t *zio, vdev_t *vd)
{
- zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
+ zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0,
NULL, NULL, ZIO_PRIORITY_NOW,
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
}
void
+zio_trim(zio_t *zio, vdev_t *vd, uint64_t offset, uint64_t size)
+{
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCTRIM, offset, size,
+ NULL, NULL, ZIO_PRIORITY_TRIM,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
+}
+
+void
zio_shrink(zio_t *zio, uint64_t size)
{
ASSERT(zio->io_executor == NULL);
@@ -2409,6 +2423,12 @@ zio_vdev_io_start(zio_t *zio)
return (vdev_mirror_ops.vdev_op_io_start(zio));
}
+ if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE) {
+ if (!BP_IS_GANG(zio->io_bp))
+ trim_map_free(zio);
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
/*
* We keep track of time-sensitive I/Os so that the scan thread
* can quickly react to certain workloads. In particular, we care
@@ -2444,7 +2464,7 @@ zio_vdev_io_start(zio_t *zio)
ASSERT(P2PHASE(zio->io_offset, align) == 0);
ASSERT(P2PHASE(zio->io_size, align) == 0);
- VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
+ VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa));
/*
* If this is a repair I/O, and there's no self-healing involved --
@@ -2484,6 +2504,11 @@ zio_vdev_io_start(zio_t *zio)
}
}
+ if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_WRITE) {
+ if (!trim_map_write_start(zio))
+ return (ZIO_PIPELINE_STOP);
+ }
+
return (vd->vdev_ops->vdev_op_io_start(zio));
}
@@ -2497,9 +2522,16 @@ zio_vdev_io_done(zio_t *zio)
if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
return (ZIO_PIPELINE_STOP);
- ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+ ASSERT(zio->io_type == ZIO_TYPE_READ ||
+ zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE);
+
+ if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+ zio->io_type == ZIO_TYPE_WRITE) {
+ trim_map_write_done(zio);
+ }
- if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
+ if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+ (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
vdev_queue_io_done(zio);

0 comments on commit cc6cd40

Please sign in to comment.
Something went wrong with that request. Please try again.