Skip to content
This repository has been archived by the owner on Jan 13, 2022. It is now read-only.

Commit

Permalink
Changes to add "writecache" mode - write only caching
Browse files Browse the repository at this point in the history
Summary: Commit contains 2 changes
1) Add a "writecache" mode, a variant of writeback caching where
only incoming writes are cached.
2) Improves disk write clustering for disk cleaning by reading
all the dirty blocks from flash in a chain and then issuing large
write for the entire chain (flashcache_kcopy.c).

Test Plan:

Reviewers: Herman Lee (facebook).

CC:

Task ID: #

Blame Rev:
  • Loading branch information
Mohan Srinivasan committed Aug 1, 2014
1 parent d359499 commit 40b8b41
Show file tree
Hide file tree
Showing 9 changed files with 880 additions and 169 deletions.
8 changes: 7 additions & 1 deletion doc/flashcache-sa-guide.txt
Expand Up @@ -29,6 +29,9 @@ are cached (tunable).
Writeback - fastest but less safe. Writes only go to the ssd initially, and
based on various policies are written to disk later. All disk reads are
cached (tunable).

Writeonly - variant of writeback caching. In this mode, only incoming writes
are cached. No reads are ever cached.

Cache Persistence :
=================
Expand All @@ -55,7 +58,7 @@ instead of using dmsetup.

flashcache_create : Create a new flashcache volume.

flashcache_create [-v] -p back|around|thru [-s cache size] [-b block size] cachedevname ssd_devname disk_devname
flashcache_create [-v] -p back|around|thru [-s cache size] [-w] [-b block size] cachedevname ssd_devname disk_devname
-v : verbose.
-p : cache mode (writeback/writethrough/writearound).
-s : cache size. Optional. If this is not specified, the entire ssd device
Expand All @@ -66,6 +69,9 @@ flashcache_create [-v] -p back|around|thru [-s cache size] [-b block size] cache
(A 4KB blocksize is the correct choice for the vast majority of
applications. But see the section "Cache Blocksize selection" below).
-f : force create. by pass checks (eg for ssd sectorsize).
-w : write cache mode. Only writes are cached, not reads
-d : disk associativity, within each cache set, we store several contigous
disk extents. Defaults to off.

Examples :
flashcache_create -p back -s 1g -b 4k cachedev /dev/sdc /dev/sdb
Expand Down
2 changes: 1 addition & 1 deletion src/Makefile
Expand Up @@ -25,7 +25,7 @@ ifneq "$(OPENVZ_VER)" ""
endif

obj-m += flashcache.o
flashcache-objs := flashcache_conf.o flashcache_main.o flashcache_subr.o flashcache_ioctl.o flashcache_procfs.o flashcache_reclaim.o
flashcache-objs := flashcache_conf.o flashcache_main.o flashcache_subr.o flashcache_ioctl.o flashcache_procfs.o flashcache_reclaim.o flashcache_kcopy.o

.PHONY: all
all: modules utils
Expand Down
57 changes: 55 additions & 2 deletions src/flashcache.h
Expand Up @@ -25,7 +25,7 @@
#ifndef FLASHCACHE_H
#define FLASHCACHE_H

#define FLASHCACHE_VERSION 3
#define FLASHCACHE_VERSION 4

#define DEV_PATHLEN 128

Expand Down Expand Up @@ -295,7 +295,8 @@ struct cache_c {
unsigned int assoc_shift; /* Consecutive blocks size in bits */
unsigned int num_sets; /* Number of cache sets */
int cache_mode;

int write_only_cache;

wait_queue_head_t destroyq; /* Wait queue for I/O completion */
/* XXX - Updates of nr_jobs should happen inside the lock. But doing it outside
is OK since the filesystem is unmounted at this point */
Expand Down Expand Up @@ -353,6 +354,9 @@ struct cache_c {
spinlock_t diskclean_list_lock;
struct diskclean_buf_ *diskclean_buf_head;

spinlock_t kcopy_job_alloc_lock;
struct flashcache_copy_job *kcopy_jobs_head;

struct cache_c *next_cache;

void *sysctl_handle;
Expand Down Expand Up @@ -396,6 +400,10 @@ struct cache_c {
struct sequential_io seq_recent_ios[SEQUENTIAL_TRACKER_QUEUE_DEPTH];
struct sequential_io *seq_io_head;
struct sequential_io *seq_io_tail;

#define FLASHCACHE_WRITE_CLUST_HIST_SIZE 128
unsigned long write_clust_hist[FLASHCACHE_WRITE_CLUST_HIST_SIZE];
unsigned long write_clust_hist_ovf;
};

/* kcached/pending job states */
Expand Down Expand Up @@ -435,6 +443,30 @@ struct pending_job {
int index;
struct pending_job *prev, *next;
};

struct flashcache_copy_job {
struct list_head list;
struct cache_c *dmc;
int nr_writes;
int reads_completed;
int write_kickoff;
struct page_list *pl_base;
struct page_list *pl_list_head;
struct page **page_base;
struct kcached_job **job_base;
struct job_io_regions_ {
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26)
struct io_region disk;
struct io_region *cache;
#else
struct dm_io_region disk;
struct dm_io_region *cache;
#endif
} job_io_regions;
int error;
spinlock_t copy_job_spinlock;
struct flashcache_copy_job *next;
};
#endif /* __KERNEL__ */

/* Cache Modes */
Expand Down Expand Up @@ -507,6 +539,7 @@ struct flash_superblock {
u_int32_t cache_version;
u_int32_t md_block_size;
u_int32_t disk_assoc;
u_int32_t write_only_cache;
};

/*
Expand Down Expand Up @@ -553,6 +586,10 @@ struct flash_cacheblock {
#define INDEX_TO_CACHE_ADDR(DMC, INDEX) \
(((sector_t)(INDEX) << (DMC)->block_shift) + (DMC)->md_blocks * MD_SECTORS_PER_BLOCK((DMC)))

#define CACHE_ADDR_TO_INDEX(DMC, CACHE_ADDR) \
((int)(((CACHE_ADDR) - ((DMC)->md_blocks * MD_SECTORS_PER_BLOCK((DMC)))) >> (DMC)->block_shift))


#ifdef __KERNEL__

/* Cache persistence */
Expand Down Expand Up @@ -671,6 +708,9 @@ int flashcache_md_io_empty(void);
int flashcache_md_complete_empty(void);
void flashcache_md_write_done(struct kcached_job *job);
void flashcache_do_pending(struct kcached_job *job);
void flashcache_free_pending_jobs(struct cache_c *dmc, struct cacheblock *cacheblk,
int error);

void flashcache_md_write(struct kcached_job *job);
void flashcache_md_write_kickoff(struct kcached_job *job);
void flashcache_do_io(struct kcached_job *job);
Expand Down Expand Up @@ -741,6 +781,19 @@ int flashcache_diskclean_alloc(struct cache_c *dmc,
void flashcache_diskclean_free(struct cache_c *dmc, struct dbn_index_pair *buf1,
struct dbn_index_pair *buf2);

unsigned long hash_block(struct cache_c *dmc, sector_t dbn);
void flashcache_copy_data(struct cache_c *dmc, struct cache_set *cache_set,
int nr_writes, struct dbn_index_pair *writes_list);

void push_cleaning_read_complete(struct flashcache_copy_job *job);
void push_cleaning_write_complete(struct flashcache_copy_job *job);
void flashcache_clean_write_kickoff(struct flashcache_copy_job *job);
void flashcache_clean_md_write_kickoff(struct flashcache_copy_job *job);

int flashcache_kcopy_init(struct cache_c *dmc);
void flashcache_kcopy_destroy(struct cache_c *dmc);


#endif /* __KERNEL__ */

#endif
84 changes: 71 additions & 13 deletions src/flashcache_conf.c
Expand Up @@ -302,6 +302,7 @@ flashcache_writeback_md_store(struct cache_c *dmc)
header->cache_devsize = to_sector(dmc->cache_dev->bdev->bd_inode->i_size);
header->disk_devsize = to_sector(dmc->disk_dev->bdev->bd_inode->i_size);
header->cache_version = dmc->on_ssd_version;
header->write_only_cache = dmc->write_only_cache;

DPRINTK("Store metadata to disk: block size(%u), md block size(%u), cache size(%llu)" \
"associativity(%u)",
Expand Down Expand Up @@ -554,6 +555,7 @@ flashcache_writeback_create(struct cache_c *dmc, int force)
header->cache_devsize = to_sector(dmc->cache_dev->bdev->bd_inode->i_size);
header->disk_devsize = to_sector(dmc->disk_dev->bdev->bd_inode->i_size);
dmc->on_ssd_version = header->cache_version = FLASHCACHE_VERSION;
header->write_only_cache = dmc->write_only_cache;
where.sector = 0;
where.count = dmc->md_block_size;

Expand Down Expand Up @@ -620,12 +622,18 @@ flashcache_writeback_load(struct cache_c *dmc)
return 1;
}
dmc->disk_assoc = header->disk_assoc;
dmc->write_only_cache = header->write_only_cache;

if (header->cache_version < 3)
/* Disk Assoc was introduced in On SSD version 3 */
dmc->disk_assoc = 0;
if (dmc->disk_assoc != 0)
dmc->disk_assoc_shift = ffs(dmc->disk_assoc) - 1;

if (header->cache_version < 4)
/* write_only_cache was introduced in On SSD version 4 */
dmc->write_only_cache = 0;

dmc->on_ssd_version = header->cache_version;

DPRINTK("Loaded cache conf: version(%d), block size(%u), md block size(%u), cache size(%llu), " \
Expand Down Expand Up @@ -1003,17 +1011,19 @@ flashcache_ctr(struct dm_target *ti, unsigned int argc, char **argv)
dmc->assoc = DEFAULT_CACHE_ASSOC;
dmc->assoc_shift = ffs(dmc->assoc) - 1;

if (argc >= 8) {
if (argc >= 9) {
if (sscanf(argv[8], "%u", &dmc->disk_assoc) != 1) {
ti->error = "flashcache: Invalid disk associativity";
r = -EINVAL;
goto bad3;
}
if (!dmc->disk_assoc || (dmc->disk_assoc & (dmc->disk_assoc - 1)) ||
dmc->disk_assoc > FLASHCACHE_MAX_DISK_ASSOC ||
dmc->disk_assoc < FLASHCACHE_MIN_DISK_ASSOC ||
dmc->size < dmc->disk_assoc ||
(dmc->assoc * dmc->block_shift) < dmc->disk_assoc) {
/* disk_assoc of 0 is permitted value */
if ((dmc->disk_assoc > 0) &&
((!dmc->disk_assoc || (dmc->disk_assoc & (dmc->disk_assoc - 1)) ||
dmc->disk_assoc > FLASHCACHE_MAX_DISK_ASSOC ||
dmc->disk_assoc < FLASHCACHE_MIN_DISK_ASSOC ||
dmc->size < dmc->disk_assoc ||
(dmc->assoc * dmc->block_shift) < dmc->disk_assoc))) {
printk(KERN_ERR "Invalid Disk Assoc assoc %d disk_assoc %d size %ld\n",
dmc->assoc, dmc->disk_assoc, dmc->size);
ti->error = "flashcache: Invalid disk associativity";
Expand All @@ -1024,9 +1034,32 @@ flashcache_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (dmc->disk_assoc != 0)
dmc->disk_assoc_shift = ffs(dmc->disk_assoc) - 1;

if (argc >= 10) {
if (sscanf(argv[9], "%u", &dmc->write_only_cache) != 1) {
ti->error = "flashcache: Invalid Write Cache setting";
r = -EINVAL;
goto bad3;
}
if ((dmc->write_only_cache == 1) &&
(dmc->cache_mode != FLASHCACHE_WRITE_BACK)) {
printk(KERN_ERR "Write Cache Setting only valid with WRITE_BACK %d\n",
dmc->write_only_cache);
ti->error = "flashcache: Invalid Write Cache Setting";
r = -EINVAL;
goto bad3;
}
if (dmc->write_only_cache < 0 || dmc->write_only_cache > 1) {
printk(KERN_ERR "Invalid Write Cache Setting %d\n",
dmc->write_only_cache);
ti->error = "flashcache: Invalid Write Cache Setting";
r = -EINVAL;
goto bad3;
}
}

if (dmc->cache_mode == FLASHCACHE_WRITE_BACK) {
if (argc >= 9) {
if (sscanf(argv[9], "%u", &dmc->md_block_size) != 1) {
if (argc >= 11) {
if (sscanf(argv[10], "%u", &dmc->md_block_size) != 1) {
ti->error = "flashcache: Invalid metadata block size";
r = -EINVAL;
goto bad3;
Expand Down Expand Up @@ -1106,12 +1139,22 @@ flashcache_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad3;
}

if (flashcache_kcopy_init(dmc)) {
ti->error = "Unable to allocate memory";
r = -ENOMEM;
flashcache_diskclean_destroy(dmc);
vfree((void *)dmc->cache);
vfree((void *)dmc->cache_sets);
goto bad3;
}

if (dmc->cache_mode == FLASHCACHE_WRITE_BACK) {
order = (dmc->md_blocks - 1) * sizeof(struct cache_md_block_head);
dmc->md_blocks_buf = (struct cache_md_block_head *)vmalloc(order);
if (!dmc->md_blocks_buf) {
ti->error = "Unable to allocate memory";
r = -ENOMEM;
flashcache_kcopy_destroy(dmc);
flashcache_diskclean_destroy(dmc);
vfree((void *)dmc->cache);
vfree((void *)dmc->cache_sets);
Expand Down Expand Up @@ -1160,7 +1203,9 @@ flashcache_ctr(struct dm_target *ti, unsigned int argc, char **argv)
dmc->sysctl_fast_remove = 0;
dmc->sysctl_cache_all = 1;
dmc->sysctl_fallow_clean_speed = FALLOW_CLEAN_SPEED;
dmc->sysctl_fallow_delay = FALLOW_DELAY;
if (dmc->write_only_cache == 0)
/* Don't both fallow cleaning for write only caching */
dmc->sysctl_fallow_delay = FALLOW_DELAY;
dmc->sysctl_skip_seq_thresh_kb = SKIP_SEQUENTIAL_THRESHOLD;
dmc->sysctl_clean_on_read_miss = 0;
dmc->sysctl_clean_on_write_miss = 0;
Expand Down Expand Up @@ -1382,6 +1427,7 @@ flashcache_dtr(struct dm_target *ti)
flashcache_dtr_stats_print(dmc);
flashcache_hash_destroy(dmc);
flashcache_diskclean_destroy(dmc);
flashcache_kcopy_destroy(dmc);
vfree((void *)dmc->cache);
vfree((void *)dmc->cache_sets);
if (dmc->cache_mode == FLASHCACHE_WRITE_BACK)
Expand Down Expand Up @@ -1535,9 +1581,12 @@ flashcache_status_table(struct cache_c *dmc, status_type_t type,
cache_pct = 0;
dirty_pct = 0;
}
if (dmc->cache_mode == FLASHCACHE_WRITE_BACK)
cache_mode = "WRITE_BACK";
else if (dmc->cache_mode == FLASHCACHE_WRITE_THROUGH)
if (dmc->cache_mode == FLASHCACHE_WRITE_BACK) {
if (dmc->write_only_cache)
cache_mode = "WRITE_CACHE";
else
cache_mode = "WRITE_BACK";
} else if (dmc->cache_mode == FLASHCACHE_WRITE_THROUGH)
cache_mode = "WRITE_THROUGH";
else
cache_mode = "WRITE_AROUND";
Expand Down Expand Up @@ -1572,6 +1621,15 @@ flashcache_status_table(struct cache_c *dmc, status_type_t type,
if (size_hist[i] > 0)
DMEMIT("%d:%llu ", i*512, size_hist[i]);
}
#if 0
DMEMIT("\n");
DMEMIT("Write Clustering Hist: ");
for (i = 0 ; i < FLASHCACHE_WRITE_CLUST_HIST_SIZE ; i++) {
if (dmc->write_clust_hist[i] > 0)
DMEMIT("%d:%llu ", i, dmc->write_clust_hist[i]);
}
DMEMIT(">=128:%llu ", dmc->write_clust_hist_ovf);
#endif
}

/*
Expand Down Expand Up @@ -1792,7 +1850,7 @@ flashcache_init(void)
/*
* Destroy a cache target.
*/
void
void __exit
flashcache_exit(void)
{
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
Expand Down

0 comments on commit 40b8b41

Please sign in to comment.