From 618e512d2b9a28ef11bee8e27fff08bb3aa95ccc Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Thu, 19 Feb 2015 18:26:43 +0530 Subject: [PATCH 1/6] rfc.sh: relax checkpatch.pl on RFC patches Change-Id: I747e1c4eef4f147977da40eafceb9fec5d869a1c Signed-off-by: Venky Shankar --- rfc.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/rfc.sh b/rfc.sh index 5528629c7fe..e7b45562d9e 100755 --- a/rfc.sh +++ b/rfc.sh @@ -129,14 +129,17 @@ main() return; fi - check_patches_for_coding_style; + bug=$(git show --format='%b' | grep -i '^BUG: ' | awk '{print $2}'); + + # relax checkpatch.pl on RFC patches + if [ !-z "$bug" ]; then + check_patches_for_coding_style; + fi rebase_changes; assert_diverge; - bug=$(git show --format='%b' | grep -i '^BUG: ' | awk '{print $2}'); - if [ "$DRY_RUN" = 1 ]; then drier='echo -e Please use the following command to send your commits to review:\n\n' else From eea693be445ead767e68ce252fd25155895b9778 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Tue, 3 Feb 2015 19:03:30 +0530 Subject: [PATCH 2/6] libglusterfs/rot-buffs: rotational buffers This patch introduces rotational buffers aiming at the classic multiple producer and multiple consumer problem. A fixed set of buffer list is allocated during initialization, where each list consist of a list of buffers. Each buffer is an iovec pointing to a memory region of fixed allocation size. Multiple producers write data the these buffers. A buffer list starts with a single buffer (iovec) and allocates more when required (although this can be preallocatd in multiples of k). rot-buffs allow multiple producers to write data parallely with a bit of extra cost of taking locks. Therefore, it's much suited for large writes. Multiple producers are allowed to write in the buffer parallely by "reserving" write space for selected number of bytes and returning pointer to the start of the reserved area. The write size is selected by the producer before it starts the write (which is often known). Therefore, the write itself need not be serialized -- just the space reservation needs to be done safely. The other part is when a consumer kicks in to consume what has been produced. At this point, a buffer list switch is performed. The "current" buffer list pointer is safely pointed to the next available buffer list. New writes are now directed to the just switched buffer list (the old buffer list is now considered out of rotation). Note that the old buffer still may have producers in progress (pending writes), so the consumer has to wait till the writers are drained. Currently this is the slow path for producers (write completion) and needs to be improved. Currently, there is special handling for cases where the number of consumers match (or exceed) the number of producers, which could result in writer starvation. In this scenario, when a consumers requests a buffer list for consumption, a check is performed for writer starvation and consumption is denied until at least another buffer list is ready of the producer for writes, i.e., one (or more) consumer(s) completed, thereby putting the buffer list back in rotation. [ NOTE: I've not performance tested this producer-consumer model yet but using it changelog for event notification. The list of buffers (iovecs) are directly passed to the RPC layer. Before performance tests are done, the slow path needs to be improved and some other safety checks needs to be added such as write sizes exceeding iovec allocation size. ] Change-Id: I88d235522b05ab82509aba861374a2312bff57f2 Signed-off-by: Venky Shankar --- libglusterfs/src/Makefile.am | 5 +- libglusterfs/src/list.h | 17 +- libglusterfs/src/mem-types.h | 3 + libglusterfs/src/rot-buffs.c | 349 +++++++++++++++++++++++++++++++++++ libglusterfs/src/rot-buffs.h | 134 ++++++++++++++ 5 files changed, 505 insertions(+), 3 deletions(-) create mode 100644 libglusterfs/src/rot-buffs.c create mode 100644 libglusterfs/src/rot-buffs.h diff --git a/libglusterfs/src/Makefile.am b/libglusterfs/src/Makefile.am index d117cc17605..637c5902ac0 100644 --- a/libglusterfs/src/Makefile.am +++ b/libglusterfs/src/Makefile.am @@ -28,7 +28,8 @@ libglusterfs_la_SOURCES = dict.c xlator.c logging.c \ event-history.c gidcache.c ctx.c client_t.c event-poll.c event-epoll.c \ $(CONTRIBDIR)/libgen/basename_r.c $(CONTRIBDIR)/libgen/dirname_r.c \ $(CONTRIBDIR)/stdlib/gf_mkostemp.c strfd.c \ - $(CONTRIBDIR)/mount/mntent.c $(CONTRIBDIR)/libexecinfo/execinfo.c + $(CONTRIBDIR)/mount/mntent.c $(CONTRIBDIR)/libexecinfo/execinfo.c \ + rot-buffs.c nodist_libglusterfs_la_SOURCES = y.tab.c graph.lex.c @@ -46,7 +47,7 @@ noinst_HEADERS = common-utils.h defaults.h dict.h glusterfs.h hashfn.h timespec. gidcache.h client_t.h glusterfs-acl.h glfs-message-id.h \ template-component-messages.h strfd.h \ $(CONTRIBDIR)/mount/mntent_compat.h lvm-defaults.h \ - $(CONTRIBDIR)/libexecinfo/execinfo_compat.h + $(CONTRIBDIR)/libexecinfo/execinfo_compat.h rot-buffs.h EXTRA_DIST = graph.l graph.y diff --git a/libglusterfs/src/list.h b/libglusterfs/src/list.h index 04b4047129f..c30a20e036a 100644 --- a/libglusterfs/src/list.h +++ b/libglusterfs/src/list.h @@ -179,15 +179,30 @@ list_append_init (struct list_head *list, struct list_head *head) INIT_LIST_HEAD (list); } +static inline int +list_is_last (struct list_head *list, struct list_head *head) +{ + return (list->next == head); +} #define list_entry(ptr, type, member) \ ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) +#define list_first_entry(ptr, type, member) \ + list_entry((ptr)->next, type, member) + +#define list_last_entry(ptr, type, member) \ + list_entry((ptr)->prev, type, member) + +#define list_next_entry(pos, member) \ + list_entry((pos)->member.next, typeof(*(pos)), member) + +#define list_prev_entry(pos, member) \ + list_entry((pos)->member.prev, typeof(*(pos)), member) #define list_for_each(pos, head) \ for (pos = (head)->next; pos != (head); pos = pos->next) - #define list_for_each_entry(pos, head, member) \ for (pos = list_entry((head)->next, typeof(*pos), member); \ &pos->member != (head); \ diff --git a/libglusterfs/src/mem-types.h b/libglusterfs/src/mem-types.h index 0db06f52c98..07fa46bd1e6 100644 --- a/libglusterfs/src/mem-types.h +++ b/libglusterfs/src/mem-types.h @@ -128,6 +128,9 @@ enum gf_common_mem_types_ { gf_common_mt_ereg = 112, gf_common_mt_wr = 113, gf_common_mt_rdma_arena_mr = 114, + gf_common_mt_rbuf_t = 115, + gf_common_mt_rlist_t = 116, + gf_common_mt_rvec_t = 117, gf_common_mt_end }; #endif diff --git a/libglusterfs/src/rot-buffs.c b/libglusterfs/src/rot-buffs.c new file mode 100644 index 00000000000..a24eebaa562 --- /dev/null +++ b/libglusterfs/src/rot-buffs.c @@ -0,0 +1,349 @@ +/* + Copyright (c) 2008-2015 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "mem-types.h" +#include "mem-pool.h" + +#include "rot-buffs.h" + +/** + * Producer-Consumer based on top of rotational buffers. + * + * This favours writers (producer) and keeps the critical section + * light weight. Buffer switch happens when a consumer wants to + * consume data. This is the slow path and waits for pending + * writes to finish. + * + * TODO: use bitmaps during lockless writer completion. + */ + +inline void * +rbuf_alloc_rvec () +{ + return GF_CALLOC (1, RLIST_IOV_MELDED_ALLOC_SIZE, gf_common_mt_rvec_t); +} + +inline void +rbuf_reset_rvec (rbuf_iovec_t *rvec) +{ + /* iov_base is _never_ modified */ + rvec->iov.iov_len = 0; +} + +/* TODO: alloc multiple rbuf_iovec_t */ +inline int +rlist_add_new_vec (rbuf_list_t *rlist) +{ + rbuf_iovec_t *rvec = NULL; + + rvec = (rbuf_iovec_t *) rbuf_alloc_rvec (); + if (!rvec) + return -1; + INIT_LIST_HEAD (&rvec->list); + rvec->iov.iov_base = ((char *)rvec) + RBUF_IOVEC_SIZE; + rvec->iov.iov_len = 0; + + list_add_tail (&rvec->list, &rlist->veclist); + + rlist->rvec = rvec; /* cache the latest */ + rlist->used++; + return 0; +} + +inline void +rlist_free_rvec (rbuf_iovec_t *rvec) +{ + if (!rvec) + return; + list_del (&rvec->list); + GF_FREE (rvec); +} + +inline void +rlist_purge_all_rvec (rbuf_list_t *rlist) +{ + rbuf_iovec_t *rvec = NULL; + + if (!rlist) + return; + while (list_empty (&rlist->veclist)) { + rvec = list_first_entry (&rlist->veclist, rbuf_iovec_t, list); + rlist_free_rvec (rvec); + } +} + +inline void +rbuf_purge_rlist (rbuf_t *rbuf) +{ + rbuf_list_t *rlist = NULL; + + while (list_empty (&rbuf->freelist)) { + rlist = list_first_entry (&rbuf->freelist, rbuf_list_t, list); + list_del (&rlist->list); + + rlist_purge_all_rvec (rlist); + + (void) pthread_mutex_destroy (&rlist->b_lock); + (void) pthread_cond_destroy (&rlist->b_cond); + + GF_FREE (rlist); + } +} + +rbuf_t * +rbuf_init (int bufcount) +{ + int j = 0; + int ret = 0; + rbuf_t *rbuf = NULL; + rbuf_list_t *rlist = NULL; + + if (bufcount <= 0) + bufcount = ROT_BUFF_DEFAULT_COUNT; + + rbuf = GF_CALLOC (1, sizeof (rbuf_t), gf_common_mt_rbuf_t); + if (!rbuf) + goto error_return; + + ret = pthread_spin_init (&rbuf->lock, 0); + if (ret != 0) + goto dealloc_rbuf; + INIT_LIST_HEAD (&rbuf->freelist); + + /* it could have been one big calloc() but this is just once.. */ + for (j = 0; j < bufcount; j++) { + rlist = GF_CALLOC (1, + sizeof (rbuf_list_t), gf_common_mt_rlist_t); + if (!rlist) { + ret = -1; + break; + } + + INIT_LIST_HEAD (&rlist->list); + INIT_LIST_HEAD (&rlist->veclist); + + rlist->pending = rlist->completed = 0; + + ret = rlist_add_new_vec (rlist); + if (ret) + break; + + rlist->awaiting = _gf_false; + ret = pthread_mutex_init (&rlist->b_lock, 0); + if (ret != 0) { + GF_FREE (rlist); + break; + } + + ret = pthread_cond_init (&rlist->b_cond, 0); + if (ret != 0) { + GF_FREE (rlist); + break; + } + + MARK_RLIST_ACTIVE (rlist); + list_add_tail (&rlist->list, &rbuf->freelist); + } + + if (ret != 0) + goto dealloc_rlist; + + /* cache currently used buffer: first in the list */ + rbuf->current = list_first_entry (&rbuf->freelist, rbuf_list_t, list); + return rbuf; + + dealloc_rlist: + rbuf_purge_rlist (rbuf); + dealloc_rbuf: + (void) pthread_spin_destroy (&rbuf->lock); + GF_FREE (rbuf); + error_return: + return NULL; +} + +void +rbuf_dtor (rbuf_t *rbuf) +{ + rbuf->current = NULL; + rbuf_purge_rlist (rbuf); + (void) pthread_spin_destroy (&rbuf->lock); + + GF_FREE (rbuf); +} + +inline char * +rbuf_adjust_write_area (struct iovec *iov, size_t bytes) +{ + char *wbuf = NULL; + + wbuf = iov->iov_base + iov->iov_len; + iov->iov_len += bytes; + return wbuf; +} + +inline char * +rbuf_alloc_write_area (rbuf_list_t *rlist, size_t bytes) +{ + int ret = 0; + struct iovec *iov = NULL; + + /* check for available space in _current_ IO buffer */ + iov = &rlist->rvec->iov; + if (iov->iov_len + bytes <= ROT_BUFF_ALLOC_SIZE) + return rbuf_adjust_write_area (iov, bytes); /* fast path */ + + /* not enough bytes, try next available buffers */ + if (list_is_last (&rlist->rvec->list, &rlist->veclist)) { + /* OH! consumed all vector buffers */ + ret = rlist_add_new_vec (rlist); + if (ret) + goto error_return; + } else { + /* not the end, have available rbuf_iovec's */ + rlist->rvec = list_next_entry (rlist->rvec, list); + rlist->used++; + rbuf_reset_rvec (rlist->rvec); + } + + iov = &rlist->rvec->iov; + return rbuf_adjust_write_area (iov, bytes); + + error_return: + return NULL; +} + +char * +rbuf_reserve_write_area (rbuf_t *rbuf, size_t bytes, void **opaque) +{ + char *wbuf = NULL; + rbuf_list_t *rlist = NULL; + + if (!rbuf || (bytes <= 0) || !opaque) + return NULL; + + pthread_spin_lock (&rbuf->lock); + { + rlist = RBUF_CURRENT_BUFFER (rbuf); + wbuf = rbuf_alloc_write_area (rlist, bytes); + if (!wbuf) + goto unlock; + rlist->pending++; + } + unlock: + pthread_spin_unlock (&rbuf->lock); + + if (wbuf) + *opaque = rlist; + return wbuf; +} + +int +rbuf_write_complete (void *opaque) +{ + rbuf_list_t *rlist = NULL; + + if (!opaque) + return -1; + + rlist = opaque; + + pthread_mutex_lock (&rlist->b_lock); + { + rlist->completed++; + if (rlist->awaiting) + pthread_cond_signal (&rlist->b_cond); + } + pthread_mutex_unlock (&rlist->b_lock); + + return 0; +} + +int +rbuf_get_buffer (rbuf_t *rbuf, + void **opaque, sequence_fn *seqfn, void *mydata) +{ + int retval = RBUF_CONSUMABLE; + rbuf_list_t *rlist = NULL; + + if (!rbuf || !opaque) + return -1; + + pthread_spin_lock (&rbuf->lock); + { + rlist = RBUF_CURRENT_BUFFER (rbuf); + if (!rlist->pending) { + retval = RBUF_EMPTY; + goto unlock; + } + + list_del_init (&rlist->list); + if (list_empty (&rbuf->freelist)) { + + /* removal would lead to writer starvation, disallow */ + list_add (&rlist->list, &rbuf->freelist); + retval = RBUF_WOULD_STARVE; + goto unlock; + } + + if (seqfn) + seqfn (rlist, mydata); + rbuf->current = + list_first_entry (&rbuf->freelist, rbuf_list_t, list); + } + unlock: + pthread_spin_unlock (&rbuf->lock); + + if (retval == RBUF_CONSUMABLE) { + *opaque = rlist; /* caller _owns_ the buffer */ + MARK_RLIST_WAITING (rlist); + } + return retval; +} + +/* TODO: shrink ->veclist using hueristics */ +int +rbuf_wait_for_completion (rbuf_t *rbuf, void *opaque, + void (*fn)(rbuf_list_t *, void *), void *arg) +{ + rbuf_list_t *rlist = NULL; + + if (!rbuf || !opaque) + return -1; + + rlist = opaque; + + pthread_mutex_lock (&rlist->b_lock); + { + while (rlist->completed != rlist->pending) { + rlist->awaiting = _gf_true; + pthread_cond_wait (&rlist->b_cond, &rlist->b_lock); + } + } + pthread_mutex_unlock (&rlist->b_lock); + + fn (rlist, arg); /* invoke handler */ + + rlist->awaiting = _gf_false; + MARK_RLIST_COMPLETE (rlist); + + rlist->rvec = list_first_entry (&rlist->veclist, rbuf_iovec_t, list); + rbuf_reset_rvec (rlist->rvec); + + rlist->used = 1; + rlist->pending = rlist->completed = 0; + + pthread_spin_lock (&rbuf->lock); + { + list_add_tail (&rlist->list, &rbuf->freelist); + } + pthread_spin_unlock (&rbuf->lock); + + return 0; +} diff --git a/libglusterfs/src/rot-buffs.h b/libglusterfs/src/rot-buffs.h new file mode 100644 index 00000000000..0be220f544e --- /dev/null +++ b/libglusterfs/src/rot-buffs.h @@ -0,0 +1,134 @@ +/* + Copyright (c) 2008-2015 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __ROT_BUFFS_H +#define __ROT_BUFFS_H + +#include + +#include "list.h" +#include "common-utils.h" + +#define ROT_BUFF_DEFAULT_COUNT 2 +#define ROT_BUFF_ALLOC_SIZE 128 * 1024 + +typedef struct rbuf_iovec { + struct iovec iov; + + struct list_head list; +} rbuf_iovec_t; +#define RBUF_IOVEC_SIZE sizeof (rbuf_iovec_t) + +typedef struct rbuf_list { + int status; /* list status - debugging for now */ + + pthread_mutex_t b_lock; /* protects this structure */ + pthread_cond_t b_cond; /* signal for writer completion */ + + gf_boolean_t awaiting; + + unsigned long long pending; /* pending writers */ + unsigned long long completed; /* completed writers */ + + rbuf_iovec_t *rvec; /* currently used IO vector */ + + struct list_head veclist; /* list of attached rbuf_iov */ + unsigned long long used; /* consumable entries + attached in ->veclist */ + + unsigned long seq[2]; /* if interested, this whould store + the start sequence number and the + range */ + + struct list_head list; /* attachment to rbuf_t */ +} rbuf_list_t; + +struct rlist_iter { + struct list_head veclist; + + unsigned long long iter; +}; + +/** + * Sequence number assigment routine is called during buffer + * switch under rbuff ->lock. + */ +typedef void (sequence_fn) (rbuf_list_t *, void *); + +#define RLIST_STORE_SEQ(rlist, start, range) \ + do { \ + rlist->seq[0] = start; \ + rlist->seq[1] = range; \ + } while (0) +#define RLIST_GET_SEQ(rlist, start, range) \ + do { \ + start = rlist->seq[0]; \ + range = rlist->seq[1]; \ + } while(0) + +#define RLIST_IOV_MELDED_ALLOC_SIZE RBUF_IOVEC_SIZE + ROT_BUFF_ALLOC_SIZE + +#define RLIST_ENTRY_COUNT(rlist) rlist->used + +#define rlist_iter_init(riter, rlist) \ + do { \ + (riter)->iter = rlist->used; \ + (riter)->veclist = rlist->veclist; \ + } while (0) + +#define rvec_for_each_entry(pos, riter) \ + for (pos = list_entry \ + ((riter)->veclist.next, typeof(*pos), list); \ + (riter)->iter > 0; \ + pos = list_entry \ + (pos->list.next, typeof(*pos), list), \ + --((riter)->iter)) + +typedef struct rbuf { + pthread_spinlock_t lock; + + rbuf_list_t *current; /* cached pointer to first free item */ + + struct list_head freelist; +} rbuf_t; + +#define RBUF_CURRENT_BUFFER(r) r->current + +#define RLIST_ACTIVE 1<<0 +#define RLIST_WAITING 1<<1 + +#define MARK_RLIST_ACTIVE(r) r->status |= RLIST_ACTIVE +#define MARK_RLIST_WAITING(r) r->status |= RLIST_WAITING +#define MARK_RLIST_COMPLETE(r) r->status &= ~RLIST_WAITING + +#define IS_RLIST_ACTIVE(r) r->status & RLIST_ACTIVE +#define IS_RLIST_WAITING(r) r->status & RLIST_WAITING + +typedef enum { + RBUF_CONSUMABLE = 1, + RBUF_BUSY, + RBUF_EMPTY, + RBUF_WOULD_STARVE, +} rlist_retval_t; + +/* Initialization/Destruction */ +rbuf_t *rbuf_init (int); +void rbuf_dtor (rbuf_t *); + +/* Producer API */ +char *rbuf_reserve_write_area (rbuf_t *, size_t, void **); +int rbuf_write_complete (void *); + +/* Consumer API */ +int rbuf_get_buffer (rbuf_t *, void **, sequence_fn *, void *); +int rbuf_wait_for_completion (rbuf_t *, void *, + void (*)(rbuf_list_t *, void *), void *); + +#endif From 39d37519447c178059e389fe546f710e3c1b8930 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Tue, 3 Feb 2015 19:05:28 +0530 Subject: [PATCH 3/6] contrib/timer-wheel: import linux kernel timer wheel This patch imports timer-wheel[1] algorithm from the linux kernel (~/kernel/time/timer.c) with some modifications. Timer-wheel is an efficent way to track millions of timers for expiry. This is a variant of the simple but RAM heavy approach of having a list (timer bucket) for every future second. Timer-wheel categorizes every future second into a logarithmic array of arrays. This is done by splitting the 32 bit "timeout" value into fixed "sliced" bits, thereby each category has a fixed size array to which buckets are assigned. A classic split would be 8+6+6+6 (used in this patch) which results in 256+64+64+64 == 512 buckets. Therefore, the entire 32 bit futuristic timeouts have been mapped into 512 buckets. [ NOTE: There are other possible splits, such as "8+8+8+8", but this patch sticks to the widely used and tested default. ] Therfore, the first category "holds" timers whose expiry range is between 1..256, the next cateogry holds 257..16384, third category 16385..1048576 and so on. When timers are added, unless it's in the first category, timers with different timeouts could end up in the same bucket. This means that the timers are "partially sorted" -- sorted in their highest bits. The expiry code walks the first array of buckets and exprires any pending timers (1..256). Next, at time value 257, timers in the first bucket of the second array is "cascaded" onto the first category and timers are placed into respective buckets according to the thier timeout values. Cascading "brings down" the timers timeout to the coorect bucket of their respective category. Therefore, timers are sorted by their highest bits of the timeout value and then by the lower bits too. [1] https://lwn.net/Articles/152436/ Change-Id: I1219abf69290961ae9a3d483e11c107c5f49c4e3 Signed-off-by: Venky Shankar --- contrib/timer-wheel/find_last_bit.c | 66 ++++++++ contrib/timer-wheel/timer-wheel.c | 250 ++++++++++++++++++++++++++++ contrib/timer-wheel/timer-wheel.h | 55 ++++++ libglusterfs/src/Makefile.am | 9 +- libglusterfs/src/list.h | 23 +++ 5 files changed, 399 insertions(+), 4 deletions(-) create mode 100644 contrib/timer-wheel/find_last_bit.c create mode 100644 contrib/timer-wheel/timer-wheel.c create mode 100644 contrib/timer-wheel/timer-wheel.h diff --git a/contrib/timer-wheel/find_last_bit.c b/contrib/timer-wheel/find_last_bit.c new file mode 100644 index 00000000000..c2c020a971e --- /dev/null +++ b/contrib/timer-wheel/find_last_bit.c @@ -0,0 +1,66 @@ +/** + * @find_last_bit + * optimized implementation of find last bit in + */ + +#ifndef BITS_PER_LONG +#define BITS_PER_LONG 64 +#endif + +static inline int fls(int x) +{ + int r = 32; + + if (!x) + return 0; + if (!(x & 0xffff0000u)) { + x <<= 16; + r -= 16; + } + if (!(x & 0xff000000u)) { + x <<= 8; + r -= 8; + } + if (!(x & 0xf0000000u)) { + x <<= 4; + r -= 4; + } + if (!(x & 0xc0000000u)) { + x <<= 2; + r -= 2; + } + if (!(x & 0x80000000u)) { + x <<= 1; + r -= 1; + } + return r; +} + + +unsigned long gf_tw_find_last_bit(const unsigned long *addr, unsigned long size) +{ + unsigned long words; + unsigned long tmp; + + /* Start at final word. */ + words = size / BITS_PER_LONG; + + /* Partial final word? */ + if (size & (BITS_PER_LONG-1)) { + tmp = (addr[words] & (~0UL >> (BITS_PER_LONG + - (size & (BITS_PER_LONG-1))))); + if (tmp) + goto found; + } + + while (words) { + tmp = addr[--words]; + if (tmp) { +found: + return words * BITS_PER_LONG + fls(tmp); + } + } + + /* Not found */ + return size; +} diff --git a/contrib/timer-wheel/timer-wheel.c b/contrib/timer-wheel/timer-wheel.c new file mode 100644 index 00000000000..e226cbc2d77 --- /dev/null +++ b/contrib/timer-wheel/timer-wheel.c @@ -0,0 +1,250 @@ +#include +#include +#include + +#include "timer-wheel.h" + +static inline void +__gf_tw_add_timer (struct tvec_base *base, struct gf_tw_timer_list *timer) +{ + int i; + unsigned long idx; + unsigned long expires; + struct list_head *vec; + + expires = timer->expires; + + idx = expires - base->timer_sec; + + if (idx < TVR_SIZE) { + i = expires & TVR_MASK; + vec = base->tv1.vec + i; + } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { + i = (expires >> TVR_BITS) & TVN_MASK; + vec = base->tv2.vec + i; + } else if (idx < 1 << (TVR_BITS + 2*TVN_BITS)) { + i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; + vec = base->tv3.vec + i; + } else if (idx < 1 << (TVR_BITS + 3*TVN_BITS)) { + i = (expires >> (TVR_BITS + 2*TVN_BITS)) & TVN_MASK; + vec = base->tv4.vec + i; + } else if (idx < 0) { + vec = base->tv1.vec + (base->timer_sec & TVR_MASK); + } else { + i = (expires >> (TVR_BITS + 3*TVN_BITS)) & TVN_MASK; + vec = base->tv5.vec + i; + } + + list_add_tail (&timer->entry, vec); +} + +/* optimized find_last_bit() */ +unsigned long gf_tw_find_last_bit(const unsigned long *, unsigned long); + +static inline unsigned long +apply_slack(struct tvec_base *base, struct gf_tw_timer_list *timer) +{ + long delta; + unsigned long mask, expires, expires_limit; + + expires = timer->expires; + + delta = expires - base->timer_sec; + if (delta < 256) + return expires; + + expires_limit = expires + delta / 256; + mask = expires ^ expires_limit; + if (mask == 0) + return expires; + + int bit = gf_tw_find_last_bit (&mask, BITS_PER_LONG); + mask = (1UL << bit) - 1; + + expires_limit = expires_limit & ~(mask); + return expires_limit; +} + +static inline void +gf_tw_detach_timer (struct gf_tw_timer_list *timer) +{ + struct list_head *entry = &timer->entry; + + list_del (entry); +} + +static inline int +cascade (struct tvec_base *base, struct tvec *tv, int index) +{ + struct gf_tw_timer_list *timer, *tmp; + struct list_head tv_list; + + list_replace_init (tv->vec + index, &tv_list); + + list_for_each_entry_safe (timer, tmp, &tv_list, entry) { + __gf_tw_add_timer (base, timer); + } + + return index; +} + +#define INDEX(N) ((base->timer_sec >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK) + +/** + * run expired timers + */ +static inline void +run_timers (struct tvec_base *base) +{ + unsigned long index, call_time; + struct gf_tw_timer_list *timer; + + struct list_head work_list; + struct list_head *head = &work_list; + + pthread_spin_lock (&base->lock); + { + index = base->timer_sec & TVR_MASK; + + if (!index && + (!cascade (base, &base->tv2, INDEX(0))) && + (!cascade (base, &base->tv3, INDEX(1))) && + (!cascade (base, &base->tv4, INDEX(2)))) + cascade (base, &base->tv5, INDEX(3)); + + call_time = base->timer_sec++; + list_replace_init (base->tv1.vec + index, head); + while (!list_empty(head)) { + void (*fn)(struct gf_tw_timer_list *, void *, unsigned long); + void *data; + + timer = list_first_entry (head, struct gf_tw_timer_list, entry); + fn = timer->function; + data = timer->data; + + gf_tw_detach_timer (timer); + fn (timer, data, call_time); + } + } + pthread_spin_unlock (&base->lock); + +} + +void *runner (void *arg) +{ + struct timeval tv = {0,}; + struct tvec_base *base = arg; + + while (1) { + run_timers (base); + + tv.tv_sec = 1; + tv.tv_usec = 0; + (void) select (0, NULL, NULL, NULL, &tv); + } + + return NULL; + +} + +/* interface */ + +/** + * Add a timer in the timer wheel + */ +void gf_tw_add_timer (struct tvec_base *base, struct gf_tw_timer_list *timer) +{ + pthread_spin_lock (&base->lock); + { + timer->expires += base->timer_sec; + timer->expires = apply_slack (base, timer); + __gf_tw_add_timer (base, timer); + } + pthread_spin_unlock (&base->lock); +} + +/** + * Remove a timer from the timer wheel + */ +void gf_tw_del_timer (struct gf_tw_timer_list *timer) +{ + gf_tw_detach_timer (timer); +} + +int gf_tw_cleanup_timers (struct tvec_base *base) +{ + int ret = 0; + void *res = NULL; + + /* terminate runner */ + ret = pthread_cancel (base->runner); + if (ret != 0) + goto error_return; + ret = pthread_join (base->runner, &res); + if (ret != 0) + goto error_return; + if (res != PTHREAD_CANCELED) + goto error_return; + + /* destroy lock */ + ret = pthread_spin_destroy (&base->lock); + if (ret != 0) + goto error_return; + + /* deallocated timer base */ + free (base); + return 0; + + error_return: + return -1; +} + +/** + * Initialize various timer wheel lists and spawn a thread that + * invokes run_timers() + */ +struct tvec_base *gf_tw_init_timers () +{ + int j = 0; + int ret = 0; + struct timeval tv = {0,}; + struct tvec_base *base = NULL; + + base = malloc (sizeof (*base)); + if (!base) + goto error_return; + + ret = pthread_spin_init (&base->lock, 0); + if (ret != 0) + goto error_dealloc; + + for (j = 0; j < TVN_SIZE; j++) { + INIT_LIST_HEAD (base->tv5.vec + j); + INIT_LIST_HEAD (base->tv4.vec + j); + INIT_LIST_HEAD (base->tv3.vec + j); + INIT_LIST_HEAD (base->tv2.vec + j); + } + + for (j = 0; j < TVR_SIZE; j++) { + INIT_LIST_HEAD (base->tv1.vec + j); + } + + ret = gettimeofday (&tv, 0); + if (ret < 0) + goto destroy_lock; + base->timer_sec = tv.tv_sec; + + ret = pthread_create (&base->runner, NULL, runner, base); + if (ret != 0) + goto destroy_lock; + return base; + + destroy_lock: + (void) pthread_spin_destroy (&base->lock); + error_dealloc: + free (base); + error_return: + return NULL; +} + + diff --git a/contrib/timer-wheel/timer-wheel.h b/contrib/timer-wheel/timer-wheel.h new file mode 100644 index 00000000000..fd282234567 --- /dev/null +++ b/contrib/timer-wheel/timer-wheel.h @@ -0,0 +1,55 @@ +#ifndef __TIMER_WHEEL_H +#define __TIMER_WHEEL_H + +#include + +#include "list.h" + +#define TVR_BITS 8 +#define TVN_BITS 6 +#define TVR_SIZE (1 << TVR_BITS) +#define TVN_SIZE (1 << TVN_BITS) +#define TVR_MASK (TVR_SIZE - 1) +#define TVN_MASK (TVN_SIZE - 1) + +#define BITS_PER_LONG 64 + +struct tvec { + struct list_head vec[TVN_SIZE]; +}; + +struct tvec_root { + struct list_head vec[TVR_SIZE]; +}; + +struct tvec_base { + pthread_spinlock_t lock; /* base lock */ + + pthread_t runner; /* run_timer() */ + + unsigned long timer_sec; /* time counter */ + + struct tvec_root tv1; + struct tvec tv2; + struct tvec tv3; + struct tvec tv4; + struct tvec tv5; +}; + +struct gf_tw_timer_list { + void *data; + unsigned long expires; + + /** callback routine */ + void (*function)(struct gf_tw_timer_list *, void *, unsigned long); + + struct list_head entry; +}; + +/** The API! */ +struct tvec_base *gf_tw_init_timers (); +int gf_tw_cleanup_timers (struct tvec_base *); +void gf_tw_add_timer (struct tvec_base *, struct gf_tw_timer_list *); +void gf_tw_del_timer (struct gf_tw_timer_list *); + +#endif diff --git a/libglusterfs/src/Makefile.am b/libglusterfs/src/Makefile.am index 637c5902ac0..f6978955ef3 100644 --- a/libglusterfs/src/Makefile.am +++ b/libglusterfs/src/Makefile.am @@ -27,9 +27,9 @@ libglusterfs_la_SOURCES = dict.c xlator.c logging.c \ graph-print.c trie.c run.c options.c fd-lk.c circ-buff.c \ event-history.c gidcache.c ctx.c client_t.c event-poll.c event-epoll.c \ $(CONTRIBDIR)/libgen/basename_r.c $(CONTRIBDIR)/libgen/dirname_r.c \ - $(CONTRIBDIR)/stdlib/gf_mkostemp.c strfd.c \ - $(CONTRIBDIR)/mount/mntent.c $(CONTRIBDIR)/libexecinfo/execinfo.c \ - rot-buffs.c + $(CONTRIBDIR)/stdlib/gf_mkostemp.c strfd.c $(CONTRIBDIR)/mount/mntent.c \ + $(CONTRIBDIR)/libexecinfo/execinfo.c rot-buffs.c \ + $(CONTRIBDIR)/timer-wheel/timer-wheel.c $(CONTRIBDIR)/timer-wheel/find_last_bit.c nodist_libglusterfs_la_SOURCES = y.tab.c graph.lex.c @@ -47,7 +47,8 @@ noinst_HEADERS = common-utils.h defaults.h dict.h glusterfs.h hashfn.h timespec. gidcache.h client_t.h glusterfs-acl.h glfs-message-id.h \ template-component-messages.h strfd.h \ $(CONTRIBDIR)/mount/mntent_compat.h lvm-defaults.h \ - $(CONTRIBDIR)/libexecinfo/execinfo_compat.h rot-buffs.h + $(CONTRIBDIR)/libexecinfo/execinfo_compat.h rot-buffs.h \ + $(CONTRIBDIR)/timer-wheel/timer-wheel.h EXTRA_DIST = graph.l graph.y diff --git a/libglusterfs/src/list.h b/libglusterfs/src/list.h index c30a20e036a..c68f581d894 100644 --- a/libglusterfs/src/list.h +++ b/libglusterfs/src/list.h @@ -185,6 +185,29 @@ list_is_last (struct list_head *list, struct list_head *head) return (list->next == head); } +/** + * list_replace - replace old entry by new one + * @old : the element to be replaced + * @new : the new element to insert + * + * If @old was empty, it will be overwritten. + */ +static inline void list_replace(struct list_head *old, + struct list_head *new) +{ + new->next = old->next; + new->next->prev = new; + new->prev = old->prev; + new->prev->next = new; +} + +static inline void list_replace_init(struct list_head *old, + struct list_head *new) +{ + list_replace(old, new); + INIT_LIST_HEAD(old); +} + #define list_entry(ptr, type, member) \ ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) From 65b35778dbccf14a7a6cacc427deb995b5b4fa9d Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Tue, 3 Feb 2015 19:22:16 +0530 Subject: [PATCH 4/6] features/changelog: RPC'fy {libgf}changelog This patch introduces RPC based communication between the changelog translator and libgfchangelog. It replaces the old pathetic stream based interaction that existed earlier (due to time constraints :-/). Changelog, upon initialization starts a RPC server (rpcsvc) allowing clients to invoke a probe API as a bootup mechanism to request for event notifications. During probe, clients can choose an event filter specifying the type(s) of events they are interested in. As of now there is no way to change the event notification set once the probe RPC call is made, but that is easier to implement. The actual event notifications is done on a separate RPC session. The client (libgfchangelog) itself starts and RPC server which the changelog translator "connects back" during probe. Notifications are dispatched by a bunch of threads from the server (translator) and the client optionally orders them if ordered notifications are requried. FOPs fill in their respective event details in a buffer (rot-buffs to be particular) and a bunch of threads (consumers) swap the buffers out of roatation and dispatch them via RPC. To avoid writer starvation, then number of dispatcher threads is one less than the number of buffer list in rot-buffs.x libgfchangelog becomes purely callback based -- upon event notification from the server (and re-ordering them if required) invoke a callback routine specified by the consumer. A major part of the patch is also aimed at providing backward compatibility for geo-replication, which was one of the main consumer of the stream based API. Also, this patch does not\ "turn on" event notifications for all fops, just a bunch which is currently in requirement. Another pain point is that the server does not filter events before dispatching it to the clients. That load is taken up by the client itself (although it's done at the library layer rather than making it hard on the callback implementor). This needs improvement and care needs to be taken to not load the server up with expensive filtering mechanisms. Change-Id: Ibf60a432b68f2dfa60c6f9add2bcfd37a9c41395 Signed-off-by: Venky Shankar --- rpc/xdr/src/Makefile.am | 9 +- rpc/xdr/src/changelog-xdr.x | 27 + .../lib/examples/c/get-changes-multi.c | 84 ++ .../changelog/lib/examples/c/get-changes.c | 2 +- .../changelog/lib/examples/c/get-history.c | 6 +- .../features/changelog/lib/src/Makefile.am | 30 +- .../features/changelog/lib/src/changelog.h | 74 ++ .../changelog/lib/src/gf-changelog-api.c | 224 +++++ .../changelog/lib/src/gf-changelog-helpers.c | 32 + .../changelog/lib/src/gf-changelog-helpers.h | 194 ++-- ...ocess.c => gf-changelog-journal-handler.c} | 499 ++++++++-- .../changelog/lib/src/gf-changelog-journal.h | 114 +++ .../changelog/lib/src/gf-changelog-reborp.c | 380 ++++++++ .../changelog/lib/src/gf-changelog-rpc.c | 105 +++ .../src/gf-changelog-rpc.h} | 17 +- .../features/changelog/lib/src/gf-changelog.c | 857 ++++++++---------- .../changelog/lib/src/gf-history-changelog.c | 155 ++-- xlators/features/changelog/src/Makefile.am | 18 +- .../changelog/src/changelog-encoders.c | 2 +- .../changelog/src/changelog-ev-handle.c | 379 ++++++++ .../changelog/src/changelog-ev-handle.h | 140 +++ .../changelog/src/changelog-helpers.c | 227 ++++- .../changelog/src/changelog-helpers.h | 114 ++- .../changelog/src/changelog-mem-types.h | 25 +- .../features/changelog/src/changelog-misc.h | 14 + .../changelog/src/changelog-notifier.c | 314 ------- .../changelog/src/changelog-rpc-common.c | 334 +++++++ .../changelog/src/changelog-rpc-common.h | 84 ++ .../features/changelog/src/changelog-rpc.c | 300 ++++++ .../features/changelog/src/changelog-rpc.h | 29 + xlators/features/changelog/src/changelog.c | 505 +++++++---- 31 files changed, 4006 insertions(+), 1288 deletions(-) create mode 100644 rpc/xdr/src/changelog-xdr.x create mode 100644 xlators/features/changelog/lib/examples/c/get-changes-multi.c create mode 100644 xlators/features/changelog/lib/src/gf-changelog-api.c rename xlators/features/changelog/lib/src/{gf-changelog-process.c => gf-changelog-journal-handler.c} (59%) create mode 100644 xlators/features/changelog/lib/src/gf-changelog-journal.h create mode 100644 xlators/features/changelog/lib/src/gf-changelog-reborp.c create mode 100644 xlators/features/changelog/lib/src/gf-changelog-rpc.c rename xlators/features/changelog/{src/changelog-notifier.h => lib/src/gf-changelog-rpc.h} (50%) create mode 100644 xlators/features/changelog/src/changelog-ev-handle.c create mode 100644 xlators/features/changelog/src/changelog-ev-handle.h delete mode 100644 xlators/features/changelog/src/changelog-notifier.c create mode 100644 xlators/features/changelog/src/changelog-rpc-common.c create mode 100644 xlators/features/changelog/src/changelog-rpc-common.h create mode 100644 xlators/features/changelog/src/changelog-rpc.c create mode 100644 xlators/features/changelog/src/changelog-rpc.h diff --git a/rpc/xdr/src/Makefile.am b/rpc/xdr/src/Makefile.am index a1c5525c17e..e27528d0e01 100644 --- a/rpc/xdr/src/Makefile.am +++ b/rpc/xdr/src/Makefile.am @@ -1,5 +1,6 @@ XDRSOURCES = glusterfs3-xdr.c cli1-xdr.c nlm4-xdr.c nsm-xdr.c \ - rpc-common-xdr.c glusterd1-xdr.c acl3-xdr.c portmap-xdr.c mount3udp.c + rpc-common-xdr.c glusterd1-xdr.c acl3-xdr.c portmap-xdr.c \ + mount3udp.c changelog-xdr.c XDRHEADERS = $(XDRSOURCES:.c=.h) XDRGENFILES = $(XDRSOURCES:.c=.x) @@ -76,3 +77,9 @@ mount3udp.c: mount3udp.x mount3udp.h mount3udp.h: mount3udp.x $(top_srcdir)/build-aux/xdrgen header $(xdrsrc)/`basename ${@:.h=.x}` + +changelog-xdr.c: changelog-xdr.x changelog-xdr.h + $(top_srcdir)/build-aux/xdrgen source $(xdrsrc)/`basename ${@:.c=.x}` + +changelog-xdr.h: changelog-xdr.x + $(top_srcdir)/build-aux/xdrgen header $(xdrsrc)/`basename ${@:.h=.x}` diff --git a/rpc/xdr/src/changelog-xdr.x b/rpc/xdr/src/changelog-xdr.x new file mode 100644 index 00000000000..ba1ebd27836 --- /dev/null +++ b/rpc/xdr/src/changelog-xdr.x @@ -0,0 +1,27 @@ +/* XDR: libgfchangelog -> changelog */ + +struct changelog_probe_req { + unsigned int filter; + char sock[UNIX_PATH_MAX]; +}; + +struct changelog_probe_rsp { + int op_ret; +}; + +/* XDR: changelog -> libgfchangelog */ +struct changelog_event_req { + /* sequence number for the buffer */ + unsigned long seq; + + /* time of dispatch */ + unsigned long tv_sec; + unsigned long tv_usec; +}; + +struct changelog_event_rsp { + int op_ret; + + /* ack'd buffers sequence number */ + unsigned long seq; +}; diff --git a/xlators/features/changelog/lib/examples/c/get-changes-multi.c b/xlators/features/changelog/lib/examples/c/get-changes-multi.c new file mode 100644 index 00000000000..d2aa009273f --- /dev/null +++ b/xlators/features/changelog/lib/examples/c/get-changes-multi.c @@ -0,0 +1,84 @@ +/* + Copyright (c) 2013 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +/** + * Compile it using: + * gcc -o getchanges-multi `pkg-config --cflags libgfchangelog` \ + * get-changes-multi.c `pkg-config --libs libgfchangelog` + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "changelog.h" + +void *brick_init (void *xl, struct gf_brick_spec *brick) +{ + return brick; +} + +void brick_fini (void *xl, char *brick, void *data) +{ + return; +} + +void brick_callback (void *xl, char *brick, + void *data, changelog_event_t *ev) +{ + printf ("->callback: (brick,type) [%s:%d] | (flags, mode) [%d:%ld]\n", + brick, ev->ev_type, ev->u.release.ordflags, ev->u.release.mode); +} + +void fill_brick_spec (struct gf_brick_spec *brick, char *path) +{ + brick->brick_path = strdup (path); + brick->filter = CHANGELOG_OP_TYPE_RELEASE; + + brick->init = brick_init; + brick->fini = brick_fini; + brick->callback = brick_callback; + brick->connected = NULL; + brick->disconnected = NULL; +} + +int +main (int argc, char ** argv) +{ + int ret = 0; + void *bricks = NULL; + struct gf_brick_spec *brick = NULL; + + bricks = calloc (2, sizeof (struct gf_brick_spec)); + if (!bricks) + goto error_return; + + brick = (struct gf_brick_spec *)bricks; + fill_brick_spec (brick, "/export/z1/zwoop"); + + brick++; + fill_brick_spec (brick, "/export/z2/zwoop"); + + ret = gf_changelog_register_generic ((struct gf_brick_spec *)bricks, 2, + 1, "/tmp/multi-changes.log", 9, NULL); + if (ret) + goto error_return; + + /* let callbacks do the job */ + select (0, NULL, NULL, NULL, NULL); + + error_return: + return -1; +} diff --git a/xlators/features/changelog/lib/examples/c/get-changes.c b/xlators/features/changelog/lib/examples/c/get-changes.c index 6d0d0357db9..0b2808c7e35 100644 --- a/xlators/features/changelog/lib/examples/c/get-changes.c +++ b/xlators/features/changelog/lib/examples/c/get-changes.c @@ -40,7 +40,7 @@ main (int argc, char ** argv) char fbuf[PATH_MAX] = {0,}; /* get changes for brick "/home/vshankar/export/yow/yow-1" */ - ret = gf_changelog_register ("/home/vshankar/exports/yow/yow-1", + ret = gf_changelog_register ("/export/z1/zwoop", "/tmp/scratch", "/tmp/change.log", 9, 5); if (ret) { handle_error ("register failed"); diff --git a/xlators/features/changelog/lib/examples/c/get-history.c b/xlators/features/changelog/lib/examples/c/get-history.c index 33eb8c32d4d..7a9b3a2564f 100644 --- a/xlators/features/changelog/lib/examples/c/get-history.c +++ b/xlators/features/changelog/lib/examples/c/get-history.c @@ -40,8 +40,8 @@ main (int argc, char ** argv) char fbuf[PATH_MAX] = {0,}; unsigned long end_ts = 0; - ret = gf_changelog_register ("/export1/v1/b1", - "/tmp/scratch_v1", "/tmp/scratch_v1/changes.log", + ret = gf_changelog_register ("/export/z1/zwoop", + "/tmp/scratch_v1", "/tmp/changes.log", 9, 5); if (ret) { handle_error ("register failed"); @@ -51,7 +51,7 @@ main (int argc, char ** argv) int a, b; printf ("give the two numbers start and end\t"); scanf ("%d%d", &a, &b); - ret = gf_history_changelog ("/export1/v1/b1/.glusterfs/changelogs",a, b, 3, &end_ts); + ret = gf_history_changelog ("/export/z1/zwoop/.glusterfs/changelogs",a, b, 3, &end_ts); if (ret == -1) { printf ("history failed"); goto out; diff --git a/xlators/features/changelog/lib/src/Makefile.am b/xlators/features/changelog/lib/src/Makefile.am index 1ae919bfb38..306306bd585 100644 --- a/xlators/features/changelog/lib/src/Makefile.am +++ b/xlators/features/changelog/lib/src/Makefile.am @@ -4,9 +4,13 @@ libgfchangelog_la_CFLAGS = -Wall $(GF_CFLAGS) $(GF_DARWIN_LIBGLUSTERFS_CFLAGS) \ libgfchangelog_la_CPPFLAGS = $(GF_CPPFLAGS) -D__USE_FILE_OFFSET64 -fpic \ -I../../../src/ -I$(top_srcdir)/libglusterfs/src \ -I$(top_srcdir)/xlators/features/changelog/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_srcdir)/rpc/rpc-lib/src \ + -I$(top_srcdir)/rpc/rpc-transport/socket/src \ -DDATADIR=\"$(localstatedir)\" -libgfchangelog_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la +libgfchangelog_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ + $(top_builddir)/rpc/xdr/src/libgfxdr.la \ + $(top_builddir)/rpc/rpc-lib/src/libgfrpc.la libgfchangelog_la_LDFLAGS = $(GF_LDFLAGS) -version-info $(LIBGFCHANGELOG_LT_VERSION) @@ -15,18 +19,18 @@ lib_LTLIBRARIES = libgfchangelog.la CONTRIB_BUILDDIR = $(top_builddir)/contrib -libgfchangelog_la_SOURCES = gf-changelog.c gf-changelog-process.c \ - gf-changelog-helpers.c gf-history-changelog.c \ - $(CONTRIBDIR)/uuid/clear.c \ - $(CONTRIBDIR)/uuid/copy.c $(CONTRIBDIR)/uuid/gen_uuid.c \ - $(CONTRIBDIR)/uuid/pack.c $(CONTRIBDIR)/uuid/parse.c \ - $(CONTRIBDIR)/uuid/unparse.c $(CONTRIBDIR)/uuid/uuid_time.c \ - $(CONTRIBDIR)/uuid/compare.c $(CONTRIBDIR)/uuid/isnull.c \ - $(CONTRIBDIR)/uuid/unpack.c - -noinst_HEADERS = gf-changelog-helpers.h $(CONTRIBDIR)/uuid/uuidd.h \ - $(CONTRIBDIR)/uuid/uuid.h $(CONTRIBDIR)/uuid/uuidP.h \ - $(CONTRIB_BUILDDIR)/uuid/uuid_types.h +libgfchangelog_la_SOURCES = gf-changelog.c gf-changelog-journal-handler.c gf-changelog-helpers.c \ + gf-changelog-api.c gf-history-changelog.c gf-changelog-rpc.c gf-changelog-reborp.c \ + $(top_srcdir)/xlators/features/changelog/src/changelog-rpc-common.c \ + $(CONTRIBDIR)/uuid/clear.c $(CONTRIBDIR)/uuid/copy.c \ + $(CONTRIBDIR)/uuid/gen_uuid.c $(CONTRIBDIR)/uuid/pack.c \ + $(CONTRIBDIR)/uuid/parse.c $(CONTRIBDIR)/uuid/unparse.c \ + $(CONTRIBDIR)/uuid/uuid_time.c $(CONTRIBDIR)/uuid/compare.c \ + $(CONTRIBDIR)/uuid/isnull.c $(CONTRIBDIR)/uuid/unpack.c + +noinst_HEADERS = gf-changelog-helpers.h gf-changelog-rpc.h gf-changelog-journal.h \ + $(CONTRIBDIR)/uuid/uuidd.h $(CONTRIBDIR)/uuid/uuid.h \ + $(CONTRIBDIR)/uuid/uuidP.h $(CONTRIB_BUILDDIR)/uuid/uuid_types.h libgfchangelog_HEADERS = changelog.h diff --git a/xlators/features/changelog/lib/src/changelog.h b/xlators/features/changelog/lib/src/changelog.h index 5cddfb5839c..9dcb7634f51 100644 --- a/xlators/features/changelog/lib/src/changelog.h +++ b/xlators/features/changelog/lib/src/changelog.h @@ -11,6 +11,75 @@ #ifndef _GF_CHANGELOG_H #define _GF_CHANGELOG_H +struct gf_brick_spec; + +/** + * Max bit shiter for event selection + */ +#define CHANGELOG_EV_SELECTION_RANGE 4 + +#define CHANGELOG_OP_TYPE_JOURNAL 1<<0 +#define CHANGELOG_OP_TYPE_OPEN 1<<1 +#define CHANGELOG_OP_TYPE_CREATE 1<<2 +#define CHANGELOG_OP_TYPE_RELEASE 1<<3 +#define CHANGELOG_OP_TYPE_MAX 1< + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "uuid.h" +#include "globals.h" +#include "glusterfs.h" + +#include "gf-changelog-helpers.h" +#include "gf-changelog-journal.h" +#include "changelog-mem-types.h" + +int +gf_changelog_done (char *file) +{ + int ret = -1; + char *buffer = NULL; + xlator_t *this = NULL; + gf_changelog_journal_t *jnl = NULL; + char to_path[PATH_MAX] = {0,}; + + errno = EINVAL; + + this = THIS; + if (!this) + goto out; + + jnl = (gf_changelog_journal_t *) GF_CHANGELOG_GET_API_PTR (this); + if (!jnl) + goto out; + + if (!file || !strlen (file)) + goto out; + + /* make sure 'file' is inside ->jnl_working_dir */ + buffer = realpath (file, NULL); + if (!buffer) + goto out; + + if (strncmp (jnl->jnl_working_dir, + buffer, strlen (jnl->jnl_working_dir))) + goto out; + + (void) snprintf (to_path, PATH_MAX, "%s%s", + jnl->jnl_processed_dir, basename (buffer)); + gf_log (this->name, GF_LOG_DEBUG, + "moving %s to processed directory", file); + ret = rename (buffer, to_path); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "cannot move %s to %s (reason: %s)", + file, to_path, strerror (errno)); + goto out; + } + + ret = 0; + + out: + if (buffer) + free (buffer); /* allocated by realpath() */ + return ret; +} + +/** + * @API + * for a set of changelogs, start from the beginning + */ +int +gf_changelog_start_fresh () +{ + xlator_t *this = NULL; + gf_changelog_journal_t *jnl = NULL; + + this = THIS; + if (!this) + goto out; + + errno = EINVAL; + + jnl = (gf_changelog_journal_t *) GF_CHANGELOG_GET_API_PTR (this); + if (!jnl) + goto out; + + if (gf_ftruncate (jnl->jnl_fd, 0)) + goto out; + + return 0; + + out: + return -1; +} + +/** + * @API + * return the next changelog file entry. zero means all chanelogs + * consumed. + */ +ssize_t +gf_changelog_next_change (char *bufptr, size_t maxlen) +{ + ssize_t size = -1; + int tracker_fd = 0; + xlator_t *this = NULL; + gf_changelog_journal_t *jnl = NULL; + char buffer[PATH_MAX] = {0,}; + + errno = EINVAL; + + this = THIS; + if (!this) + goto out; + + jnl = (gf_changelog_journal_t *) GF_CHANGELOG_GET_API_PTR (this); + if (!jnl) + goto out; + + tracker_fd = jnl->jnl_fd; + + size = gf_readline (tracker_fd, buffer, maxlen); + if (size < 0) { + size = -1; + goto out; + } + + if (size == 0) + goto out; + + memcpy (bufptr, buffer, size - 1); + bufptr[size - 1] = '\0'; + +out: + return size; +} + +/** + * @API + * gf_changelog_scan() - scan and generate a list of change entries + * + * calling this api multiple times (without calling gf_changlog_done()) + * would result new changelogs(s) being refreshed in the tracker file. + * This call also acts as a cancellation point for the consumer. + */ +ssize_t +gf_changelog_scan () +{ + int ret = 0; + int tracker_fd = 0; + size_t len = 0; + size_t off = 0; + xlator_t *this = NULL; + size_t nr_entries = 0; + gf_changelog_journal_t *jnl = NULL; + struct dirent *entryp = NULL; + struct dirent *result = NULL; + char buffer[PATH_MAX] = {0,}; + + this = THIS; + if (!this) + goto out; + + jnl = (gf_changelog_journal_t *) GF_CHANGELOG_GET_API_PTR (this); + if (!jnl) + goto out; + if (JNL_IS_API_DISCONNECTED (jnl)) { + errno = ENOTCONN; + goto out; + } + + errno = EINVAL; + + tracker_fd = jnl->jnl_fd; + if (gf_ftruncate (tracker_fd, 0)) + goto out; + + len = offsetof(struct dirent, d_name) + + pathconf(jnl->jnl_processing_dir, _PC_NAME_MAX) + 1; + entryp = GF_CALLOC (1, len, + gf_changelog_mt_libgfchangelog_dirent_t); + if (!entryp) + goto out; + + rewinddir (jnl->jnl_dir); + while (1) { + ret = readdir_r (jnl->jnl_dir, entryp, &result); + if (ret || !result) + break; + + if ( !strcmp (basename (entryp->d_name), ".") + || !strcmp (basename (entryp->d_name), "..") ) + continue; + + nr_entries++; + + GF_CHANGELOG_FILL_BUFFER (jnl->jnl_processing_dir, + buffer, off, + strlen (jnl->jnl_processing_dir)); + GF_CHANGELOG_FILL_BUFFER (entryp->d_name, buffer, + off, strlen (entryp->d_name)); + GF_CHANGELOG_FILL_BUFFER ("\n", buffer, off, 1); + + if (gf_changelog_write (tracker_fd, buffer, off) != off) { + gf_log (this->name, GF_LOG_ERROR, + "error writing changelog filename" + " to tracker file"); + break; + } + off = 0; + } + + GF_FREE (entryp); + + if (!result) { + if (gf_lseek (tracker_fd, 0, SEEK_SET) != -1) + return nr_entries; + } + out: + return -1; +} diff --git a/xlators/features/changelog/lib/src/gf-changelog-helpers.c b/xlators/features/changelog/lib/src/gf-changelog-helpers.c index f071b057d59..6bf709dc664 100644 --- a/xlators/features/changelog/lib/src/gf-changelog-helpers.c +++ b/xlators/features/changelog/lib/src/gf-changelog-helpers.c @@ -178,3 +178,35 @@ gf_ftruncate (int fd, off_t length) return 0; } + +int +gf_thread_cleanup (xlator_t *this, pthread_t thread) +{ + int ret = 0; + void *res = NULL; + + ret = pthread_cancel (thread); + if (ret != 0) { + gf_log (this->name, GF_LOG_WARNING, + "Failed to send cancellation to thread"); + goto error_return; + } + + ret = pthread_join (thread, &res); + if (ret != 0) { + gf_log (this->name, GF_LOG_WARNING, + "failed to join thread"); + goto error_return; + } + + if (res != PTHREAD_CANCELED) { + gf_log (this->name, GF_LOG_WARNING, + "Thread could not be cleaned up"); + goto error_return; + } + + return 0; + + error_return: + return -1; +} diff --git a/xlators/features/changelog/lib/src/gf-changelog-helpers.h b/xlators/features/changelog/lib/src/gf-changelog-helpers.h index 9b875d45dcc..9eacf11065b 100644 --- a/xlators/features/changelog/lib/src/gf-changelog-helpers.h +++ b/xlators/features/changelog/lib/src/gf-changelog-helpers.h @@ -18,6 +18,11 @@ #include +#include "changelog.h" + +#include "changelog-rpc-common.h" +#include "gf-changelog-journal.h" + #define GF_CHANGELOG_TRACKER "tracker" #define GF_CHANGELOG_CURRENT_DIR ".current" @@ -41,79 +46,143 @@ typedef struct read_line { char rl_buf[MAXLINE]; } read_line_t; +struct gf_changelog; + +/** + * Event list for ordered event notification + * + * ->next_seq holds the next _expected_ sequence number. + */ +struct gf_event_list { + pthread_mutex_t lock; /* protects this structure */ + pthread_cond_t cond; + + pthread_t invoker; + + unsigned long next_seq; /* next sequence number expected: + zero during bootstrap */ + + struct gf_changelog *entry; /* backpointer to it's brick + encapsulator (entry) */ + struct list_head events; /* list of events (ordered) */ +}; + +/** + * include a refcount if it's of use by additional layers + */ +struct gf_event { + int count; + + unsigned long seq; + + struct list_head list; + + struct iovec iov[0]; +}; +#define GF_EVENT_CALLOC_SIZE(cnt, len) \ + (sizeof (struct gf_event) + (cnt * sizeof (struct iovec)) + len) + +/** + * assign the base address of the IO vector to the correct memory + * area and set it's addressable length. + */ +#define GF_EVENT_ASSIGN_IOVEC(vec, event, len, pos) \ + do { \ + vec->iov_base = ((char *)event) + \ + sizeof (struct gf_event) + \ + (event->count * sizeof (struct iovec)) + pos; \ + vec->iov_len = len; \ + pos += len; \ + } while (0) + +/** + * An instance of this structure is allocated for each brick for which + * notifications are streamed. + */ typedef struct gf_changelog { xlator_t *this; - /* 'processing' directory stream */ - DIR *gfc_dir; - - /* fd to the tracker file */ - int gfc_fd; - - /* connection retries */ - int gfc_connretries; + struct list_head list; /* list of instances */ - char gfc_sockpath[UNIX_PATH_MAX]; + char brick[PATH_MAX]; /* brick path for this end-point */ - char gfc_brickpath[PATH_MAX]; + changelog_rpc_t grpc; /* rpc{-clnt,svc} for this brick */ +#define RPC_PROBER(ent) ent->grpc.rpc +#define RPC_REBORP(ent) ent->grpc.svc +#define RPC_SOCK(ent) ent->grpc.sock - /* socket for receiving notifications */ - int gfc_sockfd; + unsigned int notify; /* notification flag(s) */ - char *gfc_working_dir; + FINI *fini; /* destructor callback */ + CALLBACK *callback; /* event callback dispatcher */ + CONNECT *connected; /* connect callback */ + DISCONNECT *disconnected; /* disconnection callback */ - /* RFC 3986 string encoding */ - char rfc3986[256]; + void *ptr; /* owner specific private data */ + xlator_t *invokerxl; /* consumers _this_, if valid, + assigned to THIS before cbk is + invoked */ - char gfc_current_dir[PATH_MAX]; - char gfc_processed_dir[PATH_MAX]; - char gfc_processing_dir[PATH_MAX]; + gf_boolean_t ordered; - pthread_t gfc_changelog_processor; - - /* Holds gfc for History API */ - struct gf_changelog *hist_gfc; - - /* holds 0 done scanning, 1 keep scanning and -1 error */ - int hist_done; + struct gf_event_list event; } gf_changelog_t; -typedef struct gf_changelog_history_data { - int len; - - int htime_fd; - - /* parallelism count */ - int n_parallel; - - /* history from, to indexes */ - unsigned long from; - unsigned long to; -} gf_changelog_history_data_t; - -typedef struct gf_changelog_consume_data { - /** set of inputs */ - - /* fd to read from */ - int fd; - - /* from @offset */ - off_t offset; - - xlator_t *this; - gf_changelog_t *gfc; - - /** set of outputs */ +static inline int +gf_changelog_filter_check (gf_changelog_t *entry, changelog_event_t *event) +{ + if (event->ev_type & entry->notify) + return 1; + return 0; +} + +#define GF_NEED_ORDERED_EVENTS(ent) (ent->ordered == _gf_true) + +/** private structure */ +typedef struct gf_private { + gf_lock_t lock; /* protects ->connections */ + + void *api; /* pointer for API access */ + + pthread_t poller; /* event poller thread */ + + struct list_head connections; /* list of connections */ +} gf_private_t; + +#define GF_CHANGELOG_GET_API_PTR(this) ((gf_private_t *) this->private)->api + +/** + * upcall: invoke callback with _correct_ THIS + */ +#define GF_CHANGELOG_INVOKE_CBK(this, cbk, brick, args ...) \ + do { \ + xlator_t *old_this = NULL; \ + xlator_t *invokerxl = NULL; \ + \ + invokerxl = entry->invokerxl; \ + old_this = this; \ + \ + if (invokerxl) { \ + THIS = invokerxl; \ + } \ + \ + cbk (invokerxl, brick, args); \ + THIS = old_this; \ + \ + } while (0) - /* return value */ - int retval; +#define SAVE_THIS(xl) \ + do { \ + old_this = xl; \ + THIS = master; \ + } while (0) - /* journal processed */ - char changelog[PATH_MAX]; -} gf_changelog_consume_data_t; +#define RESTORE_THIS() \ + do { \ + THIS = old_this; \ + } while (0) -int -gf_changelog_notification_init (xlator_t *this, gf_changelog_t *gfc); +/** APIs and the rest */ void * gf_changelog_process (void *data); @@ -138,9 +207,14 @@ gf_lseek (int fd, off_t offset, int whence); int gf_changelog_consume (xlator_t *this, - gf_changelog_t *gfc, + gf_changelog_journal_t *jnl, char *from_path, gf_boolean_t no_publish); int -gf_changelog_publish (xlator_t *this, gf_changelog_t *gfc, char *from_path); +gf_changelog_publish (xlator_t *this, + gf_changelog_journal_t *jnl, char *from_path); +int +gf_thread_cleanup (xlator_t *this, pthread_t thread); +void * +gf_changelog_callback_invoker (void *arg); #endif diff --git a/xlators/features/changelog/lib/src/gf-changelog-process.c b/xlators/features/changelog/lib/src/gf-changelog-journal-handler.c similarity index 59% rename from xlators/features/changelog/lib/src/gf-changelog-process.c rename to xlators/features/changelog/lib/src/gf-changelog-journal-handler.c index 1a275e676fb..1005bd0ea39 100644 --- a/xlators/features/changelog/lib/src/gf-changelog-process.c +++ b/xlators/features/changelog/lib/src/gf-changelog-journal-handler.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2013 Red Hat, Inc. + Copyright (c) 2015 Red Hat, Inc. This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -8,9 +8,6 @@ cases as published by the Free Software Foundation. */ -#include -#include - #include "uuid.h" #include "globals.h" #include "glusterfs.h" @@ -19,6 +16,9 @@ /* from the changelog translator */ #include "changelog-misc.h" +#include "changelog-mem-types.h" + +#include "gf-changelog-journal.h" extern int byebye; @@ -114,7 +114,8 @@ conv_noop (char *ptr) { return ptr; } static int gf_changelog_parse_binary (xlator_t *this, - gf_changelog_t *gfc, int from_fd, int to_fd, + gf_changelog_journal_t *jnl, + int from_fd, int to_fd, size_t start_offset, struct stat *stbuf) { @@ -218,7 +219,8 @@ gf_changelog_parse_binary (xlator_t *this, */ static int gf_changelog_parse_ascii (xlator_t *this, - gf_changelog_t *gfc, int from_fd, int to_fd, + gf_changelog_journal_t *jnl, + int from_fd, int to_fd, size_t start_offset, struct stat *stbuf) { int ng = 0; @@ -346,7 +348,7 @@ gf_changelog_parse_ascii (xlator_t *this, } gf_rfc3986_encode ((unsigned char *) ptr, - eptr, gfc->rfc3986); + eptr, jnl->rfc3986); FILL_AND_MOVE (eptr, ascii, off, mover, nleft, len); free (eptr); @@ -410,8 +412,8 @@ gf_changelog_copy (xlator_t *this, int from_fd, int to_fd) } static int -gf_changelog_decode (xlator_t *this, gf_changelog_t *gfc, int from_fd, - int to_fd, struct stat *stbuf, int *zerob) +gf_changelog_decode (xlator_t *this, gf_changelog_journal_t *jnl, + int from_fd, int to_fd, struct stat *stbuf, int *zerob) { int ret = -1; int encoding = -1; @@ -441,12 +443,12 @@ gf_changelog_decode (xlator_t *this, gf_changelog_t *gfc, int from_fd, * this ideally should have been a part of changelog-encoders.c * (ie. part of the changelog translator). */ - ret = gf_changelog_parse_binary (this, gfc, from_fd, + ret = gf_changelog_parse_binary (this, jnl, from_fd, to_fd, elen, stbuf); break; case CHANGELOG_ENCODE_ASCII: - ret = gf_changelog_parse_ascii (this, gfc, from_fd, + ret = gf_changelog_parse_ascii (this, jnl, from_fd, to_fd, elen, stbuf); break; default: @@ -458,7 +460,8 @@ gf_changelog_decode (xlator_t *this, gf_changelog_t *gfc, int from_fd, } int -gf_changelog_publish (xlator_t *this, gf_changelog_t *gfc, char *from_path) +gf_changelog_publish (xlator_t *this, + gf_changelog_journal_t *jnl, char *from_path) { int ret = 0; char dest[PATH_MAX] = {0,}; @@ -466,7 +469,7 @@ gf_changelog_publish (xlator_t *this, gf_changelog_t *gfc, char *from_path) struct stat stbuf = {0,}; (void) snprintf (to_path, PATH_MAX, "%s%s", - gfc->gfc_current_dir, basename (from_path)); + jnl->jnl_current_dir, basename (from_path)); /* handle zerob file that wont exist in current */ ret = stat (to_path, &stbuf); @@ -477,7 +480,7 @@ gf_changelog_publish (xlator_t *this, gf_changelog_t *gfc, char *from_path) } (void) snprintf (dest, PATH_MAX, "%s%s", - gfc->gfc_processing_dir, basename (from_path)); + jnl->jnl_processing_dir, basename (from_path)); ret = rename (to_path, dest); if (ret){ @@ -492,7 +495,7 @@ gf_changelog_publish (xlator_t *this, gf_changelog_t *gfc, char *from_path) int gf_changelog_consume (xlator_t *this, - gf_changelog_t *gfc, + gf_changelog_journal_t *jnl, char *from_path, gf_boolean_t no_publish) { int ret = -1; @@ -520,9 +523,9 @@ gf_changelog_consume (xlator_t *this, } (void) snprintf (to_path, PATH_MAX, "%s%s", - gfc->gfc_current_dir, basename (from_path)); + jnl->jnl_current_dir, basename (from_path)); (void) snprintf (dest, PATH_MAX, "%s%s", - gfc->gfc_processing_dir, basename (from_path)); + jnl->jnl_processing_dir, basename (from_path)); fd2 = open (to_path, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); @@ -532,7 +535,7 @@ gf_changelog_consume (xlator_t *this, to_path, strerror (errno)); goto close_fd; } else { - ret = gf_changelog_decode (this, gfc, fd1, + ret = gf_changelog_decode (this, jnl, fd1, fd2, &stbuf, &zerob); close (fd2); @@ -568,88 +571,412 @@ gf_changelog_consume (xlator_t *this, return ret; } -static char * -gf_changelog_ext_change (xlator_t *this, - gf_changelog_t *gfc, char *path, size_t readlen) +void * +gf_changelog_process (void *data) { - int alo = 0; - int ret = 0; - size_t len = 0; - char *buf = NULL; - - buf = path; - while (len < readlen) { - if (*buf == '\0') { - alo = 1; - gf_log (this->name, GF_LOG_DEBUG, - "processing changelog: %s", path); - ret = gf_changelog_consume (this, gfc, path, _gf_false); - } + int ret = 0; + xlator_t *this = NULL; + gf_changelog_journal_t *jnl = NULL; + gf_changelog_entry_t *entry = NULL; + gf_changelog_processor_t *jnl_proc = NULL; - if (ret) - break; + this = THIS; - len++; buf++; - if (alo) { - alo = 0; - path = buf; + jnl = data; + jnl_proc = jnl->jnl_proc; + + while (1) { + pthread_mutex_lock (&jnl_proc->lock); + { + while (list_empty (&jnl_proc->entries)) { + jnl_proc->waiting = _gf_true; + pthread_cond_wait + (&jnl_proc->cond, &jnl_proc->lock); + } + + entry = list_first_entry (&jnl_proc->entries, + gf_changelog_entry_t, list); + list_del (&entry->list); + jnl_proc->waiting = _gf_false; } + pthread_mutex_unlock (&jnl_proc->lock); + + if (entry) { + ret = gf_changelog_consume (this, jnl, + entry->path, _gf_false); + GF_FREE (entry); + } + } + + return NULL; +} + +inline void +gf_changelog_queue_journal (gf_changelog_processor_t *jnl_proc, + changelog_event_t *event) +{ + size_t len = 0; + gf_changelog_entry_t *entry = NULL; + + entry = GF_CALLOC (1, sizeof (gf_changelog_entry_t), + gf_changelog_mt_libgfchangelog_entry_t); + if (!entry) + return; + INIT_LIST_HEAD (&entry->list); + + len = strlen (event->u.journal.path); + (void)memcpy (entry->path, event->u.journal.path, len+1); + + pthread_mutex_lock (&jnl_proc->lock); + { + list_add_tail (&entry->list, &jnl_proc->entries); + if (jnl_proc->waiting) + pthread_cond_signal (&jnl_proc->cond); + } + pthread_mutex_unlock (&jnl_proc->lock); + + return; +} + +void +gf_changelog_handle_journal (void *xl, char *brick, + void *cbkdata, changelog_event_t *event) +{ + int ret = 0; + gf_changelog_journal_t *jnl = NULL; + gf_changelog_processor_t *jnl_proc = NULL; + + jnl = cbkdata; + jnl_proc = jnl->jnl_proc; + + gf_changelog_queue_journal (jnl_proc, event); +} + +void +gf_changelog_journal_disconnect (void *xl, char *brick, void *data) +{ + gf_changelog_journal_t *jnl = NULL; + + jnl = data; + + pthread_spin_lock (&jnl->lock); + { + JNL_SET_API_STATE (jnl, JNL_API_DISCONNECTED); + }; + pthread_spin_unlock (&jnl->lock); +} + +void +gf_changelog_journal_connect (void *xl, char *brick, void *data) +{ + gf_changelog_journal_t *jnl = NULL; + + jnl = data; + + pthread_spin_lock (&jnl->lock); + { + JNL_SET_API_STATE (jnl, JNL_API_CONNECTED); + }; + pthread_spin_unlock (&jnl->lock); + + return; +} + +void +gf_changelog_cleanup_processor (gf_changelog_journal_t *jnl) +{ + int ret = 0; + xlator_t *this = NULL; + gf_changelog_processor_t *jnl_proc = NULL; + + this = THIS; + if (!this || !jnl || !jnl->jnl_proc) + goto error_return; + + jnl_proc = jnl->jnl_proc; + + ret = gf_thread_cleanup (this, jnl_proc->processor); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to cleanup processor thread"); + goto error_return; + } + + (void)pthread_mutex_destroy (&jnl_proc->lock); + (void)pthread_cond_destroy (&jnl_proc->cond); + + GF_FREE (jnl_proc); + + error_return: + return; +} + +int +gf_changelog_init_processor (gf_changelog_journal_t *jnl) +{ + int ret = -1; + gf_changelog_processor_t *jnl_proc = NULL; + + jnl_proc = GF_CALLOC (1, sizeof (gf_changelog_processor_t), + gf_changelog_mt_libgfchangelog_t); + if (!jnl_proc) + goto error_return; + + ret = pthread_mutex_init (&jnl_proc->lock, NULL); + if (ret != 0) + goto free_jnl_proc; + ret = pthread_cond_init (&jnl_proc->cond, NULL); + if (ret != 0) + goto cleanup_mutex; + + INIT_LIST_HEAD (&jnl_proc->entries); + ret = pthread_create (&jnl_proc->processor, + NULL, gf_changelog_process, jnl); + if (ret != 0) + goto cleanup_cond; + jnl_proc->waiting = _gf_false; + + jnl->jnl_proc = jnl_proc; + return 0; + + cleanup_cond: + (void) pthread_cond_destroy (&jnl_proc->cond); + cleanup_mutex: + (void) pthread_mutex_destroy (&jnl_proc->lock); + free_jnl_proc: + GF_FREE (jnl_proc); + error_return: + return -1; +} + +static void +gf_changelog_cleanup_fds (gf_changelog_journal_t *jnl) +{ + /* tracker fd */ + if (jnl->jnl_fd != -1) + close (jnl->jnl_fd); + /* processing dir */ + if (jnl->jnl_dir) + closedir (jnl->jnl_dir); + + if (jnl->jnl_working_dir) + free (jnl->jnl_working_dir); /* allocated by realpath */ +} + +static int +gf_changelog_open_dirs (xlator_t *this, gf_changelog_journal_t *jnl) +{ + int ret = -1; + DIR *dir = NULL; + int tracker_fd = 0; + char tracker_path[PATH_MAX] = {0,}; + + /* .current */ + (void) snprintf (jnl->jnl_current_dir, PATH_MAX, + "%s/"GF_CHANGELOG_CURRENT_DIR"/", + jnl->jnl_working_dir); + ret = recursive_rmdir (jnl->jnl_current_dir); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to rmdir: %s, err: %s", + jnl->jnl_current_dir, strerror (errno)); + goto out; + } + ret = mkdir_p (jnl->jnl_current_dir, 0600, _gf_false); + if (ret) + goto out; + + /* .processed */ + (void) snprintf (jnl->jnl_processed_dir, PATH_MAX, + "%s/"GF_CHANGELOG_PROCESSED_DIR"/", + jnl->jnl_working_dir); + ret = mkdir_p (jnl->jnl_processed_dir, 0600, _gf_false); + if (ret) + goto out; + + /* .processing */ + (void) snprintf (jnl->jnl_processing_dir, PATH_MAX, + "%s/"GF_CHANGELOG_PROCESSING_DIR"/", + jnl->jnl_working_dir); + ret = recursive_rmdir (jnl->jnl_processing_dir); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to rmdir: %s, err: %s", + jnl->jnl_processing_dir, strerror (errno)); + goto out; + } + + ret = mkdir_p (jnl->jnl_processing_dir, 0600, _gf_false); + if (ret) + goto out; + + dir = opendir (jnl->jnl_processing_dir); + if (!dir) { + gf_log ("", GF_LOG_ERROR, + "opendir() error [reason: %s]", strerror (errno)); + goto out; } - return (ret) ? NULL : path; + jnl->jnl_dir = dir; + + (void) snprintf (tracker_path, PATH_MAX, + "%s/"GF_CHANGELOG_TRACKER, jnl->jnl_working_dir); + + tracker_fd = open (tracker_path, O_CREAT | O_APPEND | O_RDWR, + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (tracker_fd < 0) { + closedir (jnl->jnl_dir); + ret = -1; + goto out; + } + + jnl->jnl_fd = tracker_fd; + ret = 0; + out: + return ret; +} + +int +gf_changelog_init_history (xlator_t *this, + gf_changelog_journal_t *jnl, + char *brick_path, char *scratch_dir) +{ + int i = 0; + int ret = 0; + char hist_scratch_dir[PATH_MAX] = {0,}; + + jnl->hist_jnl = GF_CALLOC (1, sizeof (*jnl), + gf_changelog_mt_libgfchangelog_t); + if (!jnl->hist_jnl) + goto error_return; + + jnl->hist_jnl->jnl_dir = NULL; + jnl->hist_jnl->jnl_fd = -1; + + (void) strncpy (hist_scratch_dir, scratch_dir, PATH_MAX); + (void) snprintf (hist_scratch_dir, PATH_MAX, + "%s/"GF_CHANGELOG_HISTORY_DIR"/", + jnl->jnl_working_dir); + + ret = mkdir_p (hist_scratch_dir, 0600, _gf_false); + if (ret) + goto dealloc_hist; + + jnl->hist_jnl->jnl_working_dir = realpath (hist_scratch_dir, NULL); + if (!jnl->hist_jnl->jnl_working_dir) + goto dealloc_hist; + + ret = gf_changelog_open_dirs (this, jnl->hist_jnl); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "could not create entries in history scratch dir"); + goto dealloc_hist; + } + + (void) strncpy (jnl->hist_jnl->jnl_brickpath, brick_path, PATH_MAX); + + for (i = 0; i < 256; i++) { + jnl->hist_jnl->rfc3986[i] = + (isalnum(i) || i == '~' || + i == '-' || i == '.' || i == '_') ? i : 0; + } + + return 0; + + dealloc_hist: + GF_FREE (jnl->hist_jnl); + jnl->hist_jnl = NULL; + error_return: + return -1; +} + +void +gf_changelog_journal_fini (void *xl, char *brick, void *data) +{ + int ret = 0; + xlator_t *this = NULL; + gf_changelog_journal_t *jnl = NULL; + + this = xl; + jnl = data; + + gf_changelog_cleanup_processor (jnl); + + gf_changelog_cleanup_fds (jnl); + if (jnl->hist_jnl) + gf_changelog_cleanup_fds (jnl->hist_jnl); + + GF_FREE (jnl); } void * -gf_changelog_process (void *data) +gf_changelog_journal_init (void *xl, struct gf_brick_spec *brick) { - ssize_t len = 0; - ssize_t offlen = 0; - xlator_t *this = NULL; - char *sbuf = NULL; - gf_changelog_t *gfc = NULL; - char from_path[PATH_MAX] = {0,}; - - gfc = (gf_changelog_t *) data; - this = gfc->this; - - pthread_detach (pthread_self()); - - for (;;) { - len = gf_changelog_read_path (gfc->gfc_sockfd, - from_path + offlen, - PATH_MAX - offlen); - if (len < 0) - continue; /* ignore it for now */ - - if (len == 0) { /* close() from the changelog translator */ - gf_log (this->name, GF_LOG_INFO, "close from changelog" - " notification translator."); - - if (gfc->gfc_connretries != 1) { - if (!gf_changelog_notification_init(this, gfc)) - continue; - } + int i = 0; + int ret = 0; + xlator_t *this = NULL; + struct stat buf = {0,}; + char *scratch_dir = NULL; + gf_changelog_journal_t *jnl = NULL; + + this = xl; + scratch_dir = (char *) brick->ptr; + + jnl = GF_CALLOC (1, sizeof (gf_changelog_journal_t), + gf_changelog_mt_libgfchangelog_t); + if (!jnl) + goto error_return; + + if (stat (scratch_dir, &buf) && errno == ENOENT) { + ret = mkdir_p (scratch_dir, 0600, _gf_true); + if (ret) + goto dealloc_private; + } - byebye = 1; - break; - } + jnl->jnl_working_dir = realpath (scratch_dir, NULL); + if (!jnl->jnl_working_dir) + goto dealloc_private; - len += offlen; - sbuf = gf_changelog_ext_change (this, gfc, from_path, len); - if (!sbuf) { - gf_log (this->name, GF_LOG_ERROR, - "could not extract changelog filename"); - continue; - } + ret = gf_changelog_open_dirs (this, jnl); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "could not create entries in scratch dir"); + goto dealloc_private; + } - offlen = 0; - if (sbuf != (from_path + len)) { - offlen = from_path + len - sbuf; - memmove (from_path, sbuf, offlen); - } + (void) strncpy (jnl->jnl_brickpath, brick->brick_path, PATH_MAX); + + /* RFC 3986 {de,en}coding */ + for (i = 0; i < 256; i++) { + jnl->rfc3986[i] = + (isalnum(i) || i == '~' || + i == '-' || i == '.' || i == '_') ? i : 0; } - gf_log (this->name, GF_LOG_DEBUG, - "byebye (%d) from processing thread...", byebye); + ret = gf_changelog_init_history (this, jnl, + brick->brick_path, scratch_dir); + if (ret) + goto cleanup_fds; + + /* initialize journal processor */ + ret = gf_changelog_init_processor (jnl); + if (ret) + goto cleanup_fds; + + JNL_SET_API_STATE (jnl, JNL_API_CONN_INPROGESS); + ret = pthread_spin_init (&jnl->lock, 0); + if (ret != 0) + goto cleanup_processor; + return jnl; + + cleanup_processor: + gf_changelog_cleanup_processor (jnl); + cleanup_fds: + gf_changelog_cleanup_fds (jnl); + if (jnl->hist_jnl) + gf_changelog_cleanup_fds (jnl->hist_jnl); + dealloc_private: + GF_FREE (jnl); + error_return: return NULL; } diff --git a/xlators/features/changelog/lib/src/gf-changelog-journal.h b/xlators/features/changelog/lib/src/gf-changelog-journal.h new file mode 100644 index 00000000000..2605ccda84e --- /dev/null +++ b/xlators/features/changelog/lib/src/gf-changelog-journal.h @@ -0,0 +1,114 @@ +/* + Copyright (c) 2015 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __GF_CHANGELOG_JOURNAL_H +#define __GF_CHANGELOG_JOURNAL_H + +#include +#include + +#include "changelog.h" + +enum api_conn { + JNL_API_CONNECTED, + JNL_API_CONN_INPROGESS, + JNL_API_DISCONNECTED, +}; + +typedef struct gf_changelog_entry { + char path[PATH_MAX]; + + struct list_head list; +} gf_changelog_entry_t; + +typedef struct gf_changelog_processor { + pthread_mutex_t lock; /* protects ->entries */ + pthread_cond_t cond; /* waiter during empty list */ + gf_boolean_t waiting; + + pthread_t processor; /* thread-id of journal processing thread */ + + struct list_head entries; +} gf_changelog_processor_t; + +typedef struct gf_changelog_journal { + DIR *jnl_dir; /* 'processing' directory stream */ + + int jnl_fd; /* fd to the tracker file */ + + char jnl_brickpath[PATH_MAX]; /* brick path for this end-point */ + + gf_changelog_processor_t *jnl_proc; + + char *jnl_working_dir; /* scratch directory */ + + char jnl_current_dir[PATH_MAX]; + char jnl_processed_dir[PATH_MAX]; + char jnl_processing_dir[PATH_MAX]; + + char rfc3986[256]; /* RFC 3986 string encoding */ + + struct gf_changelog_journal *hist_jnl; + int hist_done; /* holds 0 done scanning, + 1 keep scanning and -1 error */ + + pthread_spinlock_t lock; + int connected; +} gf_changelog_journal_t; + +#define JNL_SET_API_STATE(jnl, state) jnl->connected = state +#define JNL_IS_API_DISCONNECTED(jnl) (jnl->connected == JNL_API_DISCONNECTED) + +/* History API */ +typedef struct gf_changelog_history_data { + int len; + + int htime_fd; + + /* parallelism count */ + int n_parallel; + + /* history from, to indexes */ + unsigned long from; + unsigned long to; +} gf_changelog_history_data_t; + +typedef struct gf_changelog_consume_data { + /** set of inputs */ + + /* fd to read from */ + int fd; + + /* from @offset */ + off_t offset; + + xlator_t *this; + + gf_changelog_journal_t *jnl; + + /** set of outputs */ + + /* return value */ + int retval; + + /* journal processed */ + char changelog[PATH_MAX]; +} gf_changelog_consume_data_t; + +/* event handler */ +CALLBACK gf_changelog_handle_journal; + +/* init, connect & disconnect handler */ +INIT gf_changelog_journal_init; +FINI gf_changelog_journal_fini; +CONNECT gf_changelog_journal_connect; +DISCONNECT gf_changelog_journal_disconnect; + +#endif diff --git a/xlators/features/changelog/lib/src/gf-changelog-reborp.c b/xlators/features/changelog/lib/src/gf-changelog-reborp.c new file mode 100644 index 00000000000..4805d979e58 --- /dev/null +++ b/xlators/features/changelog/lib/src/gf-changelog-reborp.c @@ -0,0 +1,380 @@ +/* + Copyright (c) 2015 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "changelog-misc.h" +#include "changelog-mem-types.h" + +#include "gf-changelog-helpers.h" +#include "changelog-rpc-common.h" + +/** + * Reverse socket: actual data transfer handler. Connection + * initiator is PROBER, data transfer is REBORP. + */ + +struct rpcsvc_program *gf_changelog_reborp_programs[]; + +/** + * On a reverse connection, unlink the socket file. + */ +int +gf_changelog_reborp_rpcsvc_notify (rpcsvc_t *rpc, void *mydata, + rpcsvc_event_t event, void *data) +{ + int ret = 0; + xlator_t *this = NULL; + gf_private_t *priv = NULL; + gf_changelog_t *entry = NULL; + char sock[UNIX_PATH_MAX] = {0,}; + + entry = mydata; + this = entry->this; + priv = this->private; + + switch (event) { + case RPCSVC_EVENT_ACCEPT: + ret = unlink (RPC_SOCK(entry)); + if (ret != 0) + gf_log (this->name, GF_LOG_WARNING, "failed to unlink" + " reverse socket file %s", RPC_SOCK (entry)); + if (entry->connected) + GF_CHANGELOG_INVOKE_CBK (this, entry->connected, + entry->brick, entry->ptr); + break; + case RPCSVC_EVENT_DISCONNECT: + LOCK (&priv->lock); + { + list_del (&entry->list); + } + UNLOCK (&priv->lock); + + if (entry->disconnected) + GF_CHANGELOG_INVOKE_CBK (this, entry->disconnected, + entry->brick, entry->ptr); + + GF_FREE (entry); + break; + default: + break; + } + + return 0; +} + +rpcsvc_t * +gf_changelog_reborp_init_rpc_listner (xlator_t *this, + char *path, char *sock, void *cbkdata) +{ + CHANGELOG_MAKE_TMP_SOCKET_PATH (path, sock, UNIX_PATH_MAX); + return changelog_rpc_server_init (this, sock, cbkdata, + gf_changelog_reborp_rpcsvc_notify, + gf_changelog_reborp_programs); +} + +/** + * This is dirty and painful as of now untill there is event filtering in the + * server. The entire event buffer is scanned and interested events are picked, + * whereas we _should_ be notified with the events we were interested in + * (selected at the time of probe). As of now this is complete BS and needs + * fixture ASAP. I just made it work, it needs to be better. + * + * @FIXME: cleanup this bugger once server filters events. + */ +inline void +gf_changelog_invoke_callback (gf_changelog_t *entry, + struct iovec **vec, int payloadcnt) +{ + int i = 0; + int evsize = 0; + xlator_t *this = NULL; + changelog_event_t *event = NULL; + + this = entry->this; + + for (; i < payloadcnt; i++) { + event = (changelog_event_t *)vec[i]->iov_base; + evsize = vec[i]->iov_len / CHANGELOG_EV_SIZE; + + for (; evsize > 0; evsize--, event++) { + if (gf_changelog_filter_check (entry, event)) { + GF_CHANGELOG_INVOKE_CBK (this, + entry->callback, + entry->brick, + entry->ptr, event); + } + } + } +} + +/** + * Ordered event handler is self-adaptive.. if the event sequence number + * is what's expected (->next_seq) there is no ordering list that's + * maintained. On out-of-order event notifications, event buffers are + * dynamically allocated and ordered. + */ + +inline int +__is_expected_sequence (struct gf_event_list *ev, struct gf_event *event) +{ + return (ev->next_seq == event->seq); +} + +inline int +__can_process_event (struct gf_event_list *ev, struct gf_event **event) +{ + *event = list_first_entry (&ev->events, struct gf_event, list); + + if (__is_expected_sequence (ev, *event)) { + list_del (&(*event)->list); + ev->next_seq++; + return 1; + } + + return 0; +} + +inline void +__process_event_list (struct gf_event_list *ev, struct gf_event **event) +{ + while ( list_empty (&ev->events) + || !__can_process_event (ev, event) ) + pthread_cond_wait (&ev->cond, &ev->lock); +} + +void * +gf_changelog_callback_invoker (void *arg) +{ + int ret = 0; + xlator_t *this = NULL; + gf_changelog_t *entry = NULL; + struct iovec *vec = NULL; + struct gf_event *event = NULL; + struct gf_event_list *ev = NULL; + + ev = arg; + entry = ev->entry; + this = entry->this; + + while (1) { + pthread_mutex_lock (&ev->lock); + { + __process_event_list (ev, &event); + } + pthread_mutex_unlock (&ev->lock); + + vec = (struct iovec *) &event->iov; + gf_changelog_invoke_callback (entry, &vec, event->count); + + GF_FREE (event); + } + + return NULL; +} + +static int +orderfn (struct list_head *pos1, struct list_head *pos2) +{ + struct gf_event *event1 = NULL; + struct gf_event *event2 = NULL; + + event1 = list_entry (pos1, struct gf_event, list); + event2 = list_entry (pos2, struct gf_event, list); + + if (event1->seq > event2->seq) + return 1; + return -1; +} + +int +gf_changelog_ordered_event_handler (rpcsvc_request_t *req, + xlator_t *this, gf_changelog_t *entry) +{ + int i = 0; + size_t payloadlen = 0; + ssize_t len = 0; + int payloadcnt = 0; + changelog_event_req rpc_req = {0,}; + changelog_event_rsp rpc_rsp = {0,}; + struct iovec *vec = NULL; + struct gf_event *event = NULL; + struct gf_event_list *ev = NULL; + + ev = &entry->event; + + len = xdr_to_generic (req->msg[0], + &rpc_req, (xdrproc_t)xdr_changelog_event_req); + if (len < 0) { + gf_log (this->name, GF_LOG_ERROR, "xdr decoding failed"); + req->rpc_err = GARBAGE_ARGS; + goto handle_xdr_error; + } + + if (len < req->msg[0].iov_len) { + payloadcnt = 1; + payloadlen = (req->msg[0].iov_len - len); + } + for (i = 1; i < req->count; i++) { + payloadcnt++; + payloadlen += req->msg[i].iov_len; + } + + event = GF_CALLOC (1, GF_EVENT_CALLOC_SIZE (payloadcnt, payloadlen), + gf_changelog_mt_libgfchangelog_event_t); + if (!event) + goto handle_xdr_error; + INIT_LIST_HEAD (&event->list); + + payloadlen = 0; + event->seq = rpc_req.seq; + event->count = payloadcnt; + + /* deep copy IO vectors */ + vec = &event->iov[0]; + GF_EVENT_ASSIGN_IOVEC (vec, event, + (req->msg[0].iov_len - len), payloadlen); + (void) memcpy (vec->iov_base, + req->msg[0].iov_base + len, vec->iov_len); + + for (i = 1; i < req->count; i++) { + vec = &event->iov[i]; + GF_EVENT_ASSIGN_IOVEC (vec, event, + req->msg[i].iov_len, payloadlen); + (void) memcpy (event->iov[i].iov_base, + req->msg[i].iov_base, req->msg[i].iov_len); + } + + gf_log (this->name, GF_LOG_DEBUG, + "seq: %lu [%s] (time: %lu.%lu), (vec: %d, len: %ld)", + rpc_req.seq, entry->brick, rpc_req.tv_sec, + rpc_req.tv_usec, payloadcnt, payloadlen); + + /* add it to the ordered event list and wake up listner(s) */ + pthread_mutex_lock (&ev->lock); + { + list_add_order (&event->list, &ev->events, orderfn); + if ( (!ev->next_seq && (ev->next_seq = event->seq)) + || (ev->next_seq == event->seq) ) + pthread_cond_signal (&ev->cond); + } + pthread_mutex_unlock (&ev->lock); + + /* ack sequence number */ + rpc_rsp.op_ret = 0; + rpc_rsp.seq = rpc_req.seq; + + goto submit_rpc; + + handle_xdr_error: + rpc_rsp.op_ret = -1; + rpc_rsp.seq = 0; /* invalid */ + submit_rpc: + return changelog_rpc_sumbit_reply (req, &rpc_rsp, NULL, 0, NULL, + (xdrproc_t)xdr_changelog_event_rsp); +} + +int +gf_changelog_unordered_event_handler (rpcsvc_request_t *req, + xlator_t *this, gf_changelog_t *entry) +{ + int i = 0; + int ret = 0; + ssize_t len = 0; + int payloadcnt = 0; + struct iovec vector[MAX_IOVEC] = {{0,}}; + changelog_event_req rpc_req = {0,}; + changelog_event_rsp rpc_rsp = {0,}; + + len = xdr_to_generic (req->msg[0], + &rpc_req, (xdrproc_t)xdr_changelog_event_req); + if (len < 0) { + gf_log (this->name, GF_LOG_ERROR, "xdr decoding failed"); + req->rpc_err = GARBAGE_ARGS; + goto handle_xdr_error; + } + + /* prepare payload */ + if (len < req->msg[0].iov_len) { + payloadcnt = 1; + vector[0].iov_base = (req->msg[0].iov_base + len); + vector[0].iov_len = (req->msg[0].iov_len - len); + } + + for (i = 1; i < req->count; i++) { + vector[payloadcnt++] = req->msg[i]; + } + + gf_log (this->name, GF_LOG_DEBUG, + "seq: %lu (time: %lu.%lu), (vec: %d)", + rpc_req.seq, rpc_req.tv_sec, rpc_req.tv_usec, payloadcnt); + + /* invoke callback */ + struct iovec *vec = (struct iovec *) &vector; + gf_changelog_invoke_callback (entry, &vec, payloadcnt); + + /* ack sequence number */ + rpc_rsp.op_ret = 0; + rpc_rsp.seq = rpc_req.seq; + + goto submit_rpc; + + handle_xdr_error: + rpc_rsp.op_ret = -1; + rpc_rsp.seq = 0; /* invalid */ + submit_rpc: + return changelog_rpc_sumbit_reply (req, &rpc_rsp, NULL, 0, NULL, + (xdrproc_t)xdr_changelog_event_rsp); +} + +int +gf_changelog_reborp_handle_event (rpcsvc_request_t *req) +{ + int ret = 0; + xlator_t *this = NULL; + rpcsvc_t *svc = NULL; + gf_changelog_t *entry = NULL; + + svc = rpcsvc_request_service (req); + entry = svc->mydata; + + this = THIS = entry->this; + + ret = GF_NEED_ORDERED_EVENTS (entry) + ? gf_changelog_ordered_event_handler (req, this, entry) + : gf_changelog_unordered_event_handler (req, this, entry); + + return ret; +} + +rpcsvc_actor_t gf_changelog_reborp_actors[CHANGELOG_REV_PROC_MAX] = { + [CHANGELOG_REV_PROC_EVENT] = { + "CHANGELOG EVENT HANDLER", CHANGELOG_REV_PROC_EVENT, + gf_changelog_reborp_handle_event, NULL, 0, DRC_NA + }, +}; + +/** + * Do not use synctask as the RPC layer dereferences ->mydata as THIS. + * In gf_changelog_setup_rpc(), @cbkdata is of type @gf_changelog_t, + * and that's required to invoke the callback with the appropriate + * brick path and it's private data. + */ +struct rpcsvc_program gf_changelog_reborp_prog = { + .progname = "LIBGFCHANGELOG REBORP", + .prognum = CHANGELOG_REV_RPC_PROCNUM, + .progver = CHANGELOG_REV_RPC_PROCVER, + .numactors = CHANGELOG_REV_PROC_MAX, + .actors = gf_changelog_reborp_actors, + .synctask = _gf_false, +}; + +struct rpcsvc_program *gf_changelog_reborp_programs[] = { + &gf_changelog_reborp_prog, + NULL, +}; diff --git a/xlators/features/changelog/lib/src/gf-changelog-rpc.c b/xlators/features/changelog/lib/src/gf-changelog-rpc.c new file mode 100644 index 00000000000..c2a4c044d23 --- /dev/null +++ b/xlators/features/changelog/lib/src/gf-changelog-rpc.c @@ -0,0 +1,105 @@ +/* + Copyright (c) 2013 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "gf-changelog-rpc.h" +#include "changelog-misc.h" +#include "changelog-mem-types.h" + +struct rpc_clnt_program gf_changelog_clnt; + +/* TODO: piggyback reconnect to called (upcall) */ +int +gf_changelog_rpc_notify (struct rpc_clnt *rpc, + void *mydata, rpc_clnt_event_t event, void *data) +{ + xlator_t *this = NULL; + + this = mydata; + + switch (event) { + case RPC_CLNT_CONNECT: + rpc_clnt_set_connected (&rpc->conn); + break; + case RPC_CLNT_DISCONNECT: + rpc_clnt_unset_connected (&rpc->conn); + break; + case RPC_CLNT_MSG: + case RPC_CLNT_DESTROY: + break; + } + + return 0; +} + +struct rpc_clnt * +gf_changelog_rpc_init (xlator_t *this, gf_changelog_t *entry) +{ + char sockfile[UNIX_PATH_MAX] = {0,}; + + CHANGELOG_MAKE_SOCKET_PATH (entry->brick, + sockfile, UNIX_PATH_MAX); + return changelog_rpc_client_init (this, entry, + sockfile, gf_changelog_rpc_notify); +} + +/** + * remote procedure calls declarations. + */ + +int +gf_probe_changelog_cbk (struct rpc_req *req, + struct iovec *iovec, int count, void *myframe) +{ + return 0; +} + +int +gf_probe_changelog_filter (call_frame_t *frame, xlator_t *this, void *data) +{ + int ret = 0; + char *sock = NULL; + gf_changelog_t *entry = NULL; + changelog_probe_req req = {0,}; + + entry = data; + sock = RPC_SOCK (entry); + + (void) memcpy (&req.sock, sock, strlen (sock)); + req.filter = entry->notify; + + /* invoke RPC */ + return changelog_rpc_sumbit_req (RPC_PROBER (entry), (void *) &req, + frame, &gf_changelog_clnt, + CHANGELOG_RPC_PROBE_FILTER, NULL, 0, + NULL, this, gf_probe_changelog_cbk, + (xdrproc_t) xdr_changelog_probe_req); +} + +int +gf_changelog_invoke_rpc (xlator_t *this, gf_changelog_t *entry, int procidx) +{ + return changelog_invoke_rpc (this, RPC_PROBER (entry), + &gf_changelog_clnt, procidx, entry); +} + +struct rpc_clnt_procedure gf_changelog_procs[CHANGELOG_RPC_PROC_MAX] = { + [CHANGELOG_RPC_PROC_NULL] = {"NULL", NULL}, + [CHANGELOG_RPC_PROBE_FILTER] = { + "PROBE FILTER", gf_probe_changelog_filter + }, +}; + +struct rpc_clnt_program gf_changelog_clnt = { + .progname = "LIBGFCHANGELOG", + .prognum = CHANGELOG_RPC_PROGNUM, + .progver = CHANGELOG_RPC_PROGVER, + .numproc = CHANGELOG_RPC_PROC_MAX, + .proctable = gf_changelog_procs, +}; diff --git a/xlators/features/changelog/src/changelog-notifier.h b/xlators/features/changelog/lib/src/gf-changelog-rpc.h similarity index 50% rename from xlators/features/changelog/src/changelog-notifier.h rename to xlators/features/changelog/lib/src/gf-changelog-rpc.h index 55e728356e6..f551fe69497 100644 --- a/xlators/features/changelog/src/changelog-notifier.h +++ b/xlators/features/changelog/lib/src/gf-changelog-rpc.h @@ -8,12 +8,19 @@ cases as published by the Free Software Foundation. */ -#ifndef _CHANGELOG_NOTIFIER_H -#define _CHANGELOG_NOTIFIER_H +#ifndef __GF_CHANGELOG_RPC_H +#define __GF_CHANGELOG_RPC_H -#include "changelog-helpers.h" +#include "xlator.h" -void * -changelog_notifier (void *data); +#include "gf-changelog-helpers.h" +#include "changelog-rpc-common.h" + +struct rpc_clnt * gf_changelog_rpc_init (xlator_t *, gf_changelog_t *); + +int gf_changelog_invoke_rpc (xlator_t *, gf_changelog_t *, int); + +rpcsvc_t * +gf_changelog_reborp_init_rpc_listner (xlator_t *, char *, char *, void *); #endif diff --git a/xlators/features/changelog/lib/src/gf-changelog.c b/xlators/features/changelog/lib/src/gf-changelog.c index 18eeac3e1a4..8f33eb01013 100644 --- a/xlators/features/changelog/lib/src/gf-changelog.c +++ b/xlators/features/changelog/lib/src/gf-changelog.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #ifndef _GNU_SOURCE #define _GNU_SOURCE @@ -24,584 +26,523 @@ #include "glusterfs.h" #include "logging.h" #include "defaults.h" +#include "syncop.h" +#include "gf-changelog-rpc.h" #include "gf-changelog-helpers.h" /* from the changelog translator */ #include "changelog-misc.h" #include "changelog-mem-types.h" -int byebye = 0; +/** + * Global singleton xlator pointer for the library, initialized + * during library load. This should probably be hidden inside + * an initialized object which is an handle for the consumer. + * + * TODO: do away with the global.. + */ +xlator_t *master = NULL; -static void -gf_changelog_cleanup (gf_changelog_t *gfc) +static inline +gf_private_t *gf_changelog_alloc_priv () { - /* socket */ - if (gfc->gfc_sockfd != -1) - close (gfc->gfc_sockfd); - /* tracker fd */ - if (gfc->gfc_fd != -1) - close (gfc->gfc_fd); - /* processing dir */ - if (gfc->gfc_dir) - closedir (gfc->gfc_dir); - - if (gfc->gfc_working_dir) - free (gfc->gfc_working_dir); /* allocated by realpath */ -} + int ret = 0; + gf_private_t *priv = NULL; -void -__attribute__ ((constructor)) gf_changelog_ctor (void) -{ - glusterfs_ctx_t *ctx = NULL; + priv = calloc (1, sizeof (gf_private_t)); + if (!priv) + goto error_return; + INIT_LIST_HEAD (&priv->connections); - ctx = glusterfs_ctx_new (); - if (!ctx) - return; + ret = LOCK_INIT (&priv->lock); + if (ret != 0) + goto free_priv; + priv->api = NULL; - if (glusterfs_globals_init (ctx)) { - free (ctx); - ctx = NULL; - return; - } + return priv; - THIS->ctx = ctx; - if (xlator_mem_acct_init (THIS, gf_changelog_mt_end)) - return; + free_priv: + free (priv); + error_return: + return NULL; } -void -__attribute__ ((destructor)) gf_changelog_dtor (void) -{ - xlator_t *this = NULL; - glusterfs_ctx_t *ctx = NULL; - gf_changelog_t *gfc = NULL; +#define GF_CHANGELOG_EVENT_POOL_SIZE 16384 +#define GF_CHANGELOG_EVENT_THREAD_COUNT 4 - this = THIS; - if (!this) - return; - - ctx = this->ctx; - gfc = this->private; - - if (gfc) { - if (gfc->hist_gfc) { - gf_changelog_cleanup(gfc->hist_gfc); - GF_FREE (gfc->hist_gfc); - } - gf_changelog_cleanup (gfc); - GF_FREE (gfc); +static int +gf_changelog_ctx_defaults_init (glusterfs_ctx_t *ctx) +{ + cmd_args_t *cmd_args = NULL; + struct rlimit lim = {0, }; + call_pool_t *pool = NULL; + int ret = -1; + + ret = xlator_mem_acct_init (THIS, gf_changelog_mt_end); + if (ret != 0) { + return ret; } - if (ctx) { - pthread_mutex_destroy (&ctx->lock); - free (ctx); - ctx = NULL; - } -} + ctx->process_uuid = generate_glusterfs_ctx_id (); + if (!ctx->process_uuid) + return -1; + ctx->page_size = 128 * GF_UNIT_KB; -static int -gf_changelog_open_dirs (gf_changelog_t *gfc) -{ - int ret = -1; - DIR *dir = NULL; - int tracker_fd = 0; - char tracker_path[PATH_MAX] = {0,}; - xlator_t *this = NULL; + ctx->iobuf_pool = iobuf_pool_new (); + if (!ctx->iobuf_pool) + return -1; - this = THIS; - GF_ASSERT (this); + ctx->event_pool = event_pool_new (GF_CHANGELOG_EVENT_POOL_SIZE, + GF_CHANGELOG_EVENT_THREAD_COUNT); + if (!ctx->event_pool) + return -1; - (void) snprintf (gfc->gfc_current_dir, PATH_MAX, - "%s/"GF_CHANGELOG_CURRENT_DIR"/", - gfc->gfc_working_dir); + pool = GF_CALLOC (1, sizeof (call_pool_t), + gf_changelog_mt_libgfchangelog_call_pool_t); + if (!pool) + return -1; - ret = recursive_rmdir (gfc->gfc_current_dir); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "Failed to rmdir: %s, err: %s", - gfc->gfc_current_dir, strerror (errno)); - goto out; - } + /* frame_mem_pool size 112 * 64 */ + pool->frame_mem_pool = mem_pool_new (call_frame_t, 32); + if (!pool->frame_mem_pool) + return -1; - ret = mkdir_p (gfc->gfc_current_dir, 0600, _gf_false); - if (ret) - goto out; + /* stack_mem_pool size 256 * 128 */ + pool->stack_mem_pool = mem_pool_new (call_stack_t, 16); - (void) snprintf (gfc->gfc_processed_dir, PATH_MAX, - "%s/"GF_CHANGELOG_PROCESSED_DIR"/", - gfc->gfc_working_dir); + if (!pool->stack_mem_pool) + return -1; - ret = mkdir_p (gfc->gfc_processed_dir, 0600, _gf_false); - if (ret) - goto out; + ctx->stub_mem_pool = mem_pool_new (call_stub_t, 16); + if (!ctx->stub_mem_pool) + return -1; - (void) snprintf (gfc->gfc_processing_dir, PATH_MAX, - "%s/"GF_CHANGELOG_PROCESSING_DIR"/", - gfc->gfc_working_dir); + ctx->dict_pool = mem_pool_new (dict_t, 32); + if (!ctx->dict_pool) + return -1; - ret = recursive_rmdir (gfc->gfc_processing_dir); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "Failed to rmdir: %s, err: %s", - gfc->gfc_processing_dir, strerror (errno)); - goto out; - } + ctx->dict_pair_pool = mem_pool_new (data_pair_t, 512); + if (!ctx->dict_pair_pool) + return -1; - ret = mkdir_p (gfc->gfc_processing_dir, 0600, _gf_false); - if (ret) - goto out; + ctx->dict_data_pool = mem_pool_new (data_t, 512); + if (!ctx->dict_data_pool) + return -1; - dir = opendir (gfc->gfc_processing_dir); - if (!dir) { - gf_log ("", GF_LOG_ERROR, - "opendir() error [reason: %s]", strerror (errno)); - goto out; - } + INIT_LIST_HEAD (&pool->all_frames); + LOCK_INIT (&pool->lock); + ctx->pool = pool; - gfc->gfc_dir = dir; + pthread_mutex_init (&(ctx->lock), NULL); - (void) snprintf (tracker_path, PATH_MAX, - "%s/"GF_CHANGELOG_TRACKER, gfc->gfc_working_dir); + cmd_args = &ctx->cmd_args; - tracker_fd = open (tracker_path, O_CREAT | O_APPEND | O_RDWR, - S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); - if (tracker_fd < 0) { - closedir (gfc->gfc_dir); - ret = -1; - goto out; - } + INIT_LIST_HEAD (&cmd_args->xlator_options); + + lim.rlim_cur = RLIM_INFINITY; + lim.rlim_max = RLIM_INFINITY; + setrlimit (RLIMIT_CORE, &lim); - gfc->gfc_fd = tracker_fd; - ret = 0; - out: - return ret; + return 0; } -int -gf_changelog_notification_init (xlator_t *this, gf_changelog_t *gfc) +/* TODO: cleanup ctx defaults */ +static void +gf_changelog_cleanup_this (xlator_t *this) { - int ret = 0; - int len = 0; - int tries = 0; - int sockfd = 0; - struct sockaddr_un remote; - - this = gfc->this; + glusterfs_ctx_t *ctx = NULL; - if (gfc->gfc_sockfd != -1) { - gf_log (this->name, GF_LOG_INFO, - "Reconnecting..."); - close (gfc->gfc_sockfd); - } + if (!this) + return; - sockfd = socket (AF_UNIX, SOCK_STREAM, 0); - if (sockfd < 0) { - ret = -1; - goto out; - } + ctx = this->ctx; + syncenv_destroy (ctx->env); + free (ctx); - CHANGELOG_MAKE_SOCKET_PATH (gfc->gfc_brickpath, - gfc->gfc_sockpath, UNIX_PATH_MAX); - gf_log (this->name, GF_LOG_INFO, - "connecting to changelog socket: %s (brick: %s)", - gfc->gfc_sockpath, gfc->gfc_brickpath); + if (this->private) + free (this->private); - remote.sun_family = AF_UNIX; - strcpy (remote.sun_path, gfc->gfc_sockpath); + this->private = NULL; + this->ctx = NULL; +} - len = strlen (remote.sun_path) + sizeof (remote.sun_family); +static int +gf_changelog_init_this () +{ + glusterfs_ctx_t *ctx = NULL; - while (tries < gfc->gfc_connretries) { - gf_log (this->name, GF_LOG_WARNING, - "connection attempt %d/%d...", - tries + 1, gfc->gfc_connretries); + ctx = glusterfs_ctx_new (); + if (!ctx) + goto error_return; - /* initiate a connect */ - if (connect (sockfd, (struct sockaddr *) &remote, len) == 0) { - gfc->gfc_sockfd = sockfd; - break; - } + if (glusterfs_globals_init (ctx)) + goto free_ctx; - tries++; - sleep (2); - } + THIS->ctx = ctx; + if (gf_changelog_ctx_defaults_init (ctx)) + goto free_ctx; - if (tries == gfc->gfc_connretries) { - gf_log (this->name, GF_LOG_ERROR, - "could not connect to changelog socket!" - " bailing out..."); - close (sockfd); - ret = -1; - } else - gf_log (this->name, GF_LOG_INFO, - "connection successful"); + ctx->env = syncenv_new (0, 0, 0); + if (!ctx->env) + goto free_ctx; + return 0; - out: - return ret; + free_ctx: + free (ctx); + THIS->ctx = NULL; + error_return: + return -1; } -int -gf_changelog_done (char *file) +static int +gf_changelog_init_master () { - int ret = -1; - char *buffer = NULL; - xlator_t *this = NULL; - gf_changelog_t *gfc = NULL; - char to_path[PATH_MAX] = {0,}; - - errno = EINVAL; - - this = THIS; - if (!this) - goto out; - - gfc = (gf_changelog_t *) this->private; - if (!gfc) - goto out; + int ret = 0; + gf_private_t *priv = NULL; + glusterfs_ctx_t *ctx = NULL; - if (!file || !strlen (file)) - goto out; + ret = gf_changelog_init_this (); + if (ret != 0) + goto error_return; + master = THIS; + + priv = gf_changelog_alloc_priv (); + if (!priv) + goto cleanup_master; + master->private = priv; + + /* poller thread */ + ret = pthread_create (&priv->poller, + NULL, changelog_rpc_poller, master); + if (ret != 0) { + gf_log (master->name, GF_LOG_ERROR, + "failed to spawn poller thread"); + goto cleanup_master; + } - /* make sure 'file' is inside ->gfc_working_dir */ - buffer = realpath (file, NULL); - if (!buffer) - goto out; + return 0; - if (strncmp (gfc->gfc_working_dir, - buffer, strlen (gfc->gfc_working_dir))) - goto out; + cleanup_master: + master->private = NULL; + gf_changelog_cleanup_this (master); + error_return: + return -1; +} - (void) snprintf (to_path, PATH_MAX, "%s%s", - gfc->gfc_processed_dir, basename (buffer)); - gf_log (this->name, GF_LOG_DEBUG, - "moving %s to processed directory", file); - ret = rename (buffer, to_path); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "cannot move %s to %s (reason: %s)", - file, to_path, strerror (errno)); - goto out; - } +/* ctor/dtor */ - ret = 0; +void +__attribute__ ((constructor)) gf_changelog_ctor (void) +{ + (void) gf_changelog_init_master (); +} - out: - if (buffer) - free (buffer); /* allocated by realpath() */ - return ret; +void +__attribute__ ((destructor)) gf_changelog_dtor (void) +{ + gf_changelog_cleanup_this (master); } -/** - * @API - * for a set of changelogs, start from the beginning - */ +/* TODO: cleanup clnt/svc on failure */ int -gf_changelog_start_fresh () +gf_changelog_setup_rpc (xlator_t *this, + gf_changelog_t *entry, int proc) { - xlator_t *this = NULL; - gf_changelog_t *gfc = NULL; + int ret = 0; + rpcsvc_t *svc = NULL; + struct rpc_clnt *rpc = NULL; - this = THIS; - if (!this) - goto out; - - errno = EINVAL; + /** + * Initialize a connect back socket. A probe() RPC call to the server + * triggers a reverse connect. + */ + svc = gf_changelog_reborp_init_rpc_listner (this, entry->brick, + RPC_SOCK (entry), entry); + if (!svc) + goto error_return; + RPC_REBORP (entry) = svc; + + /* Initialize an RPC client */ + rpc = gf_changelog_rpc_init (this, entry); + if (!rpc) + goto error_return; + RPC_PROBER (entry) = rpc; - gfc = (gf_changelog_t *) this->private; - if (!gfc) - goto out; + /** + * @FIXME + * till we have connection state machine, let's delay the RPC call + * for now.. + */ + sleep (2); - if (gf_ftruncate (gfc->gfc_fd, 0)) - goto out; + /** + * Probe changelog translator for reverse connection. After a successful + * call, there's less use of the client and can be disconnected, but + * let's leave the connection active for any future RPC calls. + */ + ret = gf_changelog_invoke_rpc (this, entry, proc); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not initiate probe RPC, bailing out!!!"); + goto error_return; + } return 0; - out: + error_return: return -1; } -/** - * @API - * return the next changelog file entry. zero means all chanelogs - * consumed. - */ -ssize_t -gf_changelog_next_change (char *bufptr, size_t maxlen) +static void +gf_cleanup_event (gf_changelog_t *entry) { - ssize_t size = -1; - int tracker_fd = 0; - xlator_t *this = NULL; - gf_changelog_t *gfc = NULL; - char buffer[PATH_MAX] = {0,}; + xlator_t *this = NULL; + struct gf_event_list *ev = NULL; - errno = EINVAL; + this = entry->this; + ev = &entry->event; - this = THIS; - if (!this) - goto out; + (void) gf_thread_cleanup (this, ev->invoker); - gfc = (gf_changelog_t *) this->private; - if (!gfc) - goto out; + (void) pthread_mutex_destroy (&ev->lock); + (void) pthread_cond_destroy (&ev->cond); - tracker_fd = gfc->gfc_fd; + ev->entry = NULL; +} - size = gf_readline (tracker_fd, buffer, maxlen); - if (size < 0) { - size = -1; - goto out; +static int +gf_init_event (gf_changelog_t *entry) +{ + int ret = 0; + struct gf_event_list *ev = NULL; + + ev = &entry->event; + ev->entry = entry; + + ret = pthread_mutex_init (&ev->lock, NULL); + if (ret != 0) + goto error_return; + ret = pthread_cond_init (&ev->cond, NULL); + if (ret != 0) + goto cleanup_mutex; + INIT_LIST_HEAD (&ev->events); + + ev->next_seq = 0; /* bootstrap sequencing */ + + if (entry->ordered) { + ret = pthread_create (&ev->invoker, NULL, + gf_changelog_callback_invoker, ev); + if (ret != 0) + goto cleanup_cond; } - if (size == 0) - goto out; - - memcpy (bufptr, buffer, size - 1); - bufptr[size - 1] = '\0'; + return 0; -out: - return size; + cleanup_cond: + (void) pthread_cond_destroy (&ev->cond); + cleanup_mutex: + (void) pthread_mutex_destroy (&ev->lock); + error_return: + return -1; } /** - * @API - * gf_changelog_scan() - scan and generate a list of change entries - * - * calling this api multiple times (without calling gf_changlog_done()) - * would result new changelogs(s) being refreshed in the tracker file. - * This call also acts as a cancellation point for the consumer. + * TODO: + * - cleanup invoker thread (if ordered mode) + * - cleanup event list + * - destroy rpc{-clnt, svc} */ -ssize_t -gf_changelog_scan () +int +gf_cleanup_brick_connection (xlator_t *this, gf_changelog_t *entry) { - int ret = 0; - int tracker_fd = 0; - size_t len = 0; - size_t off = 0; - xlator_t *this = NULL; - size_t nr_entries = 0; - gf_changelog_t *gfc = NULL; - struct dirent *entryp = NULL; - struct dirent *result = NULL; - char buffer[PATH_MAX] = {0,}; - - this = THIS; - if (!this) - goto out; + return 0; +} - gfc = (gf_changelog_t *) this->private; - if (!gfc) - goto out; +int +gf_cleanup_connections (xlator_t *this) +{ + return 0; +} - /** - * do we need to protect 'byebye' with locks? worst, the - * consumer would get notified during next scan(). - */ - if (byebye) { - errno = ECONNREFUSED; - goto out; - } +static int +gf_setup_brick_connection (xlator_t *this, + struct gf_brick_spec *brick, + gf_boolean_t ordered, void *xl) +{ + int ret = 0; + gf_private_t *priv = NULL; + gf_changelog_t *entry = NULL; - errno = EINVAL; + priv = this->private; - tracker_fd = gfc->gfc_fd; + if (!brick->callback || !brick->init || !brick->fini) + goto error_return; - if (gf_ftruncate (tracker_fd, 0)) - goto out; + entry = GF_CALLOC (1, sizeof (*entry), + gf_changelog_mt_libgfchangelog_t); + if (!entry) + goto error_return; + INIT_LIST_HEAD (&entry->list); - len = offsetof(struct dirent, d_name) - + pathconf(gfc->gfc_processing_dir, _PC_NAME_MAX) + 1; - entryp = GF_CALLOC (1, len, - gf_changelog_mt_libgfchangelog_dirent_t); - if (!entryp) - goto out; + entry->notify = brick->filter; + (void) strncpy (entry->brick, brick->brick_path, PATH_MAX); - rewinddir (gfc->gfc_dir); - while (1) { - ret = readdir_r (gfc->gfc_dir, entryp, &result); - if (ret || !result) - break; + entry->this = this; + entry->invokerxl = xl; - if ( !strcmp (basename (entryp->d_name), ".") - || !strcmp (basename (entryp->d_name), "..") ) - continue; + entry->ordered = ordered; + if (ordered) { + ret = gf_init_event (entry); + if (ret) + goto free_entry; + } - nr_entries++; + entry->fini = brick->fini; + entry->callback = brick->callback; + entry->connected = brick->connected; + entry->disconnected = brick->disconnected; - GF_CHANGELOG_FILL_BUFFER (gfc->gfc_processing_dir, - buffer, off, - strlen (gfc->gfc_processing_dir)); - GF_CHANGELOG_FILL_BUFFER (entryp->d_name, buffer, - off, strlen (entryp->d_name)); - GF_CHANGELOG_FILL_BUFFER ("\n", buffer, off, 1); + entry->ptr = brick->init (this, brick); + if (!entry->ptr) + goto cleanup_event; + priv->api = entry->ptr; /* pointer to API, if required */ - if (gf_changelog_write (tracker_fd, buffer, off) != off) { - gf_log (this->name, GF_LOG_ERROR, - "error writing changelog filename" - " to tracker file"); - break; - } - off = 0; + LOCK (&priv->lock); + { + list_add_tail (&entry->list, &priv->connections); } + UNLOCK (&priv->lock); - GF_FREE (entryp); + ret = gf_changelog_setup_rpc (this, entry, CHANGELOG_RPC_PROBE_FILTER); + if (ret) + goto cleanup_event; + return 0; - if (!result) { - if (gf_lseek (tracker_fd, 0, SEEK_SET) != -1) - return nr_entries; - } - out: + cleanup_event: + if (ordered) + gf_cleanup_event (entry); + free_entry: + list_del (&entry->list); /* FIXME: kludge for now */ + GF_FREE (entry); + error_return: return -1; } -/** - * @API - * gf_changelog_register() - register a client for updates. - */ int -gf_changelog_register (char *brick_path, char *scratch_dir, - char *log_file, int log_level, int max_reconnects) +gf_changelog_register_brick (xlator_t *this, + struct gf_brick_spec *brick, + gf_boolean_t ordered, void *xl) { - int i = 0; - int ret = -1; - int errn = 0; - xlator_t *this = NULL; - gf_changelog_t *gfc = NULL; - char hist_scratch_dir[PATH_MAX] = {0,}; - struct stat buf = {0,}; - - this = THIS; - if (!this->ctx) - goto out; - - errno = ENOMEM; - - gfc = GF_CALLOC (1, sizeof (*gfc), - gf_changelog_mt_libgfchangelog_t); - if (!gfc) - goto out; - - gfc->this = this; - - gfc->gfc_dir = NULL; - gfc->gfc_fd = gfc->gfc_sockfd = -1; - - if (stat (scratch_dir, &buf) && errno == ENOENT) { - ret = mkdir_p (scratch_dir, 0600, _gf_true); - if (ret) { - errn = errno; - goto cleanup; - } - } - - gfc->gfc_working_dir = realpath (scratch_dir, NULL); - if (!gfc->gfc_working_dir) { - errn = errno; - goto cleanup; - } + return gf_setup_brick_connection (this, brick, ordered, xl); +} - /* Begin: Changes for History API */ - gfc->hist_gfc = NULL; +static int +gf_changelog_setup_logging (xlator_t *this, char *logfile, int loglevel) +{ + /* passing ident as NULL means to use default ident for syslog */ + if (gf_log_init (this->ctx, logfile, NULL)) + return -1; - gfc->hist_gfc = GF_CALLOC (1, sizeof (*gfc), - gf_changelog_mt_libgfchangelog_t); - if (!gfc->hist_gfc) - goto cleanup; + gf_log_set_loglevel ((loglevel == -1) ? GF_LOG_INFO : + loglevel); + return 0; +} - gfc->hist_gfc->gfc_dir = NULL; - gfc->hist_gfc->gfc_fd = gfc->hist_gfc->gfc_sockfd = -1; - gfc->hist_gfc->this = NULL; +int +gf_changelog_register_generic (struct gf_brick_spec *bricks, int count, + int ordered, char *logfile, int lvl, void *xl) +{ + int ret = 0; + xlator_t *this = NULL; + xlator_t *old_this = NULL; + struct gf_brick_spec *brick = NULL; + gf_boolean_t need_order = _gf_false; - (void) strncpy (hist_scratch_dir, scratch_dir, PATH_MAX); - (void) snprintf (hist_scratch_dir, PATH_MAX, - "%s/"GF_CHANGELOG_HISTORY_DIR"/", - gfc->gfc_working_dir); + SAVE_THIS (xl); - ret = mkdir_p (hist_scratch_dir, 0600, _gf_false); - if (ret) { - errn = errno; - goto cleanup; - } + this = THIS; + if (!this) + goto error_return; - gfc->hist_gfc->gfc_working_dir = realpath (hist_scratch_dir, NULL); - if (!gfc->hist_gfc->gfc_working_dir) { - errn = errno; - goto cleanup; - } + ret = gf_changelog_setup_logging (this, logfile, lvl); + if (ret) + goto error_return; - ret = gf_changelog_open_dirs (gfc->hist_gfc); - if (ret) { - errn = errno; - gf_log (this->name, GF_LOG_ERROR, - "could not create entries in history scratch dir"); - goto cleanup; - } + need_order = (ordered) ? _gf_true : _gf_false; - (void) strncpy (gfc->hist_gfc->gfc_brickpath, brick_path, PATH_MAX); + brick = bricks; + while (count--) { + gf_log (this->name, GF_LOG_INFO, + "Registering brick: %s [notify filter: %d]", + brick->brick_path, brick->filter); - for (i=0; i < 256; i++) { - gfc->hist_gfc->rfc3986[i] = - (isalnum(i) || i == '~' || - i == '-' || i == '.' || i == '_') ? i : 0; - } - /* End: Changes for History API*/ + ret = gf_changelog_register_brick (this, brick, need_order, xl); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "Error registering with changelog xlator"); + break; + } - ret = gf_changelog_open_dirs (gfc); - if (ret) { - errn = errno; - gf_log (this->name, GF_LOG_ERROR, - "could not create entries in scratch dir"); - goto cleanup; + brick++; } - /* passing ident as NULL means to use default ident for syslog */ - if (gf_log_init (this->ctx, log_file, NULL)) - goto cleanup; - - gf_log_set_loglevel ((log_level == -1) ? GF_LOG_INFO : - log_level); + if (ret != 0) + goto cleanup_inited_bricks; - gfc->gfc_connretries = (max_reconnects <= 0) ? 1 : max_reconnects; - (void) strncpy (gfc->gfc_brickpath, brick_path, PATH_MAX); + RESTORE_THIS(); + return 0; - ret = gf_changelog_notification_init (this, gfc); - if (ret) { - errn = errno; - goto cleanup; - } + cleanup_inited_bricks: + gf_cleanup_connections (this); + error_return: + RESTORE_THIS(); + return -1; +} - ret = gf_thread_create (&gfc->gfc_changelog_processor, - NULL, gf_changelog_process, gfc); - if (ret) { - errn = errno; - gf_log (this->name, GF_LOG_ERROR, - "error creating changelog processor thread" - " new changes won't be recorded!!!"); - goto cleanup; - } +/** + * @API + * gf_changelog_register() + * + * This is _NOT_ a generic register API. It's a special API to handle + * updates at a journal granulality. This is used by consumers wanting + * to process persistent journal such as geo-replication via a set of + * APIs. All of this is required to maintain backward compatibility. + * Owner specific private data is stored in ->api (in gf_private_t), + * which is used by APIs to access it's private data. This limits + * the library access to a single brick, but that's how it used to + * be anyway. Furthermore, this API solely _owns_ "this", therefore + * callers already having a notion of "this" are expected to use the + * newer API. + * + * Newer applications wanting to use this library need not face this + * limitation and reply of the much more feature rich generic register + * API, which is purely callback based. + * + * NOTE: @max_reconnects is not used but required for backward compat. + * + * For generic API, refer gf_changelog_register_generic(). + */ +int +gf_changelog_register (char *brick_path, char *scratch_dir, + char *log_file, int log_level, int max_reconnects) +{ + struct gf_brick_spec brick = {0,}; - for (i=0; i < 256; i++) { - gfc->rfc3986[i] = - (isalnum(i) || i == '~' || - i == '-' || i == '.' || i == '_') ? i : 0; - } + THIS = master; - ret = 0; - this->private = gfc; + brick.brick_path = brick_path; + brick.filter = CHANGELOG_OP_TYPE_JOURNAL; - goto out; + brick.init = gf_changelog_journal_init; + brick.fini = gf_changelog_journal_fini; + brick.callback = gf_changelog_handle_journal; + brick.connected = gf_changelog_journal_connect; + brick.disconnected = gf_changelog_journal_disconnect; - cleanup: - if (gfc->hist_gfc) { - gf_changelog_cleanup (gfc->hist_gfc); - GF_FREE (gfc->hist_gfc); - } - gf_changelog_cleanup (gfc); - GF_FREE (gfc); - this->private = NULL; - errno = errn; + brick.ptr = scratch_dir; - out: - return ret; + return gf_changelog_register_generic (&brick, 1, 1, + log_file, log_level, NULL); } diff --git a/xlators/features/changelog/lib/src/gf-history-changelog.c b/xlators/features/changelog/lib/src/gf-history-changelog.c index 8a527dd6e4b..60704af011a 100644 --- a/xlators/features/changelog/lib/src/gf-history-changelog.c +++ b/xlators/features/changelog/lib/src/gf-history-changelog.c @@ -14,6 +14,7 @@ #include "syscall.h" #include "gf-changelog-helpers.h" +#include "gf-changelog-journal.h" /* from the changelog translator */ #include "changelog-misc.h" @@ -36,12 +37,12 @@ int gf_history_changelog_done (char *file) { - int ret = -1; - char *buffer = NULL; - xlator_t *this = NULL; - gf_changelog_t *gfc = NULL; - gf_changelog_t *hist_gfc = NULL; - char to_path[PATH_MAX] = {0,}; + int ret = -1; + char *buffer = NULL; + xlator_t *this = NULL; + gf_changelog_journal_t *jnl = NULL; + gf_changelog_journal_t *hist_jnl = NULL; + char to_path[PATH_MAX] = {0,}; errno = EINVAL; @@ -49,28 +50,28 @@ gf_history_changelog_done (char *file) if (!this) goto out; - gfc = (gf_changelog_t *) this->private; - if (!gfc) + jnl = (gf_changelog_journal_t *) GF_CHANGELOG_GET_API_PTR (this); + if (!jnl) goto out; - hist_gfc = gfc->hist_gfc; - if (!hist_gfc) + hist_jnl = jnl->hist_jnl; + if (!hist_jnl) goto out; if (!file || !strlen (file)) goto out; - /* make sure 'file' is inside ->gfc_working_dir */ + /* make sure 'file' is inside ->jnl_working_dir */ buffer = realpath (file, NULL); if (!buffer) goto out; - if (strncmp (hist_gfc->gfc_working_dir, - buffer, strlen (hist_gfc->gfc_working_dir))) + if (strncmp (hist_jnl->jnl_working_dir, + buffer, strlen (hist_jnl->jnl_working_dir))) goto out; (void) snprintf (to_path, PATH_MAX, "%s%s", - hist_gfc->gfc_processed_dir, basename (buffer)); + hist_jnl->jnl_processed_dir, basename (buffer)); gf_log (this->name, GF_LOG_DEBUG, "moving %s to processed directory", file); ret = rename (buffer, to_path); @@ -102,9 +103,9 @@ gf_history_changelog_done (char *file) int gf_history_changelog_start_fresh () { - xlator_t *this = NULL; - gf_changelog_t *gfc = NULL; - gf_changelog_t *hist_gfc = NULL; + xlator_t *this = NULL; + gf_changelog_journal_t *jnl = NULL; + gf_changelog_journal_t *hist_jnl = NULL; this = THIS; if (!this) @@ -112,15 +113,15 @@ gf_history_changelog_start_fresh () errno = EINVAL; - gfc = (gf_changelog_t *) this->private; - if (!gfc) + jnl = (gf_changelog_journal_t *) GF_CHANGELOG_GET_API_PTR (this); + if (!jnl) goto out; - hist_gfc = gfc->hist_gfc; - if (!hist_gfc) + hist_jnl = jnl->hist_jnl; + if (!hist_jnl) goto out; - if (gf_ftruncate (hist_gfc->gfc_fd, 0)) + if (gf_ftruncate (hist_jnl->jnl_fd, 0)) goto out; return 0; @@ -147,12 +148,12 @@ gf_history_changelog_start_fresh () ssize_t gf_history_changelog_next_change (char *bufptr, size_t maxlen) { - ssize_t size = -1; - int tracker_fd = 0; - xlator_t *this = NULL; - gf_changelog_t *gfc = NULL; - gf_changelog_t *hist_gfc = NULL; - char buffer[PATH_MAX] = {0,}; + ssize_t size = -1; + int tracker_fd = 0; + xlator_t *this = NULL; + gf_changelog_journal_t *jnl = NULL; + gf_changelog_journal_t *hist_jnl = NULL; + char buffer[PATH_MAX] = {0,}; errno = EINVAL; @@ -160,15 +161,15 @@ gf_history_changelog_next_change (char *bufptr, size_t maxlen) if (!this) goto out; - gfc = (gf_changelog_t *) this->private; - if (!gfc) + jnl = (gf_changelog_journal_t *) GF_CHANGELOG_GET_API_PTR (this); + if (!jnl) goto out; - hist_gfc = gfc->hist_gfc; - if (!hist_gfc) + hist_jnl = jnl->hist_jnl; + if (!hist_jnl) goto out; - tracker_fd = hist_gfc->gfc_fd; + tracker_fd = hist_jnl->jnl_fd; size = gf_readline (tracker_fd, buffer, maxlen); if (size < 0) { @@ -206,56 +207,60 @@ gf_history_changelog_next_change (char *bufptr, size_t maxlen) ssize_t gf_history_changelog_scan () { - int ret = 0; - int tracker_fd = 0; - size_t len = 0; - size_t off = 0; - xlator_t *this = NULL; - size_t nr_entries = 0; - gf_changelog_t *gfc = NULL; - gf_changelog_t *hist_gfc = NULL; - struct dirent *entryp = NULL; - struct dirent *result = NULL; - char buffer[PATH_MAX] = {0,}; - static int is_last_scan = 0; + int ret = 0; + int tracker_fd = 0; + size_t len = 0; + size_t off = 0; + xlator_t *this = NULL; + size_t nr_entries = 0; + gf_changelog_journal_t *jnl = NULL; + gf_changelog_journal_t *hist_jnl = NULL; + struct dirent *entryp = NULL; + struct dirent *result = NULL; + char buffer[PATH_MAX] = {0,}; + static int is_last_scan = 0; this = THIS; if (!this) goto out; - gfc = (gf_changelog_t *) this->private; - if (!gfc) + jnl = (gf_changelog_journal_t *) GF_CHANGELOG_GET_API_PTR (this); + if (!jnl) goto out; + if (JNL_IS_API_DISCONNECTED (jnl)) { + errno = ENOTCONN; + goto out; + } - hist_gfc = gfc->hist_gfc; - if (!hist_gfc) + hist_jnl = jnl->hist_jnl; + if (!hist_jnl) goto out; retry: if (is_last_scan == 1) return 0; - if (hist_gfc->hist_done == 0) + if (hist_jnl->hist_done == 0) is_last_scan = 1; errno = EINVAL; - if (hist_gfc->hist_done == -1) + if (hist_jnl->hist_done == -1) goto out; - tracker_fd = hist_gfc->gfc_fd; + tracker_fd = hist_jnl->jnl_fd; if (gf_ftruncate (tracker_fd, 0)) goto out; len = offsetof (struct dirent, d_name) - + pathconf (hist_gfc->gfc_processing_dir, _PC_NAME_MAX) + 1; + + pathconf (hist_jnl->jnl_processing_dir, _PC_NAME_MAX) + 1; entryp = GF_CALLOC (1, len, gf_changelog_mt_libgfchangelog_dirent_t); if (!entryp) goto out; - rewinddir (hist_gfc->gfc_dir); + rewinddir (hist_jnl->jnl_dir); while (1) { - ret = readdir_r (hist_gfc->gfc_dir, entryp, &result); + ret = readdir_r (hist_jnl->jnl_dir, entryp, &result); if (ret || !result) break; @@ -265,9 +270,9 @@ gf_history_changelog_scan () nr_entries++; - GF_CHANGELOG_FILL_BUFFER (hist_gfc->gfc_processing_dir, + GF_CHANGELOG_FILL_BUFFER (hist_jnl->jnl_processing_dir, buffer, off, - strlen (hist_gfc->gfc_processing_dir)); + strlen (hist_jnl->jnl_processing_dir)); GF_CHANGELOG_FILL_BUFFER (entryp->d_name, buffer, off, strlen (entryp->d_name)); GF_CHANGELOG_FILL_BUFFER ("\n", buffer, off, 1); @@ -284,7 +289,7 @@ gf_history_changelog_scan () GF_FREE (entryp); gf_log (this->name, GF_LOG_DEBUG, - "hist_done %d, is_last_scan: %d", hist_gfc->hist_done, is_last_scan); + "hist_done %d, is_last_scan: %d", hist_jnl->hist_done, is_last_scan); if (!result) { if (gf_lseek (tracker_fd, 0, SEEK_SET) != -1) { @@ -490,7 +495,7 @@ gf_changelog_consume_wrap (void* data) /* TODO: handle short reads and EOF. */ ret = gf_changelog_consume (ccd->this, - ccd->gfc, ccd->changelog, _gf_true); + ccd->jnl, ccd->changelog, _gf_true); if (ret) { gf_log (this->name, GF_LOG_ERROR, "could not parse changelog: %s", ccd->changelog); @@ -515,8 +520,8 @@ void * gf_history_consume (void * data) { xlator_t *this = NULL; - gf_changelog_t *gfc = NULL; - gf_changelog_t *hist_gfc = NULL; + gf_changelog_journal_t *jnl = NULL; + gf_changelog_journal_t *hist_jnl = NULL; int ret = 0; int iter = 0; int fd = -1; @@ -549,14 +554,14 @@ gf_history_consume (void * data) goto out; } - gfc = (gf_changelog_t *) this->private; - if (!gfc) { + jnl = (gf_changelog_journal_t *) GF_CHANGELOG_GET_API_PTR (this); + if (!jnl) { ret = -1; goto out; } - hist_gfc = gfc->hist_gfc; - if (!hist_gfc) { + hist_jnl = jnl->hist_jnl; + if (!hist_jnl) { ret = -1; goto out; } @@ -568,7 +573,7 @@ gf_history_consume (void * data) curr = &ccd[iter]; curr->this = this; - curr->gfc = hist_gfc; + curr->jnl = hist_jnl; curr->fd = fd; curr->offset = from * (len + 1); @@ -613,7 +618,7 @@ gf_history_consume (void * data) } ret = gf_changelog_publish (curr->this, - curr->gfc, curr->changelog); + curr->jnl, curr->changelog); if (ret) { publish = _gf_false; gf_log (this->name, GF_LOG_ERROR, @@ -623,7 +628,7 @@ gf_history_consume (void * data) } /* informing "parsing done". */ - hist_gfc->hist_done = (publish == _gf_true) ? 0 : -1; + hist_jnl->hist_done = (publish == _gf_true) ? 0 : -1; out: if (fd != -1) @@ -740,8 +745,8 @@ gf_history_changelog (char* changelog_dir, unsigned long start, unsigned long from = 0; unsigned long total_changelog = 0; xlator_t *this = NULL; - gf_changelog_t *gfc = NULL; - gf_changelog_t *hist_gfc = NULL; + gf_changelog_journal_t *jnl = NULL; + gf_changelog_journal_t *hist_jnl = NULL; gf_changelog_history_data_t *hist_data = NULL; DIR *dirp = NULL; struct dirent *dp = NULL; @@ -762,14 +767,14 @@ gf_history_changelog (char* changelog_dir, unsigned long start, goto out; } - gfc = (gf_changelog_t *) this->private; - if (!gfc) { + jnl = (gf_changelog_journal_t *) GF_CHANGELOG_GET_API_PTR (this); + if (!jnl) { ret = -1; goto out; } - hist_gfc = (gf_changelog_t *) gfc->hist_gfc; - if (!hist_gfc) { + hist_jnl = (gf_changelog_journal_t *) jnl->hist_jnl; + if (!hist_jnl) { ret = -1; goto out; } @@ -917,7 +922,7 @@ gf_history_changelog (char* changelog_dir, unsigned long start, return ret; } - hist_gfc->hist_done = 1; + hist_jnl->hist_done = 1; *actual_end = ts2; return ret; diff --git a/xlators/features/changelog/src/Makefile.am b/xlators/features/changelog/src/Makefile.am index 18c41e7d7d1..8712b9d059f 100644 --- a/xlators/features/changelog/src/Makefile.am +++ b/xlators/features/changelog/src/Makefile.am @@ -3,16 +3,24 @@ xlator_LTLIBRARIES = changelog.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features noinst_HEADERS = changelog-helpers.h changelog-mem-types.h changelog-rt.h \ - changelog-misc.h changelog-encoders.h changelog-notifier.h + changelog-rpc-common.h changelog-misc.h changelog-encoders.h \ + changelog-rpc-common.h changelog-rpc.h changelog-ev-handle.h changelog_la_LDFLAGS = -module -avoid-version changelog_la_SOURCES = changelog.c changelog-rt.c changelog-helpers.c \ - changelog-encoders.c changelog-notifier.c changelog-barrier.c -changelog_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + changelog-encoders.c changelog-rpc.c changelog-barrier.c \ + changelog-rpc-common.c changelog-ev-handle.c +changelog_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ + $(top_builddir)/rpc/xdr/src/libgfxdr.la \ + $(top_builddir)/rpc/rpc-lib/src/libgfrpc.la -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src -fPIC -D_FILE_OFFSET_BITS=64 \ - -D_GNU_SOURCE -D$(GF_HOST_OS) -DDATADIR=\"$(localstatedir)\" +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_srcdir)/rpc/rpc-lib/src \ + -I$(top_srcdir)/rpc/rpc-transport/socket/src \ + -I$(top_srcdir)/xlators/features/changelog/lib/src/ \ + -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS) \ + -DDATADIR=\"$(localstatedir)\" AM_CFLAGS = -Wall $(GF_CFLAGS) diff --git a/xlators/features/changelog/src/changelog-encoders.c b/xlators/features/changelog/src/changelog-encoders.c index 08626ee2f22..d2f5451e649 100644 --- a/xlators/features/changelog/src/changelog-encoders.c +++ b/xlators/features/changelog/src/changelog-encoders.c @@ -191,7 +191,7 @@ cb_encoder[] = { }; void -changelog_encode_change( changelog_priv_t * priv) +changelog_encode_change(changelog_priv_t * priv) { priv->ce = &cb_encoder[priv->encode_mode]; } diff --git a/xlators/features/changelog/src/changelog-ev-handle.c b/xlators/features/changelog/src/changelog-ev-handle.c new file mode 100644 index 00000000000..b526d38a872 --- /dev/null +++ b/xlators/features/changelog/src/changelog-ev-handle.c @@ -0,0 +1,379 @@ +/* + Copyright (c) 2015 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "changelog-ev-handle.h" +#include "changelog-rpc-common.h" +#include "changelog-helpers.h" + +struct rpc_clnt_program changelog_ev_program; + +#define NR_IOVEC (MAX_IOVEC - 3) +struct ev_rpc_vec { + int count; + struct iovec vector[NR_IOVEC]; + + /* sequence number */ + unsigned long seq; +}; + +struct ev_rpc { + rbuf_list_t *rlist; + struct rpc_clnt *rpc; + struct ev_rpc_vec vec; +}; + +/** + * As of now this just does the minimal (retval logging). Going further + * un-acknowledges sequence numbers can be retransmitted and other + * intelligence can be built into the server. + */ +int +changelog_event_dispatch_cbk (struct rpc_req *req, + struct iovec *iov, int count, void *myframe) +{ + return 0; +} + +/* dispatcher RPC */ +inline int +changelog_dispatch_vec (call_frame_t *frame, xlator_t *this, + struct rpc_clnt *rpc, struct ev_rpc_vec *vec) +{ + struct timeval tv = {0,}; + changelog_event_req req = {0,}; + + (void) gettimeofday (&tv, NULL); + + /** + * Event dispatch RPC header contains a sequence number for each + * dispatch. This allows the reciever to order the request before + * processing. + */ + req.seq = vec->seq; + req.tv_sec = tv.tv_sec; + req.tv_usec = tv.tv_usec; + + return changelog_rpc_sumbit_req (rpc, (void *)&req, + frame, &changelog_ev_program, + CHANGELOG_REV_PROC_EVENT, + vec->vector, vec->count, NULL, + this, changelog_event_dispatch_cbk, + (xdrproc_t) xdr_changelog_event_req); + } + + int + changelog_event_dispatch_rpc (call_frame_t *frame, xlator_t *this, void *data) + { + int idx = 0; + int count = 0; + int ret = 0; + unsigned long range = 0; + unsigned long sequence = 0; + rbuf_iovec_t *rvec = NULL; + struct ev_rpc *erpc = NULL; + struct rlist_iter riter = {{0,},}; + + /* dispatch NR_IOVEC IO vectors at a time. */ + + erpc = data; + RLIST_GET_SEQ (erpc->rlist, sequence, range); + + rlist_iter_init (&riter, erpc->rlist); + + rvec_for_each_entry (rvec, &riter) { + idx = count % NR_IOVEC; + if (++count == NR_IOVEC) { + erpc->vec.vector[idx] = rvec->iov; + erpc->vec.seq = sequence++; + erpc->vec.count = NR_IOVEC; + + ret = changelog_dispatch_vec (frame, this, + erpc->rpc, &erpc->vec); + if (ret) + break; + count = 0; + continue; + } + + erpc->vec.vector[idx] = rvec->iov; + } + + if (ret) + goto error_return; + + if ( (idx = count % NR_IOVEC) ) { + erpc->vec.seq = sequence; + erpc->vec.count = idx; + + ret = changelog_dispatch_vec (frame, this, + erpc->rpc, &erpc->vec); + } + + error_return: + return ret; +} + +int +changelog_rpc_notify (struct rpc_clnt *rpc, + void *mydata, rpc_clnt_event_t event, void *data) +{ + xlator_t *this = NULL; + changelog_rpc_clnt_t *crpc = NULL; + changelog_clnt_t *c_clnt = NULL; + changelog_priv_t *priv = NULL; + changelog_ev_selector_t *selection = NULL; + + crpc = mydata; + this = crpc->this; + c_clnt = crpc->c_clnt; + + priv = this->private; + + switch (event) { + case RPC_CLNT_CONNECT: + rpc_clnt_set_connected (&rpc->conn); + selection = &priv->ev_selection; + + LOCK (&c_clnt->wait_lock); + { + LOCK (&c_clnt->active_lock); + { + changelog_select_event (this, selection, + crpc->filter); + list_move_tail (&crpc->list, &c_clnt->active); + } + UNLOCK (&c_clnt->active_lock); + } + UNLOCK (&c_clnt->wait_lock); + + break; + case RPC_CLNT_DISCONNECT: + rpc_clnt_disable (crpc->rpc); + selection = &priv->ev_selection; + + LOCK (&crpc->lock); + { + changelog_deselect_event (this, selection, + crpc->filter); + changelog_set_disconnect_flag (crpc, _gf_true); + } + UNLOCK (&crpc->lock); + + break; + case RPC_CLNT_MSG: + case RPC_CLNT_DESTROY: + break; + } + + return 0; +} + +void * +changelog_ev_connector (void *data) +{ + xlator_t *this = NULL; + changelog_clnt_t *c_clnt = NULL; + changelog_rpc_clnt_t *crpc = NULL; + + c_clnt = data; + this = c_clnt->this; + + while (1) { + pthread_mutex_lock (&c_clnt->pending_lock); + { + while (list_empty (&c_clnt->pending)) + pthread_cond_wait (&c_clnt->pending_cond, + &c_clnt->pending_lock); + crpc = list_first_entry (&c_clnt->pending, + changelog_rpc_clnt_t, list); + crpc->rpc = + changelog_rpc_client_init (this, crpc, + crpc->sock, + changelog_rpc_notify); + if (!crpc->rpc) { + gf_log (this->name, GF_LOG_ERROR, "failed to " + "connect back.. <%s>", crpc->sock); + crpc->cleanup (crpc); + goto mutex_unlock; + } + + LOCK (&c_clnt->wait_lock); + { + list_move_tail (&crpc->list, &c_clnt->waitq); + } + UNLOCK (&c_clnt->wait_lock); + } + mutex_unlock: + pthread_mutex_unlock (&c_clnt->pending_lock); + } + + return NULL; +} + +void +changelog_ev_cleanup_connections (xlator_t *this, changelog_clnt_t *c_clnt) +{ + int ret = 0; + changelog_rpc_clnt_t *crpc = NULL; + + /* cleanup active connections */ + LOCK (&c_clnt->active_lock); + { + list_for_each_entry (crpc, &c_clnt->active, list) { + rpc_clnt_disable (crpc->rpc); + } + } + UNLOCK (&c_clnt->active_lock); +} + +/** + * TODO: granularize lock + * + * If we have multiple threads dispatching events, doing it this way is + * a performance bottleneck. + */ + +static inline changelog_rpc_clnt_t * +get_client (changelog_clnt_t *c_clnt, struct list_head **next) +{ + changelog_rpc_clnt_t *crpc = NULL; + + LOCK (&c_clnt->active_lock); + { + if (*next == &c_clnt->active) + goto unblock; + crpc = list_entry (*next, changelog_rpc_clnt_t, list); + changelog_rpc_clnt_ref (crpc); + *next = (*next)->next; + } + unblock: + UNLOCK (&c_clnt->active_lock); + + return crpc; +} + +static inline void +put_client (changelog_clnt_t *c_clnt, changelog_rpc_clnt_t *crpc) +{ + LOCK (&c_clnt->active_lock); + { + changelog_rpc_clnt_unref (crpc); + } + UNLOCK (&c_clnt->active_lock); +} + +void +_dispatcher (rbuf_list_t *rlist, void *arg) +{ + int ret = 0; + xlator_t *this = NULL; + changelog_clnt_t *c_clnt = NULL; + changelog_rpc_clnt_t *crpc = NULL; + changelog_rpc_clnt_t *tmp = NULL; + struct ev_rpc erpc = {0,}; + struct list_head *next = NULL; + + c_clnt = arg; + this = c_clnt->this; + + erpc.rlist = rlist; + next = c_clnt->active.next; + + while (1) { + crpc = get_client (c_clnt, &next); + if (!crpc) + break; + erpc.rpc = crpc->rpc; + ret = changelog_invoke_rpc (this, crpc->rpc, + &changelog_ev_program, + CHANGELOG_REV_PROC_EVENT, &erpc); + put_client (c_clnt, crpc); + } +} + +/** this is called under rotbuff's lock */ +void +sequencer (rbuf_list_t *rlist, void *mydata) +{ + unsigned long range = 0; + changelog_clnt_t *c_clnt = 0; + + c_clnt = mydata; + + range = (RLIST_ENTRY_COUNT (rlist)) / NR_IOVEC; + if ((RLIST_ENTRY_COUNT (rlist)) % NR_IOVEC) + range++; + RLIST_STORE_SEQ (rlist, c_clnt->sequence, range); + + c_clnt->sequence += range; +} + +void * +changelog_ev_dispatch (void *data) +{ + int ret = 0; + void *opaque = NULL; + xlator_t *this = NULL; + changelog_clnt_t *c_clnt = NULL; + struct timeval tv = {0,}; + + c_clnt = data; + this = c_clnt->this; + + while (1) { + /* TODO: change this to be pthread cond based.. later */ + tv.tv_sec = 1; + tv.tv_usec = 0; + select (0, NULL, NULL, NULL, &tv); + + ret = rbuf_get_buffer (c_clnt->rbuf, + &opaque, sequencer, c_clnt); + if (ret != RBUF_CONSUMABLE) { + /* TODO: handle failure */ + continue; + } + + ret = rbuf_wait_for_completion (c_clnt->rbuf, + opaque, _dispatcher, c_clnt); + if (ret) { + /* TODO: handle failure */ + continue; + } + } + + return NULL; +} + +void +changelog_ev_queue_connection (changelog_clnt_t *c_clnt, + changelog_rpc_clnt_t *crpc) +{ + pthread_mutex_lock (&c_clnt->pending_lock); + { + list_add_tail (&crpc->list, &c_clnt->pending); + pthread_cond_signal (&c_clnt->pending_cond); + } + pthread_mutex_unlock (&c_clnt->pending_lock); +} + +struct rpc_clnt_procedure changelog_ev_procs[CHANGELOG_REV_PROC_MAX] = { + [CHANGELOG_REV_PROC_NULL] = {"NULL", NULL}, + [CHANGELOG_REV_PROC_EVENT] = { + "EVENT DISPATCH", changelog_event_dispatch_rpc + }, +}; + +struct rpc_clnt_program changelog_ev_program = { + .progname = "CHANGELOG EVENT DISPATCHER", + .prognum = CHANGELOG_REV_RPC_PROCNUM, + .progver = CHANGELOG_REV_RPC_PROCVER, + .numproc = CHANGELOG_REV_PROC_MAX, + .proctable = changelog_ev_procs, +}; diff --git a/xlators/features/changelog/src/changelog-ev-handle.h b/xlators/features/changelog/src/changelog-ev-handle.h new file mode 100644 index 00000000000..eef0492a9ee --- /dev/null +++ b/xlators/features/changelog/src/changelog-ev-handle.h @@ -0,0 +1,140 @@ +/* + Copyright (c) 2015 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __CHANGELOG_EV_HANDLE_H +#define __CHANGELOG_EV_HANDLE_H + +#include "list.h" +#include "xlator.h" +#include "rpc-clnt.h" + +#include "rot-buffs.h" + +struct changelog_clnt; + +typedef struct changelog_rpc_clnt { + xlator_t *this; + + gf_lock_t lock; + + unsigned long ref; + gf_boolean_t disconnected; + + unsigned int filter; + char sock[UNIX_PATH_MAX]; + + struct changelog_clnt *c_clnt; /* back pointer to list holder */ + + struct rpc_clnt *rpc; /* RPC client endpoint */ + + struct list_head list; /* ->pending, ->waitq, ->active */ + + void (*cleanup) + (struct changelog_rpc_clnt *); /* cleanup handler */ +} changelog_rpc_clnt_t; + +static inline void +changelog_rpc_clnt_ref (changelog_rpc_clnt_t *crpc) +{ + LOCK (&crpc->lock); + { + ++crpc->ref; + } + UNLOCK (&crpc->lock); +} + +static inline void +changelog_set_disconnect_flag (changelog_rpc_clnt_t *crpc, gf_boolean_t flag) +{ + crpc->disconnected = flag; +} + +static inline int +changelog_rpc_clnt_is_disconnected (changelog_rpc_clnt_t *crpc) +{ + return (crpc->disconnected == _gf_true); +} + +static inline void +changelog_rpc_clnt_unref (changelog_rpc_clnt_t *crpc) +{ + gf_boolean_t gone = _gf_false; + + LOCK (&crpc->lock); + { + if (!(--crpc->ref) + && changelog_rpc_clnt_is_disconnected (crpc)) { + list_del (&crpc->list); + gone = _gf_true; + } + } + UNLOCK (&crpc->lock); + + if (gone) + crpc->cleanup (crpc); +} + +/** + * This structure holds pending and active clients. On probe RPC all + * an instance of the above structure (@changelog_rpc_clnt) is placed + * in ->pending and gets moved to ->active on a successful connect. + * + * locking rules: + * + * Manipulating ->pending + * ->pending_lock + * ->pending + * + * Manipulating ->active + * ->active_lock + * ->active + * + * Moving object from ->pending to ->active + * ->pending_lock + * ->active_lock + * + * Objects are _never_ moved from ->active to ->pending, i.e., during + * disconnection, the object is destroyed. Well, we could have tried + * to reconnect, but that's pure waste.. let the other end reconnect. + */ + +typedef struct changelog_clnt { + xlator_t *this; + + /* pending connections */ + pthread_mutex_t pending_lock; + pthread_cond_t pending_cond; + struct list_head pending; + + /* current active connections */ + gf_lock_t active_lock; + struct list_head active; + + gf_lock_t wait_lock; + struct list_head waitq; + + /* consumer part of rot-buffs */ + rbuf_t *rbuf; + unsigned long sequence; +} changelog_clnt_t; + +void *changelog_ev_connector (void *); + +void *changelog_ev_dispatch (void *); + +/* APIs */ +void +changelog_ev_queue_connection (changelog_clnt_t *, changelog_rpc_clnt_t *); + +void +changelog_ev_cleanup_connections (xlator_t *, changelog_clnt_t *); + +#endif + diff --git a/xlators/features/changelog/src/changelog-helpers.c b/xlators/features/changelog/src/changelog-helpers.c index 3af2938190d..ccc121c473d 100644 --- a/xlators/features/changelog/src/changelog-helpers.c +++ b/xlators/features/changelog/src/changelog-helpers.c @@ -24,6 +24,7 @@ #include "changelog-mem-types.h" #include "changelog-encoders.h" +#include "changelog-rpc-common.h" #include static inline void @@ -57,7 +58,7 @@ changelog_cleanup_free_mutex (void *arg_mutex) pthread_mutex_unlock(p_mutex); } -void +int changelog_thread_cleanup (xlator_t *this, pthread_t thr_id) { int ret = 0; @@ -65,7 +66,7 @@ changelog_thread_cleanup (xlator_t *this, pthread_t thr_id) /* send a cancel request to the thread */ ret = pthread_cancel (thr_id); - if (ret) { + if (ret != 0) { gf_log (this->name, GF_LOG_ERROR, "could not cancel thread (reason: %s)", strerror (errno)); @@ -73,14 +74,14 @@ changelog_thread_cleanup (xlator_t *this, pthread_t thr_id) } ret = pthread_join (thr_id, &retval); - if (ret || (retval != PTHREAD_CANCELED)) { + if ((ret != 0) || (retval != PTHREAD_CANCELED)) { gf_log (this->name, GF_LOG_ERROR, "cancel request not adhered as expected" " (reason: %s)", strerror (errno)); } out: - return; + return ret; } inline void * @@ -98,6 +99,142 @@ changelog_get_usable_buffer (changelog_local_t *local) return cld->cld_iobuf->ptr; } +static inline int +changelog_selector_index (unsigned int selector) +{ + return (ffs (selector) - 1); +} + +inline int +changelog_ev_selected (xlator_t *this, + changelog_ev_selector_t *selection, + unsigned int selector) +{ + int idx = 0; + + idx = changelog_selector_index (selector); + gf_log (this->name, GF_LOG_DEBUG, + "selector ref count for %d (idx: %d): %d", + selector, idx, selection->ref[idx]); + /* this can be lockless */ + return ( idx < CHANGELOG_EV_SELECTION_RANGE + && (selection->ref[idx] > 0) ); +} + +inline void +changelog_select_event (xlator_t *this, + changelog_ev_selector_t *selection, + unsigned int selector) +{ + int idx = 0; + + LOCK (&selection->reflock); + { + while (selector) { + idx = changelog_selector_index (selector); + if (idx < CHANGELOG_EV_SELECTION_RANGE) { + selection->ref[idx]++; + gf_log (this->name, GF_LOG_DEBUG, + "selecting event %d", idx); + } + selector &= ~(1 << idx); + } + } + UNLOCK (&selection->reflock); +} + +inline void +changelog_deselect_event (xlator_t *this, + changelog_ev_selector_t *selection, + unsigned int selector) +{ + int idx = 0; + + LOCK (&selection->reflock); + { + while (selector) { + idx = changelog_selector_index (selector); + if (idx < CHANGELOG_EV_SELECTION_RANGE) { + selection->ref[idx]--; + gf_log (this->name, GF_LOG_DEBUG, + "de-selecting event %d", idx); + } + selector &= ~(1 << idx); + } + } + UNLOCK (&selection->reflock); +} + +inline int +changelog_init_event_selection (xlator_t *this, + changelog_ev_selector_t *selection) +{ + int ret = 0; + int j = CHANGELOG_EV_SELECTION_RANGE; + + ret = LOCK_INIT (&selection->reflock); + if (ret != 0) + return -1; + + LOCK (&selection->reflock); + { + while (j--) { + selection->ref[j] = 0; + } + } + UNLOCK (&selection->reflock); + + return 0; +} + +inline int +changelog_cleanup_event_selection (xlator_t *this, + changelog_ev_selector_t *selection) +{ + int ret = 0; + int j = CHANGELOG_EV_SELECTION_RANGE; + + LOCK (&selection->reflock); + { + while (j--) { + if (selection->ref[j] > 0) + gf_log (this->name, GF_LOG_WARNING, + "changelog event selection cleaning up " + " on active references"); + } + } + UNLOCK (&selection->reflock); + + return LOCK_DESTROY (&selection->reflock); +} + +inline void +changelog_perform_dispatch (changelog_priv_t *priv, void *mem, size_t size) +{ + char *buf = NULL; + void *opaque = NULL; + + buf = rbuf_reserve_write_area (priv->rbuf, size, &opaque); + if (!buf) { + /* TODO: log failure */ + return; + } + memcpy (buf, mem, size); + rbuf_write_complete (opaque); +} + +inline void +changelog_dispatch_event (xlator_t *this, + changelog_priv_t *priv, changelog_event_t *ev) +{ + changelog_ev_selector_t *selection = NULL; + + selection = &priv->ev_selection; + if (changelog_ev_selected (this, selection, ev->ev_type)) { + changelog_perform_dispatch (priv, ev, CHANGELOG_EV_SIZE); + } +} + inline void changelog_set_usable_record_and_length (changelog_local_t *local, size_t len, int xr) @@ -206,9 +343,9 @@ changelog_rollover_changelog (xlator_t *this, { int ret = -1; int notify = 0; - char *bname = NULL; char ofile[PATH_MAX] = {0,}; char nfile[PATH_MAX] = {0,}; + changelog_event_t ev = {0,}; if (priv->changelog_fd != -1) { ret = fsync (priv->changelog_fd); @@ -252,40 +389,32 @@ changelog_rollover_changelog (xlator_t *this, } if (notify) { - bname = basename (nfile); - gf_log (this->name, GF_LOG_DEBUG, "notifying: %s", bname); - ret = changelog_write (priv->wfd, bname, strlen (bname) + 1); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "Failed to send file name to notify thread" - " (reason: %s)", strerror (errno)); - } else { - /* If this is explicit rollover initiated by snapshot, - * wakeup reconfigure thread waiting for changelog to - * rollover - */ - if (priv->explicit_rollover) { - priv->explicit_rollover = _gf_false; - ret = pthread_mutex_lock ( - &priv->bn.bnotify_mutex); - CHANGELOG_PTHREAD_ERROR_HANDLE_0 (ret, out); - { - priv->bn.bnotify = _gf_false; - ret = pthread_cond_signal ( - &priv->bn.bnotify_cond); - CHANGELOG_PTHREAD_ERROR_HANDLE_0 (ret, - out); - gf_log (this->name, GF_LOG_INFO, - "Changelog published: %s and" - " signalled bnotify", bname); - } - ret = pthread_mutex_unlock ( - &priv->bn.bnotify_mutex); + ev.ev_type = CHANGELOG_OP_TYPE_JOURNAL; + memcpy (ev.u.journal.path, nfile, strlen (nfile) + 1); + changelog_dispatch_event (this, priv, &ev); + + /* If this is explicit rollover initiated by snapshot, + * wakeup reconfigure thread waiting for changelog to + * rollover + */ + if (priv->explicit_rollover) { + priv->explicit_rollover = _gf_false; + + ret = pthread_mutex_lock (&priv->bn.bnotify_mutex); + CHANGELOG_PTHREAD_ERROR_HANDLE_0 (ret, out); + { + priv->bn.bnotify = _gf_false; + ret = pthread_cond_signal + (&priv->bn.bnotify_cond); CHANGELOG_PTHREAD_ERROR_HANDLE_0 (ret, out); + gf_log (this->name, GF_LOG_INFO, + "Changelog published: %s signalled" + " bnotify", nfile); } + ret = pthread_mutex_unlock (&priv->bn.bnotify_mutex); + CHANGELOG_PTHREAD_ERROR_HANDLE_0 (ret, out); } } - out: return ret; } @@ -434,8 +563,8 @@ changelog_snap_logging_stop (xlator_t *this, } int -changelog_open (xlator_t *this, - changelog_priv_t *priv) +changelog_open_journal (xlator_t *this, + changelog_priv_t *priv) { int fd = 0; int ret = -1; @@ -490,7 +619,7 @@ changelog_start_next_change (xlator_t *this, ret = changelog_rollover_changelog (this, priv, ts); if (!ret && !finale) - ret = changelog_open (this, priv); + ret = changelog_open_journal (this, priv); return ret; } @@ -975,7 +1104,7 @@ __changelog_inode_ctx_set (xlator_t *this, * one shot routine to get the address and the value of a inode version * for a particular type. */ -static changelog_inode_ctx_t * +changelog_inode_ctx_t * __changelog_inode_ctx_get (xlator_t *this, inode_t *inode, unsigned long **iver, unsigned long *version, changelog_log_type type) @@ -1028,6 +1157,26 @@ changelog_inode_ctx_get (xlator_t *this, return ctx; } +int32_t +changelog_inode_mod_ctx_flags (xlator_t *this, inode_t *inode, int32_t flags) +{ + int32_t ret = -1; + changelog_inode_ctx_t *ctx = NULL; + + LOCK (&inode->lock); + { + ctx = __changelog_inode_ctx_get (this, inode, NULL, NULL, 0); + if (!ctx) + goto unblock; + ret = 0; + ctx->ordflags |= flags; + } + unblock: + UNLOCK (&inode->lock); + + return ret; +} + /** * This is the main update routine. Locking has been made granular so as to * maximize parallelism of fops - I'll try to explain it below using execution diff --git a/xlators/features/changelog/src/changelog-helpers.h b/xlators/features/changelog/src/changelog-helpers.h index 03a795369d1..f44a06f426a 100644 --- a/xlators/features/changelog/src/changelog-helpers.h +++ b/xlators/features/changelog/src/changelog-helpers.h @@ -15,10 +15,16 @@ #include "timer.h" #include "pthread.h" #include "iobuf.h" +#include "rot-buffs.h" #include "changelog-misc.h" #include "call-stub.h" +#include "rpcsvc.h" +#include "changelog-ev-handle.h" + +#include "changelog.h" + /** * the changelog entry */ @@ -120,29 +126,6 @@ typedef struct changelog_fsync { xlator_t *this; } changelog_fsync_t; -# define CHANGELOG_MAX_CLIENTS 5 -typedef struct changelog_notify { - /* reader end of the pipe */ - int rfd; - - /* notifier thread */ - pthread_t notify_th; - - /* unique socket path */ - char sockpath[UNIX_PATH_MAX]; - - int socket_fd; - - /** - * simple array of accept()'ed fds. Not scalable at all - * for large number of clients, but it's okay as we have - * a ahrd limit in this version (@CHANGELOG_MAX_CLIENTS). - */ - int client_fd[CHANGELOG_MAX_CLIENTS]; - - xlator_t *this; -} changelog_notify_t; - /* Draining during changelog rollover (for geo-rep snapshot dependency): * -------------------------------------------------------------------- * The introduction of draining of in-transit fops during changelog rollover @@ -162,14 +145,14 @@ typedef struct changelog_notify { typedef enum chlog_fop_color { FOP_COLOR_BLACK, FOP_COLOR_WHITE -}chlog_fop_color_t; +} chlog_fop_color_t; /* Barrier notify variable */ typedef struct barrier_notify { pthread_mutex_t bnotify_mutex; pthread_cond_t bnotify_cond; gf_boolean_t bnotify; -}barrier_notify_t; +} barrier_notify_t; /* Two separate mutex and conditional variable set is used * to drain white and black fops. */ @@ -185,15 +168,26 @@ typedef struct drain_mgmt { unsigned long white_fop_cnt; gf_boolean_t drain_wait_black; gf_boolean_t drain_wait_white; -}drain_mgmt_t; +} drain_mgmt_t; /* External barrier as a result of snap on/off indicating flag*/ typedef struct barrier_flags { gf_lock_t lock; gf_boolean_t barrier_ext; -}barrier_flags_t; +} barrier_flags_t; + +/* Event selection */ +typedef struct changelog_ev_selector { + gf_lock_t reflock; + + /** + * Array of references for each selection bit. + */ + unsigned int ref[CHANGELOG_EV_SELECTION_RANGE]; +} changelog_ev_selector_t; +/* changelog's private structure */ struct changelog_priv { gf_boolean_t active; @@ -223,9 +217,6 @@ struct changelog_priv { /* lock to synchronize CSNAP updation */ gf_lock_t c_snap_lock; - /* writen end of the pipe */ - int wfd; - /* rollover time */ int32_t rollover_time; @@ -247,9 +238,6 @@ struct changelog_priv { /* context of fsync thread */ changelog_fsync_t cf; - /* context of the notifier thread */ - changelog_notify_t cn; - /* operation mode */ changelog_mode_t op_mode; @@ -262,7 +250,9 @@ struct changelog_priv { /* encoder */ struct changelog_encoder *ce; - /* snapshot dependency changes */ + /** + * snapshot dependency changes + */ /* Draining of fops*/ drain_mgmt_t dm; @@ -289,6 +279,30 @@ struct changelog_priv { gf_timer_t *timer; struct timespec timeout; + /** + * buffers, RPC, event selection, notifications and other + * beasts. + */ + + /* epoll pthread */ + pthread_t poller; + + /* rotational buffer */ + rbuf_t *rbuf; + + /* changelog RPC server */ + rpcsvc_t *rpc; + + /* event selection */ + changelog_ev_selector_t ev_selection; + + /* client handling (reverse connection) */ + pthread_t connector; + + int nr_dispatchers; + pthread_t *ev_dispatcher; + + changelog_clnt_t connections; }; struct changelog_local { @@ -313,6 +327,7 @@ typedef struct changelog_local changelog_local_t; /* inode version is stored in inode ctx */ typedef struct changelog_inode_ctx { + int32_t ordflags; unsigned long iversion[CHANGELOG_MAX_TYPE]; } changelog_inode_ctx_t; @@ -367,9 +382,12 @@ typedef struct { * helpers routines */ -void +int changelog_thread_cleanup (xlator_t *this, pthread_t thr_id); +void +changelog_perform_dispatch (changelog_priv_t *priv, void *mem, size_t size); + void * changelog_get_usable_buffer (changelog_local_t *local); @@ -386,7 +404,7 @@ changelog_start_next_change (xlator_t *this, changelog_priv_t *priv, unsigned long ts, gf_boolean_t finale); int -changelog_open (xlator_t *this, changelog_priv_t *priv); +changelog_open_journal (xlator_t *this, changelog_priv_t *priv); int changelog_fill_rollover_data (changelog_log_data_t *cld, gf_boolean_t is_last); int @@ -449,6 +467,7 @@ changelog_snap_handle_ascii_change (xlator_t *this, changelog_log_data_t *cld); int changelog_snap_write_change (changelog_priv_t *priv, char *buffer, size_t len); + /* Changelog barrier routines */ void __chlog_barrier_enqueue (xlator_t *this, call_stub_t *stub); void __chlog_barrier_disable (xlator_t *this, struct list_head *queue); @@ -460,6 +479,27 @@ int32_t changelog_fill_entry_buf (call_frame_t *frame, xlator_t *this, loc_t *loc, changelog_local_t **local); +/* event selection routines */ +inline void changelog_select_event (xlator_t *, + changelog_ev_selector_t *, unsigned int); +inline void changelog_deselect_event (xlator_t *, + changelog_ev_selector_t *, unsigned int); +inline int changelog_init_event_selection (xlator_t *, + changelog_ev_selector_t *); +inline int changelog_cleanup_event_selection (xlator_t *, + changelog_ev_selector_t *); +inline int changelog_ev_selected (xlator_t *, + changelog_ev_selector_t *, unsigned int); +inline void +changelog_dispatch_event (xlator_t *, changelog_priv_t *, changelog_event_t *); + +int32_t +changelog_inode_mod_ctx_flags (xlator_t *, inode_t *, int32_t ); + +changelog_inode_ctx_t * +__changelog_inode_ctx_get (xlator_t *, inode_t *, unsigned long **, + unsigned long *, changelog_log_type ); + /* macros */ #define CHANGELOG_STACK_UNWIND(fop, frame, params ...) do { \ @@ -471,10 +511,10 @@ changelog_fill_entry_buf (call_frame_t *frame, xlator_t *this, frame->local = NULL; \ } \ STACK_UNWIND_STRICT (fop, frame, params); \ - changelog_local_cleanup (__xl, __local); \ if (__local && __local->prev_entry) \ changelog_local_cleanup (__xl, \ __local->prev_entry); \ + changelog_local_cleanup (__xl, __local); \ } while (0) #define CHANGELOG_IOBUF_REF(iobuf) do { \ diff --git a/xlators/features/changelog/src/changelog-mem-types.h b/xlators/features/changelog/src/changelog-mem-types.h index e1fa319a715..1618f722f6c 100644 --- a/xlators/features/changelog/src/changelog-mem-types.h +++ b/xlators/features/changelog/src/changelog-mem-types.h @@ -14,16 +14,21 @@ #include "mem-types.h" enum gf_changelog_mem_types { - gf_changelog_mt_priv_t = gf_common_mt_end + 1, - gf_changelog_mt_str_t = gf_common_mt_end + 2, - gf_changelog_mt_batch_t = gf_common_mt_end + 3, - gf_changelog_mt_rt_t = gf_common_mt_end + 4, - gf_changelog_mt_inode_ctx_t = gf_common_mt_end + 5, - gf_changelog_mt_libgfchangelog_t = gf_common_mt_end + 6, - gf_changelog_mt_libgfchangelog_rl_t = gf_common_mt_end + 7, - gf_changelog_mt_libgfchangelog_dirent_t = gf_common_mt_end + 8, - gf_changelog_mt_changelog_buffer_t = gf_common_mt_end + 9, - gf_changelog_mt_history_data_t = gf_common_mt_end + 10, + gf_changelog_mt_priv_t = gf_common_mt_end + 1, + gf_changelog_mt_str_t = gf_common_mt_end + 2, + gf_changelog_mt_batch_t = gf_common_mt_end + 3, + gf_changelog_mt_rt_t = gf_common_mt_end + 4, + gf_changelog_mt_inode_ctx_t = gf_common_mt_end + 5, + gf_changelog_mt_rpc_clnt_t = gf_common_mt_end + 6, + gf_changelog_mt_libgfchangelog_t = gf_common_mt_end + 7, + gf_changelog_mt_libgfchangelog_entry_t = gf_common_mt_end + 8, + gf_changelog_mt_libgfchangelog_rl_t = gf_common_mt_end + 9, + gf_changelog_mt_libgfchangelog_dirent_t = gf_common_mt_end + 10, + gf_changelog_mt_changelog_buffer_t = gf_common_mt_end + 11, + gf_changelog_mt_history_data_t = gf_common_mt_end + 12, + gf_changelog_mt_libgfchangelog_call_pool_t = gf_common_mt_end + 13, + gf_changelog_mt_libgfchangelog_event_t = gf_common_mt_end + 14, + gf_changelog_mt_ev_dispatcher_t = gf_common_mt_end + 15, gf_changelog_mt_end }; diff --git a/xlators/features/changelog/src/changelog-misc.h b/xlators/features/changelog/src/changelog-misc.h index 58b10961463..b45302ad099 100644 --- a/xlators/features/changelog/src/changelog-misc.h +++ b/xlators/features/changelog/src/changelog-misc.h @@ -25,6 +25,7 @@ #define CHANGELOG_VERSION_MINOR 1 #define CHANGELOG_UNIX_SOCK DEFAULT_VAR_RUN_DIRECTORY"/changelog-%s.sock" +#define CHANGELOG_TMP_UNIX_SOCK DEFAULT_VAR_RUN_DIRECTORY"/.%s%lu.sock" /** * header starts with the version and the format of the changelog. @@ -42,6 +43,19 @@ CHANGELOG_UNIX_SOCK, md5_sum); \ } while (0) +#define CHANGELOG_MAKE_TMP_SOCKET_PATH(brick_path, sockpath, len) do { \ + unsigned long pid = 0; \ + char md5_sum[MD5_DIGEST_LENGTH*2+1] = {0,}; \ + pid = (unsigned long) getpid (); \ + md5_wrapper((unsigned char *) brick_path, \ + strlen(brick_path), \ + md5_sum); \ + (void) snprintf (sockpath, \ + len, CHANGELOG_TMP_UNIX_SOCK, \ + md5_sum, pid); \ + } while (0) + + /** * ... used by libgfchangelog. */ diff --git a/xlators/features/changelog/src/changelog-notifier.c b/xlators/features/changelog/src/changelog-notifier.c deleted file mode 100644 index 5f3d063a8ad..00000000000 --- a/xlators/features/changelog/src/changelog-notifier.c +++ /dev/null @@ -1,314 +0,0 @@ -/* - Copyright (c) 2013 Red Hat, Inc. - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#include "changelog-notifier.h" - -#include - -inline static void -changelog_notify_clear_fd (changelog_notify_t *cn, int i) -{ - cn->client_fd[i] = -1; -} - -inline static void -changelog_notify_save_fd (changelog_notify_t *cn, int i, int fd) -{ - cn->client_fd[i] = fd; -} - -static int -changelog_notify_insert_fd (xlator_t *this, changelog_notify_t *cn, int fd) -{ - int i = 0; - int ret = 0; - - for (; i < CHANGELOG_MAX_CLIENTS; i++) { - if (cn->client_fd[i] == -1) - break; - } - - if (i == CHANGELOG_MAX_CLIENTS) { - /** - * this case should not be hit as listen() would limit - * the number of completely established connections. - */ - gf_log (this->name, GF_LOG_WARNING, - "hit max client limit (%d)", CHANGELOG_MAX_CLIENTS); - ret = -1; - } - else - changelog_notify_save_fd (cn, i, fd); - - return ret; -} - -static void -changelog_notify_fill_rset (changelog_notify_t *cn, fd_set *rset, int *maxfd) -{ - int i = 0; - - FD_ZERO (rset); - - FD_SET (cn->socket_fd, rset); - *maxfd = cn->socket_fd; - - FD_SET (cn->rfd, rset); - *maxfd = max (*maxfd, cn->rfd); - - for (; i < CHANGELOG_MAX_CLIENTS; i++) { - if (cn->client_fd[i] != -1) { - FD_SET (cn->client_fd[i], rset); - *maxfd = max (*maxfd, cn->client_fd[i]); - } - } - - *maxfd = *maxfd + 1; -} - -static int -changelog_notify_client (changelog_notify_t *cn, char *path, ssize_t len) -{ - int i = 0; - int ret = 0; - - for (; i < CHANGELOG_MAX_CLIENTS; i++) { - if (cn->client_fd[i] == -1) - continue; - - if (changelog_write (cn->client_fd[i], - path, len)) { - ret = -1; - - close (cn->client_fd[i]); - changelog_notify_clear_fd (cn, i); - } - } - - return ret; -} - -static void -changelog_notifier_init (changelog_notify_t *cn) -{ - int i = 0; - - cn->socket_fd = -1; - - for (; i < CHANGELOG_MAX_CLIENTS; i++) { - changelog_notify_clear_fd (cn, i); - } -} - -static void -changelog_close_client_conn (changelog_notify_t *cn) -{ - int i = 0; - - for (; i < CHANGELOG_MAX_CLIENTS; i++) { - if (cn->client_fd[i] == -1) - continue; - - close (cn->client_fd[i]); - changelog_notify_clear_fd (cn, i); - } -} - -static void -changelog_notifier_cleanup (void *arg) -{ - changelog_notify_t *cn = NULL; - - cn = (changelog_notify_t *) arg; - - changelog_close_client_conn (cn); - - if (cn->socket_fd != -1) - close (cn->socket_fd); - - if (cn->rfd) - close (cn->rfd); - - if (unlink (cn->sockpath)) - gf_log ("", GF_LOG_WARNING, - "could not unlink changelog socket file" - " %s (reason: %s", cn->sockpath, strerror (errno)); -} - -void * -changelog_notifier (void *data) -{ - int i = 0; - int fd = 0; - int max_fd = 0; - int len = 0; - ssize_t readlen = 0; - xlator_t *this = NULL; - changelog_priv_t *priv = NULL; - changelog_notify_t *cn = NULL; - struct sockaddr_un local = {0,}; - char path[PATH_MAX] = {0,}; - char abspath[PATH_MAX] = {0,}; - - char buffer; - fd_set rset; - - priv = (changelog_priv_t *) data; - - cn = &priv->cn; - this = cn->this; - - pthread_cleanup_push (changelog_notifier_cleanup, cn); - - changelog_notifier_init (cn); - - cn->socket_fd = socket (AF_UNIX, SOCK_STREAM, 0); - if (cn->socket_fd < 0) { - gf_log (this->name, GF_LOG_ERROR, - "changelog socket error (reason: %s)", - strerror (errno)); - goto out; - } - - CHANGELOG_MAKE_SOCKET_PATH (priv->changelog_brick, - cn->sockpath, UNIX_PATH_MAX); - if (unlink (cn->sockpath) < 0) { - if (errno != ENOENT) { - gf_log (this->name, GF_LOG_ERROR, - "Could not unlink changelog socket file (%s)" - " (reason: %s)", - CHANGELOG_UNIX_SOCK, strerror (errno)); - goto cleanup; - } - } - - local.sun_family = AF_UNIX; - strcpy (local.sun_path, cn->sockpath); - - len = strlen (local.sun_path) + sizeof (local.sun_family); - - /* bind to the unix domain socket */ - if (bind (cn->socket_fd, (struct sockaddr *) &local, len) < 0) { - gf_log (this->name, GF_LOG_ERROR, - "Could not bind to changelog socket (reason: %s)", - strerror (errno)); - goto cleanup; - } - - /* listen for incoming connections */ - if (listen (cn->socket_fd, CHANGELOG_MAX_CLIENTS) < 0) { - gf_log (this->name, GF_LOG_ERROR, - "listen() error on changelog socket (reason: %s)", - strerror (errno)); - goto cleanup; - } - - /** - * simple select() on all to-be-read file descriptors. This method - * though old school works pretty well when you have a handfull of - * fd's to be watched (clients). - * - * Future TODO: move this to epoll based notification facility if - * number of clients increase. - */ - for (;;) { - changelog_notify_fill_rset (cn, &rset, &max_fd); - - if (select (max_fd, &rset, NULL, NULL, NULL) < 0) { - gf_log (this->name, GF_LOG_ERROR, - "select() returned -1 (reason: %s)", - strerror (errno)); - sleep (2); - continue; - } - - if (FD_ISSET (cn->socket_fd, &rset)) { - fd = accept (cn->socket_fd, NULL, NULL); - if (fd < 0) { - gf_log (this->name, GF_LOG_ERROR, - "accept error on changelog socket" - " (reason: %s)", strerror (errno)); - } else if (changelog_notify_insert_fd (this, cn, fd)) { - gf_log (this->name, GF_LOG_ERROR, - "hit max client limit"); - } - } - - if (FD_ISSET (cn->rfd, &rset)) { - /** - * read changelog filename and notify all connected - * clients. - */ - readlen = 0; - while (readlen < PATH_MAX) { - len = read (cn->rfd, &path[readlen++], 1); - if (len == -1) { - break; - } - - if (len == 0) { - gf_log (this->name, GF_LOG_ERROR, - "rollover thread sent EOF" - " on pipe - possibly a crash."); - /* be blunt and close all connections */ - pthread_exit(NULL); - } - - if (path[readlen - 1] == '\0') - break; - } - - /* should we close all client connections here too? */ - if (len < 0 || readlen == PATH_MAX) { - gf_log (this->name, GF_LOG_ERROR, - "Could not get pathname from rollover" - " thread or pathname too long"); - goto process_rest; - } - - (void) snprintf (abspath, PATH_MAX, - "%s/%s", priv->changelog_dir, path); - if (changelog_notify_client (cn, abspath, - strlen (abspath) + 1)) - gf_log (this->name, GF_LOG_ERROR, - "could not notify some clients with new" - " changelogs"); - } - - process_rest: - for (i = 0; i < CHANGELOG_MAX_CLIENTS; i++) { - if ( (fd = cn->client_fd[i]) == -1 ) - continue; - - if (FD_ISSET (fd, &rset)) { - /** - * the only data we accept from the client is a - * disconnect. Anything else is treated as bogus - * and is silently discarded (also warned!!!). - */ - if ( (readlen = read (fd, &buffer, 1)) <= 0 ) { - close (fd); - changelog_notify_clear_fd (cn, i); - } else { - /* silently discard data and log */ - gf_log (this->name, GF_LOG_WARNING, - "misbehaving changelog client"); - } - } - } - - } - - cleanup:; - pthread_cleanup_pop (1); - - out: - return NULL; -} diff --git a/xlators/features/changelog/src/changelog-rpc-common.c b/xlators/features/changelog/src/changelog-rpc-common.c new file mode 100644 index 00000000000..76db6696ae8 --- /dev/null +++ b/xlators/features/changelog/src/changelog-rpc-common.c @@ -0,0 +1,334 @@ +/* + Copyright (c) 2015 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "changelog-rpc-common.h" + +/** +***************************************************** + Client Interface +***************************************************** +*/ + +/** + * Initialize and return an RPC client object for a given unix + * domain socket. + */ + +void * +changelog_rpc_poller (void *arg) +{ + xlator_t *this = arg; + + (void) event_dispatch (this->ctx->event_pool); + return NULL; +} + +struct rpc_clnt * +changelog_rpc_client_init (xlator_t *this, void *cbkdata, + char *sockfile, rpc_clnt_notify_t fn) +{ + int ret = 0; + struct rpc_clnt *rpc = NULL; + dict_t *options = NULL; + + if (!cbkdata) + cbkdata = this; + + options = dict_new (); + if (!options) + goto error_return; + + ret = rpc_transport_unix_options_build (&options, sockfile, 0); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "failed to build rpc options"); + goto dealloc_dict; + } + + rpc = rpc_clnt_new (options, this->ctx, this->name, 16); + if (!rpc) + goto dealloc_dict; + + ret = rpc_clnt_register_notify (rpc, fn, cbkdata); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to register notify"); + goto dealloc_rpc_clnt; + } + + ret = rpc_clnt_start (rpc); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to start rpc"); + goto dealloc_rpc_clnt; + } + + return rpc; + + dealloc_rpc_clnt: + rpc_clnt_unref (rpc); + dealloc_dict: + dict_unref (options); + error_return: + return NULL; +} + +/** + * Generic RPC client routine to dispatch a request to an + * RPC server. + */ +int +changelog_rpc_sumbit_req (struct rpc_clnt *rpc, void *req, + call_frame_t *frame, rpc_clnt_prog_t *prog, + int procnum, struct iovec *payload, int payloadcnt, + struct iobref *iobref, xlator_t *this, + fop_cbk_fn_t cbkfn, xdrproc_t xdrproc) +{ + int ret = 0; + int count = 0; + struct iovec iov = {0, }; + struct iobuf *iobuf = NULL; + char new_iobref = 0; + ssize_t xdr_size = 0; + + GF_ASSERT (this); + + if (req) { + xdr_size = xdr_sizeof (xdrproc, req); + + iobuf = iobuf_get2 (this->ctx->iobuf_pool, xdr_size); + if (!iobuf) { + goto out; + }; + + if (!iobref) { + iobref = iobref_new (); + if (!iobref) { + goto out; + } + + new_iobref = 1; + } + + iobref_add (iobref, iobuf); + + iov.iov_base = iobuf->ptr; + iov.iov_len = iobuf_size (iobuf); + + /* Create the xdr payload */ + ret = xdr_serialize_generic (iov, req, xdrproc); + if (ret == -1) { + goto out; + } + + iov.iov_len = ret; + count = 1; + } + + ret = rpc_clnt_submit (rpc, prog, procnum, cbkfn, &iov, count, + payload, payloadcnt, iobref, frame, NULL, + 0, NULL, 0, NULL); + + out: + if (new_iobref) + iobref_unref (iobref); + if (iobuf) + iobuf_unref (iobuf); + return ret; +} + +/** + * Entry point to perform a remote procedure call + */ +int +changelog_invoke_rpc (xlator_t *this, struct rpc_clnt *rpc, + rpc_clnt_prog_t *prog, int procidx, void *arg) +{ + int ret = 0; + call_frame_t *frame = NULL; + rpc_clnt_procedure_t *proc = NULL; + + if (!this || !prog) + goto error_return; + + frame = create_frame (this, this->ctx->pool); + if (!frame) { + gf_log (this->name, GF_LOG_ERROR, "failed to create frame"); + goto error_return; + } + + proc = &prog->proctable[procidx]; + if (proc->fn) + ret = proc->fn (frame, this, arg); + + STACK_DESTROY (frame->root); + return ret; + + error_return: + return -1; +} + +/** +***************************************************** + Server Interface +***************************************************** +*/ + +struct iobuf * +__changelog_rpc_serialize_reply (rpcsvc_request_t *req, void *arg, + struct iovec *outmsg, xdrproc_t xdrproc) +{ + struct iobuf *iob = NULL; + ssize_t retlen = 0; + ssize_t rsp_size = 0; + + rsp_size = xdr_sizeof (xdrproc, arg); + iob = iobuf_get2 (req->svc->ctx->iobuf_pool, rsp_size); + if (!iob) + goto error_return; + + iobuf_to_iovec (iob, outmsg); + + retlen = xdr_serialize_generic (*outmsg, arg, xdrproc); + if (retlen == -1) + goto unref_iob; + + outmsg->iov_len = retlen; + return iob; + + unref_iob: + iobuf_unref (iob); + error_return: + return NULL; +} + +int +changelog_rpc_sumbit_reply (rpcsvc_request_t *req, + void *arg, struct iovec *payload, int payloadcount, + struct iobref *iobref, xdrproc_t xdrproc) +{ + int ret = -1; + struct iobuf *iob = NULL; + struct iovec iov = {0,}; + char new_iobref = 0; + + if (!req) + goto return_ret; + + if (!iobref) { + iobref = iobref_new (); + if (!iobref) + goto return_ret; + new_iobref = 1; + } + + iob = __changelog_rpc_serialize_reply (req, arg, &iov, xdrproc); + if (!iob) + gf_log ("", GF_LOG_ERROR, "failed to serialize reply"); + else + iobref_add (iobref, iob); + + ret = rpcsvc_submit_generic (req, &iov, + 1, payload, payloadcount, iobref); + + if (new_iobref) + iobref_unref (iobref); + if (iob) + iobuf_unref (iob); + return_ret: + return ret; +} + +void +changelog_rpc_server_destroy (xlator_t *this, rpcsvc_t *rpc, char *sockfile, + rpcsvc_notify_t fn, struct rpcsvc_program **progs) +{ + rpcsvc_listener_t *listener = NULL; + rpcsvc_listener_t *next = NULL; + struct rpcsvc_program *prog = NULL; + + while (*progs) { + prog = *progs; + (void) rpcsvc_program_unregister (rpc, prog); + } + + list_for_each_entry_safe (listener, next, &rpc->listeners, list) { + rpcsvc_listener_destroy (listener); + } + + (void) rpcsvc_unregister_notify (rpc, fn, this); + unlink (sockfile); + + GF_FREE (rpc); +} + +rpcsvc_t * +changelog_rpc_server_init (xlator_t *this, char *sockfile, void *cbkdata, + rpcsvc_notify_t fn, struct rpcsvc_program **progs) +{ + int j = 0; + int ret = 0; + rpcsvc_t *rpc = NULL; + dict_t *options = NULL; + struct rpcsvc_program *prog = NULL; + + if (!cbkdata) + cbkdata = this; + + options = dict_new (); + if (!options) + goto error_return; + + ret = rpcsvc_transport_unix_options_build (&options, sockfile); + if (ret) + goto dealloc_dict; + + rpc = rpcsvc_init (this, this->ctx, options, 8); + if (rpc == NULL) { + gf_log (this->name, GF_LOG_ERROR, "failed to init rpc"); + goto dealloc_dict; + } + + ret = rpcsvc_register_notify (rpc, fn, cbkdata); + if (ret) { + gf_log (this->name, + GF_LOG_ERROR, "failed to register notify function"); + goto dealloc_rpc; + } + + ret = rpcsvc_create_listeners (rpc, options, this->name); + if (ret != 1) { + gf_log (this->name, + GF_LOG_DEBUG, "failed to create listeners"); + goto dealloc_rpc; + } + + while (*progs) { + prog = *progs; + ret = rpcsvc_program_register (rpc, prog); + if (ret) { + gf_log (this->name, + GF_LOG_ERROR, "cannot register program " + "(name: %s, prognum: %d, pogver: %d)", + prog->progname, prog->prognum, prog->progver); + goto dealloc_rpc; + } + + progs++; + } + + dict_unref (options); + return rpc; + + dealloc_rpc: + GF_FREE (rpc); + dealloc_dict: + dict_unref (options); + error_return: + return NULL; +} diff --git a/xlators/features/changelog/src/changelog-rpc-common.h b/xlators/features/changelog/src/changelog-rpc-common.h new file mode 100644 index 00000000000..b15590ef43c --- /dev/null +++ b/xlators/features/changelog/src/changelog-rpc-common.h @@ -0,0 +1,84 @@ +/* + Copyright (c) 2015 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __CHANGELOG_RPC_COMMON_H +#define __CHANGELOG_RPC_COMMON_H + +#include "rpcsvc.h" +#include "rpc-clnt.h" +#include "event.h" +#include "call-stub.h" + +#include "changelog-xdr.h" +#include "xdr-generic.h" + +#include "changelog.h" + +/** + * Let's keep this non-configurable for now. + */ +#define NR_ROTT_BUFFS 4 +#define NR_DISPATCHERS (NR_ROTT_BUFFS - 1) + +enum changelog_rpc_procnum { + CHANGELOG_RPC_PROC_NULL = 0, + CHANGELOG_RPC_PROBE_FILTER = 1, + CHANGELOG_RPC_PROC_MAX = 2, +}; + +#define CHANGELOG_RPC_PROGNUM 1885957735 +#define CHANGELOG_RPC_PROGVER 1 + +/** + * reverse connection: data xfer path + */ +enum changelog_reverse_rpc_procnum { + CHANGELOG_REV_PROC_NULL = 0, + CHANGELOG_REV_PROC_EVENT = 1, + CHANGELOG_REV_PROC_MAX = 2, +}; + +#define CHANGELOG_REV_RPC_PROCNUM 1886350951 +#define CHANGELOG_REV_RPC_PROCVER 1 + +typedef struct changelog_rpc { + rpcsvc_t *svc; + struct rpc_clnt *rpc; + char sock[UNIX_PATH_MAX]; /* tied to server */ +} changelog_rpc_t; + +/* event poller */ +void *changelog_rpc_poller (void *); + +/* CLIENT API */ +struct rpc_clnt * +changelog_rpc_client_init (xlator_t *, void *, char *, rpc_clnt_notify_t); + +int +changelog_rpc_sumbit_req (struct rpc_clnt *, void *, call_frame_t *, + rpc_clnt_prog_t *, int , struct iovec *, int, + struct iobref *, xlator_t *, fop_cbk_fn_t, xdrproc_t); + +int +changelog_invoke_rpc (xlator_t *, struct rpc_clnt *, + rpc_clnt_prog_t *, int , void *); + +/* SERVER API */ +int +changelog_rpc_sumbit_reply (rpcsvc_request_t *, void *, + struct iovec *, int,struct iobref *, xdrproc_t); +rpcsvc_t * +changelog_rpc_server_init (xlator_t *, char *, void*, + rpcsvc_notify_t, struct rpcsvc_program **); +void +changelog_rpc_server_destroy (xlator_t *, rpcsvc_t *, char *, + rpcsvc_notify_t, struct rpcsvc_program **); + +#endif diff --git a/xlators/features/changelog/src/changelog-rpc.c b/xlators/features/changelog/src/changelog-rpc.c new file mode 100644 index 00000000000..04326456d31 --- /dev/null +++ b/xlators/features/changelog/src/changelog-rpc.c @@ -0,0 +1,300 @@ +/* + Copyright (c) 2015 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "changelog-rpc.h" +#include "changelog-mem-types.h" +#include "changelog-ev-handle.h" + +struct rpcsvc_program *changelog_programs[]; + +static void +changelog_cleanup_dispatchers (xlator_t *this, + changelog_priv_t *priv, int count) +{ + for (; count >= 0; count--) { + (void) changelog_thread_cleanup + (this, priv->ev_dispatcher[count]); + } +} + +static int +changelog_cleanup_rpc_threads (xlator_t *this, changelog_priv_t *priv) +{ + int ret = 0; + changelog_clnt_t *conn = NULL; + + conn = &priv->connections; + if (!conn) + return 0; + + /** terminate RPC thread(s) */ + ret = changelog_thread_cleanup (this, priv->connector); + if (ret != 0) + goto error_return; + /** terminate dispatcher thread(s) */ + changelog_cleanup_dispatchers (this, priv, priv->nr_dispatchers); + + /* TODO: what about pending and waiting connections? */ + changelog_ev_cleanup_connections (this, conn); + + /* destroy locks */ + ret = pthread_mutex_destroy (&conn->pending_lock); + if (ret != 0) + goto error_return; + ret = pthread_cond_destroy (&conn->pending_cond); + if (ret != 0) + goto error_return; + ret = LOCK_DESTROY (&conn->active_lock); + if (ret != 0) + goto error_return; + ret = LOCK_DESTROY (&conn->wait_lock); + if (ret != 0) + goto error_return; + return 0; + + error_return: + return -1; +} + +static int +changelog_init_rpc_threads (xlator_t *this, changelog_priv_t *priv, + rbuf_t *rbuf, int nr_dispatchers) +{ + int j = 0; + int ret = 0; + changelog_clnt_t *conn = NULL; + + conn = &priv->connections; + + conn->this = this; + conn->rbuf = rbuf; + conn->sequence = 1; /* start with sequence number one */ + + INIT_LIST_HEAD (&conn->pending); + INIT_LIST_HEAD (&conn->active); + INIT_LIST_HEAD (&conn->waitq); + + ret = pthread_mutex_init (&conn->pending_lock, NULL); + if (ret) + goto error_return; + ret = pthread_cond_init (&conn->pending_cond, NULL); + if (ret) + goto cleanup_pending_lock; + + ret = LOCK_INIT (&conn->active_lock); + if (ret) + goto cleanup_pending_cond; + ret = LOCK_INIT (&conn->wait_lock); + if (ret) + goto cleanup_active_lock; + + /* spawn reverse connection thread */ + ret = pthread_create (&priv->connector, + NULL, changelog_ev_connector, conn); + if (ret != 0) + goto cleanup_wait_lock; + + /* spawn dispatcher thread(s) */ + priv->ev_dispatcher = GF_CALLOC (nr_dispatchers, sizeof(pthread_t), + gf_changelog_mt_ev_dispatcher_t); + if (!priv->ev_dispatcher) + goto cleanup_connector; + + /* spawn dispatcher threads */ + for (; j < nr_dispatchers; j++) { + ret = pthread_create (&priv->ev_dispatcher[j], + NULL, changelog_ev_dispatch, conn); + if (ret != 0) { + changelog_cleanup_dispatchers (this, priv, --j); + break; + } + } + + if (ret != 0) + goto cleanup_connector; + + priv->nr_dispatchers = nr_dispatchers; + return 0; + + cleanup_connector: + (void) pthread_cancel (priv->connector); + cleanup_wait_lock: + (void) LOCK_DESTROY (&conn->wait_lock); + cleanup_active_lock: + (void) LOCK_DESTROY (&conn->active_lock); + cleanup_pending_cond: + (void) pthread_cond_destroy (&conn->pending_cond); + cleanup_pending_lock: + (void) pthread_mutex_destroy (&conn->pending_lock); + error_return: + return -1; +} + +int +changelog_rpcsvc_notify (rpcsvc_t *rpc, + void *xl, rpcsvc_event_t event, void *data) +{ + return 0; +} + +void +changelog_destroy_rpc_listner (xlator_t *this, changelog_priv_t *priv) +{ + char sockfile[UNIX_PATH_MAX] = {0,}; + + /* sockfile path could have been saved to avoid this */ + CHANGELOG_MAKE_SOCKET_PATH (priv->changelog_brick, + sockfile, UNIX_PATH_MAX); + changelog_rpc_server_destroy (this, + priv->rpc, sockfile, + changelog_rpcsvc_notify, + changelog_programs); + (void) changelog_cleanup_rpc_threads (this, priv); +} + +rpcsvc_t * +changelog_init_rpc_listner (xlator_t *this, changelog_priv_t *priv, + rbuf_t *rbuf, int nr_dispatchers) +{ + int ret = 0; + char sockfile[UNIX_PATH_MAX] = {0,}; + + ret = changelog_init_rpc_threads (this, priv, rbuf, nr_dispatchers); + if (ret) + return NULL; + + CHANGELOG_MAKE_SOCKET_PATH (priv->changelog_brick, + sockfile, UNIX_PATH_MAX); + return changelog_rpc_server_init (this, sockfile, NULL, + changelog_rpcsvc_notify, + changelog_programs); +} + +void +changelog_rpc_clnt_cleanup (changelog_rpc_clnt_t *crpc) +{ + if (!crpc) + return; + crpc->c_clnt = NULL; + (void) LOCK_DESTROY (&crpc->lock); + GF_FREE (crpc); +} + +inline changelog_rpc_clnt_t * +changelog_rpc_clnt_init (xlator_t *this, + changelog_probe_req *rpc_req, changelog_clnt_t *c_clnt) +{ + int ret = 0; + changelog_rpc_clnt_t *crpc = NULL; + + crpc = GF_CALLOC (1, sizeof (*crpc), gf_changelog_mt_rpc_clnt_t); + if (!crpc) + goto error_return; + INIT_LIST_HEAD (&crpc->list); + + crpc->ref = 0; + changelog_set_disconnect_flag (crpc, _gf_false); + + crpc->filter = rpc_req->filter; + (void) memcpy (crpc->sock, rpc_req->sock, strlen (rpc_req->sock)); + + crpc->this = this; + crpc->c_clnt = c_clnt; + crpc->cleanup = changelog_rpc_clnt_cleanup; + + ret = LOCK_INIT (&crpc->lock); + if (ret != 0) + goto dealloc_crpc; + return crpc; + + dealloc_crpc: + GF_FREE (crpc); + error_return: + return NULL; +} + +/** + * Actor declarations + */ + +/** + * @probe_handler + * A probe RPC call spawns a connect back to the caller. Caller also + * passes an hint which acts as a filter for selecting updates. + */ + +int +changelog_handle_probe (rpcsvc_request_t *req) +{ + int ret = 0; + xlator_t *this = NULL; + rpcsvc_t *svc = NULL; + changelog_priv_t *priv = NULL; + changelog_clnt_t *c_clnt = NULL; + changelog_rpc_clnt_t *crpc = NULL; + + changelog_probe_req rpc_req = {0,}; + changelog_probe_rsp rpc_rsp = {0,}; + + ret = xdr_to_generic (req->msg[0], + &rpc_req, (xdrproc_t)xdr_changelog_probe_req); + if (ret < 0) { + gf_log ("", GF_LOG_ERROR, "xdr decoding error"); + req->rpc_err = GARBAGE_ARGS; + goto handle_xdr_error; + } + + /* ->xl hidden in rpcsvc */ + svc = rpcsvc_request_service (req); + this = svc->mydata; + priv = this->private; + c_clnt = &priv->connections; + + crpc = changelog_rpc_clnt_init (this, &rpc_req, c_clnt); + if (!crpc) + goto handle_xdr_error; + + changelog_ev_queue_connection (c_clnt, crpc); + rpc_rsp.op_ret = 0; + + goto submit_rpc; + + handle_xdr_error: + rpc_rsp.op_ret = -1; + submit_rpc: + (void) changelog_rpc_sumbit_reply (req, &rpc_rsp, NULL, 0, NULL, + (xdrproc_t)xdr_changelog_probe_rsp); + return 0; +} + +/** + * RPC declarations + */ + +rpcsvc_actor_t changelog_svc_actors[CHANGELOG_RPC_PROC_MAX] = { + [CHANGELOG_RPC_PROBE_FILTER] = { + "CHANGELOG PROBE FILTER", CHANGELOG_RPC_PROBE_FILTER, + changelog_handle_probe, NULL, 0, DRC_NA + }, +}; + +struct rpcsvc_program changelog_svc_prog = { + .progname = CHANGELOG_RPC_PROGNAME, + .prognum = CHANGELOG_RPC_PROGNUM, + .progver = CHANGELOG_RPC_PROGVER, + .numactors = CHANGELOG_RPC_PROC_MAX, + .actors = changelog_svc_actors, + .synctask = _gf_true, +}; + +struct rpcsvc_program *changelog_programs[] = { + &changelog_svc_prog, + NULL, +}; diff --git a/xlators/features/changelog/src/changelog-rpc.h b/xlators/features/changelog/src/changelog-rpc.h new file mode 100644 index 00000000000..e2833f34dc3 --- /dev/null +++ b/xlators/features/changelog/src/changelog-rpc.h @@ -0,0 +1,29 @@ +/* + Copyright (c) 2015 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __CHANGELOG_RPC_H +#define __CHANGELOG_RPC_H + +#include "xlator.h" +#include "changelog-helpers.h" + +// one time +#include "socket.h" +#include "changelog-rpc-common.h" + +#define CHANGELOG_RPC_PROGNAME "GlusterFS Changelog" + +rpcsvc_t * +changelog_init_rpc_listner (xlator_t *, changelog_priv_t *, rbuf_t *, int); + +void +changelog_destroy_rpc_listner (xlator_t *, changelog_priv_t *); + +#endif diff --git a/xlators/features/changelog/src/changelog.c b/xlators/features/changelog/src/changelog.c index 4263a462ad7..ad1b78d7c65 100644 --- a/xlators/features/changelog/src/changelog.c +++ b/xlators/features/changelog/src/changelog.c @@ -19,14 +19,13 @@ #include "iobuf.h" #include "changelog-rt.h" -#include "changelog-helpers.h" #include "changelog-encoders.h" #include "changelog-mem-types.h" #include -#include "changelog-notifier.h" +#include "changelog-rpc.h" static struct changelog_bootstrap cb_bootstrap[] = { @@ -37,6 +36,28 @@ cb_bootstrap[] = { }, }; +#define NEED_RELEASE_CBK 0x1 + +static inline void +changelog_save_release_flags (xlator_t *this, fd_t *fd) +{ + int32_t ret = 0; + + ret = changelog_inode_mod_ctx_flags (this, fd->inode, fd->flags); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "could not set open flag in inode context"); + goto done; + } + + ret = fd_ctx_set (fd, this, (uint64_t)(long) NEED_RELEASE_CBK); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "could not set fd context (for relase callback)"); + done: + return; +} + /* Entry operations - TYPE III */ /** @@ -912,15 +933,24 @@ changelog_create_cbk (call_frame_t *frame, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { + int32_t ret = 0; changelog_priv_t *priv = NULL; changelog_local_t *local = NULL; + changelog_event_t ev = {0,}; priv = this->private; local = frame->local; CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + /* fill the event structure.. similar to open() */ + ev.ev_type = CHANGELOG_OP_TYPE_CREATE; + uuid_copy (ev.u.create.gfid, buf->ia_gfid); + ev.u.create.flags = fd->flags; + changelog_dispatch_event (this, priv, &ev); + changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + changelog_save_release_flags (this, fd); unwind: changelog_dec_fop_cnt (this, priv, local); @@ -1631,6 +1661,109 @@ changelog_writev (call_frame_t *frame, return 0; } +/* }}} */ + +/* open, release and other beasts */ + +/* {{{ */ + + + +int +changelog_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) +{ + int ret = 0; + void *opaque = NULL; + char *buf = NULL; + ssize_t buflen = 0; + changelog_priv_t *priv = NULL; + changelog_event_t ev = {0,}; + gf_boolean_t logopen = _gf_false; + + priv = this->private; + if (frame->local) { + frame->local = NULL; + logopen = _gf_true; + } + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !logopen), unwind); + + /* fill the event structure */ + ev.ev_type = CHANGELOG_OP_TYPE_OPEN; + uuid_copy (ev.u.open.gfid, fd->inode->gfid); + ev.u.open.flags = fd->flags; + + changelog_dispatch_event (this, priv, &ev); + changelog_save_release_flags (this, fd); + + unwind: + CHANGELOG_STACK_UNWIND (open, frame, op_ret, op_errno, fd, xdata); + return 0; +} + +int +changelog_open (call_frame_t *frame, xlator_t *this, + loc_t *loc, int flags, fd_t *fd, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + frame->local = (void *)0x1; /* do not dereference in ->cbk */ + + wind: + STACK_WIND (frame, changelog_open_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->open, loc, flags, fd, xdata); + return 0; +} + +/* }}} */ + + +/* {{{ */ + +int32_t changelog_release (xlator_t *this, fd_t *fd) +{ + int32_t flags = -1; + ia_prot_t prot = {0,}; + inode_t *inode = NULL; + changelog_priv_t *priv = NULL; + changelog_event_t ev = {0,}; + changelog_inode_ctx_t *ctx = NULL; + + priv = this->private; + inode = fd->inode; + + LOCK (&inode->lock); + { + if (!list_empty (&inode->fd_list)) /* or MAW check ->fd_count */ + goto unblock; + ctx = __changelog_inode_ctx_get (this, inode, NULL, NULL, 0); + if (!ctx) + goto unblock; + flags = ctx->ordflags; + ctx->ordflags = 0; + } + unblock: + UNLOCK (&inode->lock); + + if (ctx) { + ev.ev_type = CHANGELOG_OP_TYPE_RELEASE; + + ev.u.release.mode = st_mode_from_ia (prot, fd->inode->ia_type); + ev.u.release.ordflags = flags; + uuid_copy (ev.u.release.gfid, fd->inode->gfid); + + changelog_dispatch_event (this, priv, &ev); + (void) fd_ctx_del (fd, this, NULL); + } + + return 0; +} + + /* }}} */ /** @@ -1679,7 +1812,7 @@ changelog_cleanup_helper_threads (xlator_t *this, changelog_priv_t *priv) int ret = 0; if (priv->cr.rollover_th) { - changelog_thread_cleanup (this, priv->cr.rollover_th); + (void) changelog_thread_cleanup (this, priv->cr.rollover_th); priv->cr.rollover_th = 0; ret = close (priv->cr_wfd); if (ret) @@ -1689,7 +1822,7 @@ changelog_cleanup_helper_threads (xlator_t *this, changelog_priv_t *priv) } if (priv->cf.fsync_th) { - changelog_thread_cleanup (this, priv->cf.fsync_th); + (void) changelog_thread_cleanup (this, priv->cf.fsync_th); priv->cf.fsync_th = 0; } } @@ -1754,67 +1887,6 @@ changelog_spawn_helper_threads (xlator_t *this, changelog_priv_t *priv) return ret; } -/* cleanup the notifier thread */ -static int -changelog_cleanup_notifier (xlator_t *this, changelog_priv_t *priv) -{ - int ret = 0; - - if (priv->cn.notify_th) { - changelog_thread_cleanup (this, priv->cn.notify_th); - priv->cn.notify_th = 0; - - ret = close (priv->wfd); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "error closing writer end of notifier pipe" - " (reason: %s)", strerror (errno)); - } - - return ret; -} - -/* spawn the notifier thread - nop if already running */ -static int -changelog_spawn_notifier (xlator_t *this, changelog_priv_t *priv) -{ - int ret = 0; - int flags = 0; - int pipe_fd[2] = {0, 0}; - - if (priv->cn.notify_th) - goto out; /* notifier thread already running */ - - ret = pipe (pipe_fd); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "Cannot create pipe (reason: %s)", strerror (errno)); - goto out; - } - - /* writer is non-blocking */ - flags = fcntl (pipe_fd[1], F_GETFL); - flags |= O_NONBLOCK; - - ret = fcntl (pipe_fd[1], F_SETFL, flags); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "failed to set O_NONBLOCK flag"); - goto out; - } - - priv->wfd = pipe_fd[1]; - - priv->cn.this = this; - priv->cn.rfd = pipe_fd[0]; - - ret = gf_thread_create (&priv->cn.notify_th, - NULL, changelog_notifier, priv); - - out: - return ret; -} - int notify (xlator_t *this, int event, void *data, ...) { @@ -2054,11 +2126,6 @@ changelog_init (xlator_t *this, changelog_priv_t *priv) if (!priv->active) return ret; - /* spawn the notifier thread */ - ret = changelog_spawn_notifier (this, priv); - if (ret) - goto out; - /** * start with a fresh changelog file every time. this is done * in case there was an encoding change. so... things are kept @@ -2086,9 +2153,11 @@ changelog_init (xlator_t *this, changelog_priv_t *priv) return ret; } -/* Init all pthread condition variables and locks in changelog*/ +/** + * Init barrier related condition variables and locks + */ static int -changelog_pthread_init (xlator_t *this, changelog_priv_t *priv) +changelog_barrier_pthread_init (xlator_t *this, changelog_priv_t *priv) { gf_boolean_t bn_mutex_init = _gf_false; gf_boolean_t bn_cond_init = _gf_false; @@ -2165,9 +2234,9 @@ changelog_pthread_init (xlator_t *this, changelog_priv_t *priv) return ret; } -/* Destroy all pthread condition variables and locks in changelog */ +/* Destroy barrier related condition variables and locks */ static inline void -changelog_pthread_destroy (changelog_priv_t *priv) +changelog_barrier_pthread_destroy (changelog_priv_t *priv) { pthread_mutex_destroy (&priv->bn.bnotify_mutex); pthread_cond_destroy (&priv->bn.bnotify_cond); @@ -2284,17 +2353,13 @@ reconfigure (xlator_t *this, dict_t *options) } htime_open(this, priv, tv.tv_sec); } - ret = changelog_spawn_notifier (this, priv); - if (!ret) - ret = changelog_spawn_helper_threads (this, - priv); - } else - ret = changelog_cleanup_notifier (this, priv); + ret = changelog_spawn_helper_threads (this, priv); + } } out: if (ret) { - ret = changelog_cleanup_notifier (this, priv); + // TODO } else { gf_log (this->name, GF_LOG_DEBUG, "changelog reconfigured"); @@ -2305,67 +2370,40 @@ reconfigure (xlator_t *this, dict_t *options) return ret; } -int32_t -init (xlator_t *this) +static void +changelog_freeup_options (xlator_t *this, changelog_priv_t *priv) { - int ret = -1; - char *tmp = NULL; - changelog_priv_t *priv = NULL; - gf_boolean_t cond_lock_init = _gf_false; - char htime_dir[PATH_MAX] = {0,}; - char csnap_dir[PATH_MAX] = {0,}; - uint32_t timeout = 0; - - GF_VALIDATE_OR_GOTO ("changelog", this, out); - - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "translator needs a single subvolume"); - goto out; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_ERROR, - "dangling volume. please check volfile"); - goto out; - } - - priv = GF_CALLOC (1, sizeof (*priv), gf_changelog_mt_priv_t); - if (!priv) - goto out; + int ret = 0; - this->local_pool = mem_pool_new (changelog_local_t, 64); - if (!this->local_pool) { + ret = priv->cb->dtor (this, &priv->cd); + if (ret) gf_log (this->name, GF_LOG_ERROR, - "failed to create local memory pool"); - goto out; - } - - LOCK_INIT (&priv->lock); - LOCK_INIT (&priv->c_snap_lock); + "could not cleanup bootstrapper"); + GF_FREE (priv->changelog_brick); + GF_FREE (priv->changelog_dir); +} - GF_OPTION_INIT ("changelog-brick", tmp, str, out); - if (!tmp) { - gf_log (this->name, GF_LOG_ERROR, - "\"changelog-brick\" option is not set"); - goto out; - } +static int +changelog_init_options (xlator_t *this, changelog_priv_t *priv) +{ + int ret = 0; + char *tmp = NULL; + uint32_t timeout = 0; + char htime_dir[PATH_MAX] = {0,}; + char csnap_dir[PATH_MAX] = {0,}; + GF_OPTION_INIT ("changelog-brick", tmp, str, error_return); priv->changelog_brick = gf_strdup (tmp); if (!priv->changelog_brick) - goto out; - tmp = NULL; + goto error_return; - GF_OPTION_INIT ("changelog-dir", tmp, str, out); - if (!tmp) { - gf_log (this->name, GF_LOG_ERROR, - "\"changelog-dir\" option is not set"); - goto out; - } + tmp = NULL; + GF_OPTION_INIT ("changelog-dir", tmp, str, dealloc_1); priv->changelog_dir = gf_strdup (tmp); if (!priv->changelog_dir) - goto out; + goto dealloc_1; + tmp = NULL; /** @@ -2375,35 +2413,38 @@ init (xlator_t *this) ret = mkdir_p (priv->changelog_dir, 0600, _gf_true); if (ret) - goto out; + goto dealloc_2; - CHANGELOG_FILL_HTIME_DIR(priv->changelog_dir, htime_dir); + CHANGELOG_FILL_HTIME_DIR (priv->changelog_dir, htime_dir); ret = mkdir_p (htime_dir, 0600, _gf_true); if (ret) - goto out; + goto dealloc_2; - CHANGELOG_FILL_CSNAP_DIR(priv->changelog_dir, csnap_dir); + CHANGELOG_FILL_CSNAP_DIR (priv->changelog_dir, csnap_dir); ret = mkdir_p (csnap_dir, 0600, _gf_true); if (ret) - goto out; + goto dealloc_2; - GF_OPTION_INIT ("changelog", priv->active, bool, out); + GF_OPTION_INIT ("changelog", priv->active, bool, dealloc_2); - GF_OPTION_INIT ("op-mode", tmp, str, out); + GF_OPTION_INIT ("op-mode", tmp, str, dealloc_2); changelog_assign_opmode (priv, tmp); tmp = NULL; - GF_OPTION_INIT ("encoding", tmp, str, out); + GF_OPTION_INIT ("encoding", tmp, str, dealloc_2); changelog_assign_encoding (priv, tmp); + changelog_encode_change (priv); - GF_OPTION_INIT ("rollover-time", priv->rollover_time, int32, out); + GF_OPTION_INIT ("rollover-time", + priv->rollover_time, int32, dealloc_2); - GF_OPTION_INIT ("fsync-interval", priv->fsync_interval, int32, out); - GF_OPTION_INIT ("changelog-barrier-timeout", timeout, time, out); - priv->timeout.tv_sec = timeout; + GF_OPTION_INIT ("fsync-interval", + priv->fsync_interval, int32, dealloc_2); - changelog_encode_change(priv); + GF_OPTION_INIT ("changelog-barrier-timeout", + timeout, time, dealloc_2); + changelog_assign_barrier_timeout (priv, timeout); GF_ASSERT (cb_bootstrap[priv->op_mode].mode == priv->op_mode); priv->cb = &cb_bootstrap[priv->op_mode]; @@ -2411,10 +2452,111 @@ init (xlator_t *this) /* ... now bootstrap the logger */ ret = priv->cb->ctor (this, &priv->cd); if (ret) - goto out; + goto dealloc_2; priv->changelog_fd = -1; + return 0; + + dealloc_2: + GF_FREE (priv->changelog_dir); + dealloc_1: + GF_FREE (priv->changelog_brick); + error_return: + return -1; +} + +static void +changelog_cleanup_rpc (xlator_t *this, changelog_priv_t *priv) +{ + /* terminate rpc server */ + changelog_destroy_rpc_listner (this, priv); + + /* cleanup rot buffs */ + rbuf_dtor (priv->rbuf); + + /* cleanup poller thread */ + (void) changelog_thread_cleanup (this, priv->poller); +} + +static int +changelog_init_rpc (xlator_t *this, changelog_priv_t *priv) +{ + int ret = 0; + rpcsvc_t *rpc = NULL; + changelog_ev_selector_t *selection = NULL; + + selection = &priv->ev_selection; + + /* initialize event selection */ + changelog_init_event_selection (this, selection); + + ret = pthread_create (&priv->poller, NULL, changelog_rpc_poller, this); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to spawn poller thread"); + goto error_return; + } + + priv->rbuf = rbuf_init (NR_ROTT_BUFFS); + if (!priv->rbuf) + goto cleanup_thread; + + rpc = changelog_init_rpc_listner (this, priv, + priv->rbuf, NR_DISPATCHERS); + if (!rpc) + goto cleanup_rbuf; + priv->rpc = rpc; + + return 0; + + cleanup_rbuf: + rbuf_dtor (priv->rbuf); + cleanup_thread: + (void) changelog_thread_cleanup (this, priv->poller); + error_return: + return -1; +} + +int32_t +init (xlator_t *this) +{ + int ret = -1; + char *tmp = NULL; + changelog_priv_t *priv = NULL; + + GF_VALIDATE_OR_GOTO ("changelog", this, error_return); + + if (!this->children || this->children->next) { + gf_log (this->name, GF_LOG_ERROR, + "translator needs a single subvolume"); + goto error_return; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_ERROR, + "dangling volume. please check volfile"); + goto error_return; + } + + priv = GF_CALLOC (1, sizeof (*priv), gf_changelog_mt_priv_t); + if (!priv) + goto error_return; + + this->local_pool = mem_pool_new (changelog_local_t, 64); + if (!this->local_pool) { + gf_log (this->name, GF_LOG_ERROR, + "failed to create local memory pool"); + goto cleanup_priv; + } + + LOCK_INIT (&priv->lock); + LOCK_INIT (&priv->c_snap_lock); + + ret = changelog_init_options (this, priv); + if (ret) + goto cleanup_mempool; + /* snap dependency changes */ priv->dm.black_fop_cnt = 0; priv->dm.white_fop_cnt = 0; @@ -2422,67 +2564,68 @@ init (xlator_t *this) priv->dm.drain_wait_white = _gf_false; priv->current_color = FOP_COLOR_BLACK; priv->explicit_rollover = _gf_false; + /* Mutex is not needed as threads are not spawned yet */ priv->bn.bnotify = _gf_false; - ret = changelog_pthread_init (this, priv); + ret = changelog_barrier_pthread_init (this, priv); if (ret) - goto out; - + goto cleanup_options; LOCK_INIT (&priv->bflags.lock); - cond_lock_init = _gf_true; priv->bflags.barrier_ext = _gf_false; /* Changelog barrier init */ INIT_LIST_HEAD (&priv->queue); priv->barrier_enabled = _gf_false; - ret = changelog_init (this, priv); + /* RPC ball rolling.. */ + ret = changelog_init_rpc (this, priv); if (ret) - goto out; + goto cleanup_barrier; + ret = changelog_init (this, priv); + if (ret) + goto cleanup_rpc; gf_log (this->name, GF_LOG_DEBUG, "changelog translator loaded"); - out: - if (ret) { - if (this && this->local_pool) - mem_pool_destroy (this->local_pool); - if (priv) { - if (priv->cb) { - ret = priv->cb->dtor (this, &priv->cd); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "error in cleanup during init()"); - } - GF_FREE (priv->changelog_brick); - GF_FREE (priv->changelog_dir); - if (cond_lock_init) - changelog_pthread_destroy (priv); - GF_FREE (priv); - } - this->private = NULL; - } else - this->private = priv; + this->private = priv; + return 0; - return ret; + cleanup_rpc: + changelog_cleanup_rpc (this, priv); + cleanup_barrier: + changelog_barrier_pthread_destroy (priv); + cleanup_options: + changelog_freeup_options (this, priv); + cleanup_mempool: + mem_pool_destroy (this->local_pool); + cleanup_priv: + GF_FREE (priv); + error_return: + this->private = NULL; + return -1; } void fini (xlator_t *this) { - int ret = -1; changelog_priv_t *priv = NULL; priv = this->private; if (priv) { - ret = priv->cb->dtor (this, &priv->cd); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "error in fini"); + /* terminate RPC server/threads */ + changelog_cleanup_rpc (this, priv); + + /* cleanup barrier related objects */ + changelog_barrier_pthread_destroy (priv); + + /* cleanup allocated options */ + changelog_freeup_options (this, priv); + + /* deallocate mempool */ mem_pool_destroy (this->local_pool); - GF_FREE (priv->changelog_brick); - GF_FREE (priv->changelog_dir); - changelog_pthread_destroy (priv); + + /* finally, dealloac private variable */ GF_FREE (priv); } @@ -2492,6 +2635,7 @@ fini (xlator_t *this) } struct xlator_fops fops = { + .open = changelog_open, .mknod = changelog_mknod, .mkdir = changelog_mkdir, .create = changelog_create, @@ -2513,6 +2657,7 @@ struct xlator_fops fops = { struct xlator_cbks cbks = { .forget = changelog_forget, + .release = changelog_release, }; struct volume_options options[] = { From 69910835e1fb75309f77754ca0cc5ed80795e617 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Mon, 9 Feb 2015 18:28:21 +0530 Subject: [PATCH 5/6] features/bitrot: BitRot server stub Bitrot stub implements object versioning required for identifying signature freshness. More details about versioning is explained as a part of the "bitrot feature doc" patch. Change-Id: I2ad70d9eb109ba4a12148ab8d81336afda529ad9 Signed-off-by: Venky Shankar --- configure.ac | 3 + libglusterfs/src/common-utils.h | 1 + libglusterfs/src/glusterfs.h | 17 + xlators/features/Makefile.am | 3 +- xlators/features/bit-rot/Makefile.am | 1 + xlators/features/bit-rot/src/Makefile.am | 1 + xlators/features/bit-rot/src/stub/Makefile.am | 15 + .../bit-rot/src/stub/bit-rot-common.h | 128 ++ .../bit-rot/src/stub/bit-rot-stub-mem-types.h | 24 + .../features/bit-rot/src/stub/bit-rot-stub.c | 1285 +++++++++++++++++ .../features/bit-rot/src/stub/bit-rot-stub.h | 127 ++ .../features/changelog/lib/src/changelog.h | 1 + .../changelog/src/changelog-helpers.c | 27 +- .../changelog/src/changelog-helpers.h | 11 +- xlators/features/changelog/src/changelog.c | 73 +- xlators/mgmt/glusterd/src/glusterd-volgen.c | 8 + .../mgmt/glusterd/src/glusterd-volume-set.c | 6 + xlators/storage/posix/src/posix-helpers.c | 70 + xlators/storage/posix/src/posix.c | 23 + xlators/storage/posix/src/posix.h | 3 + 20 files changed, 1819 insertions(+), 8 deletions(-) create mode 100644 xlators/features/bit-rot/Makefile.am create mode 100644 xlators/features/bit-rot/src/Makefile.am create mode 100644 xlators/features/bit-rot/src/stub/Makefile.am create mode 100644 xlators/features/bit-rot/src/stub/bit-rot-common.h create mode 100644 xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h create mode 100644 xlators/features/bit-rot/src/stub/bit-rot-stub.c create mode 100644 xlators/features/bit-rot/src/stub/bit-rot-stub.h diff --git a/configure.ac b/configure.ac index 2d93ce847b6..68ea9ced0f4 100644 --- a/configure.ac +++ b/configure.ac @@ -148,6 +148,9 @@ AC_CONFIG_FILES([Makefile xlators/features/snapview-server/src/Makefile xlators/features/snapview-client/Makefile xlators/features/snapview-client/src/Makefile + xlators/features/bit-rot/Makefile + xlators/features/bit-rot/src/Makefile + xlators/features/bit-rot/src/stub/Makefile xlators/playground/Makefile xlators/playground/template/Makefile xlators/playground/template/src/Makefile diff --git a/libglusterfs/src/common-utils.h b/libglusterfs/src/common-utils.h index 69b1e277861..8d7737dc418 100644 --- a/libglusterfs/src/common-utils.h +++ b/libglusterfs/src/common-utils.h @@ -117,6 +117,7 @@ enum _gf_client_pid GF_CLIENT_PID_QUOTA_MOUNT = -5, GF_CLIENT_PID_AFR_SELF_HEALD = -6, GF_CLIENT_PID_GLFS_HEAL = -7, + GF_CLIENT_PID_BITD = -8, }; typedef enum _gf_boolean gf_boolean_t; diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h index 8f993b2086a..e071e218232 100644 --- a/libglusterfs/src/glusterfs.h +++ b/libglusterfs/src/glusterfs.h @@ -116,6 +116,23 @@ #define GET_ANCESTRY_PATH_KEY "glusterfs.ancestry.path" #define GET_ANCESTRY_DENTRY_KEY "glusterfs.ancestry.dentry" +#define BITROT_DEFAULT_CURRENT_VERSION (unsigned long)1 +#define BITROT_DEFAULT_SIGNING_VERSION (unsigned long)0 + +/* on-disk object signature keys */ +#define BITROT_CURRENT_VERSION_KEY "trusted.glusterfs.bit-rot.version" +#define BITROT_SIGNING_VERSION_KEY "trusted.glusterfs.bit-rot.signature" + +/* GET/SET object signature */ +#define GLUSTERFS_GET_OBJECT_SIGNATURE "trusted.glusterfs.get-signature" +#define GLUSTERFS_SET_OBJECT_SIGNATURE "trusted.glusterfs.set-signature" + +/* operation needs to be durable on-disk */ +#define GLUSTERFS_DURABLE_OP "trusted.glusterfs.durable-op" + +/* key for version exchange b/w bitrot stub and changelog */ +#define GLUSTERFS_VERSION_XCHG_KEY "glusterfs.version.xchg" + #define GLUSTERFS_INTERNAL_FOP_KEY "glusterfs-internal-fop" #define ZR_FILE_CONTENT_STR "glusterfs.file." diff --git a/xlators/features/Makefile.am b/xlators/features/Makefile.am index 8093441a043..f0d6a6edec9 100644 --- a/xlators/features/Makefile.am +++ b/xlators/features/Makefile.am @@ -1,4 +1,5 @@ SUBDIRS = locks quota read-only mac-compat quiesce marker index barrier \ - protect compress changelog gfid-access $(GLUPY_SUBDIR) qemu-block snapview-client snapview-server # trash path-converter # filter + protect compress changelog gfid-access $(GLUPY_SUBDIR) qemu-block snapview-client snapview-server \ + bit-rot # trash path-converter # filter CLEANFILES = diff --git a/xlators/features/bit-rot/Makefile.am b/xlators/features/bit-rot/Makefile.am new file mode 100644 index 00000000000..af437a64d6d --- /dev/null +++ b/xlators/features/bit-rot/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = src diff --git a/xlators/features/bit-rot/src/Makefile.am b/xlators/features/bit-rot/src/Makefile.am new file mode 100644 index 00000000000..7581732c7d9 --- /dev/null +++ b/xlators/features/bit-rot/src/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = stub diff --git a/xlators/features/bit-rot/src/stub/Makefile.am b/xlators/features/bit-rot/src/stub/Makefile.am new file mode 100644 index 00000000000..5edd730aeeb --- /dev/null +++ b/xlators/features/bit-rot/src/stub/Makefile.am @@ -0,0 +1,15 @@ +xlator_LTLIBRARIES = bitrot-stub.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +bitrot_stub_la_LDFLAGS = -module -avoid-version + +bitrot_stub_la_SOURCES = bit-rot-stub.c +bitrot_stub_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = bit-rot-stub.h bit-rot-stub-mem-types.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/bit-rot/src/stub/bit-rot-common.h b/xlators/features/bit-rot/src/stub/bit-rot-common.h new file mode 100644 index 00000000000..3c321aa3aeb --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-common.h @@ -0,0 +1,128 @@ + /* + Copyright (c) 2015 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __BIT_ROT_COMMON_H__ +#define __BIT_ROT_COMMON_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" + +/** + * on-disk formats for ongoing version and object signature. + */ +typedef struct br_version { + unsigned long ongoingversion; + uint32_t timebuf[2]; +} br_version_t; + +typedef struct br_signature { + int8_t signaturetype; + + unsigned long signedversion; + unsigned long currentversion; + + char signature[0]; +} br_signature_t; + +/** + * in-memory representation of signature used by signer for object + * signing. + */ +typedef struct br_isignature_in { + int8_t signaturetype; /* signature type */ + + unsigned long signedversion; /* version against which the + object was signed */ + + char signature[0]; /* object signature */ +} br_isignature_t; + +/** + * in-memory representation of signature used by scrubber for object + * verification. + */ +typedef struct br_isignature_out { + char stale; /* stale signature? */ + + uint32_t time[2]; /* time when the object + got dirtied */ + + int8_t signaturetype; /* hash type */ + char signature[0]; /* signature (hash) */ +} br_isignature_out_t; + +typedef struct br_stub_init { + uint32_t timebuf[2]; + char export[PATH_MAX]; +} br_stub_init_t; + +typedef enum { + BR_SIGNATURE_TYPE_VOID = -1, /* object is not signed */ + BR_SIGNATURE_TYPE_ZERO = 0, /* min boundary */ + BR_SIGNATURE_TYPE_SHA256 = 1, /* signed with SHA256 */ + BR_SIGNATURE_TYPE_MAX = 2, /* max boundary */ +} br_signature_type; + +/* BitRot stub start time (virtual xattr) */ +#define GLUSTERFS_GET_BR_STUB_INIT_TIME "trusted.glusterfs.bit-rot.stub-init" + +static inline int +br_is_signature_type_valid (int8_t signaturetype) +{ + return ( (signaturetype > BR_SIGNATURE_TYPE_ZERO) + && (signaturetype < BR_SIGNATURE_TYPE_MAX) ); +} + +static inline void +br_set_default_ongoingversion (br_version_t *buf, uint32_t *tv) +{ + buf->ongoingversion = BITROT_DEFAULT_CURRENT_VERSION; + buf->timebuf[0] = tv[0]; + buf->timebuf[1] = tv[1]; +} + +static inline void +br_set_default_signature (br_signature_t *buf, size_t *size) +{ + buf->signaturetype = (int8_t) BR_SIGNATURE_TYPE_VOID; + buf->signedversion = BITROT_DEFAULT_SIGNING_VERSION; + buf->currentversion = BITROT_DEFAULT_CURRENT_VERSION; + + *size = sizeof (br_signature_t); /* no signature */ +} + +static inline void +br_set_ongoingversion (br_version_t *buf, + unsigned long version, uint32_t *tv) +{ + buf->ongoingversion = version; + buf->timebuf[0] = tv[0]; + buf->timebuf[1] = tv[1]; +} + +static inline void +br_set_signature (br_signature_t *buf, + br_isignature_t *sign, + unsigned long currentversion, + size_t signaturelen, size_t *size) +{ + buf->signaturetype = sign->signaturetype; + buf->signedversion = ntohl (sign->signedversion); + buf->currentversion = currentversion; + + memcpy (buf->signature, sign->signature, signaturelen); + *size = sizeof (br_signature_t) + signaturelen; +} + +#endif /* __BIT_ROT_COMMON_H__ */ diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h new file mode 100644 index 00000000000..64779923fd6 --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h @@ -0,0 +1,24 @@ +/* + Copyright (c) 2015 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _BR_MEM_TYPES_H +#define _BR_MEM_TYPES_H + +#include "mem-types.h" + +enum br_mem_types { + gf_br_stub_mt_private_t = gf_common_mt_end + 1, + gf_br_stub_mt_version_t = gf_common_mt_end + 2, + gf_br_stub_mt_inode_ctx_t = gf_common_mt_end + 3, + gf_br_stub_mt_signature_t = gf_common_mt_end + 4, + gf_br_stub_mt_end +}; + +#endif diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.c b/xlators/features/bit-rot/src/stub/bit-rot-stub.c new file mode 100644 index 00000000000..33b27fe0d60 --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.c @@ -0,0 +1,1285 @@ +/* + Copyright (c) 2015 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include +#include + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "xlator.h" +#include "logging.h" + +#include "bit-rot-stub.h" +#include "bit-rot-stub-mem-types.h" + +#include "bit-rot-common.h" + +int32_t +mem_acct_init (xlator_t *this) +{ + int32_t ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init (this, gf_br_stub_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_WARNING, "Memory accounting" + " init failed"); + return ret; + } + + return ret; +} + +int32_t +init (xlator_t *this) +{ + char *tmp = NULL; + struct timeval tv = {0,}; + br_stub_private_t *priv = NULL; + + if (!this->children) { + gf_log (this->name, GF_LOG_ERROR, "FATAL: no children"); + goto error_return; + } + + priv = GF_CALLOC (1, sizeof (*priv), gf_br_stub_mt_private_t); + if (!priv) + goto error_return; + + priv->local_pool = mem_pool_new (br_stub_local_t, 512); + if (!priv->local_pool) + goto free_priv; + + GF_OPTION_INIT ("bitrot", priv->go, bool, free_mempool); + + GF_OPTION_INIT ("export", tmp, str, free_mempool); + memcpy (priv->export, tmp, strlen (tmp) + 1); + + (void) gettimeofday (&tv, NULL); + + /* boot time is in network endian format */ + priv->boot[0] = htonl (tv.tv_sec); + priv->boot[1] = htonl (tv.tv_usec); + + gf_log (this->name, GF_LOG_DEBUG, "bit-rot stub loaded"); + this->private = priv; + return 0; + + free_mempool: + mem_pool_destroy (priv->local_pool); + free_priv: + GF_FREE (priv); + error_return: + return -1; +} + +void +fini (xlator_t *this) +{ + br_stub_private_t *priv = this->private; + + if (!priv) + return; + this->private = NULL; + GF_FREE (priv); + + return; +} + +static inline int +br_stub_alloc_versions (br_version_t **obuf, + br_signature_t **sbuf, size_t signaturelen) +{ + void *mem = NULL; + size_t size = 0; + + if (obuf) + size += sizeof (br_version_t); + if (sbuf) + size += sizeof (br_signature_t) + signaturelen; + + mem = GF_CALLOC (1, size, gf_br_stub_mt_version_t); + if (!mem) + goto error_return; + + if (obuf) { + *obuf = (br_version_t *)mem; + mem = ((char *)mem + sizeof (br_version_t)); + } + if (sbuf) { + *sbuf = (br_signature_t *)mem; + } + + return 0; + + error_return: + return -1; +} + +static inline void +br_stub_dealloc_versions (void *mem) +{ + GF_FREE (mem); +} + +static inline br_stub_local_t * +br_stub_alloc_local (xlator_t *this) +{ + br_stub_private_t *priv = this->private; + + return mem_get0 (priv->local_pool); +} + +static inline void +br_stub_dealloc_local (br_stub_local_t *ptr) +{ + mem_put (ptr); +} + +static inline int +br_stub_prepare_default_request (xlator_t *this, dict_t *dict, + br_version_t *obuf, br_signature_t *sbuf) +{ + int32_t ret = 0; + size_t size = 0; + br_stub_private_t *priv = NULL; + + priv = this->private; + + /** Prepare ongoing version */ + br_set_default_ongoingversion (obuf, priv->boot); + ret = dict_set_static_bin (dict, BITROT_CURRENT_VERSION_KEY, + (void *)obuf, sizeof (br_version_t)); + if (ret) + return -1; + + /** Prepare signature version */ + br_set_default_signature (sbuf, &size); + return dict_set_static_bin (dict, BITROT_SIGNING_VERSION_KEY, + (void *)sbuf, size); +} + +static inline int +br_stub_prepare_version_request (xlator_t *this, dict_t *dict, + br_version_t *obuf, unsigned long oversion) +{ + br_stub_private_t *priv = NULL; + + priv = this->private; + br_set_ongoingversion (obuf, oversion, priv->boot); + + return dict_set_static_bin (dict, BITROT_CURRENT_VERSION_KEY, + (void *)obuf, sizeof (br_version_t)); +} + +static inline int +br_stub_prepare_signing_request (dict_t *dict, + br_signature_t *sbuf, + unsigned long currentversion, + br_isignature_t *sign, size_t signaturelen) +{ + size_t size = 0; + + br_set_signature (sbuf, sign, currentversion, signaturelen, &size); + + return dict_set_static_bin (dict, BITROT_SIGNING_VERSION_KEY, + (void *)sbuf, size); +} + +/** + * initialize an inode context starting with a given ongoing version. + * a fresh lookup() or a first creat() call initializes the inode + * context, hence the inode is marked dirty. this routine also + * initializes the transient inode version. + */ +static inline int +br_stub_init_inode_versions (xlator_t *this, inode_t *inode, + unsigned long version, gf_boolean_t markdirty) +{ + int32_t ret = 0; + br_stub_inode_ctx_t *ctx = NULL; + + ctx = GF_CALLOC (1, sizeof (br_stub_inode_ctx_t), + gf_br_stub_mt_inode_ctx_t); + if (!ctx) + goto error_return; + + (markdirty) ? __br_stub_mark_inode_dirty (ctx) + : __br_stub_mark_inode_synced (ctx); + ctx->currentversion = version; + ctx->transientversion = version; + + ret = br_stub_set_inode_ctx (this, inode, ctx); + if (ret) + goto free_ctx; + return 0; + + free_ctx: + GF_FREE (ctx); + error_return: + return -1; +} + +static inline int +br_stub_get_ongoing_version (xlator_t *this, + inode_t *inode, unsigned long *version) +{ + int32_t ret = 0; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = 0; + + LOCK (&inode->lock); + { + ret = __inode_ctx_get (inode, this, &ctx_addr); + if (ret < 0) + goto unblock; + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + *version = ctx->currentversion; + } + unblock: + UNLOCK (&inode->lock); + + return ret; +} + +/** + * modify the ongoing version of an inode. + */ +static inline int +br_stub_mod_inode_versions (xlator_t *this, + inode_t *inode, unsigned long version) +{ + int32_t ret = -1; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = 0; + + LOCK (&inode->lock); + { + ret = __inode_ctx_get (inode, this, &ctx_addr); + if (ret < 0) + goto unblock; + + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + if (version > ctx->currentversion) { + ctx->currentversion = version; + __br_stub_mark_inode_synced (ctx); + } + } + unblock: + UNLOCK (&inode->lock); + + return ret; +} + +static inline void +br_stub_fill_local (br_stub_local_t *local, + call_stub_t *stub, inode_t *inode, uuid_t gfid, + int versioningtype, unsigned long memversion, int durable) +{ + local->fopstub = stub; + local->versioningtype = versioningtype; + local->u.context.version = memversion; + local->u.context.inode = inode_ref (inode); + uuid_copy (local->u.context.gfid, gfid); + + /* mark inode dirty/fresh according to durability */ + local->u.context.markdirty = (durable) ? _gf_false : _gf_true; +} + +static inline void +br_stub_cleanup_local (br_stub_local_t *local) +{ + local->fopstub = NULL; + local->versioningtype = 0; + local->u.context.version = 0; + inode_unref (local->u.context.inode); + local->u.context.markdirty = _gf_true; + memset (local->u.context.gfid, '\0', sizeof (uuid_t)); +} + +/** + * callback for inode/fd full versioning + */ +int +br_stub_inode_fullversioning_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + inode_t *inode = NULL; + unsigned long version = 0; + gf_boolean_t dirty = _gf_true; + br_stub_local_t *local = NULL; + + local = (br_stub_local_t *)frame->local; + if (op_ret < 0) + goto done; + inode = local->u.context.inode; + version = local->u.context.version; + dirty = local->u.context.markdirty; + + op_ret = br_stub_init_inode_versions (this, inode, version, dirty); + if (op_ret < 0) + op_errno = EINVAL; + + done: + frame->local = NULL; + if (op_ret < 0) + call_unwind_error (local->fopstub, op_ret, op_errno); + else + call_resume (local->fopstub); + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); + + return 0; +} + +int +br_stub_fd_incversioning_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + inode_t *inode = NULL; + unsigned long version = 0; + br_stub_local_t *local = NULL; + + local = (br_stub_local_t *)frame->local; + if (op_ret < 0) + goto done; + inode = local->u.context.inode; + version = local->u.context.version; + + op_ret = br_stub_mod_inode_versions (this, inode, version); + if (op_ret < 0) + op_errno = EINVAL; + + done: + frame->local = NULL; + if (op_ret < 0) + call_unwind_error (local->fopstub, -1, op_errno); + else + call_resume (local->fopstub); + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); + + return 0; +} + +/** + * Initial object versioning + * + * Version persists two (2) extended attributes as explained below: + * 1. Current (ongoing) version: This is incremented on an open() + * or creat() and is the running version for an object. + * 2. Signing version: This is the version against which an object + * was signed (checksummed). Alongside this, the _current_ + * ongoing version is persisted. + * + * During initial versioning, both ongoing and signing versions are + * set of one and zero respectively. An open()/creat() call increments + * the ongoing version as an indication of modification to the object. + * Additionally this needs to be persisted on disk and needs to be + * durable: fsync().. :-/ + * As an optimization only the first open()/creat() synchronizes the + * ongoing version to disk, subsequent calls just increment the in- + * memory ongoing version. + * + * c.f. br_stub_open_cbk() / br_stub_create_cbk() + */ + +/** + * perform full or incremental versioning on an inode pointd by an + * fd. incremental versioning is done when an inode is dirty and a + * writeback is trigerred. + */ + +int +br_stub_fd_versioning (xlator_t *this, call_frame_t *frame, call_stub_t *stub, + dict_t *dict, fd_t *fd, br_stub_version_cbk *callback, + unsigned long memversion, int versioningtype, int durable) +{ + int32_t ret = -1; + dict_t *xdata = NULL; + br_stub_local_t *local = NULL; + + if (durable) { + xdata = dict_new (); + if (!xdata) + goto done; + ret = dict_set_int32 (xdata, GLUSTERFS_DURABLE_OP, 0); + if (ret) + goto dealloc_xdata; + } + + local = br_stub_alloc_local (this); + if (!local) { + ret = -1; + goto dealloc_xdata; + } + + br_stub_fill_local (local, stub, fd->inode, fd->inode->gfid, + versioningtype, memversion, durable); + + frame->local = local; + STACK_WIND (frame, + callback, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fsetxattr, fd, dict, 0, xdata); + + ret = 0; + + dealloc_xdata: + if (durable) + dict_unref (xdata); + done: + return ret; +} + +static inline int +br_stub_perform_fullversioning (xlator_t *this, call_frame_t *frame, + call_stub_t *stub, fd_t *fd) +{ + int32_t ret = -1; + dict_t *dict = NULL; + br_version_t *obuf = NULL; + br_signature_t *sbuf = NULL; + int op_errno = 0; + + op_errno = ENOMEM; + dict = dict_new (); + if (!dict) + goto done; + ret = br_stub_alloc_versions (&obuf, &sbuf, 0); + if (ret) + goto dealloc_dict; + + op_errno = EINVAL; + ret = br_stub_prepare_default_request (this, dict, obuf, sbuf); + if (ret) + goto dealloc_versions; + + /** + * Version extended attributes need not be durable at this point of + * time. If the objects (inode) data gets persisted on disk but the + * version extended attributes are lost due to a crash/power failure, + * a subsequent lookup marks the objects signature as stale.This way, + * dentry operation times do not shoot up. + */ + ret = br_stub_fd_versioning (this, frame, stub, dict, fd, + br_stub_inode_fullversioning_cbk, + BITROT_DEFAULT_CURRENT_VERSION, + BR_STUB_FULL_VERSIONING, !WRITEBACK_DURABLE); + + dealloc_versions: + br_stub_dealloc_versions (obuf); + dealloc_dict: + dict_unref (dict); + done: + if (ret) + call_unwind_error (stub, -1, op_errno); + return ret; +} + +static inline int +br_stub_perform_incversioning (xlator_t *this, + call_frame_t *frame, call_stub_t *stub, + fd_t *fd, br_stub_inode_ctx_t *ctx) +{ + int32_t ret = -1; + dict_t *dict = NULL; + inode_t *inode = NULL; + br_version_t *obuf = NULL; + gf_boolean_t writeback = _gf_false; + unsigned long writeback_version = 0; + int op_errno = 0; + + inode = fd->inode; + + LOCK (&inode->lock); + { + ctx->transientversion++; + + if (!__br_stub_is_inode_dirty (ctx)) { + ctx->currentversion = ctx->transientversion; + } else { + writeback = _gf_true; + writeback_version = ctx->transientversion; + } + } + UNLOCK (&inode->lock); + + if (!writeback) { + ret = 0; + goto done; + } + + /* inode requires writeback to disk */ + dict = dict_new (); + if (!dict) + goto done; + ret = br_stub_alloc_versions (&obuf, NULL, 0); + if (ret) + goto dealloc_dict; + ret = br_stub_prepare_version_request (this, dict, + obuf, writeback_version); + if (ret) + goto dealloc_versions; + + ret = br_stub_fd_versioning + (this, frame, stub, dict, + fd, br_stub_fd_incversioning_cbk, writeback_version, + BR_STUB_INCREMENTAL_VERSIONING, WRITEBACK_DURABLE); + + dealloc_versions: + br_stub_dealloc_versions (obuf); + dealloc_dict: + dict_unref (dict); + done: + if (!ret && !writeback) + call_resume (stub); + if (ret) + call_unwind_error (stub, -1, op_errno); + return ret; +} + +/** {{{ */ + +/* fsetxattr() */ + +static inline int +br_stub_prepare_signature (xlator_t *this, dict_t *dict, + inode_t *inode, br_isignature_t *sign) +{ + int32_t ret = 0; + size_t signaturelen = 0; + br_signature_t *sbuf = NULL; + unsigned long version = 0; + + if (!br_is_signature_type_valid (sign->signaturetype)) + goto error_return; + + signaturelen = strlen (sign->signature); + ret = br_stub_alloc_versions (NULL, &sbuf, signaturelen); + if (ret) + goto error_return; + ret = br_stub_get_ongoing_version (this, inode, &version); + if (ret) + goto dealloc_versions; + ret = br_stub_prepare_signing_request (dict, sbuf, + version, sign, signaturelen); + if (ret) + goto dealloc_versions; + return 0; + + dealloc_versions: + br_stub_dealloc_versions (sbuf); + error_return: + return -1; +} + +int +br_stub_fsetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, dict_t *dict, int flags, dict_t *xdata) +{ + int32_t ret = 0; + br_isignature_t *sign = NULL; + gf_boolean_t xref = _gf_false; + + if (!IA_ISREG (fd->inode->ia_type)) + goto wind; + ret = dict_get_bin (dict, GLUSTERFS_SET_OBJECT_SIGNATURE, + (void **) &sign); + if (ret < 0) + goto wind; + if (frame->root->pid != GF_CLIENT_PID_BITD) + goto unwind; + + ret = br_stub_prepare_signature (this, dict, fd->inode, sign); + if (ret) + goto unwind; + dict_del (dict, GLUSTERFS_SET_OBJECT_SIGNATURE); + + if (!xdata) { + xdata = dict_new (); + if (!xdata) + goto unwind; + } else { + dict_ref (xdata); + } + + xref = _gf_true; + ret = dict_set_int32 (xdata, GLUSTERFS_DURABLE_OP, 0); + if (ret) + goto unwind; + + wind: + STACK_WIND (frame, default_setxattr_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetxattr, fd, + dict, flags, xdata); + goto done; + + unwind: + STACK_UNWIND_STRICT (setxattr, frame, -1, EINVAL, NULL); + done: + if (xref) + dict_unref (xdata); + return 0; +} + +/** }}} */ + + +/** {{{ */ + +/* {f}getxattr() */ + +int +br_stub_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ + int32_t ret = 0; + ssize_t totallen = 0; + ssize_t signaturelen = 0; + br_version_t *obuf = NULL; + br_signature_t *sbuf = NULL; + unsigned long version = 0; + gf_boolean_t xrequested = _gf_false; + br_isignature_out_t *sign = NULL; + + if (frame->local) { + frame->local = NULL; + xrequested = _gf_true; + } + if (op_ret < 0) + goto unwind; + if (!xrequested) + goto unwind; + + op_ret = -1; + op_errno = EINVAL; + + ret = dict_get_bin (xattr, BITROT_CURRENT_VERSION_KEY, (void **)&obuf); + ret |= dict_get_bin (xattr, BITROT_SIGNING_VERSION_KEY, (void **)&sbuf); + if (ret) + goto unwind; + signaturelen = strlen (sbuf->signature); + totallen = signaturelen + sizeof (br_isignature_out_t); + + op_errno = ENOMEM; + sign = GF_CALLOC (1, totallen, gf_br_stub_mt_signature_t); + if (!sign) + goto unwind; + + version = br_stub_get_current_version (obuf, sbuf); + + sign->time[0] = obuf->timebuf[0]; + sign->time[1] = obuf->timebuf[1]; + + /* Object's dirty state */ + sign->stale = (version != sbuf->signedversion) ? 1 : 0; + + /* Object's signature */ + sign->signaturetype = sbuf->signaturetype; + (void) memcpy (sign->signature, sbuf->signature, signaturelen); + + op_errno = EINVAL; + ret = dict_set_bin (xattr, GLUSTERFS_GET_OBJECT_SIGNATURE, + (void *)sign, totallen); + if (ret < 0) + goto unwind; + op_errno = 0; + op_ret = signaturelen; + + unwind: + dict_del (xattr, BITROT_CURRENT_VERSION_KEY); + dict_del (xattr, BITROT_SIGNING_VERSION_KEY); + + STACK_UNWIND (frame, op_ret, op_errno, xattr, xdata); + return 0; +} + +static inline void +br_stub_send_stub_init_time (call_frame_t *frame, xlator_t *this) +{ + int op_ret = 0; + int op_errno = 0; + dict_t *xattr = NULL; + br_stub_init_t stub = {{0,},}; + br_stub_private_t *priv = NULL; + + priv = this->private; + + xattr = dict_new (); + if (!xattr) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + stub.timebuf[0] = priv->boot[0]; + stub.timebuf[1] = priv->boot[1]; + memcpy (stub.export, priv->export, strlen (priv->export) + 1); + + op_ret = dict_set_static_bin (xattr, GLUSTERFS_GET_BR_STUB_INIT_TIME, + (void *) &stub, sizeof (br_stub_init_t)); + if (op_ret < 0) { + op_errno = EINVAL; + goto unwind; + } + + op_ret = sizeof (br_stub_init_t); + + unwind: + STACK_UNWIND (frame, op_ret, op_errno, xattr, NULL); + + if (xattr) + dict_unref (xattr); +} + +int +br_stub_getxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name, dict_t *xdata) +{ + uuid_t rootgfid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1}; + + if (br_stub_is_internal_xattr (name)) + goto wind; + + /** + * this special extended attribute is allowed only on root + */ + if ( name + && (strncmp (name, GLUSTERFS_GET_BR_STUB_INIT_TIME, + strlen (GLUSTERFS_GET_BR_STUB_INIT_TIME)) == 0) + && ( (uuid_compare (loc->gfid, rootgfid) == 0) + || (uuid_compare (loc->inode->gfid, rootgfid) == 0) ) ) { + br_stub_send_stub_init_time (frame, this); + return 0; + } + + if (!IA_ISREG (loc->inode->ia_type)) + goto wind; + + if (name && (strncmp (name, GLUSTERFS_GET_OBJECT_SIGNATURE, + strlen (GLUSTERFS_GET_OBJECT_SIGNATURE)) == 0)) { + frame->local = (void *) 0x1; + } + + wind: + STACK_WIND (frame, br_stub_getxattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->getxattr, loc, name, xdata); + return 0; +} + +int +br_stub_fgetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) +{ + uuid_t rootgfid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1}; + + if (br_stub_is_internal_xattr (name)) + goto wind; + + /** + * this special extended attribute is allowed only on root + */ + if ( name + && (strncmp (name, GLUSTERFS_GET_BR_STUB_INIT_TIME, + strlen (GLUSTERFS_GET_BR_STUB_INIT_TIME)) == 0) + && (uuid_compare (fd->inode->gfid, rootgfid) == 0) ) { + br_stub_send_stub_init_time (frame, this); + return 0; + } + + if (!IA_ISREG (fd->inode->ia_type)) + goto wind; + + if (name && (strncmp (name, GLUSTERFS_GET_OBJECT_SIGNATURE, + strlen (GLUSTERFS_GET_OBJECT_SIGNATURE)) == 0)) { + frame->local = (void *) 0x1; + } + + wind: + STACK_WIND (frame, br_stub_getxattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fgetxattr, fd, name, xdata); + return 0; +} + +/** }}} */ + + +/** {{{ */ + +/* open() */ + +int +br_stub_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) +{ + int32_t ret = 0; + uint64_t ctx_addr = 0; + gf_boolean_t vercheck = _gf_false; + br_stub_inode_ctx_t *ctx = NULL; + call_stub_t *stub = NULL; + + if (frame->local) { + frame->local = NULL; + vercheck = _gf_true; + } + if (op_ret < 0) + goto unwind; + if (!vercheck) + goto unwind; + + ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr); + if (ret < 0) + goto unwind; + + stub = fop_open_cbk_stub (frame, NULL, op_ret, op_errno, fd, xdata); + if (!stub) { + op_ret = -1; + op_errno = EINVAL; + goto unwind; + } + + /** + * Ongoing version needs to be incremented. If the inode is not dirty, + * things are simple: increment the ongoing version safely and be done. + * If inode is dirty, a writeback to disk is required. This is tricky in + * case of multiple open()'s as ongoing version needs to be incremented + * on a successful writeback. It's probably safe to remember the ongoing + * version before writeback and *assigning* it in the callback, but that + * may lead to a trustable checksum to be treated as stale by scrubber + * (the case where the in-memory ongoing version is lesser than the + * on-disk version). To counter this, the ongoing version is modified + * only if the assigned version is greater than the current ongoing + * version. As far as the on-disk version is concerned, it's harmless + * as what's important is the inequality of ongoing and signing version + * (at the time of scrub or after crash recovery). On the other hand, + * in-memory ongoing version should reflect the maximum version amongst + * the instances as _this_ is eventually synced to disk. + */ + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + return br_stub_perform_incversioning (this, frame, stub, fd, ctx); + + unwind: + STACK_UNWIND_STRICT (open, frame, + op_ret, op_errno, fd, xdata); + return 0; +} + +int +br_stub_open (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata) +{ + if (!flags) + goto wind; + frame->local = (void *)0x1; /* _do not_ dereference in ->cbk */ + + wind: + STACK_WIND (frame, br_stub_open_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->open, loc, flags, fd, xdata); + return 0; +} + +/** }}} */ + + +/** {{{ */ + +/* creat() */ + +int +br_stub_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int32_t ret = 0; + uint64_t ctx_addr = 0; + call_stub_t *stub = NULL; + br_stub_inode_ctx_t *ctx = NULL; + + if (op_ret < 0) + goto unwind; + + stub = fop_create_cbk_stub (frame, NULL, op_ret, op_errno, fd, inode, + stbuf, preparent, postparent, xdata); + if (!stub) { + op_ret = -1; + op_errno = EINVAL; + goto unwind; + } + + ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr); + if (ret < 0) + ctx_addr = 0; + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + + /* see comment in br_stub_open_cbk().. */ + return (ctx) + ? br_stub_perform_incversioning (this, frame, stub, fd, ctx) + : br_stub_perform_fullversioning (this, frame, stub, fd); + + unwind: + STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, + fd, inode, stbuf, preparent, postparent, xdata); + return 0; +} + +int +br_stub_create (call_frame_t *frame, + xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + STACK_WIND (frame, br_stub_create_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->create, + loc, flags, mode, umask, fd, xdata); + return 0; +} + +/** }}} */ + + +/** {{{ */ + +/* lookup() */ + +int +br_stub_inode_fullversioning (xlator_t *this, call_frame_t *frame, + call_stub_t *stub, inode_t *inode, uuid_t gfid) + +{ + int32_t ret = -1; + loc_t loc = {0,}; + dict_t *dict = NULL; + br_version_t *obuf = NULL; + br_signature_t *sbuf = NULL; + br_stub_local_t *local = NULL; + int op_errno = 0; + + op_errno = ENOMEM; + dict = dict_new (); + if (!dict) + goto done; + + op_errno = EINVAL; + ret = br_stub_alloc_versions (&obuf, &sbuf, 0); + if (ret) + goto dealloc_dict; + ret = br_stub_prepare_default_request (this, dict, obuf, sbuf); + if (ret) + goto dealloc_versions; + + op_errno = ENOMEM; + local = br_stub_alloc_local (this); + if (!local) + goto dealloc_versions; + + br_stub_fill_local (local, + stub, inode, gfid, BR_STUB_FULL_VERSIONING, + BITROT_DEFAULT_CURRENT_VERSION, !WRITEBACK_DURABLE); + frame->local = local; + uuid_copy (loc.gfid, gfid); + + STACK_WIND (frame, + br_stub_inode_fullversioning_cbk, FIRST_CHILD (this), + FIRST_CHILD(this)->fops->setxattr, &loc, dict, 0, NULL); + ret = 0; + + dealloc_versions: + br_stub_dealloc_versions (obuf); + dealloc_dict: + dict_unref (dict); + done: + if (ret) + call_unwind_error (stub, -1, op_errno); + return ret; +} + +int +br_stub_lookup_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xattr, struct iatt *postparent) +{ + int32_t ret = 0; + call_stub_t *stub = NULL; + br_version_t *obuf = NULL; + br_signature_t *sbuf = NULL; + unsigned long version = 0; + gf_boolean_t xrequested = _gf_false; + + if (frame->local) { + frame->local = NULL; + xrequested = _gf_true; + } + if (op_ret < 0) + goto unwind; + if (!IA_ISREG (stbuf->ia_type)) + goto unwind; + if (!xrequested) + goto unwind; + + /** + * versioning xattrs were requested from POSIX. if available, figure + * out the correct version to use in the inode context. On the other + * hand, try to perform versioning. At this point, versioning xattrs + * need not be durable on disk.. anyhow, scrubbing would skip this + * object on version mismatches or non-existance of xattrs. The + * protocol as of now performs non-durable versioning on lookup(). + * This can be optimized by initializing default versions in-memory + * when on-disk xattrs are missing (or are not fully consistent). + */ + ret = dict_get_bin (xattr, BITROT_CURRENT_VERSION_KEY, (void **)&obuf); + ret |= dict_get_bin (xattr, BITROT_SIGNING_VERSION_KEY, (void **)&sbuf); + + if (ret == 0) { + version = br_stub_get_current_version (obuf, sbuf); + op_ret = br_stub_init_inode_versions (this, inode, + version, _gf_true); + if (op_ret < 0) + op_errno = EINVAL; + } else { + stub = fop_lookup_cbk_stub (frame, NULL, op_ret, op_errno, + inode, stbuf, xattr, postparent); + if (!stub) { + op_ret = -1; + op_errno = EINVAL; + goto unwind; + } + + return br_stub_inode_fullversioning (this, frame, stub, + inode, stbuf->ia_gfid); + } + + unwind: + STACK_UNWIND_STRICT (lookup, frame, + op_ret, op_errno, inode, stbuf, xattr, postparent); + + return 0; +} + +int +br_stub_lookup (call_frame_t *frame, + xlator_t *this, loc_t *loc, dict_t *xdata) +{ + int32_t ret = 0; + int op_errno = 0; + uint64_t ctx_addr = 0; + gf_boolean_t xref = _gf_false; + + ret = br_stub_get_inode_ctx (this, loc->inode, &ctx_addr); + if (ret < 0) + ctx_addr = 0; + if (ctx_addr != 0) + goto wind; + + /** + * fresh lookup: request version keys from POSIX + */ + op_errno = -ENOMEM; + if (!xdata) { + xdata = dict_new (); + if (!xdata) + goto unwind; + } else { + xdata = dict_ref (xdata); + } + + xref = _gf_true; + + op_errno = -EINVAL; + ret = dict_set_uint32 (xdata, BITROT_CURRENT_VERSION_KEY, 0); + if (ret) + goto unwind; + ret = dict_set_uint32 (xdata, BITROT_SIGNING_VERSION_KEY, 0); + if (ret) + goto unwind; + frame->local = (void *) 0x1; /* _do not_ dereference in ->cbk */ + + wind: + STACK_WIND (frame, + br_stub_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); + goto dealloc_dict; + + unwind: + STACK_UNWIND_STRICT (lookup, frame, + -1, op_errno, NULL, NULL, NULL, NULL); + dealloc_dict: + if (xref) + dict_unref (xdata); + return 0; +} + +/** }}} */ + +/** {{{ */ + +int32_t +br_stub_flush_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, dict_t *xdata) +{ + int32_t ret = -1; + fd_t *fd = NULL; + uint64_t ctx_addr = 0; + inode_t *inode = NULL; + br_stub_inode_ctx_t *ctx = NULL; + unsigned long releaseversion = 0; + + fd = (fd_t *)frame->local; + frame->local = NULL; + + if (op_ret < 0) + goto unwind; + inode = fd->inode; + + LOCK (&inode->lock); + { + ret = __inode_ctx_get (inode, this, &ctx_addr); + if (ret < 0) + goto unblock; + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + releaseversion = htonl (ctx->currentversion); + } + unblock: + UNLOCK (&inode->lock); + + if (!ctx) + goto unwind; + if (!xdata) { + xdata = dict_new (); + if (!xdata) { + ret = -1; + goto unwind; + } + } else { + xdata = dict_ref (xdata); + } + + /** + * TODO: refresh versions if the ongoingversion was modified since the + * last time a verison refresh was sent. + */ + ret = dict_set_uint64 + (xdata, GLUSTERFS_VERSION_XCHG_KEY, (uint64_t) releaseversion); + + unwind: + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "Could not fetch current version from inode context. " + "Signature would most probably be invalid [Object: %s]", + uuid_utoa (inode->gfid)); + + STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, xdata); + + if (xdata) + dict_unref (xdata); + + fd_unref (fd); /* NOTE: this would not trigger a release() */ + + return 0; +} + +/** + * Why is flush needed? + * + * Object's are candidates for signing upon release(). Object's signature + * is _tied_ to a particular version. This allows us to validate an object's + * signature by comparing the signed version with the current (ongoing) + * version. This exposes a hairy race between release() trying to fetch + * the version against which an object is deemed to be signed and open() + * trying to increment the ongoing version. If open() races ahead, release() + * would pick the version that currently has an open file desciptor (and + * possbily ongoing IO operations), which the bitrot daemon would happily + * sign. The signature would most probably be incorrect but _trustable_ + * as the signed version matches ongoing version. + * + * Therefore, the version to be signed against is maintained separately, + * which is _refreshed_ on every flush. Since a flush() is guaranteed + * before a release, this version closesly reflects the ongoing version. + * Another key point is that an object is signed after all file descriptors + * open()'d on it is close()'ed. This along with maintaining a side-copy of + * the ongoing version (release version) takes care of correct _attachment_ + * of a signature against it's version. + * + * Furthermore, the version needs to be _handed over_ to changelog translator, + * for client notification. As of now, this is done by using xdata (which is + * kind of ugly as that forces bit-rot stub to be places after changelog in + * the graph). This should probably be cleaned up (or even taken away if + * the object signer is made a part of the server graph itself). + */ +int32_t +br_stub_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + frame->local = fd_ref (fd); + + STACK_WIND (frame, br_stub_flush_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->flush, fd, xdata); + return 0; +} + +/** }}} */ + + +/** {{{ */ + +/* forget() */ + +int +br_stub_forget (xlator_t *this, inode_t *inode) +{ + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + + inode_ctx_del (inode, this, &ctx_addr); + if (!ctx_addr) + return 0; + + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + GF_FREE (ctx); + + return 0; +} + +/** }}} */ + + +struct xlator_fops fops = { + .lookup = br_stub_lookup, + .open = br_stub_open, + .flush = br_stub_flush, + .create = br_stub_create, + .getxattr = br_stub_getxattr, + .fgetxattr = br_stub_fgetxattr, + .fsetxattr = br_stub_fsetxattr, +}; + +struct xlator_cbks cbks = { + .forget = br_stub_forget, +}; + +struct volume_options options[] = { + { .key = {"bitrot"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "enable/disable bitrot stub" + }, + { .key = {"export"}, + .type = GF_OPTION_TYPE_PATH, + .description = "brick path for versioning" + }, + { .key = {NULL} }, +}; diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.h b/xlators/features/bit-rot/src/stub/bit-rot-stub.h new file mode 100644 index 00000000000..fd20fcc5f7a --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.h @@ -0,0 +1,127 @@ + /* + Copyright (c) 2015 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef __BIT_ROT_STUB_H__ +#define __BIT_ROT_STUB_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "defaults.h" +#include "call-stub.h" + +#include "bit-rot-common.h" + +typedef int (br_stub_version_cbk) (call_frame_t *, void *, + xlator_t *, int32_t, int32_t, dict_t *); + +typedef struct br_stub_inode_ctx { + int need_writeback; + unsigned long currentversion; + unsigned long transientversion; +} br_stub_inode_ctx_t; + +#define I_DIRTY (1<<0) /* inode needs writeback */ +#define WRITEBACK_DURABLE 1 /* writeback is durable */ + +/** + * This could just have been a plain struct without unions and all, + * but we may need additional things in the future. + */ +typedef struct br_stub_local { + call_stub_t *fopstub; /* stub for original fop */ + + int versioningtype; /* not much used atm */ + + union { + struct br_stub_ctx { + uuid_t gfid; + inode_t *inode; + unsigned long version; + gf_boolean_t markdirty; + } context; + } u; +} br_stub_local_t; + +#define BR_STUB_FULL_VERSIONING 1<<0 +#define BR_STUB_INCREMENTAL_VERSIONING 1<<1 + +typedef struct br_stub_private { + gf_boolean_t go; + + uint32_t boot[2]; + char export[PATH_MAX]; + + struct mem_pool *local_pool; +} br_stub_private_t; + +/* inode writeback helpers */ +static inline void +__br_stub_mark_inode_dirty (br_stub_inode_ctx_t *ctx) +{ + ctx->need_writeback |= I_DIRTY; +} + +static inline void +__br_stub_mark_inode_synced (br_stub_inode_ctx_t *ctx) +{ + ctx->need_writeback &= ~I_DIRTY; +} + +static inline int +__br_stub_is_inode_dirty (br_stub_inode_ctx_t *ctx) +{ + return (ctx->need_writeback & I_DIRTY); +} + +/* get/set inode context helpers */ + +static inline int +br_stub_get_inode_ctx (xlator_t *this, + inode_t *inode, uint64_t *ctx) +{ + return inode_ctx_get (inode, this, ctx); +} + +static inline int +br_stub_set_inode_ctx (xlator_t *this, + inode_t *inode, br_stub_inode_ctx_t *ctx) +{ + uint64_t ctx_addr = (uint64_t) ctx; + return inode_ctx_set (inode, this, &ctx_addr); +} + +static inline unsigned long +br_stub_get_current_version (br_version_t *obuf, br_signature_t *sbuf) +{ + if (obuf->ongoingversion > sbuf->currentversion) + return obuf->ongoingversion; + return sbuf->currentversion; +} + +/* filter for xattr fetch */ +static inline int +br_stub_is_internal_xattr (const char *name) +{ + if ( name + && ( (strncmp (name, BITROT_CURRENT_VERSION_KEY, + strlen (BITROT_CURRENT_VERSION_KEY)) == 0) + || (strncmp (name, BITROT_SIGNING_VERSION_KEY, + strlen (BITROT_SIGNING_VERSION_KEY)) == 0) ) ) + return 1; + return 0; +} + +#endif /* __BIT_ROT_STUB_H__ */ diff --git a/xlators/features/changelog/lib/src/changelog.h b/xlators/features/changelog/lib/src/changelog.h index 9dcb7634f51..f596191daee 100644 --- a/xlators/features/changelog/lib/src/changelog.h +++ b/xlators/features/changelog/lib/src/changelog.h @@ -38,6 +38,7 @@ struct ev_creat { struct ev_release { mode_t mode; int32_t ordflags; + unsigned long opaque; unsigned char gfid[16]; }; diff --git a/xlators/features/changelog/src/changelog-helpers.c b/xlators/features/changelog/src/changelog-helpers.c index ccc121c473d..a751b1b637c 100644 --- a/xlators/features/changelog/src/changelog-helpers.c +++ b/xlators/features/changelog/src/changelog-helpers.c @@ -1158,7 +1158,8 @@ changelog_inode_ctx_get (xlator_t *this, } int32_t -changelog_inode_mod_ctx_flags (xlator_t *this, inode_t *inode, int32_t flags) +changelog_inode_mod_releasectx_flags (xlator_t *this, + inode_t *inode, int32_t flags) { int32_t ret = -1; changelog_inode_ctx_t *ctx = NULL; @@ -1169,7 +1170,7 @@ changelog_inode_mod_ctx_flags (xlator_t *this, inode_t *inode, int32_t flags) if (!ctx) goto unblock; ret = 0; - ctx->ordflags |= flags; + ctx->releasectx.ordflags |= flags; } unblock: UNLOCK (&inode->lock); @@ -1177,6 +1178,28 @@ changelog_inode_mod_ctx_flags (xlator_t *this, inode_t *inode, int32_t flags) return ret; } +int32_t +changelog_inode_mod_releasectx_version (xlator_t *this, + inode_t *inode, unsigned long rversion) +{ + int32_t ret = -1; + changelog_inode_ctx_t *ctx = NULL; + + LOCK (&inode->lock); + { + ctx = __changelog_inode_ctx_get (this, inode, NULL, NULL, 0); + if (!ctx) + goto unblock; + ret = 0; + ctx->releasectx.releaseversion = rversion; + } + unblock: + UNLOCK (&inode->lock); + + return ret; + +} + /** * This is the main update routine. Locking has been made granular so as to * maximize parallelism of fops - I'll try to explain it below using execution diff --git a/xlators/features/changelog/src/changelog-helpers.h b/xlators/features/changelog/src/changelog-helpers.h index f44a06f426a..6511b230ba1 100644 --- a/xlators/features/changelog/src/changelog-helpers.h +++ b/xlators/features/changelog/src/changelog-helpers.h @@ -327,7 +327,11 @@ typedef struct changelog_local changelog_local_t; /* inode version is stored in inode ctx */ typedef struct changelog_inode_ctx { - int32_t ordflags; + struct releasectx { + int32_t ordflags; + unsigned long releaseversion; /* needed by bitrot */ + } releasectx; + unsigned long iversion[CHANGELOG_MAX_TYPE]; } changelog_inode_ctx_t; @@ -494,7 +498,10 @@ inline void changelog_dispatch_event (xlator_t *, changelog_priv_t *, changelog_event_t *); int32_t -changelog_inode_mod_ctx_flags (xlator_t *, inode_t *, int32_t ); +changelog_inode_mod_releasectx_flags (xlator_t *, inode_t *, int32_t ); + +int32_t +changelog_inode_mod_releasectx_version (xlator_t *, inode_t *, unsigned long); changelog_inode_ctx_t * __changelog_inode_ctx_get (xlator_t *, inode_t *, unsigned long **, diff --git a/xlators/features/changelog/src/changelog.c b/xlators/features/changelog/src/changelog.c index ad1b78d7c65..d83a8049ab2 100644 --- a/xlators/features/changelog/src/changelog.c +++ b/xlators/features/changelog/src/changelog.c @@ -43,7 +43,7 @@ changelog_save_release_flags (xlator_t *this, fd_t *fd) { int32_t ret = 0; - ret = changelog_inode_mod_ctx_flags (this, fd->inode, fd->flags); + ret = changelog_inode_mod_releasectx_flags (this, fd->inode, fd->flags); if (ret) { gf_log (this->name, GF_LOG_WARNING, "could not set open flag in inode context"); @@ -1719,6 +1719,67 @@ changelog_open (call_frame_t *frame, xlator_t *this, return 0; } +/* }}} */ + +/* {{{ */ + +int32_t +changelog_flush_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, dict_t *xdata) +{ + int32_t ret = 0; + fd_t *fd = NULL; + unsigned long xchgversion = 0; + changelog_priv_t *priv = NULL; + + priv = this->private; + + if (frame->local) { + fd = (fd_t *) frame->local; + frame->local = NULL; + } + + CHANGELOG_COND_GOTO (priv, (op_ret < 0), unwind); + + if (xdata && fd) { + ret = dict_get_uint64 (xdata, GLUSTERFS_VERSION_XCHG_KEY, + (uint64_t *) &xchgversion); + if (ret < 0) + goto unwind; + ret = changelog_inode_mod_releasectx_version + (this, fd->inode, xchgversion); + } + + unwind: + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "could not extract version exchange value from dict"); + + CHANGELOG_STACK_UNWIND (flush, frame, op_ret, op_errno, xdata); + + if (fd) + fd_unref (fd); + return 0; +} + +int32_t +changelog_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + + priv = this->private; + if (!priv->active) + goto wind; + + frame->local = fd_ref (fd); + + wind: + STACK_WIND (frame, changelog_flush_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->flush, fd, xdata); + return 0; +} + + /* }}} */ @@ -1727,6 +1788,7 @@ changelog_open (call_frame_t *frame, xlator_t *this, int32_t changelog_release (xlator_t *this, fd_t *fd) { int32_t flags = -1; + unsigned long rversion = 0; ia_prot_t prot = {0,}; inode_t *inode = NULL; changelog_priv_t *priv = NULL; @@ -1743,8 +1805,10 @@ int32_t changelog_release (xlator_t *this, fd_t *fd) ctx = __changelog_inode_ctx_get (this, inode, NULL, NULL, 0); if (!ctx) goto unblock; - flags = ctx->ordflags; - ctx->ordflags = 0; + flags = ctx->releasectx.ordflags; + rversion = ctx->releasectx.releaseversion; + ctx->releasectx.ordflags = 0; + ctx->releasectx.releaseversion = 0; } unblock: UNLOCK (&inode->lock); @@ -1754,6 +1818,7 @@ int32_t changelog_release (xlator_t *this, fd_t *fd) ev.u.release.mode = st_mode_from_ia (prot, fd->inode->ia_type); ev.u.release.ordflags = flags; + ev.u.release.opaque = rversion; uuid_copy (ev.u.release.gfid, fd->inode->gfid); changelog_dispatch_event (this, priv, &ev); @@ -1766,6 +1831,7 @@ int32_t changelog_release (xlator_t *this, fd_t *fd) /* }}} */ + /** * The * - @init () @@ -2638,6 +2704,7 @@ struct xlator_fops fops = { .open = changelog_open, .mknod = changelog_mknod, .mkdir = changelog_mkdir, + .flush = changelog_flush, .create = changelog_create, .symlink = changelog_symlink, .writev = changelog_writev, diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index 0562af9364b..ec915dbaa8a 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -1619,6 +1619,14 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, } #endif + xl = volgen_graph_add (graph, "features/bitrot-stub", volname); + if (!xl) + return -1; + + ret = xlator_set_option (xl, "export", path); + if (ret) + return -1; + xl = volgen_graph_add (graph, "features/changelog", volname); if (!xl) return -1; diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index e9473658176..dadd9503dcb 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -1655,6 +1655,12 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = GD_OP_VERSION_3_7_0, .validate_fn = validate_disperse_heal_enable_disable }, + { .key = "features.bitrot", + .voltype = "features/bitrot-stub", + .value = "disable", + .op_version = GD_OP_VERSION_3_7_0, + .type = NO_DOC, + }, { .key = NULL } }; diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c index 2920f318709..cd0ae19c541 100644 --- a/xlators/storage/posix/src/posix-helpers.c +++ b/xlators/storage/posix/src/posix-helpers.c @@ -1776,3 +1776,73 @@ posix_fsyncer (void *d) } } } + +/** + * fetch on-disk ongoing version and object signature extended + * attribute. + */ +int32_t +posix_get_objectsignature (char *real_path, dict_t *xattr) +{ + int32_t op_ret = 0; + char *memptr = NULL; + ssize_t xattrsize = 0; + ssize_t allocsize = 0; + + op_ret = -EINVAL; + xattrsize = sys_lgetxattr (real_path, + BITROT_CURRENT_VERSION_KEY, NULL, 0); + if (xattrsize == -1) + goto error_return; + allocsize += xattrsize; + + xattrsize = sys_lgetxattr (real_path, + BITROT_SIGNING_VERSION_KEY, NULL, 0); + if (xattrsize == -1) + goto error_return; + allocsize += xattrsize; + + op_ret = -ENOMEM; + /* bulk alloc */ + memptr = GF_CALLOC (allocsize + 2, sizeof (char), gf_posix_mt_char); + if (!memptr) + goto error_return; + + op_ret = sys_lgetxattr (real_path, BITROT_CURRENT_VERSION_KEY, + memptr, allocsize - xattrsize); + if (op_ret == -1) { + op_ret = -errno; + goto dealloc_mem; + } + + xattrsize = op_ret; /* save for correct _in_ memory pointing */ + + op_ret = sys_lgetxattr (real_path, BITROT_SIGNING_VERSION_KEY, + (memptr + op_ret + 1), allocsize - op_ret); + if (op_ret == -1) { + op_ret = -errno; + goto dealloc_mem; + } + + /* this is a dynamic set */ + op_ret = dict_set_dynptr (xattr, BITROT_CURRENT_VERSION_KEY, + memptr, allocsize); + if (op_ret < 0) + goto dealloc_mem; + + /* rest all should be static */ + op_ret = dict_set_static_ptr (xattr, BITROT_SIGNING_VERSION_KEY, + memptr + xattrsize + 1); + if (op_ret < 0) + goto delkey; + + return allocsize; + + delkey: + dict_del (xattr, BITROT_CURRENT_VERSION_KEY); + dealloc_mem: + GF_FREE (memptr); + error_return: + return op_ret; + +} diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index ea9cb3def62..9f64f66bfc0 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -3797,6 +3797,18 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, goto done; } + if ( loc->inode && name + && (strncmp (name, GLUSTERFS_GET_OBJECT_SIGNATURE, + strlen (GLUSTERFS_GET_OBJECT_SIGNATURE)) == 0) ) { + op_ret = posix_get_objectsignature (real_path, dict); + if (op_ret < 0) { + op_errno = -op_ret; + op_ret = -1; + } + + goto done; + } + if (name) { strcpy (keybuffer, name); char *key = keybuffer; @@ -4231,6 +4243,17 @@ posix_fsetxattr (call_frame_t *frame, xlator_t *this, op_ret = -1; } + if (!ret && xdata && dict_get (xdata, GLUSTERFS_DURABLE_OP)) { + op_ret = fsync (_fd); + if (op_ret < 0) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "could not satisfy durability request: " + "reason (%s)", strerror (errno)); + } + } + out: SET_TO_OLD_FS_ID (); diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h index 870315d0ea2..c6cea4d4c85 100644 --- a/xlators/storage/posix/src/posix.h +++ b/xlators/storage/posix/src/posix.h @@ -236,4 +236,7 @@ posix_get_ancestry (xlator_t *this, inode_t *leaf_inode, void posix_gfid_unset (xlator_t *this, dict_t *xdata); +int32_t +posix_get_objectsignature (char *, dict_t *); + #endif /* _POSIX_H */ From d3d623f35eafe1a5d26e75291d0ab584a88b0175 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 7 Jan 2015 11:14:31 +0530 Subject: [PATCH 6/6] mgmt/glusterd: generate volfile for BitD * Implement the skeleton of bit-rot xlator Change-Id: If33218bdc694f5f09cb7b8097c4fdb74d7a23b2d Original-Author: Raghavendra Bhat Signed-off-by: Venky Shankar Signed-off-by: Gaurav Kumar Garg Signed-off-by: Anand nekkunti --- xlators/features/bit-rot/src/Makefile.am | 17 ++ .../features/bit-rot/src/bit-rot-mem-types.h | 24 ++ xlators/features/bit-rot/src/bit-rot.c | 89 +++++++ xlators/features/bit-rot/src/bit-rot.h | 33 +++ xlators/mgmt/glusterd/src/glusterd-bitd-svc.c | 117 ++++++++ xlators/mgmt/glusterd/src/glusterd-bitd-svc.h | 47 ++++ .../mgmt/glusterd/src/glusterd-handshake.c | 3 +- xlators/mgmt/glusterd/src/glusterd-store.c | 250 ++++++++++++++++++ .../mgmt/glusterd/src/glusterd-svc-helper.c | 12 + xlators/mgmt/glusterd/src/glusterd-utils.c | 48 ++++ xlators/mgmt/glusterd/src/glusterd-utils.h | 44 +++ xlators/mgmt/glusterd/src/glusterd-volgen.c | 104 ++++++++ xlators/mgmt/glusterd/src/glusterd-volgen.h | 6 + xlators/mgmt/glusterd/src/glusterd.c | 11 + xlators/mgmt/glusterd/src/glusterd.h | 6 + 15 files changed, 809 insertions(+), 2 deletions(-) create mode 100644 xlators/features/bit-rot/src/bit-rot-mem-types.h create mode 100644 xlators/features/bit-rot/src/bit-rot.c create mode 100644 xlators/features/bit-rot/src/bit-rot.h create mode 100644 xlators/mgmt/glusterd/src/glusterd-bitd-svc.c create mode 100644 xlators/mgmt/glusterd/src/glusterd-bitd-svc.h diff --git a/xlators/features/bit-rot/src/Makefile.am b/xlators/features/bit-rot/src/Makefile.am index 7581732c7d9..1f59a71ebea 100644 --- a/xlators/features/bit-rot/src/Makefile.am +++ b/xlators/features/bit-rot/src/Makefile.am @@ -1 +1,18 @@ + SUBDIRS = stub + +xlator_LTLIBRARIES = bit-rot.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +bit_rot_la_LDFLAGS = -module -avoid-version + +bit_rot_la_SOURCES = bit-rot.c +bit_rot_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = bit-rot.h bit-rot-mem-types.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/bit-rot/src/bit-rot-mem-types.h b/xlators/features/bit-rot/src/bit-rot-mem-types.h new file mode 100644 index 00000000000..19c2aca0f8a --- /dev/null +++ b/xlators/features/bit-rot/src/bit-rot-mem-types.h @@ -0,0 +1,24 @@ +/* + Copyright (c) 2014 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _BR_MEM_TYPES_H +#define _BR_MEM_TYPES_H + +#include "mem-types.h" + +enum br_mem_types { + gf_br_mt_br_private_t = gf_common_mt_end + 1, + gf_br_mt_br_local_t, + gf_br_mt_br_inode_t, + gf_br_mt_br_fd_t, + gf_br_mt_end +}; + +#endif diff --git a/xlators/features/bit-rot/src/bit-rot.c b/xlators/features/bit-rot/src/bit-rot.c new file mode 100644 index 00000000000..0ba8b80825b --- /dev/null +++ b/xlators/features/bit-rot/src/bit-rot.c @@ -0,0 +1,89 @@ +/* + Copyright (c) 2014 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include +#include + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "xlator.h" +#include "logging.h" + +#include "bit-rot.h" +#include "bit-rot-mem-types.h" + +int32_t +mem_acct_init (xlator_t *this) +{ + int32_t ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init (this, gf_br_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_WARNING, "Memory accounting" + " init failed"); + return ret; + } + + return ret; +} + +int32_t +init (xlator_t *this) +{ + br_private_t *priv = NULL; + int32_t ret = -1; + + if (!this->children) { + gf_log (this->name, GF_LOG_ERROR, + "FATAL: no children"); + goto out; + } + + priv = GF_CALLOC (1, sizeof (*priv), gf_br_mt_br_private_t); + if (!priv) + goto out; + + this->private = priv; + + ret = 0; + +out: + gf_log (this->name, GF_LOG_DEBUG, "bit-rot xlator loaded"); + return ret; +} + +void +fini (xlator_t *this) +{ + br_private_t *priv = this->private; + + if (!priv) + return; + this->private = NULL; + GF_FREE (priv); + + return; +} + +struct xlator_fops fops; + +struct xlator_cbks cbks; + +struct volume_options options[] = { + { .key = {NULL} }, +}; diff --git a/xlators/features/bit-rot/src/bit-rot.h b/xlators/features/bit-rot/src/bit-rot.h new file mode 100644 index 00000000000..b275c0e9535 --- /dev/null +++ b/xlators/features/bit-rot/src/bit-rot.h @@ -0,0 +1,33 @@ + /* + Copyright (c) 2014 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef __BIT_ROT_H__ +#define __BIT_ROT_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "defaults.h" +#include "bit-rot-mem-types.h" +#include "syncop.h" + +struct br_private { + xlator_t *xl; + gf_lock_t lock; +}; + +typedef struct br_private br_private_t; + +#endif /* __BIR_ROT_H__ */ diff --git a/xlators/mgmt/glusterd/src/glusterd-bitd-svc.c b/xlators/mgmt/glusterd/src/glusterd-bitd-svc.c new file mode 100644 index 00000000000..7e2f81a093c --- /dev/null +++ b/xlators/mgmt/glusterd/src/glusterd-bitd-svc.c @@ -0,0 +1,117 @@ +/* + Copyright (c) 2006-2012 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "globals.h" +#include "run.h" +#include "glusterd.h" +#include "glusterd-utils.h" +#include "glusterd-volgen.h" +#include "glusterd-bitdsvc.h" + +char *bitd_svc_name = "bitd"; + +int +glusterd_bitdsvc_init (glusterd_svc_t *svc) +{ + return glusterd_svc_init (svc, bitd_svc_name, + glusterd_bitdsvc_manager, + glusterd_bitdsvc_start, + glusterd_bitdsvc_stop); +} + +static int +glusterd_bitdsvc_create_volfile () +{ + char filepath[PATH_MAX] = {0,}; + int ret = -1; + glusterd_conf_t *conf = NULL; + + conf = THIS->private; + GF_ASSERT (conf); + + + glusterd_svc_build_volfile_path (bitd_svc_name, conf->workdir, + filepath, sizeof (filepath)); + + ret = glusterd_create_global_volfile (build_bitd_graph, + filepath, NULL); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, "Failed to create volfile"); + goto out; + } +out: + gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret); + + return ret; +} + +int +glusterd_bitdsvc_manager (glusterd_svc_t *svc, void *data, int flags) +{ + int ret = -1; + + if (glusterd_are_all_volumes_stopped ()) { + ret = svc->stop (svc, SIGKILL); + } else { + ret = glusterd_bitdsvc_create_volfile (); + if (ret) + goto out; + + ret = svc->stop (svc, SIGKILL); + if (ret) + goto out; + + ret = svc->start (svc, flags); + if (ret) + goto out; + + ret = glusterd_conn_connect (&(svc->conn)); + if (ret) + goto out; + } +out: + gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret); + + return ret; +} + +int +glusterd_bitdsvc_start (glusterd_svc_t *svc, int flags) +{ + return glusterd_svc_start (svc, flags, NULL); +} + +int +glusterd_bitd_stop (glusterd_svc_t *svc, int sig) +{ + return glusterd_svc_stop (svc, sig); +} + +int +glusterd_bitdsvc_reconfigure () +{ + return glusterd_svc_reconfigure (glusterd_bitdsvc_create_volfile); +} + +void +glusterd_svc_build_bitd_volfile (glusterd_volinfo_t *volinfo, + char *path, int path_len) +{ + char workdir[PATH_MAX] = {0,}; + glusterd_conf_t *priv = NULL; + + priv = THIS->private; + GF_ASSERT (priv); + + GLUSTERD_GET_VOLUME_DIR (workdir, volinfo, priv); + + snprintf (path, path_len, "%s/%s-bitd.vol", workdir, + volinfo->volname); +} diff --git a/xlators/mgmt/glusterd/src/glusterd-bitd-svc.h b/xlators/mgmt/glusterd/src/glusterd-bitd-svc.h new file mode 100644 index 00000000000..0844e129969 --- /dev/null +++ b/xlators/mgmt/glusterd/src/glusterd-bitd-svc.h @@ -0,0 +1,47 @@ +/* + Copyright (c) 2006-2012 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _GLUSTERD_BITD_SVC_H_ +#define _GLUSTERD_BITD_SVC_H_ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterd-svc-mgmt.h" + +typedef struct glusterd_bitdsvc_ glusterd_bitdsvc_t; + +struct glusterd_bitdsvc_{ + glusterd_svc_t svc; + gf_store_handle_t *handle; +}; + +int +glusterd_bitdsvc_init (glusterd_svc_t *svc); + +int +glusterd_bitdsvc_manager (glusterd_svc_t *svc, void *data, int flags); + +int +glusterd_bitdsvc_start (glusterd_svc_t *svc, int flags); + +int +glusterd_bitdsvc_stop (glusterd_svc_t *svc, int sig); + +int +glusterd_bitdsvc_reconfigure (); + +void +glusterd_bitdsvc_build_volfile_path (char *server, char *workdir, + char *volfile, size_t len); + +#endif diff --git a/xlators/mgmt/glusterd/src/glusterd-handshake.c b/xlators/mgmt/glusterd/src/glusterd-handshake.c index 4f19d00a3d7..7e7f6523d36 100644 --- a/xlators/mgmt/glusterd/src/glusterd-handshake.c +++ b/xlators/mgmt/glusterd/src/glusterd-handshake.c @@ -26,6 +26,7 @@ #include "glusterd-svc-mgmt.h" #include "glusterd-snapd-svc-helper.h" #include "glusterd-quotad-svc.h" +#include "glusterd-bitd-svc.h" #include "glusterfs3.h" #include "protocol-common.h" @@ -195,8 +196,6 @@ build_volfile_path (char *volume_id, char *path, goto out; } - - volid_ptr = strstr (volume_id, "gluster/"); if (volid_ptr) { volid_ptr = strchr (volid_ptr, '/'); if (!volid_ptr) { diff --git a/xlators/mgmt/glusterd/src/glusterd-store.c b/xlators/mgmt/glusterd/src/glusterd-store.c index 70e3536af85..dd61f84e102 100644 --- a/xlators/mgmt/glusterd/src/glusterd-store.c +++ b/xlators/mgmt/glusterd/src/glusterd-store.c @@ -150,6 +150,31 @@ glusterd_store_snapd_path_set (glusterd_volinfo_t *volinfo, snprintf (snapd_path, len, "%s/snapd.info", volpath); } +/* + * As of now, bitd is per node, per volume. But in future if + * bitd is made per node only (i.e. one bitd on a peer will + * take care of all the bricks from all the volumes on that node), + * then the below things such as the location of the info file for + * the bitd has to change. + */ +static void +glusterd_store_bitd_path_set (glusterd_volinfo_t *volinfo, + char *bitd_path, size_t len) +{ + char volpath[PATH_MAX] = {0, }; + glusterd_conf_t *priv = NULL; + + GF_ASSERT (volinfo); + GF_ASSERT (len >= PATH_MAX); + + priv = THIS->private; + GF_ASSERT (priv); + + GLUSTERD_GET_VOLUME_DIR (volpath, volinfo, priv); + + snprintf (bitd_path, len, "%s/bitd.info", volpath); +} + gf_boolean_t glusterd_store_is_valid_brickpath (char *volname, char *brick) { @@ -283,6 +308,28 @@ glusterd_store_create_snapd_shandle_on_absence (glusterd_volinfo_t *volinfo) return ret; } +/* + * As of now, bitd is per node, per volume. But in future if + * bitd is made per node only (i.e. one bitd on a peer will + * take care of all the bricks from all the volumes on that node), + * then the below things such as the location of the info file for + * the bitd has to change. + */ +int32_t +glusterd_store_create_bitd_shandle_on_absence (glusterd_volinfo_t *volinfo) +{ + char bitd_path[PATH_MAX] = {0,}; + int32_t ret = 0; + + GF_ASSERT (volinfo); + + glusterd_store_bitd_path_set (volinfo, bitd_path, + sizeof (bitd_path)); + ret = gf_store_handle_create_on_absence (&volinfo->bitd.handle, + bitd_path); + return ret; +} + /* Store the bricks snapshot details only if required * * The snapshot details will be stored only if the cluster op-version is @@ -492,6 +539,49 @@ glusterd_store_perform_snapd_store (glusterd_volinfo_t *volinfo) return ret; } +/* + * As of now, bitd is per node, per volume. But in future if + * bitd is made per node only (i.e. one bitd on a peer will + * take care of all the bricks from all the volumes on that node), + * then the below things such as the location of the info file for + * the bitd has to change. + */ +int32_t +glusterd_store_perform_bitd_store (glusterd_volinfo_t *volinfo) +{ + int fd = -1; + int32_t ret = -1; + xlator_t *this = NULL; + + GF_ASSERT (volinfo); + + this = THIS; + GF_ASSERT (this); + + fd = gf_store_mkstemp (volinfo->bitd.handle); + if (fd <= 0) { + gf_log (this->name, GF_LOG_ERROR, "failed to create the " + "temporary file for the bitd store handle of volume " + "%s", volinfo->volname); + goto out; + } + + /* ret = glusterd_store_snapd_write (fd, volinfo); */ + /* if (ret) { */ + /* gf_log (this->name, GF_LOG_ERROR, "failed to write snapd port " */ + /* "info to store handle (volume: %s", volinfo->volname); */ + /* goto out; */ + /* } */ + + ret = gf_store_rename_tmppath (volinfo->bitd.handle); + +out: + if (ret && (fd > 0)) + gf_store_unlink_tmppath (volinfo->bitd.handle); + gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} + int32_t glusterd_store_brickinfo (glusterd_volinfo_t *volinfo, glusterd_brickinfo_t *brickinfo, int32_t brick_count, @@ -553,6 +643,44 @@ glusterd_store_snapd_info (glusterd_volinfo_t *volinfo) return ret; } +/* + * As of now, bitd is per node, per volume. But in future if + * bitd is made per node only (i.e. one bitd on a peer will + * take care of all the bricks from all the volumes on that node), + * then the below things such as the location of the info file for + * the bitd has to change. + */ +int32_t +glusterd_store_bitd_info (glusterd_volinfo_t *volinfo) +{ + int32_t ret = -1; + xlator_t *this = NULL; + + GF_ASSERT (volinfo); + + this = THIS; + GF_ASSERT (this); + + ret = glusterd_store_create_bitd_shandle_on_absence (volinfo); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to create store " + "handle for snapd (volume: %s)", volinfo->volname); + goto out; + } + + ret = glusterd_store_perform_bitd_store (volinfo); + if (ret) + gf_log (this->name, GF_LOG_ERROR, "failed to store snapd info " + "of the volume %s", volinfo->volname); + +out: + if (ret) + gf_store_unlink_tmppath (volinfo->bitd.handle); + + gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret); + return ret; +} + int32_t glusterd_store_delete_brick (glusterd_brickinfo_t *brickinfo, char *delete_path) { @@ -1495,6 +1623,15 @@ glusterd_store_volume_cleanup_tmp (glusterd_volinfo_t *volinfo) gf_store_unlink_tmppath (volinfo->node_state_shandle); gf_store_unlink_tmppath (volinfo->snapd.handle); + + /* + * As of now, bitd is per node, per volume. But in future if + * bitd is made per node only (i.e. one bitd on a peer will + * take care of all the bricks from all the volumes on that node), + * then the below things such as the location of the info file for + * the bitd has to change. + */ + gf_store_unlink_tmppath (volinfo->bitd.handle); } int32_t @@ -1644,6 +1781,17 @@ glusterd_store_volinfo (glusterd_volinfo_t *volinfo, glusterd_volinfo_ver_ac_t a if (ret) goto out; + /* + * As of now, bitd is per node, per volume. But in future if + * bitd is made per node only (i.e. one bitd on a peer will + * take care of all the bricks from all the volumes on that node), + * then the below things such as the location of the info file for + * the bitd has to change. + */ + ret = glusterd_store_bitd_info (volinfo); + if (ret) + goto out; + //checksum should be computed at the end ret = glusterd_compute_cksum (volinfo, _gf_false); if (ret) @@ -2209,6 +2357,104 @@ glusterd_store_retrieve_snapd (glusterd_volinfo_t *volinfo) return ret; } +/* + * As of now, bitd is per node, per volume. But in future if + * bitd is made per node only (i.e. one bitd on a peer will + * take care of all the bricks from all the volumes on that node), + * then the below things such as the location of the info file for + * the bitd has to change. + */ +int +glusterd_store_retrieve_bitd (glusterd_volinfo_t *volinfo) +{ + int ret = -1; + int exists = 0; + char *key = NULL; + char *value = NULL; + char volpath[PATH_MAX] = {0,}; + char path[PATH_MAX] = {0,}; + xlator_t *this = NULL; + glusterd_conf_t *conf = NULL; + gf_store_iter_t *iter = NULL; + gf_store_op_errno_t op_errno = GD_STORE_SUCCESS; + + this = THIS; + GF_ASSERT (this); + conf = THIS->private; + GF_ASSERT (volinfo); + + if (conf->op_version < GD_OP_VERSION_3_6_0) { + ret = 0; + goto out; + } + + /* + * This is needed for upgrade situations. Say a volume is created with + * older version of glusterfs and upgraded to a glusterfs version equal + * to or greater than GD_OP_VERSION_3_7_0. The older glusterd would not + * have created the snapd.info file related to snapshot daemon for user + * serviceable snapshots. So as part of upgrade when the new glusterd + * starts, as part of restore (restoring the volume to be precise), it + * tries to snapd related info from snapd.info file. But since there was + * no such file till now, the restore operation fails. Thus, to prevent + * it from happening check whether user serviceable snapshots features + * is enabled before restoring snapd. If its disbaled, then simply + * exit by returning success (without even checking for the snapd.info). + */ + + /* if (!dict_get_str_boolean (volinfo->dict, "features.bit-rot", _gf_false)) { */ + /* ret = 0; */ + /* goto out; */ + /* } */ + + GLUSTERD_GET_VOLUME_DIR(volpath, volinfo, conf); + + snprintf (path, sizeof (path), "%s/%s", volpath, + GLUSTERD_VOLUME_BITD_INFO_FILE); + + ret = gf_store_handle_retrieve (path, &volinfo->bitd.handle); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "volinfo handle is NULL"); + goto out; + } + + ret = gf_store_iter_new (volinfo->bitd.handle, &iter); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get new store " + "iter"); + goto out; + } + + ret = gf_store_iter_get_next (iter, &key, &value, &op_errno); + if (ret) { + if (op_errno != GD_STORE_EOF) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get next store " + "iter"); + goto out; + } + } + + while (!ret) { + ret = gf_store_iter_get_next (iter, &key, &value, + &op_errno); + } + + if (op_errno != GD_STORE_EOF) + goto out; + + ret = gf_store_iter_destroy (iter); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to destroy store " + "iter"); + goto out; + } + + ret = 0; + +out: + return ret; +} + int32_t glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo) { @@ -2879,6 +3125,10 @@ glusterd_store_retrieve_volume (char *volname, glusterd_snap_t *snap) if (ret) goto out; + ret = glusterd_store_retrieve_bitd (volinfo); + if (ret) + goto out; + ret = glusterd_compute_cksum (volinfo, _gf_false); if (ret) goto out; diff --git a/xlators/mgmt/glusterd/src/glusterd-svc-helper.c b/xlators/mgmt/glusterd/src/glusterd-svc-helper.c index f17f34c3530..d81ca974b49 100644 --- a/xlators/mgmt/glusterd/src/glusterd-svc-helper.c +++ b/xlators/mgmt/glusterd/src/glusterd-svc-helper.c @@ -17,6 +17,7 @@ #include "glusterd-shd-svc.h" #include "glusterd-quotad-svc.h" #include "glusterd-nfs-svc.h" +#include "glusterd-bitd-svc.h" int glusterd_svcs_reconfigure (glusterd_volinfo_t *volinfo) @@ -50,6 +51,10 @@ glusterd_svcs_reconfigure (glusterd_volinfo_t *volinfo) ret = glusterd_quotadsvc_reconfigure (); if (ret) goto out; + + ret = glusterd_bitdsvc_reconfigure (); + if (ret) + goto out; out: return ret; } @@ -118,6 +123,13 @@ glusterd_svcs_manager (glusterd_volinfo_t *volinfo) ret = 0; if (ret) goto out; + + ret = conf->bitd_svc.manager (&(conf->bitd_svc), volinfo, + PROC_START_NO_WAIT); + if (ret == -EINVAL) + ret = 0; + if (ret) + goto out; out: return ret; } diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index 04fa67c6df1..3d8e0b0aaed 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -3034,6 +3034,37 @@ glusterd_spawn_daemons (void *opaque) return ret; } +/* + * As of now, bitd is per node, per volume. But in future if + * bitd is made per node only (i.e. one bitd on a peer will + * take care of all the bricks from all the volumes on that node), + * then the below things such as the location of the info file for + * the bitd has to change. + */ +int +glusterd_restart_bitds (glusterd_conf_t *priv) +{ + glusterd_volinfo_t *volinfo = NULL; + int ret = 0; + xlator_t *this = THIS; + + list_for_each_entry (volinfo, &priv->volumes, vol_list) { + if (volinfo->status == GLUSTERD_STATUS_STARTED) { + //&& glusterd_is_bitd_enabled (volinfo)) { + ret = glusterd_bitd_start (volinfo, + _gf_false); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Couldn't start snapd for " + "vol: %s", volinfo->volname); + goto out; + } + } + } +out: + return ret; +} + void glusterd_do_volume_quorum_action (xlator_t *this, glusterd_volinfo_t *volinfo, gf_boolean_t meets_quorum) @@ -6850,6 +6881,23 @@ glusterd_friend_remove_cleanup_vols (uuid_t uuid) return ret; } +int +glusterd_get_bitd_filepath (char *filepath, glusterd_volinfo_t *volinfo) +{ + int ret = 0; + char path[PATH_MAX] = {0,}; + glusterd_conf_t *priv = NULL; + + priv = THIS->private; + + GLUSTERD_GET_VOLUME_DIR (path, volinfo, priv); + + snprintf (filepath, PATH_MAX, + "%s/%s-bitd.vol", path, volinfo->volname); + + return ret; +} + int glusterd_get_client_filepath (char *filepath, glusterd_volinfo_t *volinfo, gf_transport_type type) diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h index 62e2da03ad3..fcd7d9e70d9 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.h +++ b/xlators/mgmt/glusterd/src/glusterd-utils.h @@ -689,4 +689,48 @@ glusterd_nfs_pmap_deregister (); gf_boolean_t glusterd_is_volume_started (glusterd_volinfo_t *volinfo); +int +glusterd_get_bitd_filepath (char *filepath, glusterd_volinfo_t *volinfo); + +int +glusterd_is_bitd_enabled (glusterd_volinfo_t *volinfo); + +void +glusterd_get_bitd_rundir (glusterd_volinfo_t *volinfo, + char *path, int path_len); + +void +glusterd_get_bitd_volfile (glusterd_volinfo_t *volinfo, + char *path, int path_len); + +void +glusterd_get_bitd_pidfile (glusterd_volinfo_t *volinfo, + char *path, int path_len); + +void +glusterd_set_bitd_socket_filepath (glusterd_volinfo_t *volinfo, + char *path, int path_len); + +gf_boolean_t +glusterd_is_bitd_running (glusterd_volinfo_t *volinfo); + +int32_t +glusterd_bitd_start (glusterd_volinfo_t *volinfo, gf_boolean_t wait); + +int +glusterd_bitd_stop (glusterd_volinfo_t *volinfo); + +int +glusterd_handle_bitd_option (glusterd_volinfo_t *volinfo); + +gf_boolean_t +glusterd_is_bitd_online (glusterd_volinfo_t *volinfo); + +void +glusterd_bitd_set_online_status (glusterd_volinfo_t *volinfo, + gf_boolean_t status); + +int +glusterd_restart_bitds (glusterd_conf_t *priv); + #endif diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index ec915dbaa8a..cdab0653fe7 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -3742,7 +3742,10 @@ build_quotad_graph (volgen_graph_t *graph, dict_t *mod_dict) char *skey = NULL; this = THIS; + GF_ASSERT (this); + priv = this->private; + GF_ASSERT (priv); set_dict = dict_new (); if (!set_dict) { @@ -4143,6 +4146,107 @@ glusterd_snapdsvc_generate_volfile (volgen_graph_t *graph, return ret; } +int +build_bitd_graph (volgen_graph_t *graph, dict_t *mod_dict) +{ + volgen_graph_t cgraph = {0}; + glusterd_volinfo_t *voliter = NULL; + xlator_t *this = NULL; + glusterd_conf_t *priv = NULL; + dict_t *set_dict = NULL; + int ret = 0; + xlator_t *bitd_xl = NULL; + char *skey = NULL; + char transt[16] = {0,}; + + this = THIS; + GF_ASSERT (this); + + priv = this->private; + GF_ASSERT (priv); + + set_dict = dict_new (); + if (!set_dict) { + ret = -ENOMEM; + goto out; + } + + bitd_xl = volgen_graph_add_as (graph, "features/bit-rot", "bit-rot"); + + list_for_each_entry (voliter, &priv->volumes, vol_list) { + if (voliter->status != GLUSTERD_STATUS_STARTED) + continue; + + /*To do: check whether bitd is enable or not if not then + * continue; */ + + /* + * Since bitd is a service running within the trusted storage + * pool, it is treated as a trusted client. + */ + ret = dict_set_uint32 (set_dict, "trusted-client", + GF_CLIENT_TRUSTED); + if (ret) + goto out; + + get_transport_type (voliter, mod_dict, transt, _gf_false); + if (!strcmp (transt, "tcp,rdma")) + strcpy (transt, "tcp"); + + dict_copy (voliter->dict, set_dict); + if (mod_dict) + dict_copy (mod_dict, set_dict); + + ret = gf_asprintf(&skey, "%s.volume-id", voliter->volname); + if (ret == -1) { + gf_log("", GF_LOG_ERROR, "Out of memory"); + goto out; + } + ret = xlator_set_option(bitd_xl, skey, voliter->volname); + GF_FREE (skey); + if (ret) + goto out; + + memset (&cgraph, 0, sizeof (cgraph)); + ret = volgen_graph_build_clients (&cgraph, voliter, set_dict, + NULL); + if (ret) { + ret = -1; + goto out; + } + + if (mod_dict) { + dict_copy (mod_dict, set_dict); + ret = volgen_graph_set_options_generic (&cgraph, set_dict, + voliter, + basic_option_handler); + } else { + ret = volgen_graph_set_options_generic (&cgraph, + voliter->dict, + voliter, + basic_option_handler); + } + if (ret) + goto out; + + ret = volgen_graph_merge_sub (graph, &cgraph, 1); + if (ret) + goto out; + + ret = dict_reset (set_dict); + if (ret) + goto out; + } + +out: + if (set_dict) + dict_unref (set_dict); + + gf_log(this->name, GF_LOG_DEBUG, "Returning %d", ret); + + return ret; +} + int glusterd_snapdsvc_create_volfile (glusterd_volinfo_t *volinfo) { diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.h b/xlators/mgmt/glusterd/src/glusterd-volgen.h index a3c093422a6..23f20ef093e 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.h +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.h @@ -162,6 +162,9 @@ build_nfs_graph (volgen_graph_t *graph, dict_t *mod_dict); int build_quotad_graph (volgen_graph_t *graph, dict_t *mod_dict); +int +build_bitd_graph (volgen_graph_t *graph, dict_t *mod_dict); + int glusterd_delete_volfile (glusterd_volinfo_t *volinfo, glusterd_brickinfo_t *brickinfo); @@ -246,10 +249,13 @@ gd_is_xlator_option (char *key); gf_boolean_t gd_is_boolean_option (char *key); + char* volgen_get_shd_key (glusterd_volinfo_t *volinfo); int glusterd_volopt_validate (glusterd_volinfo_t *volinfo, dict_t *dict, char *key, char *value, char **op_errstr); + + #endif diff --git a/xlators/mgmt/glusterd/src/glusterd.c b/xlators/mgmt/glusterd/src/glusterd.c index c977dc1c1c4..b061f3e8c58 100644 --- a/xlators/mgmt/glusterd/src/glusterd.c +++ b/xlators/mgmt/glusterd/src/glusterd.c @@ -41,6 +41,7 @@ #include "glusterd-svc-mgmt.h" #include "glusterd-shd-svc.h" #include "glusterd-nfs-svc.h" +#include "glusterd-bitd-svc.h" #include "glusterd-quotad-svc.h" #include "glusterd-snapd-svc.h" #include "common-utils.h" @@ -1218,6 +1219,16 @@ glusterd_svc_init_all () } gf_log (THIS->name, GF_LOG_DEBUG, "quotad service initialized"); + /* Init BitD svc */ + ret = glusterd_bitdsvc_init (&(priv->bitd_svc)); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, "Failed to initialized BitD " + "service"); + goto out; + } + gf_log (THIS->name, GF_LOG_DEBUG, "BitD service initialized"); + + out: return ret; } diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h index 941a587d175..18a71c882be 100644 --- a/xlators/mgmt/glusterd/src/glusterd.h +++ b/xlators/mgmt/glusterd/src/glusterd.h @@ -39,6 +39,7 @@ #include "rpcsvc.h" #include "glusterd-sm.h" #include "glusterd-snapd-svc.h" +#include "glusterd-bitd-svc.h" #include "glusterd1-xdr.h" #include "protocol-common.h" #include "glusterd-pmap.h" @@ -136,6 +137,7 @@ typedef struct { rpcsvc_t *rpc; glusterd_svc_t shd_svc; glusterd_svc_t nfs_svc; + glusterd_svc_t bitd_svc; glusterd_svc_t quotad_svc; struct pmap_registry *pmap; struct list_head volumes; @@ -377,6 +379,9 @@ struct glusterd_volinfo_ { gd_quorum_status_t quorum_status; glusterd_snapdsvc_t snapd; + + glusterd_bitdsvc_t bitd; + }; typedef enum gd_snap_status_ { @@ -476,6 +481,7 @@ typedef enum { #define GLUSTERD_PEER_DIR_PREFIX "peers" #define GLUSTERD_VOLUME_INFO_FILE "info" #define GLUSTERD_VOLUME_SNAPD_INFO_FILE "snapd.info" +#define GLUSTERD_VOLUME_BITD_INFO_FILE "bitd.info" #define GLUSTERD_SNAP_INFO_FILE "info" #define GLUSTERD_VOLUME_RBSTATE_FILE "rbstate" #define GLUSTERD_BRICK_INFO_DIR "bricks"