diff --git a/folly/Arena.h b/folly/Arena.h
index c119d8c16ee..9798dc81a91 100644
--- a/folly/Arena.h
+++ b/folly/Arena.h
@@ -18,12 +18,13 @@
 #define FOLLY_ARENA_H_
 
 #include <cassert>
-#include <utility>
 #include <limits>
+#include <utility>
 #include <boost/intrusive/slist.hpp>
 
 #include "folly/Likely.h"
 #include "folly/Malloc.h"
+#include "folly/Memory.h"
 
 namespace folly {
 
@@ -52,8 +53,7 @@ namespace folly {
  *      guaranteed to be rounded up to a multiple of the maximum alignment
  *      required on your system; the returned value must be also.
  *
- * An implementation that uses malloc() / free() is defined below, see
- * SysAlloc / SysArena.
+ * An implementation that uses malloc() / free() is defined below, see SysArena.
  */
 template <class Alloc> struct ArenaAllocatorTraits;
 template <class Alloc>
@@ -197,6 +197,9 @@ class Arena {
   size_t sizeLimit_;
 };
 
+template <class Alloc>
+struct IsArenaAllocator<Arena<Alloc>> : std::true_type { };
+
 /**
  * By default, don't pad the given size.
  */
@@ -207,21 +210,6 @@ struct ArenaAllocatorTraits {
   }
 };
 
-/**
- * Arena-compatible allocator that calls malloc() and free(); see
- * goodMallocSize() in Malloc.h for goodSize().
- */
-class SysAlloc {
- public:
-  void* allocate(size_t size) {
-    return checkedMalloc(size);
-  }
-
-  void deallocate(void* p) {
-    free(p);
-  }
-};
-
 template <>
 struct ArenaAllocatorTraits<SysAlloc> {
   static size_t goodSize(const SysAlloc& alloc, size_t size) {
@@ -234,13 +222,15 @@ struct ArenaAllocatorTraits<SysAlloc> {
  */
 class SysArena : public Arena<SysAlloc> {
  public:
-  explicit SysArena(
-    size_t minBlockSize = kDefaultMinBlockSize,
-    size_t sizeLimit = 0)
-      : Arena<SysAlloc>(SysAlloc(), minBlockSize, sizeLimit) {
+  explicit SysArena(size_t minBlockSize = kDefaultMinBlockSize,
+                    size_t sizeLimit = 0)
+    : Arena<SysAlloc>(SysAlloc(), minBlockSize, sizeLimit) {
   }
 };
 
+template <>
+struct IsArenaAllocator<SysArena> : std::true_type { };
+
 }  // namespace folly
 
 #include "folly/Arena-inl.h"
diff --git a/folly/ConcurrentSkipList-inl.h b/folly/ConcurrentSkipList-inl.h
index deb34737de9..c303ac03849 100644
--- a/folly/ConcurrentSkipList-inl.h
+++ b/folly/ConcurrentSkipList-inl.h
@@ -20,11 +20,19 @@
 #define FOLLY_CONCURRENTSKIPLIST_INL_H_
 
 #include <algorithm>
+#include <atomic>
 #include <climits>
 #include <cmath>
+#include <memory>
+#include <mutex>
+#include <type_traits>
+#include <vector>
+#include <boost/noncopyable.hpp>
 #include <boost/random.hpp>
-
+#include <boost/type_traits.hpp>
 #include <glog/logging.h>
+
+#include "folly/Memory.h"
 #include "folly/SmallLocks.h"
 #include "folly/ThreadLocal.h"
 
@@ -33,7 +41,7 @@ namespace folly { namespace detail {
 template<typename ValT, typename NodeT> class csl_iterator;
 
 template<typename T>
-class SkipListNode : boost::noncopyable {
+class SkipListNode : private boost::noncopyable {
   enum {
     IS_HEAD_NODE = 1,
     MARKED_FOR_REMOVAL = (1 << 1),
@@ -42,22 +50,30 @@ class SkipListNode : boost::noncopyable {
  public:
   typedef T value_type;
 
-  template<typename U,
+  template<typename NodeAlloc, typename U,
     typename=typename std::enable_if<std::is_convertible<U, T>::value>::type>
-  static SkipListNode* create(int height, U&& data, bool isHead = false) {
+  static SkipListNode* create(
+      NodeAlloc& alloc, int height, U&& data, bool isHead = false) {
     DCHECK(height >= 1 && height < 64) << height;
 
     size_t size = sizeof(SkipListNode) +
       height * sizeof(std::atomic<SkipListNode*>);
-    auto* node = static_cast<SkipListNode*>(malloc(size));
+    auto* node = static_cast<SkipListNode*>(alloc.allocate(size));
     // do placement new
     new (node) SkipListNode(height, std::forward<U>(data), isHead);
     return node;
   }
 
-  static void destroy(SkipListNode* node) {
+  template<typename NodeAlloc>
+  static void destroy(NodeAlloc& alloc, SkipListNode* node) {
     node->~SkipListNode();
-    free(node);
+    alloc.deallocate(node);
+  }
+
+  template<typename NodeAlloc>
+  static constexpr bool destroyIsNoOp() {
+    return IsArenaAllocator<NodeAlloc>::value &&
+           boost::has_trivial_destructor<std::atomic<SkipListNode*>>::value;
   }
 
   // copy the head node to a new head node assuming lock acquired
@@ -178,7 +194,6 @@ class SkipListRandomHeight {
   }
 
  private:
-
   SkipListRandomHeight() { initLookupTable(); }
 
   void initLookupTable() {
@@ -211,6 +226,110 @@ class SkipListRandomHeight {
   size_t sizeLimitTable_[kMaxHeight];
 };
 
-}}
+template<typename NodeType, typename NodeAlloc, typename = void>
+class NodeRecycler;
+
+template<typename NodeType, typename NodeAlloc>
+class NodeRecycler<NodeType, NodeAlloc, typename std::enable_if<
+  !NodeType::template destroyIsNoOp<NodeAlloc>()>::type> {
+ public:
+  explicit NodeRecycler(const NodeAlloc& alloc)
+    : alloc_(alloc), refs_(0), dirty_(false) { lock_.init(); }
+
+  ~NodeRecycler() {
+    CHECK_EQ(refs(), 0);
+    if (nodes_) {
+      for (auto& node : *nodes_) {
+        NodeType::destroy(alloc_, node);
+      }
+    }
+  }
+
+  void add(NodeType* node) {
+    std::lock_guard<MicroSpinLock> g(lock_);
+    if (nodes_.get() == nullptr) {
+      nodes_.reset(new std::vector<NodeType*>(1, node));
+    } else {
+      nodes_->push_back(node);
+    }
+    DCHECK_GT(refs(), 0);
+    dirty_.store(true, std::memory_order_relaxed);
+  }
+
+  int addRef() {
+    return refs_.fetch_add(1, std::memory_order_relaxed);
+  }
+
+  int releaseRef() {
+    // We don't expect to clean the recycler immediately everytime it is OK
+    // to do so. Here, it is possible that multiple accessors all release at
+    // the same time but nobody would clean the recycler here. If this
+    // happens, the recycler will usually still get cleaned when
+    // such a race doesn't happen. The worst case is the recycler will
+    // eventually get deleted along with the skiplist.
+    if (LIKELY(!dirty_.load(std::memory_order_relaxed) || refs() > 1)) {
+      return refs_.fetch_add(-1, std::memory_order_relaxed);
+    }
+
+    std::unique_ptr<std::vector<NodeType*>> newNodes;
+    {
+      std::lock_guard<MicroSpinLock> g(lock_);
+      if (nodes_.get() == nullptr || refs() > 1) {
+        return refs_.fetch_add(-1, std::memory_order_relaxed);
+      }
+      // once refs_ reaches 1 and there is no other accessor, it is safe to
+      // remove all the current nodes in the recycler, as we already acquired
+      // the lock here so no more new nodes can be added, even though new
+      // accessors may be added after that.
+      newNodes.swap(nodes_);
+      dirty_.store(false, std::memory_order_relaxed);
+    }
+
+    // TODO(xliu) should we spawn a thread to do this when there are large
+    // number of nodes in the recycler?
+    for (auto& node : *newNodes) {
+      NodeType::destroy(alloc_, node);
+    }
+
+    // decrease the ref count at the very end, to minimize the
+    // chance of other threads acquiring lock_ to clear the deleted
+    // nodes again.
+    return refs_.fetch_add(-1, std::memory_order_relaxed);
+  }
+
+  NodeAlloc& alloc() { return alloc_; }
+
+ private:
+  int refs() const {
+    return refs_.load(std::memory_order_relaxed);
+  }
+
+  NodeAlloc alloc_;
+  std::unique_ptr<std::vector<NodeType*>> nodes_;
+  std::atomic<int32_t> refs_; // current number of visitors to the list
+  std::atomic<bool> dirty_; // whether *nodes_ is non-empty
+  MicroSpinLock lock_; // protects access to *nodes_
+};
+
+// In case of arena allocator, no recycling is necessary, and it's possible
+// to save on ConcurrentSkipList size.
+template<typename NodeType, typename NodeAlloc>
+class NodeRecycler<NodeType, NodeAlloc, typename std::enable_if<
+  NodeType::template destroyIsNoOp<NodeAlloc>()>::type> {
+ public:
+  explicit NodeRecycler(const NodeAlloc& alloc) : alloc_(alloc) { }
+
+  void addRef() { }
+  void releaseRef() { }
+
+  void add(NodeType* node) { }
+
+  NodeAlloc& alloc() { return alloc_; }
+
+ private:
+  NodeAlloc alloc_;
+};
+
+}}  // namespaces
 
 #endif  // FOLLY_CONCURRENTSKIPLIST_INL_H_
diff --git a/folly/ConcurrentSkipList.h b/folly/ConcurrentSkipList.h
index 42642b4b6f4..265f5fc5e9c 100644
--- a/folly/ConcurrentSkipList.h
+++ b/folly/ConcurrentSkipList.h
@@ -122,21 +122,25 @@ Sample usage:
 
 #include <algorithm>
 #include <atomic>
-#include <climits>
+#include <limits>
 #include <memory>
 #include <type_traits>
-#include <utility>
-#include <vector>
 #include <boost/iterator/iterator_facade.hpp>
 #include <glog/logging.h>
 
 #include "folly/ConcurrentSkipList-inl.h"
 #include "folly/Likely.h"
+#include "folly/Memory.h"
 #include "folly/SmallLocks.h"
 
 namespace folly {
 
-template<typename T, typename Comp=std::less<T>, int MAX_HEIGHT=24>
+template<typename T,
+         typename Comp = std::less<T>,
+         // All nodes are allocated using provided SimpleAllocator,
+         // it should be thread-safe.
+         typename NodeAlloc = SysAlloc,
+         int MAX_HEIGHT = 24>
 class ConcurrentSkipList {
   // MAX_HEIGHT needs to be at least 2 to suppress compiler
   // warnings/errors (Werror=uninitialized tiggered due to preds_[1]
@@ -144,7 +148,7 @@ class ConcurrentSkipList {
   static_assert(MAX_HEIGHT >= 2 && MAX_HEIGHT < 64,
       "MAX_HEIGHT can only be in the range of [2, 64)");
   typedef std::unique_lock<folly::MicroSpinLock> ScopedLocker;
-  typedef ConcurrentSkipList<T, Comp, MAX_HEIGHT> SkipListType;
+  typedef ConcurrentSkipList<T, Comp, NodeAlloc, MAX_HEIGHT> SkipListType;
 
  public:
   typedef detail::SkipListNode<T> NodeType;
@@ -157,17 +161,20 @@ class ConcurrentSkipList {
   class Accessor;
   class Skipper;
 
-  explicit ConcurrentSkipList(int height)
-    : head_(NodeType::create(height, value_type(), true)), size_(0) { }
+  explicit ConcurrentSkipList(int height, const NodeAlloc& alloc = NodeAlloc())
+    : recycler_(alloc),
+      head_(NodeType::create(recycler_.alloc(), height, value_type(), true)),
+      size_(0) { }
 
   // Convenient function to get an Accessor to a new instance.
-  static Accessor create(int height = 1) {
-    return Accessor(createInstance(height));
+  static Accessor create(int height = 1, const NodeAlloc& alloc = NodeAlloc()) {
+    return Accessor(createInstance(height, alloc));
   }
 
   // Create a shared_ptr skiplist object with initial head height.
-  static std::shared_ptr<SkipListType> createInstance(int height = 1) {
-    return std::make_shared<ConcurrentSkipList>(height);
+  static std::shared_ptr<SkipListType> createInstance(
+      int height = 1, const NodeAlloc& alloc = NodeAlloc()) {
+    return std::make_shared<ConcurrentSkipList>(height, alloc);
   }
 
   //===================================================================
@@ -176,10 +183,13 @@ class ConcurrentSkipList {
   //===================================================================
 
   ~ConcurrentSkipList() {
-    CHECK_EQ(recycler_.refs(), 0);
+    /* static */ if (NodeType::template destroyIsNoOp<NodeAlloc>()) {
+      // Avoid traversing the list if using arena allocator.
+      return;
+    }
     for (NodeType* current = head_.load(std::memory_order_relaxed); current; ) {
       NodeType* tmp = current->skip(0);
-      NodeType::destroy(current);
+      NodeType::destroy(recycler_.alloc(), current);
       current = tmp;
     }
   }
@@ -219,84 +229,12 @@ class ConcurrentSkipList {
     return foundLayer;
   }
 
-  struct Recycler : private boost::noncopyable {
-    Recycler() : refs_(0), dirty_(false) { lock_.init(); }
-
-    ~Recycler() {
-      if (nodes_) {
-        for (auto& node : *nodes_) {
-          NodeType::destroy(node);
-        }
-      }
-    }
-
-    void add(NodeType* node) {
-      std::lock_guard<MicroSpinLock> g(lock_);
-      if (nodes_.get() == nullptr) {
-        nodes_.reset(new std::vector<NodeType*>(1, node));
-      } else {
-        nodes_->push_back(node);
-      }
-      DCHECK_GT(refs(), 0);
-      dirty_.store(true, std::memory_order_relaxed);
-    }
-
-    int refs() const {
-      return refs_.load(std::memory_order_relaxed);
-    }
-
-    int addRef() {
-      return refs_.fetch_add(1, std::memory_order_relaxed);
-    }
-
-    int release() {
-      // We don't expect to clean the recycler immediately everytime it is OK
-      // to do so. Here, it is possible that multiple accessors all release at
-      // the same time but nobody would clean the recycler here. If this
-      // happens, the recycler will usually still get cleaned when
-      // such a race doesn't happen. The worst case is the recycler will
-      // eventually get deleted along with the skiplist.
-      if (LIKELY(!dirty_.load(std::memory_order_relaxed) || refs() > 1)) {
-        return refs_.fetch_add(-1, std::memory_order_relaxed);
-      }
-
-      std::unique_ptr<std::vector<NodeType*>> newNodes;
-      {
-        std::lock_guard<MicroSpinLock> g(lock_);
-        if (nodes_.get() == nullptr || refs() > 1) {
-          return refs_.fetch_add(-1, std::memory_order_relaxed);
-        }
-        // once refs_ reaches 1 and there is no other accessor, it is safe to
-        // remove all the current nodes in the recycler, as we already acquired
-        // the lock here so no more new nodes can be added, even though new
-        // accessors may be added after that.
-        newNodes.swap(nodes_);
-        dirty_.store(false, std::memory_order_relaxed);
-      }
-
-      // TODO(xliu) should we spawn a thread to do this when there are large
-      // number of nodes in the recycler?
-      for (auto& node : *newNodes) {
-        NodeType::destroy(node);
-      }
-
-      // decrease the ref count at the very end, to minimize the
-      // chance of other threads acquiring lock_ to clear the deleted
-      // nodes again.
-      return refs_.fetch_add(-1, std::memory_order_relaxed);
-    }
-
-   private:
-    std::unique_ptr<std::vector<NodeType*>> nodes_;
-    std::atomic<int32_t> refs_; // current number of visitors to the list
-    std::atomic<bool> dirty_; // whether *nodes_ is non-empty
-    MicroSpinLock lock_; // protects access to *nodes_
-  };  // class ConcurrentSkipList::Recycler
-
   size_t size() const { return size_.load(std::memory_order_relaxed); }
+
   int height() const {
     return head_.load(std::memory_order_consume)->height();
   }
+
   int maxLayer() const { return height() - 1; }
 
   size_t incrementSize(int delta) {
@@ -376,7 +314,8 @@ class ConcurrentSkipList {
       }
 
       // locks acquired and all valid, need to modify the links under the locks.
-      newNode = NodeType::create(nodeHeight, std::forward<U>(data));
+      newNode =
+        NodeType::create(recycler_.alloc(), nodeHeight, std::forward<U>(data));
       for (int layer = 0; layer < nodeHeight; ++layer) {
         newNode->setSkip(layer, succs[layer]);
         preds[layer]->setSkip(layer, newNode);
@@ -537,7 +476,8 @@ class ConcurrentSkipList {
       return;
     }
 
-    NodeType* newHead = NodeType::create(height, value_type(), true);
+    NodeType* newHead =
+      NodeType::create(recycler_.alloc(), height, value_type(), true);
 
     { // need to guard the head node in case others are adding/removing
       // nodes linked to the head.
@@ -547,7 +487,7 @@ class ConcurrentSkipList {
       if (!head_.compare_exchange_strong(expected, newHead,
           std::memory_order_release)) {
         // if someone has already done the swap, just return.
-        NodeType::destroy(newHead);
+        NodeType::destroy(recycler_.alloc(), newHead);
         return;
       }
       oldHead->setMarkedForRemoval();
@@ -559,15 +499,15 @@ class ConcurrentSkipList {
     recycler_.add(node);
   }
 
+  detail::NodeRecycler<NodeType, NodeAlloc> recycler_;
   std::atomic<NodeType*> head_;
-  Recycler recycler_;
   std::atomic<size_t> size_;
 };
 
-template<typename T, typename Comp, int MAX_HEIGHT>
-class ConcurrentSkipList<T, Comp, MAX_HEIGHT>::Accessor {
+template<typename T, typename Comp, typename NodeAlloc, int MAX_HEIGHT>
+class ConcurrentSkipList<T, Comp, NodeAlloc, MAX_HEIGHT>::Accessor {
   typedef detail::SkipListNode<T> NodeType;
-  typedef ConcurrentSkipList<T, Comp, MAX_HEIGHT> SkipListType;
+  typedef ConcurrentSkipList<T, Comp, NodeAlloc, MAX_HEIGHT> SkipListType;
  public:
   typedef T value_type;
   typedef T key_type;
@@ -607,7 +547,7 @@ class ConcurrentSkipList<T, Comp, MAX_HEIGHT>::Accessor {
   Accessor& operator=(const Accessor &accessor) {
     if (this != &accessor) {
       slHolder_ = accessor.slHolder_;
-      sl_->recycler_.release();
+      sl_->recycler_.releaseRef();
       sl_ = accessor.sl_;
       sl_->recycler_.addRef();
     }
@@ -615,7 +555,7 @@ class ConcurrentSkipList<T, Comp, MAX_HEIGHT>::Accessor {
   }
 
   ~Accessor() {
-    sl_->recycler_.release();
+    sl_->recycler_.releaseRef();
   }
 
   bool empty() const { return sl_->size() == 0; }
@@ -734,10 +674,10 @@ class detail::csl_iterator :
 };
 
 // Skipper interface
-template<typename T, typename Comp, int MAX_HEIGHT>
-class ConcurrentSkipList<T, Comp, MAX_HEIGHT>::Skipper {
+template<typename T, typename Comp, typename NodeAlloc, int MAX_HEIGHT>
+class ConcurrentSkipList<T, Comp, NodeAlloc, MAX_HEIGHT>::Skipper {
   typedef detail::SkipListNode<T> NodeType;
-  typedef ConcurrentSkipList<T, Comp, MAX_HEIGHT> SkipListType;
+  typedef ConcurrentSkipList<T, Comp, NodeAlloc, MAX_HEIGHT> SkipListType;
   typedef typename SkipListType::Accessor Accessor;
 
  public:
diff --git a/folly/Memory.h b/folly/Memory.h
index c9948569ced..8394f6ad24c 100644
--- a/folly/Memory.h
+++ b/folly/Memory.h
@@ -19,13 +19,13 @@
 
 #include "folly/Traits.h"
 
-#include <memory>
-#include <limits>
-#include <utility>
+#include <cstddef>
+#include <cstdlib>
 #include <exception>
+#include <limits>
+#include <memory>
 #include <stdexcept>
-
-#include <cstddef>
+#include <utility>
 
 namespace folly {
 
@@ -42,12 +42,7 @@ std::unique_ptr<T, Dp> make_unique(Args&&... args) {
   return std::unique_ptr<T, Dp>(new T(std::forward<Args>(args)...));
 }
 
-/*
- * StlAllocator wraps a SimpleAllocator into a STL-compliant
- * allocator, maintaining an instance pointer to the simple allocator
- * object.  The underlying SimpleAllocator object must outlive all
- * instances of StlAllocator using it.
- *
+/**
  * A SimpleAllocator must provide two methods:
  *
  *    void* allocate(size_t size);
@@ -58,26 +53,31 @@ std::unique_ptr<T, Dp> make_unique(Args&&... args) {
  * if the allocation can't be satisfied, and free a previously
  * allocated block.
  *
- * Note that the following allocator resembles the standard allocator
- * quite well:
- *
- * class MallocAllocator {
- *  public:
- *   void* allocate(size_t size) {
- *     void* p = malloc(size);
- *     if (!p) throw std::bad_alloc();
- *     return p;
- *   }
- *   void deallocate(void* p) {
- *     free(p);
- *   }
- * };
+ * SysAlloc resembles the standard allocator.
+ */
+class SysAlloc {
+ public:
+  void* allocate(size_t size) {
+    void* p = ::malloc(size);
+    if (!p) throw std::bad_alloc();
+    return p;
+  }
+  void deallocate(void* p) {
+    ::free(p);
+  }
+};
+
+/**
+ * StlAllocator wraps a SimpleAllocator into a STL-compliant
+ * allocator, maintaining an instance pointer to the simple allocator
+ * object.  The underlying SimpleAllocator object must outlive all
+ * instances of StlAllocator using it.
  *
  * But note that if you pass StlAllocator<MallocAllocator,...> to a
  * standard container it will be larger due to the contained state
  * pointer.
  *
- * author: Tudor Bosman <tudorb@fb.com>
+ * @author: Tudor Bosman <tudorb@fb.com>
  */
 
 // This would be so much simpler with std::allocator_traits, but gcc 4.6.2
@@ -341,6 +341,12 @@ std::shared_ptr<T> allocate_shared(Allocator&& allocator, Args&&... args) {
   );
 }
 
+/**
+ * IsArenaAllocator<T>::value describes whether SimpleAllocator has
+ * no-op deallocate().
+ */
+template <class T> struct IsArenaAllocator : std::false_type { };
+
 }  // namespace folly
 
 #endif /* FOLLY_MEMORY_H_ */
diff --git a/folly/test/ArenaTest.cpp b/folly/test/ArenaTest.cpp
index b2c2aec525f..f54c0892539 100644
--- a/folly/test/ArenaTest.cpp
+++ b/folly/test/ArenaTest.cpp
@@ -25,6 +25,8 @@
 
 using namespace folly;
 
+static_assert(IsArenaAllocator<SysArena>::value, "");
+
 TEST(Arena, SizeSanity) {
   std::set<size_t*> allocatedItems;