Skip to content
This repository
Browse code

Merge pull request #18 from daveschaefer/master

In-memory caching using only python
  • Loading branch information...
commit eb98fabf0b7cd0cd8d8f2864f18e43fbae02f9df 2 parents ed6d5ba + e4c3a53
Dave authored March 21, 2013
5  README
@@ -118,6 +118,11 @@ this type, we do not regularly test it.  Currently, the threaded_scanner.py will
118 118
 for SSL services, though work to have it scan for SSH services is pretty minor.
119 119
 
120 120
 
  121
+==== MORE INFO ====
  122
+
  123
+See doc/advanced_notary_configuration.txt for tips on improving notary performance.
  124
+
  125
+
121 126
 ==== CONTRIBUTING ====
122 127
 
123 128
 Please visit the github page to submit changes and suggest improvements:
9  doc/advanced_notary_configuration.txt
... ...
@@ -0,0 +1,9 @@
  1
+There are several options that can make your notary run even better.
  2
+
  3
+
  4
+1. Set up caching!
  5
+
  6
+Data caching will significantly increase your notary's performance.
  7
+
  8
+For best performance you may want to use a dedicated caching server such as memcached, memcachier, or redis. If you do not have access to or don't want to set up a dedicated caching server, use the built-in python caching with '--pycache'.
  9
+
8  notary_http.py
@@ -37,7 +37,7 @@ class NotaryHTTPServer:
37 37
 	Collect and share information on website certificates from around the internet.
38 38
 	"""
39 39
 
40  
-	VERSION = "3.1"
  40
+	VERSION = "pre3.2a"
41 41
 	DEFAULT_WEB_PORT=8080
42 42
 	ENV_PORT_KEY_NAME='PORT'
43 43
 	STATIC_DIR = "notary_static"
@@ -66,6 +66,10 @@ def __init__(self):
66 66
 			help="Use memcachier to cache observation data. " + cache.Memcachier.get_help())
67 67
 		cachegroup.add_argument('--redis', action='store_true', default=False,
68 68
 			help="Use redis to cache observation data. " + cache.Redis.get_help())
  69
+		cachegroup.add_argument('--pycache', default=False, const=cache.Pycache.CACHE_SIZE,
  70
+			nargs='?', metavar=cache.Pycache.get_metavar(),
  71
+			help="Use RAM to cache observation data on the local machine only.\
  72
+			If you don't use any other type of caching, use this! " + cache.Pycache.get_help())
69 73
 
70 74
 		args = parser.parse_args()
71 75
 
@@ -103,6 +107,8 @@ def __init__(self):
103 107
 			self.cache = cache.Memcachier()
104 108
 		elif (args.redis):
105 109
 			self.cache = cache.Redis()
  110
+		elif (args.pycache):
  111
+			self.cache = cache.Pycache(args.pycache)
106 112
 
107 113
 		self.active_threads = 0 
108 114
 		self.args = args
29  test/Network Notary Test Cases.txt
@@ -62,3 +62,32 @@ Failing Gracefully:
62 62
 	- For Machines, and Event Types (on startup) does it log an error, disable database metrics, and continue?
63 63
 	- For Metrics does it ignore the metric, log an error, and continue?
64 64
 - Are metrics throttled back if the server receives many requests in a short period of time? (e.g. 200 requests per second)
  65
+
  66
+
  67
+In-memory caching with pycache:
  68
+-------------------------------
  69
+- If the cache is below the memory limit, are new keys continually added upon request?
  70
+- If adding a new key would use too much memory, does the cache remove an entry and then store the key?
  71
+	- Is the least recently used entry removed?
  72
+	- If removing one entry doesn't clear enough RAM, does the cache remove multiple entries until it has enough space?
  73
+	- Do both the hash and the heap size go down?
  74
+- If a requested object is bigger than total RAM allowed, do we log a warning and not store it?
  75
+- When an existing entry is retrieved from the cache is it's 'last viewed' time updated?
  76
+
  77
+expiry:
  78
+- Are expired entries removed during get() calls and None returned instead?
  79
+- Are expired entries cleaned up as they are encountered when clearing new memory?
  80
+- Are negative expiry times rejected and cache entries not created?
  81
+
  82
+pycache threads:
  83
+- Do we only create a single cache and return the proper results regardless of how many threads the server uses?
  84
+- If multiple threads attempt to set a value for the same key is only one of them allowed to set and the rest return immediately?
  85
+- If multiple threads attempt to set a value for *different* keys, are they all allowed to do so?
  86
+- Is only one thread at a time allowed to adjust the current memory usage?
  87
+
  88
+pycache arguments:
  89
+- Are characters other than 0-9MGB rejected, throwing an error?
  90
+- Does it stop you from specifyng MB *and* GB?
  91
+- Is it case insensitive?
  92
+- Does the cache have to be at least 1MB?
  93
+
86  util/cache.py
@@ -102,13 +102,11 @@ def __init__(self):
102 102
 			print >> sys.stderr, "ERROR: Could not connect to memcache server: '%s'. memcache is disabled." % (str(e))
103 103
 			self.pool = None
104 104
 
105  
-
106 105
 	def __del__(self):
107 106
 		"""Clean up resources"""
108 107
 		if (self.pool != None):
109 108
 			self.pool.relinquish()
110 109
 
111  
-
112 110
 	def get(self, key):
113 111
 		"""Retrieve the value for a given key, or None if no key exists."""
114 112
 		if (self.pool != None):
@@ -118,7 +116,6 @@ def get(self, key):
118 116
 			print >> sys.stderr, "Cache does not exist! Create it first"
119 117
 			return None
120 118
 
121  
-
122 119
 	def set(self, key, data, expiry=CacheBase.CACHE_EXPIRY):
123 120
 		"""Save the value to a given key name."""
124 121
 		if (self.pool != None):
@@ -210,3 +207,86 @@ def set(self, key, data, expiry=CacheBase.CACHE_EXPIRY):
210 207
 			self.redis.expire(key, expiry)
211 208
 		else:
212 209
 			print >> sys.stderr, "ERROR: Redis cache does not exist! Create it first"
  210
+
  211
+
  212
+class Pycache(CacheBase):
  213
+	"""
  214
+	Cache data using RAM.
  215
+	"""
  216
+
  217
+	CACHE_SIZE = "50" # megabytes
  218
+
  219
+	@classmethod
  220
+	def get_help(cls):
  221
+		"""Tell the user how they can use this type of cache."""
  222
+		# TODO: quantify how many observation records this can store
  223
+		return "Size can be specified in Megabytes (M/MB) or Gigabytes (G/GB). \
  224
+			Megabytes is assumed if no unit is given. \
  225
+			Default size: " + cls.CACHE_SIZE + "MB."
  226
+
  227
+	@classmethod
  228
+	def get_metavar(cls):
  229
+		"""
  230
+		Return the string that should be used for argparse's metavariable
  231
+		(i.e. the string that explains how to specify a cache size on the command line)
  232
+		"""
  233
+		return "CACHE_SIZE_INTEGER[M|MB|G|GB]"
  234
+
  235
+	def __init__(self, cache_size=CACHE_SIZE):
  236
+		"""Create a cache using RAM."""
  237
+		self.cache = None
  238
+
  239
+		import re
  240
+
  241
+		# let the user specify sizes with the characters 'MB' or 'GB'
  242
+		if (re.search("[^0-9MGBmgb]+", cache_size) != None):
  243
+			raise ValueError("Invalid Pycache cache size '%s': use '%s'." %
  244
+				(str(cache_size), self.get_metavar()))
  245
+
  246
+		if (re.search("[Mm]", cache_size) and re.search("[Gg]", cache_size)):
  247
+			raise ValueError("Invalid Pycache cache size '%s': " % (str(cache_size)) +
  248
+				"specify only one of MB and GB.")
  249
+
  250
+		multiplier = 1024 * 1024 # convert to bytes
  251
+
  252
+		if (re.search("[Gg]", cache_size)):
  253
+			multiplier *= 1024
  254
+
  255
+		# remove non-numeric characters
  256
+		cache_size = cache_size.translate(None, 'MGBmgb')
  257
+		cache_size = int(cache_size)
  258
+
  259
+		if (cache_size < 1):
  260
+			raise ValueError("Invalid Pycache cache size '%s': " % (str(cache_size)) +
  261
+				"cache must be at least 1MB.")
  262
+
  263
+		cache_size *= multiplier
  264
+
  265
+		try:
  266
+			from util import pycache
  267
+			self.cache = pycache
  268
+			pycache.set_cache_size(cache_size)
  269
+		except ImportError, e:
  270
+			print >> sys.stderr, "ERROR: Could not import module 'pycache': '%s'." % (e)
  271
+			self.cache = None
  272
+		except Exception, e:
  273
+			print >> sys.stderr, "ERROR creating cache in memory: '%s'." % (e)
  274
+			self.cache = None
  275
+
  276
+	def get(self, key):
  277
+		"""Retrieve the value for a given key, or None if no key exists."""
  278
+		if (self.cache != None):
  279
+			return self.cache.get(key)
  280
+		else:
  281
+			print >> sys.stderr, "pycache get() error: cache does not exist! create it before retrieving values."
  282
+			return None
  283
+
  284
+	def set(self, key, data, expiry=CacheBase.CACHE_EXPIRY):
  285
+		"""Save the value to a given key name."""
  286
+		if (self.cache != None):
  287
+			try:
  288
+				self.cache.set(key, data, expiry)
  289
+			except Exception, e:
  290
+				print >> sys.stderr, "pycache set() error: '%s'." % (e)
  291
+		else:
  292
+			print >> sys.stderr, "pycache set() error: cache does not exist! create it before setting values."
257  util/pycache.py
... ...
@@ -0,0 +1,257 @@
  1
+#   This file is part of the Perspectives Notary Server
  2
+#
  3
+#   Copyright (C) 2011 Dan Wendlandt
  4
+#
  5
+#   This program is free software: you can redistribute it and/or modify
  6
+#   it under the terms of the GNU General Public License as published by
  7
+#   the Free Software Foundation, version 3 of the License.
  8
+#
  9
+#   This program is distributed in the hope that it will be useful,
  10
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
  11
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12
+#   GNU General Public License for more details.
  13
+#
  14
+#   You should have received a copy of the GNU General Public License
  15
+#   along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
+
  17
+"""
  18
+Cache and retrieve data in key-value pairs using RAM only.
  19
+
  20
+When the cache reaches maximum size entries are discarded in
  21
+'least recently used' order.
  22
+
  23
+This module does not preemptively reserve memory from the OS;
  24
+additional memory is only acquired as needed.
  25
+Make sure you have enough memory to use the cache you request!
  26
+"""
  27
+
  28
+# Use a module so python can ensure there is only one cache regardless of threads.
  29
+# Note this doesn't allow inheritance; if we need that we will need to refactor.
  30
+
  31
+import heapq
  32
+import itertools
  33
+import sys
  34
+import threading
  35
+import time
  36
+
  37
+# Note: the maximum cache size applies only to stored data;
  38
+# the internal structures used to for implementation will cause pycache
  39
+# to use slightly more memory.
  40
+DEFAULT_CACHE_SIZE = 50 * 1024 * 1024 # bytes
  41
+
  42
+
  43
+class CacheEntry(object):
  44
+	"""Store data for a given entry in the cache."""
  45
+
  46
+	def __init__(self, key, data, expiry):
  47
+		"""Create new cache entry."""
  48
+
  49
+		if (expiry < 1):
  50
+			raise ValueError("CacheEntry expiry values must be positive")
  51
+
  52
+		now = int(time.time())
  53
+
  54
+		self.key = key
  55
+		self.data = data
  56
+		self.expiry = now + expiry
  57
+		self.memory_used = sys.getsizeof(data)
  58
+
  59
+		# count the key as having been requested just now, so it is not immediately removed.
  60
+		# this is usually correct, as the caller will likely have just retrieved or calculated
  61
+		# the data before calling us to store it.
  62
+		# this also prevents thrashing so new entries are not rapidly added and then removed from the heap.
  63
+		self.last_requested = now
  64
+
  65
+	def update_request_time(self):
  66
+		"""Update the most recent request time for this cache entry."""
  67
+		self.last_requested = int(time.time())
  68
+
  69
+	def has_expired(self):
  70
+		"""Returns true if this entry has expired; false otherwise."""
  71
+		if (self.expiry < int(time.time())):
  72
+			return True
  73
+		return False
  74
+
  75
+
  76
+class Heap(object):
  77
+	"""Store CacheEntries in a heap.
  78
+	Entries are stored in 'least recently used' order
  79
+	so we know what to remove when we run out of space."""
  80
+
  81
+	# This is a wrapper class to allow use of the heapq module in an Object-Oriented way,
  82
+	# and to contain the logic for our priority queue.
  83
+	# The heap does not store the cached data; it is only used to track the 'least recently used' order
  84
+	# so cache entries can be removed when we need space.
  85
+
  86
+	# This heap uses lazy deletion - entries are not deleted immediately, as we don't want
  87
+	# to spend time traversing and re-creating the heap each time.
  88
+	# Instead entries are marked for deletion and removed when they are encountered via popping.
  89
+
  90
+	# Performance Note:
  91
+	# We could add checks to recreate the heap list if old entries are taking up too much space,
  92
+	# but with keys expiring it should be fine for now.
  93
+	# We could also add a check to see if the counter has grown too large, but iterators use
  94
+	# an infinite stream, so it shouldn't be necessary.
  95
+
  96
+	def __init__(self):
  97
+		"""Create a new heap."""
  98
+		self.heap = []
  99
+		self.current_entries = {}
  100
+		self.counter = itertools.count()
  101
+
  102
+	def __len__(self):
  103
+		"""Return the number of items in the heap."""
  104
+		return len(self.heap)
  105
+
  106
+	def __del__(self):
  107
+		"""Delete the heap."""
  108
+		del self.heap
  109
+		del self.current_entries
  110
+
  111
+	def push(self, cache_entry):
  112
+		"""Add an entry onto the heap."""
  113
+		# use an iterator to break ties if multiple keys are added in the same second;
  114
+		# this ensures tuple comparison works in python 3.
  115
+		# credit for this idea goes to the python docs -
  116
+		# http://docs.python.org/2/library/heapq.html
  117
+		entry_id = next(self.counter)
  118
+
  119
+		heap_entry = [cache_entry.last_requested, entry_id, cache_entry.key]
  120
+		self.current_entries[cache_entry.key] = entry_id
  121
+		heapq.heappush(self.heap, heap_entry)
  122
+
  123
+	def update(self, cache_entry):
  124
+		"""Update the value of a heap entry."""
  125
+		# this is a convenience function to make it easier to understand what's happening.
  126
+		# entries are not actually updated in-place (that takes too long);
  127
+		# instead a new entry is created and the current one marked for lazy deletion later
  128
+		# (the entry is 'marked' for deletion by replacing the entry_id for that key in current_entries)
  129
+		self.push(cache_entry)
  130
+
  131
+	def pop(self):
  132
+		"""Remove the least recently used heap entry."""
  133
+		while self.heap:
  134
+			last_requested, entry_id, key = heapq.heappop(self.heap)
  135
+			if (key in self.current_entries and (self.current_entries[key] == entry_id)):
  136
+				del self.current_entries[key]
  137
+				return key
  138
+			# otherwise the element we just popped is either expired or an old junk entry;
  139
+			# discard it and continue.
  140
+		raise IndexError("Heap has no entries to pop")
  141
+
  142
+	def remove(self, cache_entry):
  143
+		"""Remove the entry from the heap."""
  144
+		# a convenience function: entries are not removed immediately but marked for lazy deletion.
  145
+		if cache_entry.key in self.current_entries:
  146
+			del self.current_entries[cache_entry.key]
  147
+		# else: don't worry - some other thread might have removed the entry just before us.
  148
+
  149
+
  150
+def __free_memory(mem_needed):
  151
+	"""Remove entries from the heap and cache until we have enough free memory."""
  152
+	global current_mem
  153
+	global max_mem
  154
+
  155
+	with mem_lock:
  156
+		while heap and (current_mem + mem_needed > max_mem):
  157
+			key = heap.pop()
  158
+			if key in cache:
  159
+				# naive implementation - we don't worry about discarding a non-expired item
  160
+				# before all expired items are gone.
  161
+				# we just want to clear *some* memory for the new item as fast as possible.
  162
+				# if this really hurts performance we could refactor.
  163
+				__delete_key(key)
  164
+			else:
  165
+				raise KeyError("The heap key '%s' does not exist in the cache and cannot be removed." % (key))
  166
+
  167
+
  168
+def __delete_key(key):
  169
+	"""Remove this entry from the cache."""
  170
+	global current_mem
  171
+
  172
+	with mem_lock:
  173
+		current_mem -= cache[key].memory_used
  174
+		del cache[key]
  175
+
  176
+
  177
+def set_cache_size(size):
  178
+	"""Set the maximum amount of RAM to use, in bytes."""
  179
+	size = int(size)
  180
+	if size > 0:
  181
+		with mem_lock:
  182
+			global max_mem
  183
+			max_mem = size
  184
+
  185
+
  186
+def set(key, data, expiry):
  187
+	"""Save the value to a given key."""
  188
+	global current_mem
  189
+	global max_mem
  190
+
  191
+	with set_lock:
  192
+		if key in set_threads:
  193
+			# some other thread is already updating the value for this key.
  194
+			# don't compete or waste time calculating a possibly duplicate value
  195
+			return
  196
+		else:
  197
+			set_threads[key] = True
  198
+
  199
+	try:
  200
+		entry = CacheEntry(key, data, expiry)
  201
+
  202
+		if (entry.memory_used > max_mem):
  203
+			print >> sys.stderr, "ERROR: cannot store data for '%s' - it's larger than the max cache size (%s bytes)\n" \
  204
+				% (key, max_mem)
  205
+			return
  206
+
  207
+		with mem_lock:
  208
+
  209
+			# add/replace the entry in the hash;
  210
+			# this tracks whether we have the key at all.
  211
+			if entry.key in cache:
  212
+				current_mem -= cache[key].memory_used # subtract the memory we gain back
  213
+
  214
+			if (current_mem + entry.memory_used > max_mem):
  215
+				__free_memory(entry.memory_used)
  216
+
  217
+			heap.push(entry)
  218
+			cache[key] = entry
  219
+			current_mem += entry.memory_used
  220
+
  221
+	finally:
  222
+		del set_threads[key]
  223
+
  224
+
  225
+def get(key):
  226
+	"""Retrieve the value for a given key, or None if no key exists."""
  227
+	if key not in cache:
  228
+		return None
  229
+
  230
+	if (cache[key].has_expired()):
  231
+		heap.remove(cache[key])
  232
+		__delete_key(key)
  233
+		return None
  234
+
  235
+	cache[key].update_request_time()
  236
+	heap.update(cache[key])
  237
+
  238
+	return cache[key].data
  239
+
  240
+
  241
+
  242
+# Use a dictionary to efficiently store/retrieve data
  243
+# and a heap to maintain a 'least recently used' order.
  244
+cache = {}
  245
+heap = Heap()
  246
+
  247
+current_mem = 0 # bytes
  248
+max_mem = DEFAULT_CACHE_SIZE
  249
+
  250
+
  251
+# we don't care if we get a slightly out of date value when retrieving,
  252
+# but prevent multiple set() calls from writing data for the same key at the same time.
  253
+set_threads = {}
  254
+set_lock = threading.Lock()
  255
+
  256
+# prevent multiple threads from altering memory counts at the same time
  257
+mem_lock = threading.RLock()

0 notes on commit eb98fab

Please sign in to comment.
Something went wrong with that request. Please try again.