Skip to content

Commit

Permalink
bots: Don't request data again if recently updated
Browse files Browse the repository at this point in the history
If cached data has been recently pulled (perhaps by another bot)
then don't request it again. The except is if we changed something
on GitHub (eg: via a POST request).

The assumption here is that there will be lag retrieving data from
GitHub anyway. So lets institutionalize that assumption and stop
posling GitHub so frequently.

Previously we treated all data in the Github cache as non-current.
That is we did a round trip to GitHub to validate that it was still
current. After this commit we treat recent data as current and
just use it directly.

Closes #6600
  • Loading branch information
stefwalter committed May 20, 2017
1 parent ae7baf1 commit 442d355
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 11 deletions.
8 changes: 7 additions & 1 deletion bots/github/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,15 +161,19 @@ def request(self, method, resource, data="", headers=None):

def get(self, resource):
headers = { }
cached = self.cache.read(self.qualify(resource))
qualified = self.qualify(resource)
cached = self.cache.read(qualified)
if cached:
if self.cache.current(qualified):
return cached
etag = cached['headers'].get("etag", None)
if etag:
headers['If-None-Match'] = etag
response = self.request("GET", resource, "", headers)
if response['status'] == 404:
return None
elif cached and response['status'] == 304: # Not modified
self.cache.write(resource, response)
return json.loads(cached['data'])
elif response['status'] < 200 or response['status'] >= 300:
sys.stderr.write("{0}\n{1}\n".format(resource, response['data']))
Expand All @@ -180,6 +184,7 @@ def get(self, resource):

def post(self, resource, data, accept=[]):
response = self.request("POST", resource, json.dumps(data), { "Content-Type": "application/json" })
self.cache.mark()
status = response['status']
if (status < 200 or status >= 300) and status not in accept:
sys.stderr.write("{0}\n{1}\n".format(resource, response['data']))
Expand All @@ -188,6 +193,7 @@ def post(self, resource, data, accept=[]):

def patch(self, resource, data, accept=[]):
response = self.request("PATCH", resource, json.dumps(data), { "Content-Type": "application/json" })
self.cache.mark()
status = response['status']
if (status < 200 or status >= 300) and status not in accept:
sys.stderr.write("{0}\n{1}\n".format(resource, response['data']))
Expand Down
43 changes: 33 additions & 10 deletions bots/github/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import json
import os
import stat
import sys
import tempfile
import time
import urllib
Expand All @@ -34,33 +33,57 @@
)

class Cache(object):
def __init__(self, directory):
def __init__(self, directory, lag=60):
self.directory = directory
if not os.path.exists(self.directory):
os.makedirs(self.directory)
self.prune()

# The lag tells us how long to assume cached data is "current"
self.lag = lag

# The mark tells us that stuff before this time is not "current"
self.marked = 0

# Prune old expired data from the cache directory
def prune(self):
if not os.path.exists(self.directory):
return
now = time.time()
for filename in os.listdir(self.directory):
path = os.path.join(self.directory, filename)
if os.path.isfile(path) and os.stat(path).st_mtime < now - 7 * 86400:
os.remove(path)

# Read a resource from the cache or return None
def read(self, resource):
path = os.path.join(self.directory, urllib.quote(resource, safe=''))
if not os.path.exists(path):
return None
with open(path, 'r') as fp:
try:
try:
with open(path, 'r') as fp:
return json.load(fp)
except ValueError:
return None
except (IOError, ValueError):
return None

# Write a resource to the cache in an atomic way
def write(self, resource, contents):
path = os.path.join(self.directory, urllib.quote(resource, safe=''))
if not os.path.exists(self.directory):
os.makedirs(self.directory)
(fd, temp) = tempfile.mkstemp(dir=self.directory)
with os.fdopen(fd, 'w') as fp:
json.dump(contents, fp)
os.chmod(temp, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
os.rename(temp, path)

# Tell the cache that stuff before this time is not "current"
def mark(self, mtime=None):
if not mtime:
mtime = time.time()
self.marked = mtime

# Check if a given resource in the cache is "current" or not
def current(self, resource):
path = os.path.join(self.directory, urllib.quote(resource, safe=''))
try:
mtime = os.path.getmtime(path)
return mtime > self.marked and mtime > (time.time() - self.lag)
except OSError:
return False
32 changes: 32 additions & 0 deletions bots/github/test-cache
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import os
import sys
import tempfile
import time
import unittest

import cache
Expand Down Expand Up @@ -54,5 +55,36 @@ class TestCache(unittest.TestCase):
result = c.read("pa+t\%h")
self.assertEqual(result, other)

def testCurrent(self):
c = cache.Cache(self.directory, lag=3)

c.write("resource2", { "value": 2 })
self.assertTrue(c.current("resource2"))

time.sleep(2)
self.assertTrue(c.current("resource2"))

time.sleep(2)
self.assertFalse(c.current("resource2"))

def testCurrentMark(self):
c = cache.Cache(self.directory, lag=3)

self.assertFalse(c.current("resource"))

c.write("resource", { "value": 1 })
self.assertTrue(c.current("resource"))

time.sleep(2)
self.assertTrue(c.current("resource"))

c.mark()
self.assertFalse(c.current("resource"))

def testCurrentZero(self):
c = cache.Cache(self.directory, lag=0)
c.write("resource", { "value": 1 })
self.assertFalse(c.current("resource"))

if __name__ == '__main__':
unittest.main()

0 comments on commit 442d355

Please sign in to comment.