Skip to content

Commit

Permalink
Merge pull request #355 from edgi-govdata-archiving/264-support-etag-…
Browse files Browse the repository at this point in the history
…headers

Support etag headers
  • Loading branch information
jsnshrmn committed Feb 14, 2019
2 parents 2c1273d + 77c4a2f commit 78dc02d
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 3 deletions.
35 changes: 32 additions & 3 deletions web_monitoring/diffing_server.py
Expand Up @@ -2,6 +2,7 @@
from docopt import docopt
import hashlib
import inspect
import functools
import os
import re
import sentry_sdk
Expand Down Expand Up @@ -115,8 +116,38 @@ def options(self):
class DiffHandler(BaseHandler):
# subclass must define `differs` attribute

# If query parameters repeat, take last one.
# Decode clean query parameters into unicode strings and cache the results.
@functools.lru_cache()
def decode_query_params(self):
query_params = {k: v[-1].decode() for k, v in
self.request.arguments.items()}
return query_params

# Compute our own ETag header values.
def compute_etag(self):
# We're not actually hashing content for this, since that is expensive.
validation_bytes = str(
web_monitoring.__version__
+ self.request.path
+ str(self.decode_query_params())
).encode('utf-8')

# Uses the "weak validation" directive since we don't guarantee that future
# responses for the same diff will be byte-for-byte identical.
etag = f'W/"{web_monitoring.utils.hash_content(validation_bytes)}"'
return etag

@tornado.gen.coroutine
def get(self, differ):

# Skip a whole bunch of work if possible.
self.set_etag_header()
if self.check_etag_header():
self.set_status(304)
self.finish()
return

# Find the diffing function registered with the name given by `differ`.
try:
func = self.differs[differ]
Expand All @@ -128,9 +159,7 @@ def get(self, differ):
f'the `/` endpoint.')
return

# If params repeat, take last one. Decode bytes into unicode strings.
query_params = {k: v[-1].decode() for k, v in
self.request.arguments.items()}
query_params = self.decode_query_params()
# The logic here is a bit tortured in order to allow one or both URLs
# to be local files, while still optimizing the common case of two
# remote URLs that we want to fetch in parallel.
Expand Down
32 changes: 32 additions & 0 deletions web_monitoring/tests/test_diffing_server_exc_handling.py
Expand Up @@ -52,6 +52,29 @@ def test_both_local(self):
self.assertEqual(response.code, 200)


class DiffingServerEtagTest(DiffingServerTestCase):
def test_etag_validation(self):
with tempfile.NamedTemporaryFile() as a:
with tempfile.NamedTemporaryFile() as b:
cold_response = self.fetch('/html_token?format=json&include=all&'
f'a=file://{a.name}&b=file://{b.name}')
self.assertEqual(cold_response.code, 200)

etag = cold_response.headers.get('Etag')

warm_response = self.fetch('/html_token?format=json&include=all&'
f'a=file://{a.name}&b=file://{b.name}',
headers={'If-None-Match': etag,
'Accept': 'application/json'})
self.assertEqual(warm_response.code, 304)

mismatch_response = self.fetch('/html_token?format=json&include=all&'
f'a=file://{a.name}&b=file://{b.name}',
headers={'If-None-Match': 'Stale Value',
'Accept': 'application/json'})
self.assertEqual(mismatch_response.code, 200)


class DiffingServerHealthCheckHandlingTest(DiffingServerTestCase):

def test_healthcheck(self):
Expand Down Expand Up @@ -106,42 +129,49 @@ def test_invalid_url_a_format(self):
'a=example.org&b=https://example.org')
self.json_check(response)
self.assertEqual(response.code, 400)
self.assertFalse(response.headers.get('Etag'))

def test_invalid_url_b_format(self):
response = self.fetch('/html_token?format=json&include=all&'
'a=https://example.org&b=example.org')
self.json_check(response)
self.assertEqual(response.code, 400)
self.assertFalse(response.headers.get('Etag'))

def test_invalid_diffing_method(self):
response = self.fetch('/non_existing?format=json&include=all&'
'a=example.org&b=https://example.org')
self.json_check(response)
self.assertEqual(response.code, 404)
self.assertFalse(response.headers.get('Etag'))

def test_missing_url_a(self):
response = self.fetch('/html_token?format=json&include=all&'
'b=https://example.org')
self.json_check(response)
self.assertEqual(response.code, 400)
self.assertFalse(response.headers.get('Etag'))

def test_missing_url_b(self):
response = self.fetch('/html_token?format=json&include=all&'
'a=https://example.org')
self.json_check(response)
self.assertEqual(response.code, 400)
self.assertFalse(response.headers.get('Etag'))

def test_not_reachable_url_a(self):
response = self.fetch('/html_token?format=json&include=all&'
'a=https://eeexample.org&b=https://example.org')
self.json_check(response)
self.assertEqual(response.code, 400)
self.assertFalse(response.headers.get('Etag'))

def test_not_reachable_url_b(self):
response = self.fetch('/html_token?format=json&include=all&'
'a=https://example.org&b=https://eeexample.org')
self.json_check(response)
self.assertEqual(response.code, 400)
self.assertFalse(response.headers.get('Etag'))

def test_missing_params_caller_func(self):
response = self.fetch('http://example.org/')
Expand All @@ -153,6 +183,7 @@ def test_a_is_404(self):
'&a=http://httpstat.us/404'
'&b=https://example.org')
self.assertEqual(response.code, 404)
self.assertFalse(response.headers.get('Etag'))
self.json_check(response)

@patch('web_monitoring.diffing_server.access_control_allow_origin_header', '*')
Expand Down Expand Up @@ -207,6 +238,7 @@ def test_fetch_undecodable_content(self):
f'b=file://{fixture_path("simple.pdf")}')
self.json_check(response)
assert response.code == 422
self.assertFalse(response.headers.get('Etag'))

def test_treats_unknown_encoding_as_ascii(self):
response = mock_tornado_request('unknown_encoding.html')
Expand Down

0 comments on commit 78dc02d

Please sign in to comment.