From 4f68c521eb242d219174d9cd92db9e2ffde9269b Mon Sep 17 00:00:00 2001 From: Gabriel Date: Tue, 8 May 2012 22:26:53 +0200 Subject: [PATCH] Second side-effect of relaxed: normalise non-compliant spacing. Test this, both in strict and relaxed mode. --- rfc6266.py | 22 +++++++++++++++++++--- test_rfc6266.py | 12 ++++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/rfc6266.py b/rfc6266.py index f950571..15b2a61 100644 --- a/rfc6266.py +++ b/rfc6266.py @@ -199,11 +199,23 @@ def parse_headers(content_disposition, location=None, relaxed=False): # remove CR and LF even if they aren't part of a CRLF. # However http doesn't allow isolated CR and LF in headers outside # of LWS. - assert is_lws_safe(content_disposition) if relaxed: + # Relaxed has two effects (so far): + # the grammar allows a final ';' in the header; + # we do LWS-folding, and possibly normalise other broken + # whitespace, instead of rejecting non-lws-safe text. + # XXX Would prefer to accept only the quoted whitespace + # case, rather than normalising everything. + content_disposition = normalize_ws(content_disposition) parser = content_disposition_value_relaxed else: + # Turns out this is occasionally broken: two spaces inside + # a quoted_string's qdtext. Firefox and Chrome save the two spaces. + if not is_lws_safe(content_disposition): + raise ValueError( + content_disposition, 'Contains nonstandard whitespace') + parser = content_disposition_value try: @@ -366,7 +378,11 @@ def fits_inside_codec(text, codec): def is_lws_safe(text): - return ' '.join(text.split()) == text + return normalize_ws(text) == text + + +def normalize_ws(text): + return ' '.join(text.split()) def qd_quote(text): @@ -378,7 +394,7 @@ def build_header( ): """Generate a Content-Disposition header for a given filename. - For legacy clients that don't understant the filename* parameter, + For legacy clients that don't understand the filename* parameter, a filename_compat value may be given. It should either be ascii-only (recommended) or iso-8859-1 only. In the later case it should be a character string diff --git a/test_rfc6266.py b/test_rfc6266.py index 9b8ffdc..a6b7d52 100644 --- a/test_rfc6266.py +++ b/test_rfc6266.py @@ -60,6 +60,13 @@ def test_strict(): # Trailing ; means the header is rejected assert parse_headers('attachment;').disposition == 'inline' assert parse_headers('attachment; key=val;').disposition == 'inline' + try: + cd = parse_headers( + 'attachment; filename="spa ced";') + except ValueError: + assert True + else: + assert False, cd def test_relaxed(): @@ -67,6 +74,11 @@ def test_relaxed(): 'attachment;', relaxed=True).disposition == 'attachment' assert parse_headers( 'attachment; key=val;', relaxed=True).disposition == 'attachment' + cd = parse_headers( + 'attachment; filename="spa ced";', + relaxed=True) + assert cd.filename_unsafe == u'spa ced' + def test_roundtrip():