From 4f68c521eb242d219174d9cd92db9e2ffde9269b Mon Sep 17 00:00:00 2001
From: Gabriel <g2p.code@gmail.com>
Date: Tue, 8 May 2012 22:26:53 +0200
Subject: [PATCH] Second side-effect of relaxed: normalise non-compliant
 spacing.

Test this, both in strict and relaxed mode.
---
 rfc6266.py      | 22 +++++++++++++++++++---
 test_rfc6266.py | 12 ++++++++++++
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/rfc6266.py b/rfc6266.py
index f950571..15b2a61 100644
--- a/rfc6266.py
+++ b/rfc6266.py
@@ -199,11 +199,23 @@ def parse_headers(content_disposition, location=None, relaxed=False):
     # remove CR and LF even if they aren't part of a CRLF.
     # However http doesn't allow isolated CR and LF in headers outside
     # of LWS.
-    assert is_lws_safe(content_disposition)
 
     if relaxed:
+        # Relaxed has two effects (so far):
+        # the grammar allows a final ';' in the header;
+        # we do LWS-folding, and possibly normalise other broken
+        # whitespace, instead of rejecting non-lws-safe text.
+        # XXX Would prefer to accept only the quoted whitespace
+        # case, rather than normalising everything.
+        content_disposition = normalize_ws(content_disposition)
         parser = content_disposition_value_relaxed
     else:
+        # Turns out this is occasionally broken: two spaces inside
+        # a quoted_string's qdtext. Firefox and Chrome save the two spaces.
+        if not is_lws_safe(content_disposition):
+            raise ValueError(
+                content_disposition, 'Contains nonstandard whitespace')
+
         parser = content_disposition_value
 
     try:
@@ -366,7 +378,11 @@ def fits_inside_codec(text, codec):
 
 
 def is_lws_safe(text):
-    return ' '.join(text.split()) == text
+    return normalize_ws(text) == text
+
+
+def normalize_ws(text):
+    return ' '.join(text.split())
 
 
 def qd_quote(text):
@@ -378,7 +394,7 @@ def build_header(
 ):
     """Generate a Content-Disposition header for a given filename.
 
-    For legacy clients that don't understant the filename* parameter,
+    For legacy clients that don't understand the filename* parameter,
     a filename_compat value may be given.
     It should either be ascii-only (recommended) or iso-8859-1 only.
     In the later case it should be a character string
diff --git a/test_rfc6266.py b/test_rfc6266.py
index 9b8ffdc..a6b7d52 100644
--- a/test_rfc6266.py
+++ b/test_rfc6266.py
@@ -60,6 +60,13 @@ def test_strict():
     # Trailing ; means the header is rejected
     assert parse_headers('attachment;').disposition == 'inline'
     assert parse_headers('attachment; key=val;').disposition == 'inline'
+    try:
+        cd = parse_headers(
+            'attachment; filename="spa  ced";')
+    except ValueError:
+        assert True
+    else:
+        assert False, cd
 
 
 def test_relaxed():
@@ -67,6 +74,11 @@ def test_relaxed():
         'attachment;', relaxed=True).disposition == 'attachment'
     assert parse_headers(
         'attachment; key=val;', relaxed=True).disposition == 'attachment'
+    cd = parse_headers(
+        'attachment; filename="spa  ced";',
+        relaxed=True)
+    assert cd.filename_unsafe == u'spa ced'
+
 
 
 def test_roundtrip():