Fixed #9886 -- Added a file-like interface to HttpRequest. Thanks to …

…Ivan Sagalaev for the suggestion and patch. git-svn-id: http://code.djangoproject.com/svn/django/trunk@14394 bcc190cf-cafb-0310-a4f2-bffc1f526a37
django · Oct 29, 2010 · 269e921 · 269e921
1 parent 3086b55
commit 269e921
Show file tree

Hide file tree

Showing 5 changed files with 236 additions and 95 deletions.
diff --git a/django/core/handlers/modpython.py b/django/core/handlers/modpython.py
@@ -42,6 +42,8 @@ def __init__(self, req):
             # naughty, but also pretty harmless.
             self.path_info = u'/'
         self._post_parse_error = False
+        self._stream = self._req
+        self._read_started = False
 
     def __repr__(self):
         # Since this is called as part of error handling, we need to be very
@@ -81,26 +83,6 @@ def is_secure(self):
             # mod_python < 3.2.10 doesn't have req.is_https().
             return self._req.subprocess_env.get('HTTPS', '').lower() in ('on', '1')
 
-    def _load_post_and_files(self):
-        "Populates self._post and self._files"
-        if self.method != 'POST':
-            self._post, self._files = http.QueryDict('', encoding=self._encoding), datastructures.MultiValueDict()
-            return
-
-        if 'content-type' in self._req.headers_in and self._req.headers_in['content-type'].startswith('multipart'):
-            self._raw_post_data = ''
-            try:
-                self._post, self._files = self.parse_file_upload(self.META, self._req)
-            except:
-                # See django.core.handlers.wsgi.WSGIHandler for an explanation
-                # of what's going on here.
-                self._post = http.QueryDict('')
-                self._files = datastructures.MultiValueDict()
-                self._post_parse_error = True
-                raise
-        else:
-            self._post, self._files = http.QueryDict(self.raw_post_data, encoding=self._encoding), datastructures.MultiValueDict()
-
     def _get_request(self):
         if not hasattr(self, '_request'):
             self._request = datastructures.MergeDict(self.POST, self.GET)
@@ -162,13 +144,6 @@ def _get_meta(self):
                 self._meta[key] = value
         return self._meta
 
-    def _get_raw_post_data(self):
-        try:
-            return self._raw_post_data
-        except AttributeError:
-            self._raw_post_data = self._req.read()
-            return self._raw_post_data
-
     def _get_method(self):
         return self.META['REQUEST_METHOD'].upper()
 
@@ -178,7 +153,6 @@ def _get_method(self):
     FILES = property(_get_files)
     META = property(_get_meta)
     REQUEST = property(_get_request)
-    raw_post_data = property(_get_raw_post_data)
     method = property(_get_method)
 
 class ModPythonHandler(BaseHandler):

diff --git a/django/core/handlers/wsgi.py b/django/core/handlers/wsgi.py
@@ -5,6 +5,7 @@
     from cStringIO import StringIO
 except ImportError:
     from StringIO import StringIO
+import socket
 
 from django import http
 from django.core import signals
@@ -62,20 +63,55 @@
     505: 'HTTP VERSION NOT SUPPORTED',
 }
 
-def safe_copyfileobj(fsrc, fdst, length=16*1024, size=0):
+class LimitedStream(object):
-    """
+    '''
-    A version of shutil.copyfileobj that will not read more than 'size' bytes.
+    LimitedStream wraps another stream in order to not allow reading from it
-    This makes it safe from clients sending more than CONTENT_LENGTH bytes of
+    past specified amount of bytes.
-    data in the body.
+    '''
-    """
+    def __init__(self, stream, limit, buf_size=64 * 1024 * 1024):
-    if not size:
+        self.stream = stream
-        return
+        self.remaining = limit
-    while size > 0:
+        self.buffer = ''
-        buf = fsrc.read(min(length, size))
+        self.buf_size = buf_size
-        if not buf:
+
-            break
+    def _read_limited(self, size=None):
-        fdst.write(buf)
+        if size is None or size > self.remaining:
-        size -= len(buf)
+            size = self.remaining
+        if size == 0:
+            return ''
+        result = self.stream.read(size)
+        self.remaining -= len(result)
+        return result
+
+    def read(self, size=None):
+        if size is None:
+            result = self.buffer + self._read_limited()
+            self.buffer = ''
+        elif size < len(self.buffer):
+            result = self.buffer[:size]
+            self.buffer = self.buffer[size:]
+        else: # size >= len(self.buffer)
+            result = self.buffer + self._read_limited(size - len(self.buffer))
+            self.buffer = ''
+        return result
+
+    def readline(self, size=None):
+        while '\n' not in self.buffer or \
+              (size is not None and len(self.buffer) < size):
+            if size:
+                chunk = self._read_limited(size - len(self.buffer))
+            else:
+                chunk = self._read_limited()
+            if not chunk:
+                break
+            self.buffer += chunk
+        sio = StringIO(self.buffer)
+        if size:
+            line = sio.readline(size)
+        else:
+            line = sio.readline()
+        self.buffer = sio.read()
+        return line
 
 class WSGIRequest(http.HttpRequest):
     def __init__(self, environ):
@@ -98,6 +134,24 @@ def __init__(self, environ):
         self.META['SCRIPT_NAME'] = script_name
         self.method = environ['REQUEST_METHOD'].upper()
         self._post_parse_error = False
+        if isinstance(self.environ['wsgi.input'], socket._fileobject):
+            # Under development server 'wsgi.input' is an instance of
+            # socket._fileobject which hangs indefinitely on reading bytes past
+            # available count. To prevent this it's wrapped in LimitedStream
+            # that doesn't read past Content-Length bytes.
+            #
+            # This is not done for other kinds of inputs (like flup's FastCGI
+            # streams) beacuse they don't suffer from this problem and we can
+            # avoid using another wrapper with its own .read and .readline
+            # implementation.
+            try:
+                content_length = int(self.environ.get('CONTENT_LENGTH', 0))
+            except (ValueError, TypeError):
+                content_length = 0
+            self._stream = LimitedStream(self.environ['wsgi.input'], content_length)
+        else:
+            self._stream = self.environ['wsgi.input']
+        self._read_started = False
 
     def __repr__(self):
         # Since this is called as part of error handling, we need to be very
@@ -133,30 +187,6 @@ def is_secure(self):
         return 'wsgi.url_scheme' in self.environ \
             and self.environ['wsgi.url_scheme'] == 'https'
 
-    def _load_post_and_files(self):
-        # Populates self._post and self._files
-        if self.method == 'POST':
-            if self.environ.get('CONTENT_TYPE', '').startswith('multipart'):
-                self._raw_post_data = ''
-                try:
-                    self._post, self._files = self.parse_file_upload(self.META, self.environ['wsgi.input'])
-                except:
-                    # An error occured while parsing POST data.  Since when
-                    # formatting the error the request handler might access
-                    # self.POST, set self._post and self._file to prevent
-                    # attempts to parse POST data again.
-                    self._post = http.QueryDict('')
-                    self._files = datastructures.MultiValueDict()
-                    # Mark that an error occured.  This allows self.__repr__ to
-                    # be explicit about it instead of simply representing an
-                    # empty POST
-                    self._post_parse_error = True
-                    raise
-            else:
-                self._post, self._files = http.QueryDict(self.raw_post_data, encoding=self._encoding), datastructures.MultiValueDict()
-        else:
-            self._post, self._files = http.QueryDict('', encoding=self._encoding), datastructures.MultiValueDict()
-
     def _get_request(self):
         if not hasattr(self, '_request'):
             self._request = datastructures.MergeDict(self.POST, self.GET)
@@ -192,32 +222,11 @@ def _get_files(self):
             self._load_post_and_files()
         return self._files
 
-    def _get_raw_post_data(self):
-        try:
-            return self._raw_post_data
-        except AttributeError:
-            buf = StringIO()
-            try:
-                # CONTENT_LENGTH might be absent if POST doesn't have content at all (lighttpd)
-                content_length = int(self.environ.get('CONTENT_LENGTH', 0))
-            except (ValueError, TypeError):
-                # If CONTENT_LENGTH was empty string or not an integer, don't
-                # error out. We've also seen None passed in here (against all
-                # specs, but see ticket #8259), so we handle TypeError as well.
-                content_length = 0
-            if content_length > 0:
-                safe_copyfileobj(self.environ['wsgi.input'], buf,
-                        size=content_length)
-            self._raw_post_data = buf.getvalue()
-            buf.close()
-            return self._raw_post_data
-
     GET = property(_get_get, _set_get)
     POST = property(_get_post, _set_post)
     COOKIES = property(_get_cookies, _set_cookies)
     FILES = property(_get_files)
     REQUEST = property(_get_request)
-    raw_post_data = property(_get_raw_post_data)
 
 class WSGIHandler(base.BaseHandler):
     initLock = Lock()

diff --git a/django/http/__init__.py b/django/http/__init__.py
@@ -6,6 +6,10 @@
 from pprint import pformat
 from urllib import urlencode
 from urlparse import urljoin
+try:
+    from cStringIO import StringIO
+except ImportError:
+    from StringIO import StringIO
 try:
     # The mod_python version is more efficient, so try importing it first.
     from mod_python.util import parse_qsl
@@ -132,6 +136,73 @@ def parse_file_upload(self, META, post_data):
         parser = MultiPartParser(META, post_data, self.upload_handlers, self.encoding)
         return parser.parse()
 
+    def _get_raw_post_data(self):
+        if not hasattr(self, '_raw_post_data'):
+            if self._read_started:
+                raise Exception("You cannot access raw_post_data after reading from request's data stream")
+            self._raw_post_data = self.read()
+            self._stream = StringIO(self._raw_post_data)
+        return self._raw_post_data
+    raw_post_data = property(_get_raw_post_data)
+
+    def _mark_post_parse_error(self):
+        self._post = QueryDict('')
+        self._files = MultiValueDict()
+        self._post_parse_error = True
+
+    def _load_post_and_files(self):
+        # Populates self._post and self._files
+        if self.method != 'POST':
+            self._post, self._files = QueryDict('', encoding=self._encoding), MultiValueDict()
+            return
+        if self._read_started:
+            self._mark_post_parse_error()
+            return
+
+        if self.META.get('CONTENT_TYPE', '').startswith('multipart'):
+            self._raw_post_data = ''
+            try:
+                self._post, self._files = self.parse_file_upload(self.META, self)
+            except:
+                # An error occured while parsing POST data.  Since when
+                # formatting the error the request handler might access
+                # self.POST, set self._post and self._file to prevent
+                # attempts to parse POST data again.
+                # Mark that an error occured.  This allows self.__repr__ to
+                # be explicit about it instead of simply representing an
+                # empty POST
+                self._mark_post_parse_error()
+                raise
+        else:
+            self._post, self._files = QueryDict(self.raw_post_data, encoding=self._encoding), MultiValueDict()
+
+    ## File-like and iterator interface.
+    ##
+    ## Expects self._stream to be set to an appropriate source of bytes by
+    ## a corresponding request subclass (WSGIRequest or ModPythonRequest).
+    ## Also when request data has already been read by request.POST or
+    ## request.raw_post_data, self._stream points to a StringIO instance
+    ## containing that data.
+
+    def read(self, *args, **kwargs):
+        self._read_started = True
+        return self._stream.read(*args, **kwargs)
+
+    def readline(self, *args, **kwargs):
+        self._read_started = True
+        return self._stream.readline(*args, **kwargs)
+
+    def xreadlines(self):
+        while True:
+            buf = self.readline()
+            if not buf:
+                break
+            yield buf
+    __iter__ = xreadlines
+
+    def readlines(self):
+        return list(iter(self))
+
 class QueryDict(MultiValueDict):
     """
     A specialized MultiValueDict that takes a query string when initialized.
@@ -198,7 +269,7 @@ def __deepcopy__(self, memo):
         for key, value in dict.items(self):
             dict.__setitem__(result, copy.deepcopy(key, memo), copy.deepcopy(value, memo))
         return result
-    
+
     def setlist(self, key, list_):
         self._assert_mutable()
         key = str_to_unicode(key, self.encoding)
@@ -385,7 +456,7 @@ def set_cookie(self, key, value='', max_age=None, expires=None, path='/',
         """
         Sets a cookie.
 
-        ``expires`` can be a string in the correct format or a 
+        ``expires`` can be a string in the correct format or a
         ``datetime.datetime`` object in UTC. If ``expires`` is a datetime
         object then ``max_age`` will be calculated.
         """
@@ -407,7 +478,7 @@ def set_cookie(self, key, value='', max_age=None, expires=None, path='/',
             # IE requires expires, so set it if hasn't been already.
             if not expires:
                 self.cookies[key]['expires'] = cookie_date(time.time() +
-                                                           max_age) 
+                                                           max_age)
         if path is not None:
             self.cookies[key]['path'] = path
         if domain is not None:

diff --git a/docs/ref/request-response.txt b/docs/ref/request-response.txt
@@ -189,8 +189,14 @@ All attributes except ``session`` should be considered read-only.
 
 .. attribute:: HttpRequest.raw_post_data
 
-    The raw HTTP POST data. This is only useful for advanced processing. Use
+    The raw HTTP POST data as a byte string. This is useful for processing
-    ``POST`` instead.
+    data in different formats than of conventional HTML forms: binary images,
+    XML payload etc. For processing form data use ``HttpRequest.POST``.
+
+    .. versionadded:: 1.3
+
+    You can also read from an HttpRequest using file-like interface. See
+    :meth:`HttpRequest.read()`.
 
 .. attribute:: HttpRequest.urlconf
 
@@ -249,6 +255,27 @@ Methods
    If you write your own XMLHttpRequest call (on the browser side), you'll
    have to set this header manually if you want ``is_ajax()`` to work.
 
+.. method:: HttpRequest.read(size=None)
+.. method:: HttpRequest.readline()
+.. method:: HttpRequest.readlines()
+.. method:: HttpRequest.xreadlines()
+.. method:: HttpRequest.__iter__()
+
+    .. versionadded:: 1.3
+
+    Methods implementing a file-like interface for reading from an
+    HttpRequest instance. This makes it possible to consume an incoming
+    request in a streaming fashion. A common use-case would be to process a
+    big XML payload with iterative parser without constructing a whole
+    XML tree in memory.
+
+    Given this standard interface, an HttpRequest instance can be
+    passed directly to an XML parser such as ElementTree::
+
+        import xml.etree.ElementTree as ET
+        for element in ET.iterparse(request):
+            process(element)
+
 
 QueryDict objects
 -----------------