Skip to content

Commit

Permalink
Fixed #18702 -- Removed chunked reads from QuerySet iteration
Browse files Browse the repository at this point in the history
  • Loading branch information
akaariai committed May 21, 2013
1 parent ea9a085 commit 7067924
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 156 deletions.
149 changes: 29 additions & 120 deletions django/db/models/query.py
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -20,11 +20,6 @@
from django.utils import six from django.utils import six
from django.utils import timezone from django.utils import timezone


# Used to control how many objects are worked with at once in some cases (e.g.
# when deleting objects).
CHUNK_SIZE = 100
ITER_CHUNK_SIZE = CHUNK_SIZE

# The maximum number of items to display in a QuerySet.__repr__ # The maximum number of items to display in a QuerySet.__repr__
REPR_OUTPUT_SIZE = 20 REPR_OUTPUT_SIZE = 20


Expand All @@ -41,7 +36,6 @@ def __init__(self, model=None, query=None, using=None):
self._db = using self._db = using
self.query = query or sql.Query(self.model) self.query = query or sql.Query(self.model)
self._result_cache = None self._result_cache = None
self._iter = None
self._sticky_filter = False self._sticky_filter = False
self._for_write = False self._for_write = False
self._prefetch_related_lookups = [] self._prefetch_related_lookups = []
Expand All @@ -57,8 +51,8 @@ def __deepcopy__(self, memo):
Deep copy of a QuerySet doesn't populate the cache Deep copy of a QuerySet doesn't populate the cache
""" """
obj = self.__class__() obj = self.__class__()
for k,v in self.__dict__.items(): for k, v in self.__dict__.items():
if k in ('_iter','_result_cache'): if k == '_result_cache':
obj.__dict__[k] = None obj.__dict__[k] = None
else: else:
obj.__dict__[k] = copy.deepcopy(v, memo) obj.__dict__[k] = copy.deepcopy(v, memo)
Expand All @@ -69,10 +63,8 @@ def __getstate__(self):
Allows the QuerySet to be pickled. Allows the QuerySet to be pickled.
""" """
# Force the cache to be fully populated. # Force the cache to be fully populated.
len(self) self._fetch_all()

obj_dict = self.__dict__.copy() obj_dict = self.__dict__.copy()
obj_dict['_iter'] = None
return obj_dict return obj_dict


def __repr__(self): def __repr__(self):
Expand All @@ -82,95 +74,31 @@ def __repr__(self):
return repr(data) return repr(data)


def __len__(self): def __len__(self):
# Since __len__ is called quite frequently (for example, as part of self._fetch_all()
# list(qs), we make some effort here to be as efficient as possible
# whilst not messing up any existing iterators against the QuerySet.
if self._result_cache is None:
if self._iter:
self._result_cache = list(self._iter)
else:
self._result_cache = list(self.iterator())
elif self._iter:
self._result_cache.extend(self._iter)
if self._prefetch_related_lookups and not self._prefetch_done:
self._prefetch_related_objects()
return len(self._result_cache) return len(self._result_cache)


def __iter__(self): def __iter__(self):
if self._prefetch_related_lookups and not self._prefetch_done: """
# We need all the results in order to be able to do the prefetch The queryset iterator protocol uses three nested iterators in the
# in one go. To minimize code duplication, we use the __len__ default case:
# code path which also forces this, and also does the prefetch 1. sql.compiler:execute_sql()
len(self) - Returns 100 rows at time (constants.GET_ITERATOR_CHUNK_SIZE)

using cursor.fetchmany(). This part is responsible for
if self._result_cache is None: doing some column masking, and returning the rows in chunks.
self._iter = self.iterator() 2. sql/compiler.results_iter()
self._result_cache = [] - Returns one row at time. At this point the rows are still just
if self._iter: tuples. In some cases the return values are converted to
return self._result_iter() Python values at this location (see resolve_columns(),
# Python's list iterator is better than our version when we're just resolve_aggregate()).
# iterating over the cache. 3. self.iterator()
- Responsible for turning the rows into model objects.
"""
self._fetch_all()
return iter(self._result_cache) return iter(self._result_cache)


def _result_iter(self): def __nonzero__(self):
pos = 0 self._fetch_all()
while 1: return bool(self._result_cache)
upper = len(self._result_cache)
while pos < upper:
yield self._result_cache[pos]
pos = pos + 1
if not self._iter:
raise StopIteration
if len(self._result_cache) <= pos:
self._fill_cache()

def __bool__(self):
if self._prefetch_related_lookups and not self._prefetch_done:
# We need all the results in order to be able to do the prefetch
# in one go. To minimize code duplication, we use the __len__
# code path which also forces this, and also does the prefetch
len(self)

if self._result_cache is not None:
return bool(self._result_cache)
try:
next(iter(self))
except StopIteration:
return False
return True

def __nonzero__(self): # Python 2 compatibility
return type(self).__bool__(self)

def __contains__(self, val):
# The 'in' operator works without this method, due to __iter__. This
# implementation exists only to shortcut the creation of Model
# instances, by bailing out early if we find a matching element.
pos = 0
if self._result_cache is not None:
if val in self._result_cache:
return True
elif self._iter is None:
# iterator is exhausted, so we have our answer
return False
# remember not to check these again:
pos = len(self._result_cache)
else:
# We need to start filling the result cache out. The following
# ensures that self._iter is not None and self._result_cache is not
# None
it = iter(self)

# Carry on, one result at a time.
while True:
if len(self._result_cache) <= pos:
self._fill_cache(num=1)
if self._iter is None:
# we ran out of items
return False
if self._result_cache[pos] == val:
return True
pos += 1


def __getitem__(self, k): def __getitem__(self, k):
""" """
Expand All @@ -184,19 +112,6 @@ def __getitem__(self, k):
"Negative indexing is not supported." "Negative indexing is not supported."


if self._result_cache is not None: if self._result_cache is not None:
if self._iter is not None:
# The result cache has only been partially populated, so we may
# need to fill it out a bit more.
if isinstance(k, slice):
if k.stop is not None:
# Some people insist on passing in strings here.
bound = int(k.stop)
else:
bound = None
else:
bound = k + 1
if len(self._result_cache) < bound:
self._fill_cache(bound - len(self._result_cache))
return self._result_cache[k] return self._result_cache[k]


if isinstance(k, slice): if isinstance(k, slice):
Expand Down Expand Up @@ -370,7 +285,7 @@ def count(self):
If the QuerySet is already fully cached this simply returns the length If the QuerySet is already fully cached this simply returns the length
of the cached results set to avoid multiple SELECT COUNT(*) calls. of the cached results set to avoid multiple SELECT COUNT(*) calls.
""" """
if self._result_cache is not None and not self._iter: if self._result_cache is not None:
return len(self._result_cache) return len(self._result_cache)


return self.query.get_count(using=self.db) return self.query.get_count(using=self.db)
Expand Down Expand Up @@ -933,17 +848,11 @@ def _clone(self, klass=None, setup=False, **kwargs):
c._setup_query() c._setup_query()
return c return c


def _fill_cache(self, num=None): def _fetch_all(self):
""" if self._result_cache is None:
Fills the result cache with 'num' more entries (or until the results self._result_cache = list(self.iterator())
iterator is exhausted). if self._prefetch_related_lookups and not self._prefetch_done:
""" self._prefetch_related_objects()
if self._iter:
try:
for i in range(num or ITER_CHUNK_SIZE):
self._result_cache.append(next(self._iter))
except StopIteration:
self._iter = None


def _next_is_sticky(self): def _next_is_sticky(self):
""" """
Expand Down
19 changes: 19 additions & 0 deletions docs/releases/1.6.txt
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -524,6 +524,25 @@ non-standard behavior has been preserved but moved to the model form field layer
and occurs only when the associated widget is and occurs only when the associated widget is
:class:`~django.forms.SelectMultiple` or a subclass. :class:`~django.forms.SelectMultiple` or a subclass.


QuerySet iteration
~~~~~~~~~~~~~~~~~~

The ``QuerySet`` iteration was changed to immediately convert all fetched
rows to ``Model`` objects. In Django 1.5 and earlier the fetched rows were
converted to ``Model`` objects in chunks of 100.

Existing code will work, but the amount of rows converted to objects
might change in certain use cases. Such usages include partially looping
over a queryset or any usage which ends up doing ``__bool__`` or
``__contains__``.

Notably most database backends did fetch all the rows in one go already in
1.5.

It is still possible to convert the fetched rows to ``Model`` objects
lazily by using the :meth:`~django.db.models.query.QuerySet.iterator()`
method.

Miscellaneous Miscellaneous
~~~~~~~~~~~~~ ~~~~~~~~~~~~~


Expand Down
44 changes: 8 additions & 36 deletions tests/queries/tests.py
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from django.core.exceptions import FieldError from django.core.exceptions import FieldError
from django.db import DatabaseError, connection, connections, DEFAULT_DB_ALIAS from django.db import DatabaseError, connection, connections, DEFAULT_DB_ALIAS
from django.db.models import Count, F, Q from django.db.models import Count, F, Q
from django.db.models.query import ITER_CHUNK_SIZE
from django.db.models.sql.where import WhereNode, EverythingNode, NothingNode from django.db.models.sql.where import WhereNode, EverythingNode, NothingNode
from django.db.models.sql.datastructures import EmptyResultSet from django.db.models.sql.datastructures import EmptyResultSet
from django.test import TestCase, skipUnlessDBFeature from django.test import TestCase, skipUnlessDBFeature
Expand Down Expand Up @@ -1211,16 +1210,6 @@ def test_ticket12239(self):
ordered=False ordered=False
) )


def test_ticket7411(self):
# Saving to db must work even with partially read result set in another
# cursor.
for num in range(2 * ITER_CHUNK_SIZE + 1):
_ = Number.objects.create(num=num)

for i, obj in enumerate(Number.objects.all()):
obj.save()
if i > 10: break

def test_ticket7759(self): def test_ticket7759(self):
# Count should work with a partially read result set. # Count should work with a partially read result set.
count = Number.objects.count() count = Number.objects.count()
Expand Down Expand Up @@ -1700,31 +1689,6 @@ def setUp(self):
ann1.notes.add(n1) ann1.notes.add(n1)
ann2 = Annotation.objects.create(name='a2', tag=t4) ann2 = Annotation.objects.create(name='a2', tag=t4)


# This next test used to cause really weird PostgreSQL behavior, but it was
# only apparent much later when the full test suite ran.
# - Yeah, it leaves global ITER_CHUNK_SIZE to 2 instead of 100...
#@unittest.expectedFailure
def test_slicing_and_cache_interaction(self):
# We can do slicing beyond what is currently in the result cache,
# too.

# We need to mess with the implementation internals a bit here to decrease the
# cache fill size so that we don't read all the results at once.
from django.db.models import query
query.ITER_CHUNK_SIZE = 2
qs = Tag.objects.all()

# Fill the cache with the first chunk.
self.assertTrue(bool(qs))
self.assertEqual(len(qs._result_cache), 2)

# Query beyond the end of the cache and check that it is filled out as required.
self.assertEqual(repr(qs[4]), '<Tag: t5>')
self.assertEqual(len(qs._result_cache), 5)

# But querying beyond the end of the result set will fail.
self.assertRaises(IndexError, lambda: qs[100])

def test_parallel_iterators(self): def test_parallel_iterators(self):
# Test that parallel iterators work. # Test that parallel iterators work.
qs = Tag.objects.all() qs = Tag.objects.all()
Expand Down Expand Up @@ -2533,6 +2497,14 @@ def test_empty_nodes(self):
w = WhereNode(children=[empty_w, NothingNode()], connector='OR') w = WhereNode(children=[empty_w, NothingNode()], connector='OR')
self.assertRaises(EmptyResultSet, w.as_sql, qn, connection) self.assertRaises(EmptyResultSet, w.as_sql, qn, connection)



class IteratorExceptionsTest(TestCase):
def test_iter_exceptions(self):
qs = ExtraInfo.objects.only('author')
with self.assertRaises(AttributeError):
list(qs)


class NullJoinPromotionOrTest(TestCase): class NullJoinPromotionOrTest(TestCase):
def setUp(self): def setUp(self):
self.d1 = ModelD.objects.create(name='foo') self.d1 = ModelD.objects.create(name='foo')
Expand Down

0 comments on commit 7067924

Please sign in to comment.