-
Notifications
You must be signed in to change notification settings - Fork 262
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Mjl index speedup #213
Mjl index speedup #213
Changes from 21 commits
b4920c9
e5cd2df
861f704
827800d
1773ed6
acda7e7
8279746
c4f230d
1dcc013
505406e
93f7d7c
c4a8a24
2d2e996
3b69009
f613926
51360c3
475e8a8
45f62e7
c933fb7
96c3883
9c138d6
9ec41a0
5086396
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
@@ -1,9 +1,13 @@ | ||||
from __future__ import unicode_literals | ||||
|
||||
from django.core.paginator import Paginator | ||||
from collections import deque | ||||
from copy import deepcopy | ||||
from functools import partial | ||||
|
||||
from django import VERSION as DJANGO_VERSION | ||||
from django.db import models | ||||
from django.utils.six import iteritems | ||||
from elasticsearch.helpers import bulk | ||||
from elasticsearch.helpers import bulk, parallel_bulk | ||||
from elasticsearch_dsl import Document as DSLDocument | ||||
|
||||
from .exceptions import ModelFieldNotMappedError | ||||
|
@@ -45,11 +49,12 @@ | |||
models.URLField: TextField, | ||||
} | ||||
|
||||
|
||||
class DocType(DSLDocument): | ||||
_prepared_fields = [] | ||||
def __init__(self, related_instance_to_ignore=None, **kwargs): | ||||
super(DocType, self).__init__(**kwargs) | ||||
self._related_instance_to_ignore = related_instance_to_ignore | ||||
self._prepared_fields = self.init_prepare() | ||||
|
||||
def __eq__(self, other): | ||||
return id(self) == id(other) | ||||
|
@@ -70,39 +75,57 @@ def get_queryset(self): | |||
""" | ||||
Return the queryset that should be indexed by this doc type. | ||||
""" | ||||
primary_key_field_name = self.django.model._meta.pk.name | ||||
return self.django.model._default_manager.all().order_by(primary_key_field_name) | ||||
return self.django.model._default_manager.all() | ||||
|
||||
def prepare(self, instance): | ||||
def get_indexing_queryset(self): | ||||
""" | ||||
Take a model instance, and turn it into a dict that can be serialized | ||||
based on the fields defined on this DocType subclass | ||||
Build queryset (iterator) for use by indexing. | ||||
""" | ||||
qs = self.get_queryset() | ||||
kwargs = {} | ||||
if DJANGO_VERSION >= (2,) and self.django.queryset_pagination: | ||||
kwargs = {'chunk_size': self.django.queryset_pagination} | ||||
return qs.iterator(**kwargs) | ||||
|
||||
def init_prepare(self): | ||||
""" | ||||
Initialise the data model preparers once here. Extracts the preparers | ||||
from the model and generate a list of callables to avoid doing that | ||||
work on every object instance over. | ||||
""" | ||||
data = {} | ||||
for name, field in iteritems(self._fields): | ||||
index_fields = getattr(self, '_fields', {}) | ||||
fields = [] | ||||
for name, field in iteritems(index_fields): | ||||
if not isinstance(field, DEDField): | ||||
continue | ||||
|
||||
if field._path == []: | ||||
if not field._path: | ||||
field._path = [name] | ||||
|
||||
prep_func = getattr(self, 'prepare_%s_with_related' % name, None) | ||||
if prep_func: | ||||
field_value = prep_func( | ||||
instance, | ||||
related_to_ignore=self._related_instance_to_ignore | ||||
) | ||||
fn = partial(prep_func, related_to_ignore=self._related_instance_to_ignore) | ||||
else: | ||||
prep_func = getattr(self, 'prepare_%s' % name, None) | ||||
if prep_func: | ||||
field_value = prep_func(instance) | ||||
fn = prep_func | ||||
else: | ||||
field_value = field.get_value_from_instance( | ||||
instance, self._related_instance_to_ignore | ||||
) | ||||
fn = partial(field.get_value_from_instance, field_value_to_ignore=self._related_instance_to_ignore) | ||||
|
||||
data[name] = field_value | ||||
fields.append((name, field, fn)) | ||||
|
||||
return fields | ||||
|
||||
def prepare(self, instance): | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need to change the prepare function? |
||||
""" | ||||
Take a model instance, and turn it into a dict that can be serialized | ||||
based on the fields defined on this DocType subclass | ||||
""" | ||||
data = { | ||||
name: prep_func(instance) | ||||
for name, field, prep_func in self._prepared_fields | ||||
} | ||||
# print("-> %s" % data) | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you remove this?
Suggested change
|
||||
return data | ||||
|
||||
@classmethod | ||||
|
@@ -124,6 +147,17 @@ def to_field(cls, field_name, model_field): | |||
def bulk(self, actions, **kwargs): | ||||
return bulk(client=self._get_connection(), actions=actions, **kwargs) | ||||
|
||||
def parallel_bulk(self, actions, **kwargs): | ||||
if self.django.queryset_pagination and 'chunk_size' not in kwargs: | ||||
kwargs['chunk_size'] = self.django.queryset_pagination | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fix the indent. |
||||
bulk_actions = parallel_bulk(client=self._get_connection(), actions=actions, **kwargs) | ||||
# As the `parallel_bulk` is lazy, we need to get it into `deque` to run it instantly | ||||
# See https://discuss.elastic.co/t/helpers-parallel-bulk-in-python-not-working/39498/2 | ||||
deque(bulk_actions, maxlen=0) | ||||
# Fake return value to emulate bulk() since we don't have a result yet, | ||||
# the result is currently not used upstream anyway. | ||||
return (1, []) | ||||
|
||||
def _prepare_action(self, object_instance, action): | ||||
return { | ||||
'_op_type': action, | ||||
|
@@ -135,18 +169,18 @@ def _prepare_action(self, object_instance, action): | |||
} | ||||
|
||||
def _get_actions(self, object_list, action): | ||||
if self.django.queryset_pagination is not None: | ||||
paginator = Paginator( | ||||
object_list, self.django.queryset_pagination | ||||
) | ||||
for page in paginator.page_range: | ||||
for object_instance in paginator.page(page).object_list: | ||||
yield self._prepare_action(object_instance, action) | ||||
for object_instance in object_list: | ||||
yield self._prepare_action(object_instance, action) | ||||
|
||||
def _bulk(self, *args, **kwargs): | ||||
"""Helper for switching between normal and parallel bulk operation""" | ||||
parallel = kwargs.pop('parallel', False) | ||||
if parallel: | ||||
return self.parallel_bulk(*args, **kwargs) | ||||
else: | ||||
for object_instance in object_list: | ||||
yield self._prepare_action(object_instance, action) | ||||
return self.bulk(*args, **kwargs) | ||||
|
||||
def update(self, thing, refresh=None, action='index', **kwargs): | ||||
def update(self, thing, refresh=None, action='index', parallel=False, **kwargs): | ||||
""" | ||||
Update each document in ES for a model, iterable of models or queryset | ||||
""" | ||||
|
@@ -160,8 +194,10 @@ def update(self, thing, refresh=None, action='index', **kwargs): | |||
else: | ||||
object_list = thing | ||||
|
||||
return self.bulk( | ||||
self._get_actions(object_list, action), **kwargs | ||||
return self._bulk( | ||||
self._get_actions(object_list, action), | ||||
parallel=parallel, | ||||
**kwargs | ||||
) | ||||
|
||||
|
||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we need to handle the usecase of people who are using
django< 2
version. They should be able to paginate the queryset like before.