Skip to content
This repository has been archived by the owner on Feb 27, 2021. It is now read-only.

Commit

Permalink
improve elasticsearch configuration
Browse files Browse the repository at this point in the history
- no stemming on fields where this is irrelevant (in particular author
  names)
- multivaluefields changed to contain integers when appropriate
- indexing and analyzing disabled where adequate
  • Loading branch information
a3nm committed Jul 8, 2017
1 parent 94adb9f commit 391aba0
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 2 deletions.
8 changes: 6 additions & 2 deletions papers/search_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@

from .models import Paper

# from https://github.com/django-haystack/django-haystack/issues/204#issuecomment-544579
class IntegerMultiValueField(indexes.MultiValueField):
field_type = 'integer'

class PaperIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.CharField(document=True, model_attr='title')
pubdate = indexes.DateField(model_attr='pubdate')
Expand All @@ -17,10 +21,10 @@ class PaperIndex(indexes.SearchIndex, indexes.Indexable):
authors_last = indexes.MultiValueField()

#: IDs of researchers
researchers = indexes.MultiValueField()
researchers = IntegerMultiValueField()

#: IDs of institutions of researchers
institutions = indexes.MultiValueField()
institutions = IntegerMultiValueField()

#: ID of publisher
publisher = indexes.IntegerField(null=True)
Expand Down
19 changes: 19 additions & 0 deletions search/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,25 @@ def build_search_kwargs(self, query_string, extra=None, *args, **kwargs):
kwargs.update(extra)
return kwargs

def build_schema(self, fields):
# modify build_schema to change analyzer of some fields
# to disable stemming, in particular on author names
# inspired by https://github.com/django-haystack/django-haystack/issues/621#issuecomment-10833143
content_field_name, mapping = super(SearchBackend, self).build_schema(fields)
for field_name, field_mapping in mapping.items():
if "analyzer" not in field_mapping.keys():
# no analyzer to change
continue
if field_name in ["authors_full", "authors_last"]:
# do not use the snowball analyzer but the standard analyzer,
# which does not do stemming
field_mapping["analyzer"] = "standard"
if field_name in ["availability", "oa_status", "combined_status", "doctype"]:
# no point in doing any indexing on this field or any analyzing
field_mapping["index"] = "not_analyzed"
del field_mapping["analyzer"]
return content_field_name, mapping

def _process_results(self, raw_results, **kwargs):
results = super(SearchBackend, self)._process_results(
raw_results, **kwargs)
Expand Down

0 comments on commit 391aba0

Please sign in to comment.