Skip to content
This repository has been archived by the owner. It is now read-only.
Permalink
Browse files

improve elasticsearch configuration

- no stemming on fields where this is irrelevant (in particular author
  names)
- multivaluefields changed to contain integers when appropriate
- indexing and analyzing disabled where adequate
  • Loading branch information
a3nm committed Jul 8, 2017
1 parent 94adb9f commit 391aba0cbe0b6622164648c0ee40b60ce47e4f0d
Showing with 25 additions and 2 deletions.
  1. +6 −2 papers/search_indexes.py
  2. +19 −0 search/__init__.py
@@ -3,6 +3,10 @@

from .models import Paper

# from https://github.com/django-haystack/django-haystack/issues/204#issuecomment-544579
class IntegerMultiValueField(indexes.MultiValueField):
field_type = 'integer'

class PaperIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.CharField(document=True, model_attr='title')
pubdate = indexes.DateField(model_attr='pubdate')
@@ -17,10 +21,10 @@ class PaperIndex(indexes.SearchIndex, indexes.Indexable):
authors_last = indexes.MultiValueField()

#: IDs of researchers
researchers = indexes.MultiValueField()
researchers = IntegerMultiValueField()

#: IDs of institutions of researchers
institutions = indexes.MultiValueField()
institutions = IntegerMultiValueField()

#: ID of publisher
publisher = indexes.IntegerField(null=True)
@@ -17,6 +17,25 @@ def build_search_kwargs(self, query_string, extra=None, *args, **kwargs):
kwargs.update(extra)
return kwargs

def build_schema(self, fields):
# modify build_schema to change analyzer of some fields
# to disable stemming, in particular on author names
# inspired by https://github.com/django-haystack/django-haystack/issues/621#issuecomment-10833143
content_field_name, mapping = super(SearchBackend, self).build_schema(fields)
for field_name, field_mapping in mapping.items():
if "analyzer" not in field_mapping.keys():
# no analyzer to change
continue
if field_name in ["authors_full", "authors_last"]:
# do not use the snowball analyzer but the standard analyzer,
# which does not do stemming
field_mapping["analyzer"] = "standard"
if field_name in ["availability", "oa_status", "combined_status", "doctype"]:
# no point in doing any indexing on this field or any analyzing
field_mapping["index"] = "not_analyzed"
del field_mapping["analyzer"]
return content_field_name, mapping

def _process_results(self, raw_results, **kwargs):
results = super(SearchBackend, self)._process_results(
raw_results, **kwargs)

0 comments on commit 391aba0

Please sign in to comment.