Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Solr backend support for rich-content extraction

This allows indexes to use text extracted from binary files as well
as normal database content.

Note: requires a very recent pysolr - see https://github.com/acdha/pysolr/tree/rich-content-extraction
  • Loading branch information...
commit b612ab482ad436b5596f0a1846c30d540bddc760 1 parent eec3f78
@acdha acdha authored toastdriven committed
View
1  docs/index.rst
@@ -60,6 +60,7 @@ you may want to include in your application.
autocomplete
boost
multiple_index
+ rich_content_extraction
Reference
View
68 docs/rich_content_extraction.rst
@@ -0,0 +1,68 @@
+.. _ref-rich_content_extraction:
+
+=======================
+Rich Content Extraction
+=======================
+
+For some projects it is desirable to index text content which is stored in
+structured files such as PDFs, Microsoft Office documents, images, etc.
+Currently only Solr's `ExtractingRequestHandler`_ is directly supported by
+Haystack but the approach below could be used with any backend which supports
+this feature.
+
+.. _`ExtractingRequestHandler`: http://wiki.apache.org/solr/ExtractingRequestHandler
+
+Extracting Content
+==================
+
+:meth:`SearchBackend.extract_file_contents` accepts a file or file-like object
+and returns a dictionary containing two keys: ``metadata`` and ``contents``. The
+``contents`` value will be a string containing all of the text which the backend
+managed to extract from the file contents. ``metadata`` will always be a
+dictionary but the keys and values will vary based on the underlying extraction
+engine and the type of file provided.
+
+Indexing Extracted Content
+==========================
+
+Generally you will want to include the extracted text in your main document
+field along with everything else specified in your search template. This example
+shows how to override a hypothetical ``FileIndex``'s ``prepare`` method to
+include the extract content along with information retrieved from the database::
+
+ def prepare(self, obj):
+ data = super(FileIndex, self).prepare(obj)
+
+ # This could also be a regular Python open() call, a StringIO instance
+ # or the result of opening a URL. Note that due to a library limitation
+ # file_obj must have a .name attribute even if you need to set one
+ # manually before calling extract_file_contents:
+ file_obj = obj.the_file.open()
+
+ extracted_data = self.backend.extract_file_contents(file_obj)
+
+ # Now we'll finally perform the template processing to render the
+ # text field with *all* of our metadata visible for templating:
+ t = loader.select_template(('search/indexes/myapp/file_text.txt', ))
+ data['text'] = t.render(Context({'object': obj,
+ 'extracted': extracted_data}))
+
+ return data
+
+This allows you to insert the extracted text at the appropriate place in your
+template, modified or intermixed with database content as appropriate:
+
+.. code-block:: html+django
+
+ {{ object.title }}
+ {{ object.owner.name }}
+
+ …
+
+ {% for k, v in extracted.metadata.items %}
+ {% for val in v %}
+ {{ k }}: {{ val|safe }}
+ {% endfor %}
+ {% endfor %}
+
+ {{ extracted.contents|striptags|safe }}
View
11 docs/searchbackend_api.rst
@@ -70,6 +70,17 @@ results the search backend found.
This method MUST be implemented by each backend, as it will be highly
specific to each one.
+``extract_file_contents``
+-------------------------
+
+.. method:: SearchBackend.extract_file_contents(self, file_obj)
+
+Perform text extraction on the provided file or file-like object. Returns either
+None or a dictionary containing the keys ``contents`` and ``metadata``. The
+``contents`` field will always contain the extracted text content returned by
+the underlying search engine but ``metadata`` may vary considerably based on
+the backend and the input file.
+
``prep_value``
--------------
View
17 haystack/backends/__init__.py
@@ -135,6 +135,23 @@ def more_like_this(self, model_instance, additional_query_string=None, result_cl
"""
raise NotImplementedError("Subclasses must provide a way to fetch similar record via the 'more_like_this' method if supported by the backend.")
+ def extract_file_contents(self, file_obj):
+ """
+ Hook to allow backends which support rich-content types such as PDF,
+ Word, etc. extraction to process the provided file object and return
+ the contents for indexing
+
+ Returns None if metadata cannot be extracted; otherwise returns a
+ dictionary containing at least two keys:
+
+ :contents:
+ Extracted full-text content, if applicable
+ :metadata:
+ key:value pairs of text strings
+ """
+
+ raise NotImplementedError("Subclasses must provide a way to extract metadata via the 'extract' method if supported by the backend.")
+
def build_schema(self, fields):
"""
Takes a dictionary of fields and returns schema information.
View
32 haystack/backends/solr_backend.py
@@ -377,6 +377,38 @@ def build_schema(self, fields):
return (content_field_name, schema_fields)
+ def extract_file_contents(self, file_obj):
+ """Extract text and metadata from a structured file (PDF, MS Word, etc.)
+
+ Uses the Solr ExtractingRequestHandler, which is based on Apache Tika.
+ See the Solr wiki for details:
+
+ http://wiki.apache.org/solr/ExtractingRequestHandler
+
+ Due to the way the ExtractingRequestHandler is implemented it completely
+ replaces the normal Haystack indexing process with several unfortunate
+ restrictions: only one file per request, the extracted data is added to
+ the index with no ability to modify it, etc. To simplify the process and
+ allow for more advanced use we'll run using the extract-only mode to
+ return the extracted data without adding it to the index so we can then
+ use it within Haystack's normal templating process.
+
+ Returns None if metadata cannot be extracted; otherwise returns a
+ dictionary containing at least two keys:
+
+ :contents:
+ Extracted full-text content, if applicable
+ :metadata:
+ key:value pairs of text strings
+ """
+
+ try:
+ return self.conn.extract(file_obj)
+ except StandardError, e:
+ self.log.warning(u"Unable to extract file contents: %s", e,
+ exc_info=True, extra={"data": {"file": file_obj}})
+ return None
+
class SolrSearchQuery(BaseSearchQuery):
def matching_all_fragment(self):
View
BIN  tests/content_extraction/test.pdf
Binary file not shown
View
21 tests/solr_tests/tests/solr_backend.py
@@ -2,6 +2,8 @@
import datetime
from decimal import Decimal
import logging
+import os
+
import pysolr
from django.conf import settings
from django.test import TestCase
@@ -1208,3 +1210,22 @@ def test_boost(self):
'core.afourthmockmodel.2',
'core.afourthmockmodel.4'
])
+
+
+class LiveSolrContentExtractionTestCase(TestCase):
+ def setUp(self):
+ super(LiveSolrContentExtractionTestCase, self).setUp()
+
+ self.sb = connections['default'].get_backend()
+
+ def test_content_extraction(self):
+ f = open(os.path.join(os.path.dirname(__file__),
+ "..", "..", "content_extraction", "test.pdf"),
+ "rb")
+
+ data = self.sb.extract_file_contents(f)
+
+ self.assertTrue("haystack" in data['contents'])
+ self.assertEqual(data['metadata']['Content-Type'], [u'application/pdf'])
+ self.assertTrue(any(i for i in data['metadata']['Keywords'] if 'SolrCell' in i))
+
Please sign in to comment.
Something went wrong with that request. Please try again.