From c5ca938c990194ed53f1e61a3631a16848a64775 Mon Sep 17 00:00:00 2001 From: Johannes Charra Date: Thu, 16 Apr 2015 15:12:06 +0200 Subject: [PATCH 01/31] append asterisk to each search term --- src/collective/solr/skins/solr_site_search/livesearch_reply.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) mode change 100644 => 100755 src/collective/solr/skins/solr_site_search/livesearch_reply.py diff --git a/src/collective/solr/skins/solr_site_search/livesearch_reply.py b/src/collective/solr/skins/solr_site_search/livesearch_reply.py old mode 100644 new mode 100755 index 5d5666a4d..1bd468471 --- a/src/collective/solr/skins/solr_site_search/livesearch_reply.py +++ b/src/collective/solr/skins/solr_site_search/livesearch_reply.py @@ -62,8 +62,7 @@ def quote_bad_chars(s): for char in ('?', '-', '+', '*', multispace): q = q.replace(char, ' ') r = q.split() -r = " AND ".join(r) -r = quote_bad_chars(r)+'*' +r = " AND ".join([quote_bad_chars(x) + "*" for x in r]) searchterms = url_quote_plus(r) site_encoding = context.plone_utils.getSiteEncoding() From f4b8a9081659a3e911f3e20de0e82770d2480231 Mon Sep 17 00:00:00 2001 From: Johannes Charra Date: Thu, 16 Apr 2015 15:14:10 +0200 Subject: [PATCH 02/31] undo incorrect lowercasing of boolean operators --- src/collective/solr/utils.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/collective/solr/utils.py b/src/collective/solr/utils.py index 241ed9f1f..aa5e67798 100644 --- a/src/collective/solr/utils.py +++ b/src/collective/solr/utils.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- from string import maketrans from re import compile, UNICODE @@ -67,8 +66,6 @@ def prepareData(data): simpleTerm = compile(r'^[\w\d]+$', UNICODE) - - def isSimpleTerm(term): if isinstance(term, str): term = unicode(term, 'utf-8', 'ignore') @@ -82,8 +79,6 @@ def isSimpleTerm(term): operators = compile(r'(.*)\s+(AND|OR|NOT)\s+', UNICODE) simpleCharacters = compile(r'^[\w\d\?\*\s]+$', UNICODE) is_digit = compile('\d', UNICODE) - - def isSimpleSearch(term): term = term.strip() if isinstance(term, str): @@ -135,8 +130,6 @@ def splitSimpleSearch(term): wildCard = compile(r'^[\w\d\s*?]*[*?]+[\w\d\s*?]*$', UNICODE) - - def isWildCard(term): if isinstance(term, str): term = unicode(term, 'utf-8', 'ignore') @@ -151,7 +144,18 @@ def prepare_wildcard(value): # unidecode will produce the same results if not isinstance(value, unicode): value = unicode(value, 'utf-8', 'ignore') - return str(unidecode(value).lower()) + + value = str(unidecode(value).lower()) + + # keywords like "AND" and "OR" must not be lowercased, otherwise Solr + # will interpret them as search terms. + # Re-capitalizing them in the lowercased value might incorrectly capitalize + # actual search terms, but this erroneous behaviour is both difficult to provoke + # and probably harmless in most cases. + value = value.replace(" and ", " AND ") + value = value.replace(" or ", " OR ") + + return value def findObjects(origin): From 4750bbe9961a009c1f65268fb5d73c9ed8fe0498 Mon Sep 17 00:00:00 2001 From: Johannes Charra Date: Fri, 17 Apr 2015 22:03:45 +0200 Subject: [PATCH 03/31] improvement of prepare_wildcard + tests --- src/collective/solr/tests/test_utils.py | 17 +++++++++++++++ src/collective/solr/utils.py | 28 +++++++++++++++---------- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/src/collective/solr/tests/test_utils.py b/src/collective/solr/tests/test_utils.py index d125009c8..521c990f6 100644 --- a/src/collective/solr/tests/test_utils.py +++ b/src/collective/solr/tests/test_utils.py @@ -8,6 +8,7 @@ from collective.solr.utils import isWildCard from collective.solr.utils import padResults from collective.solr.utils import prepareData +from collective.solr.utils import prepare_wildcard from collective.solr.utils import setupTranslationMap from collective.solr.utils import splitSimpleSearch from unittest import TestCase @@ -135,6 +136,22 @@ def testIsWildCard(self): # distinguish them properly (yet) self.assertFalse(isWildCard('foo#?')) + def testPrepareWildcard(self): + self.assertEqual(prepare_wildcard("Foo"), "foo") + self.assertEqual(prepare_wildcard("and"), "and") + self.assertEqual(prepare_wildcard("or"), "or") + self.assertEqual(prepare_wildcard("not"), "not") + self.assertEqual(prepare_wildcard("Foo and bar"), "foo and bar") + self.assertEqual(prepare_wildcard("Foo AND Bar"), "foo AND bar") + self.assertEqual(prepare_wildcard("FOO AND NOT BAR"), + "foo AND NOT bar") + self.assertEqual(prepare_wildcard("Foo OR Bar"), + "foo OR bar") + self.assertEqual(prepare_wildcard("FOO OR NOT BAR"), + "foo OR NOT bar") + self.assertEqual(prepare_wildcard("FOO AND BAR OR FOO AND NOT BAR"), + "foo AND bar OR foo AND NOT bar") + class TranslationTests(TestCase): diff --git a/src/collective/solr/utils.py b/src/collective/solr/utils.py index aa5e67798..b98cbe6bd 100644 --- a/src/collective/solr/utils.py +++ b/src/collective/solr/utils.py @@ -66,6 +66,8 @@ def prepareData(data): simpleTerm = compile(r'^[\w\d]+$', UNICODE) + + def isSimpleTerm(term): if isinstance(term, str): term = unicode(term, 'utf-8', 'ignore') @@ -79,6 +81,8 @@ def isSimpleTerm(term): operators = compile(r'(.*)\s+(AND|OR|NOT)\s+', UNICODE) simpleCharacters = compile(r'^[\w\d\?\*\s]+$', UNICODE) is_digit = compile('\d', UNICODE) + + def isSimpleSearch(term): term = term.strip() if isinstance(term, str): @@ -130,6 +134,8 @@ def splitSimpleSearch(term): wildCard = compile(r'^[\w\d\s*?]*[*?]+[\w\d\s*?]*$', UNICODE) + + def isWildCard(term): if isinstance(term, str): term = unicode(term, 'utf-8', 'ignore') @@ -145,17 +151,17 @@ def prepare_wildcard(value): if not isinstance(value, unicode): value = unicode(value, 'utf-8', 'ignore') - value = str(unidecode(value).lower()) - - # keywords like "AND" and "OR" must not be lowercased, otherwise Solr - # will interpret them as search terms. - # Re-capitalizing them in the lowercased value might incorrectly capitalize - # actual search terms, but this erroneous behaviour is both difficult to provoke - # and probably harmless in most cases. - value = value.replace(" and ", " AND ") - value = value.replace(" or ", " OR ") - - return value + value = str(unidecode(value)) + + # boolean operators must not be lowercased, otherwise Solr will interpret + # them as search terms. So we split the search term into tokens and + # lowercase only the non-operator parts. + parts = [] + for item in value.split(): + parts.append(item.lower() + if item not in ("AND", "OR", "NOT") + else item) + return " ".join(parts) def findObjects(origin): From 5b682a6604cda9c780dca79b33c431d1e52159d4 Mon Sep 17 00:00:00 2001 From: Gil Forcada Date: Mon, 21 Mar 2016 10:26:03 +0100 Subject: [PATCH 04/31] Use semantic line breaks --- docs/base/dependencies.rst | 9 ++- docs/base/index.rst | 2 - docs/base/indexing.rst | 44 +++++++++---- docs/base/searching.rst | 61 +++++++++++------ docs/development/TODO.rst | 4 +- docs/features/atomic_updates.rst | 18 +++-- docs/features/binary.rst | 12 ++-- docs/features/exclude.rst | 6 +- docs/features/facets.rst | 2 +- docs/features/index.rst | 11 ---- docs/features/languages.rst | 7 +- docs/features/suggestions.rst | 6 +- docs/features/wildcard.rst | 5 +- docs/introduction.rst | 7 +- docs/status.rst | 7 +- docs/usage/autocomplete.rst | 110 +++++++++++++++---------------- docs/usage/config.rst | 5 -- docs/usage/install.rst | 4 +- docs/usage/monitoring.rst | 36 +++++----- docs/usage/replication.rst | 46 ++++++------- docs/usage/setup.rst | 7 -- docs/usage/solrcloud.rst | 4 +- 22 files changed, 233 insertions(+), 180 deletions(-) diff --git a/docs/base/dependencies.rst b/docs/base/dependencies.rst index a8307c52c..0d189c7c3 100644 --- a/docs/base/dependencies.rst +++ b/docs/base/dependencies.rst @@ -2,6 +2,9 @@ Dependencies ------------ Currently we depend on `collective.indexing` as a means to hook into the normal catalog machinery of Plone to detect content changes. -`c.indexing` before version two had some persistent data structures that frequently caused problems when removing the add-on. These problems have been fixed in version two. -Unfortunately `c.indexing` still has to hook the catalog machinery in various evil ways, as the machinery lacks the required hooks for its use-case. -Going forward it is expected for `c.indexing` to be merged into the underlying `ZCatalog` implementation, at which point `collective.solr` can use those hooks directly. +`c.indexing` before version two had some persistent data structures that frequently caused problems when removing the add-on. +These problems have been fixed in version two. +Unfortunately `c.indexing` still has to hook the catalog machinery in various evil ways, +as the machinery lacks the required hooks for its use-case. +Going forward it is expected for `c.indexing` to be merged into the underlying `ZCatalog` implementation, +at which point `collective.solr` can use those hooks directly. diff --git a/docs/base/index.rst b/docs/base/index.rst index e9714d17d..3d416f8ef 100644 --- a/docs/base/index.rst +++ b/docs/base/index.rst @@ -1,8 +1,6 @@ Base Information how Solr and the Integration of Solr and Plone work ==================================================================== - - Architecture ------------ diff --git a/docs/base/indexing.rst b/docs/base/indexing.rst index c119d3faf..718f703e7 100644 --- a/docs/base/indexing.rst +++ b/docs/base/indexing.rst @@ -3,37 +3,55 @@ Indexing Solr is not transactional aware or supports any kind of rollback or undo. We therefor only sent data to Solr at the end of any successful request. -This is done via collective.indexing, a transaction manager and an end request transaction hook. +This is done via collective.indexing, +a transaction manager and an end request transaction hook. This means you won't see any changes done to content inside a request when doing Solr searches later on in the same request. Inside tests you need to either commit real transactions or otherwise flush the Solr connection. -There's no transaction concept, so one request doing a search might get some results in its beginning, than a different request might add new information to Solr. +There's no transaction concept, +so one request doing a search might get some results in its beginning, +than a different request might add new information to Solr. If the first request is still running and does the same search again it might get different results taking the changes from the second request into account. Solr is not a real time search engine. -While there's work under way to make Solr capable of delivering real time results, there's currently always a certain delay up to some minutes from the time data is sent to Solr to when it is available in searches. +While there's work under way to make Solr capable of delivering real time results, +there's currently always a certain delay up to some minutes from the time data is sent to Solr to when it is available in searches. Search results are returned in Solr by distinct search threads. These search threads hold a great number of caches which are crucial for Solr to perform. -When index or unindex operations are sent to Solr, it will keep those in memory until a commit is executed on its own search index. +When index or unindex operations are sent to Solr, +it will keep those in memory until a commit is executed on its own search index. When a commit occurs, all search threads and thus all caches are thrown away and new threads are created reflecting the data after the commit. -While there's a certain amount of cache data that is copied to the new search threads, this data has to be validated against the new index which takes some time. +While there's a certain amount of cache data that is copied to the new search threads, +this data has to be validated against the new index which takes some time. The `useColdSearcher` and `maxWarmingSearchers` options of the Solr recipe relate to this aspect. While cache data is copied over and validated for a new search thread, the searcher is `warming up`. If the warming up is not yet completed the searcher is considered to be `cold`. -In order to get real good performance out of Solr, we need to minimize the number of commits against the Solr index. +In order to get real good performance out of Solr, +we need to minimize the number of commits against the Solr index. We can achieve this by turning off `auto-commit` and instead use `commitWithin`. So we don't sent a `commit` to Solr at the end of each index/unindex request on the Plone side. Instead we tell Solr to commit the data to its index at most after a certain time interval. Values of 15 minutes to 1 minute work well for this interval. -The larger you can make this interval, the better the performance of Solr will be, at the cost of search results lagging behind a bit. -In this setup we also need to configure the `autoCommitMaxTime` option of the Solr server, as `commitWithin` only works for index but not unindex operations. +The larger you can make this interval, +the better the performance of Solr will be, +at the cost of search results lagging behind a bit. +In this setup we also need to configure the `autoCommitMaxTime` option of the Solr server, +as `commitWithin` only works for index but not unindex operations. Otherwise a large number of unindex operations without any index operations occurring could not be reflected in the index for a long time. -As a result of all the above, the Solr index and the Plone site will always have slightly diverging contents. -If you use Solr to do searches you need to be aware of this, as you might get results for objects that no longer exist. +As a result of all the above, +the Solr index and the Plone site will always have slightly diverging contents. +If you use Solr to do searches you need to be aware of this, +as you might get results for objects that no longer exist. So any `brain/getObject` call on the Plone side needs to have error handling code around it as the object might not be there anymore and traversing to it can throw an exception. -When adding new or deleting old content or changing the workflow state of it, you will also not see those actions reflected in searches right away, but only after a delay of at most the `commitWithin` interval. -After a `commitWithin` operation is sent to Solr, any other operations happening during that time window will be executed after the first interval is over. -So with a 15 minute interval, if document A is indexed at 5:15, B at 5:20 and C at 5:35, both A & B will be committed at 5:30 and C at 5:50. +When adding new or deleting old content or changing the workflow state of it, +you will also not see those actions reflected in searches right away, +but only after a delay of at most the `commitWithin` interval. +After a `commitWithin` operation is sent to Solr, +any other operations happening during that time window will be executed after the first interval is over. +So with a 15 minute interval, +if document A is indexed at 5:15, +B at 5:20 and C at 5:35, +both A & B will be committed at 5:30 and C at 5:50. diff --git a/docs/base/searching.rst b/docs/base/searching.rst index 3f2ef4a59..1736d3eea 100644 --- a/docs/base/searching.rst +++ b/docs/base/searching.rst @@ -2,10 +2,13 @@ Searching ********* Information retrieval is a complex science. -We try to give a very brief explanation here, refer to the literature and documentation of Lucene/Solr for much more detailed information. +We try to give a very brief explanation here, +refer to the literature and documentation of Lucene/Solr for much more detailed information. -If you do searches in normal Plone, you have a search term and query the SearchableText index with it. -The SearchableText is a simple concatenation of all searchable fields, by default title, description and the body text. +If you do searches in normal Plone, +you have a search term and query the SearchableText index with it. +The SearchableText is a simple concatenation of all searchable fields, +by default title, description and the body text. The default ZCTextIndex in Plone uses a simplified version of the Okapi BM25 algorithm described in papers in 1998. It uses two metrics to score documents: @@ -15,40 +18,60 @@ It uses two metrics to score documents: Terms only occurring in a few documents are scored higher than those occurring in many documents. It calculates the sum of all scores, for every term common to the query and any document. -So for a query with two terms, a document is likely to score higher if it contains both terms, except if one of them is a very common term and the other document contains the non-common term more often. +So for a query with two terms, +a document is likely to score higher if it contains both terms, +except if one of them is a very common term and the other document contains the non-common term more often. -The similarity function used in Solr/Lucene uses a different algorithm, based on a combination of a boolean and vector space model, but taking the same underlying metrics into account. +The similarity function used in Solr/Lucene uses a different algorithm, +based on a combination of a boolean and vector space model, +but taking the same underlying metrics into account. In addition to the term frequency and inverse document frequency Solr respects some more metrics: - length normalization: The number of all terms in a field. Shorter fields contribute higher scores compared to long fields. - boost values: There's a variety of boost values that can be applied, both index-time document boost values as well as boost values per search field or search term -In its pre 2.0 versions, collective.solr used a naive approach and mirrored the approach taken by ZCTextIndex. +In its pre 2.0 versions, +collective.solr used a naive approach and mirrored the approach taken by ZCTextIndex. So it sent each search query as one query and matched it against the full SearchableText field inside Solr. By doing that Solr basically used the same algorithm as ZCTextIndex as it only had one field to match with the entire text in it. -The only difference was the use of the length normalization, so shorter documents ranked higher than those with longer texts. -This actually caused search quality to be worse, as you'd frequently find folders, links or otherwise rather empty documents. +The only difference was the use of the length normalization, +so shorter documents ranked higher than those with longer texts. +This actually caused search quality to be worse, +as you'd frequently find folders, links or otherwise rather empty documents. The Okapi BM25 implementation in ZCTextIndex deliberately ignores the document length for that reason. -In order to get good or better search quality from Solr, we have to query it in a different way. -Instead of concatenating all fields into one big text, we need to preserve the individual fields and use their intrinsic importance. +In order to get good or better search quality from Solr, +we have to query it in a different way. +Instead of concatenating all fields into one big text, +we need to preserve the individual fields and use their intrinsic importance. We get the main benefit be realizing that matches on the title and description are more important than matches on the body text or other fields in a document. -collective.solr 2.0+ does exactly that by introducing a `search-pattern` to be used for text searches. In its default form it causes each query to work against the title, description and full searchable text fields and boosts the title by a high and the description by a medium value. -The length normalization already provides an improvement for these fields, as the title is likely short, the description a bit longer and the full text even longer. +collective.solr 2.0+ does exactly that by introducing a `search-pattern` to be used for text searches. +In its default form it causes each query to work against the title, +description and full searchable text fields and boosts the title by a high and the description by a medium value. +The length normalization already provides an improvement for these fields, +as the title is likely short, +the description a bit longer and the full text even longer. By using explicit boost values the effect gets to be more pronounced. If you do custom searches or want to include more fields into the full text search you need to keep the above in mind. Simply setting the `searchable` attribute on the schema of a field to `True` will only include it in the big searchable text stream. -If you for example include a field containing tags, the simple tag names will likely 'drown' in the full body text. +If you for example include a field containing tags, +the simple tag names will likely 'drown' in the full body text. You might want to instead change the search pattern to include the field and potentially put a boost value on it - though it will be more important as it's likely to be extremely short. Similarly extracting the full text of binary files and simply appending them into the search stream might not be the best approach. You should rather index those in a separate field and then maybe use a boost value of less than one to make the field less important. -Given two documents with the same content, one as a normal page and one as a binary file, you'll likely want to find the page first, as it's faster to access and read than the file. - +Given two documents with the same content, +one as a normal page and one as a binary file, +you'll likely want to find the page first, +as it's faster to access and read than the file. There's a good number of other improvements you can do using query time and index time boost values. -To provide index time boost values, you can provide a skin script called `solr_boost_index_values` which gets the object to be indexed and the data sent to Solr as arguments and returns a dictionary of field names to boost values for each document. -The safest is to return a boost value for the empty string, which results in a document boost value. -Field level boost values don't work with all searches, especially wildcard searches as done by most simple web searches. -The index time boost allows you to implement policies like boosting certain content types over others, taking into account ratings or number of comments as a measure of user feedback or anything else that can be derived from each content item. +To provide index time boost values, +you can provide a skin script called `solr_boost_index_values` which gets the object to be indexed and the data sent to Solr as arguments and returns a dictionary of field names to boost values for each document. +The safest is to return a boost value for the empty string, +which results in a document boost value. +Field level boost values don't work with all searches, +especially wildcard searches as done by most simple web searches. +The index time boost allows you to implement policies like boosting certain content types over others, +taking into account ratings or number of comments as a measure of user feedback or anything else that can be derived from each content item. diff --git a/docs/development/TODO.rst b/docs/development/TODO.rst index 3a55649fc..0df6ae358 100644 --- a/docs/development/TODO.rst +++ b/docs/development/TODO.rst @@ -14,8 +14,8 @@ TODOs: * evaluate http://www.gnuenterprise.org/~jcater/solr.py as a replacement (also see http://tinyurl.com/2zcogf) * evaluate sunburnet as a replacement https://pypi.python.org/pypi/sunburnt -* evaluat mysolr as backend https://pypi.python.org/pypi/mysolr +* evaluate mysolr as backend https://pypi.python.org/pypi/mysolr * implement LocalParams to have a nicer facet view http://wiki.apache.org/solr/SimpleFacetParameters#Multi-Select_Faceting_and_LocalParams -* Use current search view and get rid of anicient search override +* Use current search view and get rid of ancient search override * Implement a push only and read only mode * Play nice with eea.facetednavigation diff --git a/docs/features/atomic_updates.rst b/docs/features/atomic_updates.rst index fa905a02c..bc5a2b670 100644 --- a/docs/features/atomic_updates.rst +++ b/docs/features/atomic_updates.rst @@ -1,13 +1,21 @@ Partial indexing documents (AtomicUpdates) ****************************************** -This means whenever possible, only the necessary / specified attributes get updated in Solr, and more importantly, re-indexed by Plone's indexers. +This means whenever possible, +only the necessary/specified attributes get updated in Solr, +and more importantly, +re-indexed by Plone's indexers. -With collective.recipe.solr a new configuration is introduced, called `updateLog`. ``updateLog`` is enabled by default and allows atomic updates. In detail it adds a new field ``_version_`` to the schema and also adds "" to your solr config. +With collective.recipe.solr a new configuration is introduced, +called ``updateLog``. +``updateLog`` is enabled by default and allows atomic updates. +In detail it adds a new field ``_version_`` to the schema and also adds "" to your solr config. -Further all your indexes configured in solr.cfg needs the stored:true attribute (Except the ``default`` field). +Further all your indexes configured in solr.cfg needs the ``stored:true`` attribute (except the ``default`` field). See http://wiki.apache.org/solr/Atomic_Updates for details. - -Also note, that the AtomicUpdate feature is no compatible with the "Index time boost" feature. You have to decide, whether using atomic updates, or boosting on index time. You can enable/disable atomic updates thru the collective.solr control panel. Atomic updates are enabled by default. +Also note, that the AtomicUpdate feature is no compatible with the "Index time boost" feature. +You have to decide, whether using atomic updates, or boosting on index time. +You can enable/disable atomic updates through the collective.solr control panel. +Atomic updates are enabled by default. diff --git a/docs/features/binary.rst b/docs/features/binary.rst index 53e09373a..2a17a29f4 100644 --- a/docs/features/binary.rst +++ b/docs/features/binary.rst @@ -1,14 +1,18 @@ Indexing binary documents ************************* -At this point collective.solr uses Plone's default capabilities to index binary documents via `portal_transforms` and installing command line tools like `wv2` or `pdftotext`. +At this point collective.solr uses Plone's default capabilities to index binary documents. +It does so via `portal_transforms` and installing command line tools like `wv2` or `pdftotext`. Work is under way to expose and use the `Apache Tika`_ Solr integration available via the `update/extract` handler. -Once finished this will speed up indexing of binary documents considerably, as the extraction will happen out-of-process on the Solr server side. +Once finished this will speed up indexing of binary documents considerably, +as the extraction will happen out-of-process on the Solr server side. `Apache Tika`_ also supports a much larger list of formats than can be supported by adding external command line tools. -There is room for more improvements in this area, as c.solr will still send the binary data to Solr as part of the end-user request/transaction. -To further optimize this, Solr index operations can be stored in a task queue as provided by `plone.app.async` or solutions build on top of `Celery`. +There is room for more improvements in this area, +as collective.solr will still send the binary data to Solr as part of the end-user request/transaction. +To further optimize this, +Solr index operations can be stored in a task queue as provided by `plone.app.async` or solutions build on top of `Celery`. This is currently outside the scope of `collective.solr`. .. _`Apache Tika`: http://tika.apache.org/ diff --git a/docs/features/exclude.rst b/docs/features/exclude.rst index 114e98f06..f5060f02c 100644 --- a/docs/features/exclude.rst +++ b/docs/features/exclude.rst @@ -3,7 +3,8 @@ Exclude from search and elevation By default this add-on introduces two new fields to the default content types or any custom type derived from ATContentTypes. -The `showinsearch` boolean field lets you hide specific content items from the search results, by setting the value to `false`. +The `showinsearch` boolean field lets you hide specific content items from the search results, +by setting the value to `false`. The `searchwords` lines field allows you to specify multiple phrases per content item. A phrase is specified per line. @@ -11,4 +12,5 @@ User searches containing any of these phrases will show the content item as the This technique is also known as `elevation`. Both of these features depend on the default `search-pattern` to include the required parts as included in the default configuration. -The `searchwords` approach to elevation doesn't depend on the Solr elevation feature, as that would require maintaining a xml file as part of the Solr server configuration. +The `searchwords` approach to elevation doesn't depend on the Solr elevation feature, +as that would require maintaining a xml file as part of the Solr server configuration. diff --git a/docs/features/facets.rst b/docs/features/facets.rst index 7973c9a03..bee9cf6b0 100644 --- a/docs/features/facets.rst +++ b/docs/features/facets.rst @@ -7,4 +7,4 @@ The provided search form is currently more of an example and not used in many re You likely want to override it with a custom implementation for your specific site. Starting with Plone 4.2, Plone will contain a modernized search form whose UI supports faceting more naturally. -At some point `c.solr` will extend this new search form rather than providing its own. +At some point `collective.solr` will extend this new search form rather than providing its own. diff --git a/docs/features/index.rst b/docs/features/index.rst index 1ca85b2f2..d460967e4 100644 --- a/docs/features/index.rst +++ b/docs/features/index.rst @@ -6,26 +6,15 @@ Once installed and configured, this add-on introduces a number of end-user featu Solr Features ------------- - - Features of Solr Integration into Plone --------------------------------------- - Search Enhancements ******************* - - - ZCTextIndex Replacement *********************** - - - - - Old full description of Features -------------------------------- diff --git a/docs/features/languages.rst b/docs/features/languages.rst index 6c0b4d322..84f586c62 100644 --- a/docs/features/languages.rst +++ b/docs/features/languages.rst @@ -8,9 +8,12 @@ The default text analysis uses libraries based on ICU standards to fold and norm Accented characters are folder into their unaccented base form and many other characters are normalized. This normalization is similar to what Plone does when generating url identifiers from titles. -These changes are applied both to the indexed text and the user provided search query, so in general there's a large number of matches at the expense of specificity. +These changes are applied both to the indexed text and the user provided search query, +so in general there's a large number of matches at the expense of specificity. -Non-alphabetic characters like hyphens, dots and colons are interpreted as word boundaries, while case changes and alphanumeric combinations are left intact; for example `WiFi` or `IPv4` will only be lower-cased but not split. +Non-alphabetic characters like hyphens, dots and colons are interpreted as word boundaries, +while case changes and alphanumeric combinations are left intact; +for example `WiFi` or `IPv4` will only be lower-cased but not split. For any specific site, you likely know the supported content languages and could further tune the text analysis. A common example is the use of stemming, to generate base words for terms. diff --git a/docs/features/suggestions.rst b/docs/features/suggestions.rst index 8a0f6578e..ac737c435 100644 --- a/docs/features/suggestions.rst +++ b/docs/features/suggestions.rst @@ -1,7 +1,9 @@ Spelling checking / suggestions ******************************* -Solr supports spell checking - or rather suggestions, as it doesn't contain a formal dictionary but bases suggestions on the indexed corpus. +Solr supports spell checking - or rather suggestions, +as it doesn't contain a formal dictionary but bases suggestions on the indexed corpus. The idea is to present the user with alternative search terms for any query that is likely to produce more or better results. -Currently this is not yet exposed in the `collective.solr` API's even though the Solr server as set up by the buildout recipe already contains the required configuration for this. +Currently this is not yet exposed in the `collective.solr` API's. +Solr server, as set up by the buildout recipe, already contains the required configuration for this. diff --git a/docs/features/wildcard.rst b/docs/features/wildcard.rst index 3d4787596..f0d316856 100644 --- a/docs/features/wildcard.rst +++ b/docs/features/wildcard.rst @@ -3,9 +3,12 @@ Wildcard searches Wildcard search support in Solr is rather poor. Unfortunately Plone's live search uses this by default, so we have to support it. -When doing wildcard searches, Solr ignores any of the tokenizer and analyzer settings of the field at query time. +When doing wildcard searches, +Solr ignores any of the tokenizer and analyzer settings of the field at query time. + This often leads to a mismatch of the indexed data as modified by those settings and the query term. In order to work around this, we try to reproduce the essential parts of these analyzers on the `collective.solr` side. The most common changes are lower-casing characters and folding non-ascii characters to ascii as done by the `ICUFoldingFilterFactory`. + Currently these two changes are hard-wired and applied to all fields of type `solr.TextField`. If you have different field settings you might need to overwrite `collective.solr.utils.prepare_wildcard`. diff --git a/docs/introduction.rst b/docs/introduction.rst index 5547a937a..1deb86485 100644 --- a/docs/introduction.rst +++ b/docs/introduction.rst @@ -6,9 +6,12 @@ Introduction Apache Solr is based on Lucene and is *the* enterprise open source search engine. It powers the search of sites like Twitter, the Apple and iTunes Stores, Wikipedia, Netflix and many more. -Solr does not only scale to any level of content, but provides rich search functionality, like faceting, geospatial search, suggestions, spelling corrections, indexing of binary formats and a whole variety of powerful tools to configure custom search solutions. +Solr does not only scale to any level of content, but provides rich search functionality, +like faceting, geospatial search, suggestions, spelling corrections, +indexing of binary formats and a whole variety of powerful tools to configure custom search solutions. It has integrated clustering and load-balancing to provide a high level of robustness. -``collective.solr`` comes with a default configuration and setup of Solr that makes it extremely easy to get started, yet provides a vastly superior search quality compared to Plone's integrated text search based on ``ZCTextIndex``. +``collective.solr`` comes with a default configuration and setup of Solr that makes it extremely easy to get started, +yet provides a vastly superior search quality compared to Plone's integrated text search based on ``ZCTextIndex``. .. include:: indexes.rst diff --git a/docs/status.rst b/docs/status.rst index c989fa82e..4e2fd060d 100644 --- a/docs/status.rst +++ b/docs/status.rst @@ -3,8 +3,11 @@ Current Status The code is used in production in many sites and considered stable. This add-on can be installed in a `Plone`_ 4.1 (or later) site to enable indexing operations as well as searching (site and live search) using `Solr`_. -Doing so will not only significantly improve search quality and performance - especially for a large number of indexed objects, but also reduce the memory footprint of your `Plone`_ instance by allowing you to remove the ``SearchableText``, ``Description`` and ``Title`` indexes from the catalog. -In large sites with 100000 content objects and more, searches using ``ZCTextIndex`` often taken 10 seconds or more and require a good deal of memory from ZODB caches. +Doing so will not only significantly improve search quality and performance - +especially for a large number of indexed objects, +but also reduce the memory footprint of your `Plone`_ instance by allowing you to remove the ``SearchableText``, ``Description`` and ``Title`` indexes from the catalog. +In large sites with 100000 content objects and more, +searches using ``ZCTextIndex`` often taken 10 seconds or more and require a good deal of memory from ZODB caches. Solr will typically answer these requests in 10ms to 50ms at which point network latency and the rendering speed of Plone's page templates are a more dominant factor. .. _`Solr`: http://lucene.apache.org/solr/ diff --git a/docs/usage/autocomplete.rst b/docs/usage/autocomplete.rst index 5cd157406..ca078ef76 100644 --- a/docs/usage/autocomplete.rst +++ b/docs/usage/autocomplete.rst @@ -5,67 +5,67 @@ http://wiki.apache.org/solr/Suggester Simple autocomplete configuration using the "Title" field (buildout.cfg):: - additional-solrconfig = - - - suggest - org.apache.solr.spelling.suggest.Suggester - org.apache.solr.spelling.suggest.fst.WFSTLookupFactory - Title - 0.005 - true - - + additional-solrconfig = + + + suggest + org.apache.solr.spelling.suggest.Suggester + org.apache.solr.spelling.suggest.fst.WFSTLookupFactory + Title + 0.005 + true + + - - - true - suggest - 10 - true - - - suggest - - + + + true + suggest + 10 + true + + + suggest + + More complex example with custom field/filters:: - index += - name:title_autocomplete type:text_auto indexed:true stored:true + index += + name:title_autocomplete type:text_auto indexed:true stored:true - additional-solrconfig = - - - suggest - org.apache.solr.spelling.suggest.Suggester - org.apache.solr.spelling.suggest.fst.WFSTLookupFactory - title_autocomplete - 0.005 - true - - + additional-solrconfig = + + + suggest + org.apache.solr.spelling.suggest.Suggester + org.apache.solr.spelling.suggest.fst.WFSTLookupFactory + title_autocomplete + 0.005 + true + + - - - true - suggest - 10 - true - - - suggest - - + + + true + suggest + 10 + true + + + suggest + + - extra-field-types = - - - - - - + extra-field-types = + + + + + + - additional-schema-config = - + additional-schema-config = + diff --git a/docs/usage/config.rst b/docs/usage/config.rst index 010f8de07..610178268 100644 --- a/docs/usage/config.rst +++ b/docs/usage/config.rst @@ -4,11 +4,9 @@ Configuring collective.solr Solr-Connection Configuration ***************************** - ZCML Configuration (prefered) ............................. - The connections settings for Solr can be configured in ZCML and thus in buildout. This makes it easier when copying databases between multiple Zope instances with different Solr servers. Example:: @@ -21,9 +19,6 @@ Example:: TTW Configuration ................. - - - TTW Configuration of Solr-Settings ********************************** diff --git a/docs/usage/install.rst b/docs/usage/install.rst index f09f02922..004583069 100644 --- a/docs/usage/install.rst +++ b/docs/usage/install.rst @@ -33,7 +33,9 @@ Creating the initial index can take some considerable time. A typical indexing rate for a Plone site running of a local disk is 20 index operations per second. While Solr scales to orders of magnitude more than that, the limiting factor is database access time in Plone. -If you have an existing site with a large volume of content, you can create an initial Solr index on a staging server or development machine, then rsync it over to the live machine, enable Solr and call `@@solr-maintenance/sync`. +If you have an existing site with a large volume of content, +you can create an initial Solr index on a staging server or development machine, +then rsync it over to the live machine, enable Solr and call `@@solr-maintenance/sync`. The sync will usually take just a couple of minutes for catching up with changes in the live database. You can also use this approach when making changes to the index structure or changing the settings of existing fields. diff --git a/docs/usage/monitoring.rst b/docs/usage/monitoring.rst index 84fdf1f52..1ae10d561 100644 --- a/docs/usage/monitoring.rst +++ b/docs/usage/monitoring.rst @@ -4,45 +4,49 @@ Monitoring Java has a general monitoring framework called JMX. You can use this to get a huge number of details about the Java process in general and Solr in particular. Some hints are at http://wiki.apache.org/solr/SolrJmx. -The default `collective.recipe.solrinstance` config uses ``, so we can use command line arguments to configure it. +The default `collective.recipe.solrinstance` config uses ``, +so we can use command line arguments to configure it. Our example `buildout/solr.cfg` includes all the relevant values in its `java_opts` variable. -To view all the available metrics, start Solr and then the `jconsole` command included in the Java SDK and connect to the local process named `start.jar`. +To view all the available metrics, +start Solr and then the `jconsole` command included in the Java SDK and connect to the local process named `start.jar`. Solr specific information is available from the MBeans tab under the `solr` section. For example you'll find `avgTimePerRequest` within `search/org.apache.solr.handler.component.SearchHandler` under `Attributes`. If you want to integrate with munin, you can install the JMX plugin at: http://exchange.munin-monitoring.org/plugins/jmx/details -Follow its install instructions and tweak the included examples to query the information you want to track. To track the average time per search request, add a file called `solr_avg_query_time.conf` into `/usr/share/munin/plugins` with the following contents:: +Follow its install instructions and tweak the included examples to query the information you want to track. +To track the average time per search request, +add a file called `solr_avg_query_time.conf` into `/usr/share/munin/plugins` with the following contents:: - graph_title Average Query Time - graph_vlabel ms - graph_category Solr + graph_title Average Query Time + graph_vlabel ms + graph_category Solr - solr_average_query_time.label time per request - solr_average_query_time.jmxObjectName solr/:type=search,id=org.apache.solr.handler.component.SearchHandler - solr_average_query_time.jmxAttributeName avgTimePerRequest + solr_average_query_time.label time per request + solr_average_query_time.jmxObjectName solr/:type=search,id=org.apache.solr.handler.component.SearchHandler + solr_average_query_time.jmxAttributeName avgTimePerRequest Then add a symlink to add the plugin:: - $ ln -s /usr/share/munin/plugins/jmx_ /etc/munin/plugins/jmx_solr_avg_query_time + $ ln -s /usr/share/munin/plugins/jmx_ /etc/munin/plugins/jmx_solr_avg_query_time Point the jmx plugin to the Solr process, by opening `/etc/munin/plugin-conf.d/munin-node.conf` and adding something like:: - [jmx_*] - env.jmxurl service:jmx:rmi:///jndi/rmi://127.0.0.1:8984/jmxrmi + [jmx_*] + env.jmxurl service:jmx:rmi:///jndi/rmi://127.0.0.1:8984/jmxrmi The host and port need to match those passed via `java_opts` to Solr. To check if the plugins are working do:: - $ export jmxurl="service:jmx:rmi:///jndi/rmi://127.0.0.1:8984/jmxrmi" - $ cd /etc/munin/plugins + $ export jmxurl="service:jmx:rmi:///jndi/rmi://127.0.0.1:8984/jmxrmi" + $ cd /etc/munin/plugins And call the plugin you configured directly, like for example:: - $ ./solr_avg_query_time - solr_average_query_time.value NaN + $ ./solr_avg_query_time + solr_average_query_time.value NaN We include a number of useful configurations inside the package, in the `collective/solr/munin_config` directory. You can copy all of them into the `/usr/share/munin/plugins` directory and create the symlinks for all of them. diff --git a/docs/usage/replication.rst b/docs/usage/replication.rst index 35c90a99d..719db98b5 100644 --- a/docs/usage/replication.rst +++ b/docs/usage/replication.rst @@ -1,40 +1,40 @@ Replication *********** - At this point Solr doesn't yet allow for a full fault tolerance setup. -You can read more about the `Solr Cloud`__ effort which aims to provide this. +You can read more about the `Solr Cloud`_ effort which aims to provide this. -But we can setup a simple master/slave replication using Solr's built-in `Solr Replication`__ support, which is a first step in the right direction. +But we can setup a simple master/slave replication using Solr's built-in `Solr Replication`_ support, +which is a first step in the right direction. - .. __: http://wiki.apache.org/solr/SolrCloud - .. __: http://wiki.apache.org/solr/SolrReplication + .. _Solr Cloud: http://wiki.apache.org/solr/SolrCloud + .. _Solr Replication: http://wiki.apache.org/solr/SolrReplication In order to use this, you can setup a Solr master server and give it some extra config:: - [solr-instance] - additional-solrconfig = - - - commit - startup - optimize - - + [solr-instance] + additional-solrconfig = + + + commit + startup + optimize + + Then you can point one or multiple slave servers to the master. Assuming the master runs on `solr-master.domain.com` at port `8983`, we could write:: - [solr-instance] - additional-solrconfig = - - - http://solr-master.domain.com:8983/solr/replication - 00:00:30 - - + [solr-instance] + additional-solrconfig = + + + http://solr-master.domain.com:8983/solr/replication + 00:00:30 + + A poll interval of 30 seconds should be fast enough without creating too much overhead. At this point `collective.solr` does not yet have support for connecting to multiple servers and using the slaves as a fallback for querying. -As there's no master-master setup yet, fault tolerance for index changes cannot be provided. \ No newline at end of file +As there's no master-master setup yet, fault tolerance for index changes cannot be provided. diff --git a/docs/usage/setup.rst b/docs/usage/setup.rst index 5b74896e4..cef18d654 100644 --- a/docs/usage/setup.rst +++ b/docs/usage/setup.rst @@ -4,18 +4,11 @@ Setup Solr Solr Schema *********** - Solr Field Types ................ - - - - .. include:: autocomplete.rst - - Solr Base Schema for Plone ************************** diff --git a/docs/usage/solrcloud.rst b/docs/usage/solrcloud.rst index e3f4353f2..d3f07a39f 100644 --- a/docs/usage/solrcloud.rst +++ b/docs/usage/solrcloud.rst @@ -1,6 +1,6 @@ SolrCloud ********* -You can read more about the `Solr Cloud`__ effort which aims to provide this. +You can read more about the `Solr Cloud`_ effort which aims to provide this. - .. __: http://wiki.apache.org/solr/SolrCloud + .. _Solr Cloud: http://wiki.apache.org/solr/SolrCloud From 915073a4de900df40d75d3603c7a00f4e6339ba4 Mon Sep 17 00:00:00 2001 From: Gil Forcada Date: Fri, 1 Apr 2016 16:36:55 +0200 Subject: [PATCH 05/31] Fix field-list export It was appending them to facets setting. --- src/collective/solr/exportimport.py | 8 ++++---- src/collective/solr/tests/test_exportimport.py | 6 +++++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/collective/solr/exportimport.py b/src/collective/solr/exportimport.py index 5b10569cd..d74a23234 100644 --- a/src/collective/solr/exportimport.py +++ b/src/collective/solr/exportimport.py @@ -219,6 +219,10 @@ def _extractProperties(self): ) field_list = self._doc.createElement('field-list') append(field_list) + for name in self.context.field_list: + param = self._doc.createElement('parameter') + param.setAttribute('name', name) + field_list.appendChild(param) append( create( 'levenshtein_distance', @@ -228,10 +232,6 @@ def _extractProperties(self): append(create('atomic_updates', str(bool(self.context.atomic_updates)))) - for name in self.context.field_list: - param = self._doc.createElement('parameter') - param.setAttribute('name', name) - facets.appendChild(param) return node diff --git a/src/collective/solr/tests/test_exportimport.py b/src/collective/solr/tests/test_exportimport.py index 519ebdf8a..8943ee9d8 100644 --- a/src/collective/solr/tests/test_exportimport.py +++ b/src/collective/solr/tests/test_exportimport.py @@ -33,6 +33,7 @@ def setUp(self): config.effective_steps = 900 config.exclude_user = True config.levenshtein_distance = 0.2 + config.field_list = ('review_state', 'Title', ) config.atomic_updates = False def testImportStep(self): @@ -128,7 +129,10 @@ def testImportDoesntChangeActivationState(self): - + + + + From d9960c8b702f87c0d88e88259503c81ef10849b6 Mon Sep 17 00:00:00 2001 From: Timo Stollenwerk Date: Wed, 13 Apr 2016 15:05:21 +0200 Subject: [PATCH 06/31] Add a few more information to the changelog about the breaking changes. --- CHANGES.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 3223855b6..cb565cae4 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,7 +4,8 @@ Changelog 4.1.1 (unreleased) ------------------ -Note: This release requires you to to update your Solr config and do a full reindex. +Note: This release requires you to to update your Solr config and do a full reindex. Make sure you add "updateLog = true" to your "solr-instance" +buildout section. See https://github.com/collective/collective.solr/blob/master/solr.cfg for a working example. - Ported atomic updates from ftw.solr. This requires you to update your solr config, load the new solr config and From 896d4c5db046937775b98eaf313af48a629c9e7b Mon Sep 17 00:00:00 2001 From: Timo Stollenwerk Date: Wed, 13 Apr 2016 15:09:49 +0200 Subject: [PATCH 07/31] Preparing release 5.0 --- CHANGES.rst | 4 ++-- setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index cb565cae4..15a14d21b 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,8 +1,8 @@ Changelog ========= -4.1.1 (unreleased) ------------------- +5.0 (2016-04-13) +---------------- Note: This release requires you to to update your Solr config and do a full reindex. Make sure you add "updateLog = true" to your "solr-instance" buildout section. See https://github.com/collective/collective.solr/blob/master/solr.cfg for a working example. diff --git a/setup.py b/setup.py index b1344f08e..cd24a3666 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup from setuptools import find_packages -version = '4.1.1.dev0' +version = '5.0' long_description = \ open('README.rst').read() + '\n' + \ open('CHANGES.rst').read() + \ From f448e871e5ae96a03d5405230135f06bc0fe25ec Mon Sep 17 00:00:00 2001 From: Timo Stollenwerk Date: Wed, 13 Apr 2016 15:11:08 +0200 Subject: [PATCH 08/31] Back to development: 5.0.1 --- CHANGES.rst | 6 ++++++ setup.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 15a14d21b..49ded59f8 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,12 @@ Changelog ========= +5.0.1 (unreleased) +------------------ + +- Nothing changed yet. + + 5.0 (2016-04-13) ---------------- diff --git a/setup.py b/setup.py index cd24a3666..67aba4718 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup from setuptools import find_packages -version = '5.0' +version = '5.0.1.dev0' long_description = \ open('README.rst').read() + '\n' + \ open('CHANGES.rst').read() + \ From b54946b4f976955d4da3ef857afc3098e7733d9b Mon Sep 17 00:00:00 2001 From: Timo Stollenwerk Date: Wed, 13 Apr 2016 15:29:14 +0200 Subject: [PATCH 09/31] Add check-readme script. --- base.cfg | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/base.cfg b/base.cfg index 6c1d72c20..af4b187c2 100644 --- a/base.cfg +++ b/base.cfg @@ -8,6 +8,7 @@ parts += code-analysis releaser sphinxbuilder + check-readme develop = . @@ -66,3 +67,11 @@ eggs = zest.releaser recipe = collective.recipe.sphinxbuilder source = ${buildout:directory}/docs build = ${buildout:directory}/docs + +[check-readme] +recipe = collective.recipe.template +input = inline: + #!/bin/sh + python ${buildout:directory}/setup.py check -r -s +output = ${buildout:directory}/bin/check-readme +mode = 755 From f1c3e3846c26e65a3423ec230920941d40318701 Mon Sep 17 00:00:00 2001 From: Timo Stollenwerk Date: Wed, 13 Apr 2016 15:33:48 +0200 Subject: [PATCH 10/31] Fix link to documentation. --- README.rst | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/README.rst b/README.rst index 3809b3dc3..bf184ca83 100644 --- a/README.rst +++ b/README.rst @@ -62,17 +62,13 @@ Search Enhancements * Exclude from search * Elevation -ZCTextIndex Replacement -*********************** - - Detailed Documentation ====================== -A full Documentation of the Solr integration of Plone could be found in `docs.plone.org collective.solr`_. +A full Documentation of the Solr integration of Plone could be found on `collectivesolr.readthedocs.org`_. -.. _`docs.plone.org collective.solr`: http://docs.plone.org/external/collective.solr/docs/index.html +.. _`collectivesolr.readthedocs.org`: http://collectivesolr.readthedocs.org/en/latest/ Installation & Configuration From a5992810f5855b7b81218dc65d5887f97589b289 Mon Sep 17 00:00:00 2001 From: Timo Stollenwerk Date: Wed, 13 Apr 2016 15:38:28 +0200 Subject: [PATCH 11/31] Attempt to fix ReST problem with pypi page. --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 67aba4718..0d4f591cd 100644 --- a/setup.py +++ b/setup.py @@ -5,8 +5,8 @@ version = '5.0.1.dev0' long_description = \ open('README.rst').read() + '\n' + \ - open('CHANGES.rst').read() + \ - open(os.path.join('docs', 'credits.rst')).read() + \ + open('CHANGES.rst').read() + '\n' + \ + open(os.path.join('docs', 'credits.rst')).read() + '\n' + \ open(os.path.join('docs', 'contributors.rst')).read(), setup( From f1f388548f15d93289068623b95d4bef553939a3 Mon Sep 17 00:00:00 2001 From: Gil Forcada Codinachs Date: Thu, 14 Apr 2016 16:13:38 +0200 Subject: [PATCH 12/31] Allow range minmax ZCatalog accepts ranges defined as ``min:max`` as well as ``minmax``. --- src/collective/solr/mangler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/collective/solr/mangler.py b/src/collective/solr/mangler.py index 194daaf74..679344cb6 100644 --- a/src/collective/solr/mangler.py +++ b/src/collective/solr/mangler.py @@ -14,6 +14,7 @@ 'min': '[%s TO *]', 'max': '[* TO %s]', 'min:max': '[%s TO %s]', + 'minmax': '[%s TO %s]', } sort_aliases = { From 595d516ebcf5b775fc97b9d5d642204eb5da5b31 Mon Sep 17 00:00:00 2001 From: Timo Stollenwerk Date: Fri, 15 Apr 2016 08:32:09 +0200 Subject: [PATCH 13/31] Upgrade to Plone 4.3.8. --- plone-4.3.x.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plone-4.3.x.cfg b/plone-4.3.x.cfg index 518736a4b..25b186727 100644 --- a/plone-4.3.x.cfg +++ b/plone-4.3.x.cfg @@ -1,2 +1,2 @@ [buildout] -extends = http://dist.plone.org/release/4.3.6/versions.cfg +extends = http://dist.plone.org/release/4.3.8/versions.cfg From 62cdcfd95657633d768c379e9117a82b25d83bd9 Mon Sep 17 00:00:00 2001 From: Timo Stollenwerk Date: Fri, 15 Apr 2016 08:58:13 +0200 Subject: [PATCH 14/31] Upgrade to Plone 4.3.9. --- plone-4.3.x.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plone-4.3.x.cfg b/plone-4.3.x.cfg index 25b186727..c59a8fb17 100644 --- a/plone-4.3.x.cfg +++ b/plone-4.3.x.cfg @@ -1,2 +1,2 @@ [buildout] -extends = http://dist.plone.org/release/4.3.8/versions.cfg +extends = http://dist.plone.org/release/4.3.9/versions.cfg From e7e9cbcc47f3e6d75d5bcde3f594bb3e834c4a32 Mon Sep 17 00:00:00 2001 From: Timo Stollenwerk Date: Mon, 18 Apr 2016 19:40:28 +0200 Subject: [PATCH 15/31] Provide full buildout conf in README. --- README.rst | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index bf184ca83..ad9772050 100644 --- a/README.rst +++ b/README.rst @@ -82,16 +82,26 @@ Download the latest default Solr configuration from github:: .. note: Please do not extend your buildout directly with those files since they are likely to change over time. Always fetch the files via wget to have a stable local copy. In general you should never rely on extending buildout config files from servers that aren't under your control. Extend your buildout to use those files and make sure collective.solr is added -to the eggs in your instance section:: +to the eggs in your instance section. Your full buildout file should look +something like this:: [buildout] + parts += instance extends = - solr.cfg - solr-4.10.x.cfg + http://dist.plone.org/release/4.3.8/versions.cfg + solr.cfg + solr-4.10.x.cfg [instance] - eggs += collective.solr - + recipe = plone.recipe.zope2instance + http-address = 8080 + user = admin:admin + eggs = + Plone + collective.solr + + [versions] + collective.recipe.solrinstance = 5.3.2 After saving this to let's say ``buildout.cfg`` the buildout can be run and the `Solr`_ server and `Plone`_ instance started:: From 6a4efdfee26c61e6993dc6039f5c5ea094e1f049 Mon Sep 17 00:00:00 2001 From: Tom Gross Date: Tue, 31 May 2016 20:46:34 +0200 Subject: [PATCH 16/31] Fix indexing of new files - fixes #120) --- CHANGES.rst | 3 ++- src/collective/solr/indexer.py | 9 +++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 49ded59f8..17534e842 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,7 +4,8 @@ Changelog 5.0.1 (unreleased) ------------------ -- Nothing changed yet. +- Fix BlobError occuring when indexing new files (fixes #120) + [tomgross] 5.0 (2016-04-13) diff --git a/src/collective/solr/indexer.py b/src/collective/solr/indexer.py index 08a1101b2..a27d9098c 100644 --- a/src/collective/solr/indexer.py +++ b/src/collective/solr/indexer.py @@ -9,6 +9,7 @@ from zope.component import queryAdapter, adapts from zope.interface import implements from zope.interface import Interface +from ZODB.interfaces import BlobError from ZODB.POSException import ConflictError from Products.CMFCore.utils import getToolByName from Products.CMFCore.CMFCatalogAware import CMFCatalogAware @@ -118,8 +119,12 @@ class BinaryAdder(DefaultAdder): def getpath(self): field = self.context.getPrimaryField() blob = field.get(self.context).blob - return blob.committed() or blob._p_blob_committed or \ - blob._p_blob_uncommitted + try: + path = blob.committed() + except BlobError: + path = blob._p_blob_committed or blob._p_blob_uncommitted + logger.debug('Indexing BLOB from path %s', path) + return path def __call__(self, conn, **data): if 'ZOPETESTCASE' in os.environ: From 8a1e092d01bd096d476789c26ff60f64ed2e5cf3 Mon Sep 17 00:00:00 2001 From: Tom Gross Date: Wed, 1 Jun 2016 06:15:57 +0000 Subject: [PATCH 17/31] Make extracting text from binary content and indexing 2 steps (#65) --- CHANGES.rst | 1 + setup.py | 1 + src/collective/solr/indexer.py | 27 ++++++++------------------- 3 files changed, 10 insertions(+), 19 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 17534e842..e5c7904e3 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -5,6 +5,7 @@ Changelog ------------------ - Fix BlobError occuring when indexing new files (fixes #120) +- Make extracting text from binary content and indexing 2 steps (#65) [tomgross] diff --git a/setup.py b/setup.py index 0d4f591cd..bcccb3104 100644 --- a/setup.py +++ b/setup.py @@ -55,6 +55,7 @@ 'argparse', # we need to support Python 2.6 (Plone 4.x) 'collective.indexing >= 2.0a2', 'collective.js.showmore', + 'lxml', 'plone.app.content', 'plone.app.controlpanel', 'plone.app.layout', diff --git a/src/collective/solr/indexer.py b/src/collective/solr/indexer.py index a27d9098c..778c90a98 100644 --- a/src/collective/solr/indexer.py +++ b/src/collective/solr/indexer.py @@ -2,6 +2,7 @@ import os from logging import getLogger +from lxml import etree from Acquisition import aq_get from DateTime import DateTime from datetime import date, datetime @@ -129,37 +130,25 @@ def getpath(self): def __call__(self, conn, **data): if 'ZOPETESTCASE' in os.environ: return super(BinaryAdder, self).__call__(conn, **data) - ignore = ('SearchableText', 'created', 'Type', 'links', - 'description', 'Date') + postdata = {} - for key, val in data.iteritems(): - if key in ignore: - continue - if isinstance(val, list) or isinstance(val, tuple): - newvalue = [] - for item in val: - if isinstance(item, unicode): - item = item.encode('utf-8') - newvalue.append(item) - else: - newvalue = val - postdata['literal.%s' % key] = newvalue postdata['stream.file'] = self.getpath() postdata['stream.contentType'] = data.get( 'content_type', 'application/octet-stream' ) - postdata['fmap.content'] = 'SearchableText' postdata['extractFormat'] = 'text' + postdata['extractOnly'] = 'true' url = '%s/update/extract' % conn.solrBase - try: - conn.doPost(url, urlencode(postdata, doseq=True), conn.formheaders) - conn.flush() + response = conn.doPost(url, urlencode(postdata, doseq=True), conn.formheaders) + root = etree.parse(response) + data['SearchableText'] = root.find('.//str').text.strip() except SolrConnectionException, e: logger.warn('Error %s @ %s', e, data['path_string']) - conn.reset() + data['SearchableText'] = '' + super(BinaryAdder, self).__call__(conn, **data) def boost_values(obj, data): From 70ae2055ca5255b5607fd4a9c3306a54ab839cb5 Mon Sep 17 00:00:00 2001 From: Tom Gross Date: Wed, 1 Jun 2016 13:08:19 +0000 Subject: [PATCH 18/31] remove obsolete condition to exclude code from tests --- src/collective/solr/indexer.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/collective/solr/indexer.py b/src/collective/solr/indexer.py index 778c90a98..1773ced4b 100644 --- a/src/collective/solr/indexer.py +++ b/src/collective/solr/indexer.py @@ -1,6 +1,4 @@ # -*- coding: utf-8 -*- -import os - from logging import getLogger from lxml import etree from Acquisition import aq_get @@ -128,9 +126,6 @@ def getpath(self): return path def __call__(self, conn, **data): - if 'ZOPETESTCASE' in os.environ: - return super(BinaryAdder, self).__call__(conn, **data) - postdata = {} postdata['stream.file'] = self.getpath() postdata['stream.contentType'] = data.get( From bb41a9067ad302fef10d8463cd64f9a8cb56b63e Mon Sep 17 00:00:00 2001 From: Tom Gross Date: Wed, 1 Jun 2016 13:09:32 +0000 Subject: [PATCH 19/31] pep8 --- src/collective/solr/indexer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/collective/solr/indexer.py b/src/collective/solr/indexer.py index 1773ced4b..0793b1835 100644 --- a/src/collective/solr/indexer.py +++ b/src/collective/solr/indexer.py @@ -137,7 +137,8 @@ def __call__(self, conn, **data): url = '%s/update/extract' % conn.solrBase try: - response = conn.doPost(url, urlencode(postdata, doseq=True), conn.formheaders) + response = conn.doPost( + url, urlencode(postdata, doseq=True), conn.formheaders) root = etree.parse(response) data['SearchableText'] = root.find('.//str').text.strip() except SolrConnectionException, e: From 053ab265ff7ca6b0a67ac0f0a5a181cff6741301 Mon Sep 17 00:00:00 2001 From: Tom Gross Date: Thu, 2 Jun 2016 06:53:29 +0000 Subject: [PATCH 20/31] register search utility as local utility --- src/collective/solr/profiles/default/componentregistry.xml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/collective/solr/profiles/default/componentregistry.xml b/src/collective/solr/profiles/default/componentregistry.xml index 15ea83d41..5a37c1a1c 100644 --- a/src/collective/solr/profiles/default/componentregistry.xml +++ b/src/collective/solr/profiles/default/componentregistry.xml @@ -12,5 +12,8 @@ interface="collective.solr.interfaces.ISolrIndexQueueProcessor" name="solr" factory="collective.solr.indexer.SolrIndexProcessor"/> + From 340732d0daaed2dbcafa52e04cf886137ff3aa4f Mon Sep 17 00:00:00 2001 From: Tom Gross Date: Thu, 2 Jun 2016 14:12:46 +0000 Subject: [PATCH 21/31] fallback to catalog search, if no search utility is found --- src/collective/solr/dispatcher.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/collective/solr/dispatcher.py b/src/collective/solr/dispatcher.py index 1c8dfd317..fc3f96d35 100644 --- a/src/collective/solr/dispatcher.py +++ b/src/collective/solr/dispatcher.py @@ -13,6 +13,7 @@ from collective.solr.utils import isActive from collective.solr.utils import padResults from copy import deepcopy +from logging import getLogger from zope.component import queryMultiAdapter from zope.component import queryUtility from zope.component.hooks import getSite @@ -23,6 +24,9 @@ patchLazy() # ...as well as ZCatalog's Lazy class +logger = getLogger('collective.solr.dispatcher') + + class SearchDispatcher(object): """ adapter for potentially dispatching a given query to an alternative search backend (instead of the portal catalog) """ @@ -48,6 +52,9 @@ def solrSearchResults(request=None, **keywords): parameters with portal catalog semantics """ site = getSite() search = queryUtility(ISearch, context=site) + if search is None: + logger.warn('No search utility found in site %s', site) + raise FallBackException config = queryUtility(ISolrConnectionConfig, context=site) if request is None: From 3e582f61a27ae5b452395db61b178ed34ba57b48 Mon Sep 17 00:00:00 2001 From: Timo Stollenwerk Date: Sat, 4 Jun 2016 20:30:23 +0200 Subject: [PATCH 22/31] Add missing changelog entries. --- CHANGES.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index e5c7904e3..8afdac7f5 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -5,9 +5,17 @@ Changelog ------------------ - Fix BlobError occuring when indexing new files (fixes #120) + [tomgross] + - Make extracting text from binary content and indexing 2 steps (#65) [tomgross] +- Make suggest search work when entering multiple search terms. + [jcharra] + +- Fix field-list export. + [gforcada] + 5.0 (2016-04-13) ---------------- From b8f03d2ca799125222a0646a9737dca54ffd43c5 Mon Sep 17 00:00:00 2001 From: Timo Stollenwerk Date: Sat, 4 Jun 2016 20:39:20 +0200 Subject: [PATCH 23/31] Preparing release 5.0.1 --- CHANGES.rst | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 8afdac7f5..a20563473 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,7 +1,7 @@ Changelog ========= -5.0.1 (unreleased) +5.0.1 (2016-06-04) ------------------ - Fix BlobError occuring when indexing new files (fixes #120) diff --git a/setup.py b/setup.py index bcccb3104..274758dde 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup from setuptools import find_packages -version = '5.0.1.dev0' +version = '5.0.1' long_description = \ open('README.rst').read() + '\n' + \ open('CHANGES.rst').read() + '\n' + \ From fc9217c15e5951bf21cc14e15e496e77a958db96 Mon Sep 17 00:00:00 2001 From: Timo Stollenwerk Date: Sat, 4 Jun 2016 20:44:01 +0200 Subject: [PATCH 24/31] Back to development: 5.0.2 --- CHANGES.rst | 6 ++++++ setup.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index a20563473..a48902ce1 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,12 @@ Changelog ========= +5.0.2 (unreleased) +------------------ + +- Nothing changed yet. + + 5.0.1 (2016-06-04) ------------------ diff --git a/setup.py b/setup.py index 274758dde..6796a197e 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup from setuptools import find_packages -version = '5.0.1' +version = '5.0.2.dev0' long_description = \ open('README.rst').read() + '\n' + \ open('CHANGES.rst').read() + '\n' + \ From 01fded18b66d4e9410b4628ce43ef2329e3165be Mon Sep 17 00:00:00 2001 From: Timo Stollenwerk Date: Sat, 4 Jun 2016 20:55:40 +0200 Subject: [PATCH 25/31] Fix README formatting. --- CHANGES.rst | 3 ++- setup.py | 9 +++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index a48902ce1..e00013f38 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,7 +4,8 @@ Changelog 5.0.2 (unreleased) ------------------ -- Nothing changed yet. +- Fix README formatting. + [timo] 5.0.1 (2016-06-04) diff --git a/setup.py b/setup.py index 6796a197e..680b39c19 100644 --- a/setup.py +++ b/setup.py @@ -3,11 +3,12 @@ from setuptools import find_packages version = '5.0.2.dev0' -long_description = \ - open('README.rst').read() + '\n' + \ - open('CHANGES.rst').read() + '\n' + \ - open(os.path.join('docs', 'credits.rst')).read() + '\n' + \ +long_description = ( + open('README.rst').read() + '\n' + + open('CHANGES.rst').read() + '\n' + + open(os.path.join('docs', 'credits.rst')).read() + '\n' + open(os.path.join('docs', 'contributors.rst')).read(), +) setup( name='collective.solr', From 4405f2fa2d98a6a36e420382b0ee034f37a9d696 Mon Sep 17 00:00:00 2001 From: Timo Stollenwerk Date: Sat, 4 Jun 2016 21:03:06 +0200 Subject: [PATCH 26/31] Preparing release 5.0.2 --- CHANGES.rst | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index e00013f38..1e1ac86ee 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,7 +1,7 @@ Changelog ========= -5.0.2 (unreleased) +5.0.2 (2016-06-04) ------------------ - Fix README formatting. diff --git a/setup.py b/setup.py index 680b39c19..5521dcd55 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup from setuptools import find_packages -version = '5.0.2.dev0' +version = '5.0.2' long_description = ( open('README.rst').read() + '\n' + open('CHANGES.rst').read() + '\n' + From 76ee74a7bfec1e12abdc716bf6ac103ec6679b7a Mon Sep 17 00:00:00 2001 From: Timo Stollenwerk Date: Sat, 4 Jun 2016 21:14:24 +0200 Subject: [PATCH 27/31] Back to development: 5.0.3 --- CHANGES.rst | 6 ++++++ setup.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 1e1ac86ee..2d552aea5 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,12 @@ Changelog ========= +5.0.3 (unreleased) +------------------ + +- Nothing changed yet. + + 5.0.2 (2016-06-04) ------------------ diff --git a/setup.py b/setup.py index 5521dcd55..ec30963d0 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup from setuptools import find_packages -version = '5.0.2' +version = '5.0.3.dev0' long_description = ( open('README.rst').read() + '\n' + open('CHANGES.rst').read() + '\n' + From 8e5323eeb3317e5d9bd83e708a2d4b21f644ceae Mon Sep 17 00:00:00 2001 From: Timo Stollenwerk Date: Sun, 5 Jun 2016 09:54:44 +0200 Subject: [PATCH 28/31] Simplify long_description. --- setup.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/setup.py b/setup.py index ec30963d0..624b941e4 100644 --- a/setup.py +++ b/setup.py @@ -1,14 +1,12 @@ -import os.path from setuptools import setup from setuptools import find_packages version = '5.0.3.dev0' -long_description = ( - open('README.rst').read() + '\n' + - open('CHANGES.rst').read() + '\n' + - open(os.path.join('docs', 'credits.rst')).read() + '\n' + - open(os.path.join('docs', 'contributors.rst')).read(), -) + +long_description = '\n\n'.join([ + open('README.rst').read(), + open('CHANGES.rst').read(), +]) setup( name='collective.solr', From 745c03ec7a32c431f741bf7815d22113a42b55aa Mon Sep 17 00:00:00 2001 From: Timo Stollenwerk Date: Sun, 5 Jun 2016 09:56:18 +0200 Subject: [PATCH 29/31] Add changelog entry for pypi fix. --- CHANGES.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 2d552aea5..6a8a0868a 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,7 +4,8 @@ Changelog 5.0.3 (unreleased) ------------------ -- Nothing changed yet. +- Fix Pypi page. + [timo] 5.0.2 (2016-06-04) From e64232c24369a82b199b727325fe33d3848da0b9 Mon Sep 17 00:00:00 2001 From: Timo Stollenwerk Date: Sun, 5 Jun 2016 09:58:29 +0200 Subject: [PATCH 30/31] Preparing release 5.0.3 --- CHANGES.rst | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 6a8a0868a..9e18b83bb 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,7 +1,7 @@ Changelog ========= -5.0.3 (unreleased) +5.0.3 (2016-06-05) ------------------ - Fix Pypi page. diff --git a/setup.py b/setup.py index 624b941e4..8673a341c 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup from setuptools import find_packages -version = '5.0.3.dev0' +version = '5.0.3' long_description = '\n\n'.join([ open('README.rst').read(), From 11edb108ee21e794ea54f5e4830da7ea2df40dfa Mon Sep 17 00:00:00 2001 From: Timo Stollenwerk Date: Sun, 5 Jun 2016 10:07:56 +0200 Subject: [PATCH 31/31] Back to development: 5.0.4 --- CHANGES.rst | 6 ++++++ setup.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 9e18b83bb..ea2cfe163 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,12 @@ Changelog ========= +5.0.4 (unreleased) +------------------ + +- Nothing changed yet. + + 5.0.3 (2016-06-05) ------------------ diff --git a/setup.py b/setup.py index 8673a341c..4ec4dd079 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup from setuptools import find_packages -version = '5.0.3' +version = '5.0.4.dev0' long_description = '\n\n'.join([ open('README.rst').read(),