Merge pull request #1081 from untergeek/feature/1044

Add pattern feature for count filter
elastic · Oct 12, 2017 · ce58168 · ce58168
2 parents 3d70de7 + 2e88c11
commit ce58168
Show file tree

Hide file tree

Showing 11 changed files with 284 additions and 79 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -13,7 +13,7 @@ env:
   - ES_VERSION=5.3.3
   - ES_VERSION=5.4.3
   - ES_VERSION=5.5.2
-  - ES_VERSION=5.6.2
+  - ES_VERSION=5.6.3
 
 os: linux
 

diff --git a/curator/defaults/filter_elements.py b/curator/defaults/filter_elements.py
@@ -61,6 +61,11 @@ def max_num_segments(**kwargs):
         Required('max_num_segments'): All(Coerce(int), Range(min=1))
     }
 
+def pattern(**kwargs):
+    return {
+        Optional('pattern'): Any(str, unicode)
+    }
+
 def range_from(**kwargs):
     return { Required('range_from'): Coerce(int) }
 

diff --git a/curator/defaults/filtertypes.py b/curator/defaults/filtertypes.py
@@ -71,6 +71,7 @@ def count(action, config):
     retval = [
         filter_elements.count(),
         filter_elements.use_age(),
+        filter_elements.pattern(),
         filter_elements.reverse(),
         filter_elements.exclude(exclude=True),
     ]

diff --git a/curator/defaults/settings.py b/curator/defaults/settings.py
@@ -111,6 +111,7 @@ def structural_filter_elements():
         Optional('key'): Any(str, unicode),
         Optional('kind'): Any(str, unicode),
         Optional('max_num_segments'): Coerce(int),
+        Optional('pattern'): Any(str, unicode),
         Optional('reverse'): Any(int, str, unicode, bool, None),
         Optional('range_from'): Coerce(int),
         Optional('range_to'): Coerce(int),

diff --git a/curator/indexlist.py b/curator/indexlist.py
@@ -1,6 +1,7 @@
 from datetime import timedelta, datetime, date
 import time
 import re
+import itertools
 import logging
 import elasticsearch
 from .defaults import settings
@@ -766,7 +767,7 @@ def filter_by_alias(self, aliases=None, exclude=False):
                 self.__excludify(condition, exclude, index, msg)
 
     def filter_by_count(
-        self, count=None, reverse=True, use_age=False,
+        self, count=None, reverse=True, use_age=False, pattern=None,
         source='creation_date', timestring=None, field=None,
         stats_result='min_value', exclude=True):
         """
@@ -791,6 +792,16 @@ def filter_by_count(
         :arg reverse: The filtering direction. (default: `True`).
         :arg use_age: Sort indices by age.  ``source`` is required in this
             case.
+        :arg pattern: Select indices to count from a regular expression 
+            pattern.  This pattern must have one and only one capture group.
+            This can allow a single ``count`` filter instance to operate against
+            any number of matching patterns, and keep ``count`` of each index
+            in that group.  For example, given a ``pattern`` of ``'^(.*)-\d{6}$'``,
+            it will match both ``rollover-000001`` and ``index-999990``, but not 
+            ``logstash-2017.10.12``.  Following the same example, if my cluster
+            also had ``rollover-000002`` through ``rollover-000010`` and
+            ``index-888888`` through ``index-999999``, it will process both
+            groups of indices, and include or exclude the ``count`` of each.
         :arg source: Source of index age. Can be one of ``name``,
             ``creation_date``, or ``field_stats``. Default: ``creation_date``
         :arg timestring: An strftime string to match the datestamp in an index
@@ -811,35 +822,72 @@ def filter_by_count(
 
         # Create a copy-by-value working list
         working_list = self.working_list()
-
-        if use_age:
-            if source != 'name':
-                self.loggit.warn(
-                    'Cannot get age information from closed indices unless '
-                    'source="name".  Omitting any closed indices.'
+        if pattern:
+            try:
+                r = re.compile(pattern)
+                if r.groups < 1:
+                    raise ConfigurationError('No regular expression group found in {0}'.format(pattern))
+                elif r.groups > 1:
+                    raise ConfigurationError('More than 1 regular expression group found in {0}'.format(pattern))
+                # Prune indices not matching the regular expression the object (and filtered_indices)
+                # We do not want to act on them by accident.
+                prune_these = list(filter(lambda x: r.match(x) is None, working_list))
+                filtered_indices = working_list
+                for index in prune_these:
+                    msg = (
+                        '{0} does not match regular expression {1}.'.format(
+                            index, pattern
+                        )
+                    )
+                    condition = True
+                    exclude = True
+                    self.__excludify(condition, exclude, index, msg)
+                    # also remove it from filtered_indices
+                    filtered_indices.remove(index)
+                # Presort these filtered_indices using the lambda
+                presorted = sorted(filtered_indices, key=lambda x: r.match(x).group(1))
+            except Exception as e:
+                raise ActionError('Unable to process pattern: "{0}". Error: {1}'.format(pattern, e))
+            # Initialize groups here
+            groups = []
+            # We have to pull keys k this way, but we don't need to keep them
+            # We only need g for groups
+            for k, g in itertools.groupby(presorted, key=lambda x: r.match(x).group(1)):
+                groups.append(list(g))
+        else:
+            # Since pattern will create a list of lists, and we iterate over that,
+            # we need to put our single list inside a list
+            groups = [ working_list ]
+        for group in groups:
+            if use_age:
+                if source != 'name':
+                    self.loggit.warn(
+                        'Cannot get age information from closed indices unless '
+                        'source="name".  Omitting any closed indices.'
+                    )
+                    self.filter_closed()
+                self._calculate_ages(
+                    source=source, timestring=timestring, field=field,
+                    stats_result=stats_result
                 )
-                self.filter_closed()
-            self._calculate_ages(
-                source=source, timestring=timestring, field=field,
-                stats_result=stats_result
-            )
-            # Using default value of reverse=True in self._sort_by_age()
-            sorted_indices = self._sort_by_age(working_list, reverse=reverse)
+                # Using default value of reverse=True in self._sort_by_age()
+                sorted_indices = self._sort_by_age(group, reverse=reverse)
 
-        else:
-            # Default to sorting by index name
-            sorted_indices = sorted(working_list, reverse=reverse)
+            else:
+                # Default to sorting by index name
+                sorted_indices = sorted(group, reverse=reverse)
 
-        idx = 1
-        for index in sorted_indices:
-            msg = (
-                '{0} is {1} of specified count of {2}.'.format(
-                    index, idx, count
+
+            idx = 1
+            for index in sorted_indices:
+                msg = (
+                    '{0} is {1} of specified count of {2}.'.format(
+                        index, idx, count
+                    )
                 )
-            )
-            condition = True if idx <= count else False
-            self.__excludify(condition, exclude, index, msg)
-            idx += 1
+                condition = True if idx <= count else False
+                self.__excludify(condition, exclude, index, msg)
+                idx += 1
 
     def filter_period(
         self, source='name', range_from=None, range_to=None, timestring=None,

diff --git a/docs/Changelog.rst b/docs/Changelog.rst
@@ -14,6 +14,8 @@ Changelog
     Requested in #1045. (untergeek)
   * Add a ``restore`` function to ``curator_cli`` singleton. Mentioned in
     #851 (alexef)
+  * Add ``pattern`` to the ``count`` filter.  This is particularly useful
+    when working with rollover indices.  Requested in #1044 (untergeek)
 
 **Bug Fixes**
 

diff --git a/docs/asciidoc/filter_elements.asciidoc b/docs/asciidoc/filter_elements.asciidoc
@@ -415,6 +415,43 @@ will be raised, and execution will halt.
 
 
 
+[[fe_pattern]]
+== pattern
+
+NOTE: This setting is only used with the <<filtertype_count,count>> filtertype
+
+[source,yaml]
+-------------
+- filtertype: count
+  count: 1
+  pattern: '^(.*)-\d{6}$'
+  reverse: true
+-------------
+
+This particular example will match indices following the basic rollover pattern
+of `indexname-######`, and keep the highest numbered index for each group.
+
+For example, given indices `a-000001`, `a-000002`, `a-000003` and `b-000006`,
+and `b-000007`, the indices will would be matched are `a-000003` and `b-000007`.
+Indices that do not match the regular expression in `pattern` will be
+automatically excluded.
+
+This is particularly useful with indices created and managed using the
+{ref}/indices-rollover-index.html[Rollover API], as you can select only the
+active indices with the above example (<<fe_exclude,`exclude`>> defaults to `False`).
+Setting <<fe_exclude,`exclude`>> to `True` with the above example will _remove_
+the active rollover indices, leaving only those which have been rolled-over.
+
+While this is perhaps most useful for the aforementioned scenario, it can
+also be used with age-based indices as well.
+
+Items will remain in the actionable list depending on the value of
+<<fe_exclude,exclude>>, and <<fe_reverse,reverse>>.
+
+There is no default value. The value must include a capture group, defined by
+parenthesis, or left empty.  If a value is provided, and there is no capture
+group, and exception will be raised and execution will halt.
+
 [[fe_range_from]]
 == range_from
 

diff --git a/docs/asciidoc/filters.asciidoc b/docs/asciidoc/filters.asciidoc
@@ -305,6 +305,33 @@ All of the age-related settings from the <<filtertype_age,`age`>> filter are
 supported, and the same restrictions apply with regard to filtering indices vs.
 snapshots.
 
+=== Pattern-based sorting
+
+[source,yaml]
+-------------
+- filtertype: count
+  count: 1
+  pattern: '^(.*)-\d{6}$'
+  reverse: true
+-------------
+
+This particular example will match indices following the basic rollover pattern
+of `indexname-######`, and keep the highest numbered index for each group.
+
+For example, given indices `a-000001`, `a-000002`, `a-000003` and `b-000006`,
+and `b-000007`, the indices will would be matched are `a-000003` and `b-000007`.
+Indices that do not match the regular expression in `pattern` will be
+automatically excluded.
+
+This is particularly useful with indices created and managed using the
+{ref}/indices-rollover-index.html[Rollover API], as you can select only the
+active indices with the above example (<<fe_exclude,`exclude`>> defaults to `False`).
+Setting <<fe_exclude,`exclude`>> to `True` with the above example will _remove_
+the active rollover indices, leaving only those which have been rolled-over.
+
+While this is perhaps most useful for the aforementioned scenario, it can
+also be used with age-based indices as well.
+
 === Reversing sorting
 
 Using the default configuration, <<fe_reverse,`reverse`>> is `True`.  Given
@@ -398,6 +425,7 @@ removed from the actionable list, leaving `index-2017.03.03`,
 
 * <<fe_reverse,reverse>>
 * <<fe_use_age,use_age>>
+* <<fe_pattern,pattern>>
 * <<fe_source,source>> (required if `use_age` is `True`)
 * <<fe_timestring,timestring>> (required if `source` is `name`)
 * <<fe_exclude,exclude>> (default is `False`)

diff --git a/test/integration/test_count_pattern.py b/test/integration/test_count_pattern.py
@@ -0,0 +1,99 @@
+import elasticsearch
+import curator
+import os
+import json
+import string, random, tempfile
+import time
+from click import testing as clicktest
+from mock import patch, Mock
+import unittest
+from . import CuratorTestCase
+from . import testvars as testvars
+
+import logging
+logger = logging.getLogger(__name__)
+
+host, port = os.environ.get('TEST_ES_SERVER', 'localhost:9200').split(':')
+port = int(port) if port else 9200
+# '      - filtertype: {0}\n'
+# '        source: {1}\n'
+# '        direction: {2}\n'
+# '        timestring: {3}\n'
+# '        unit: {4}\n'
+# '        unit_count: {5}\n'
+# '        field: {6}\n'
+# '        stats_result: {7}\n'
+# '        epoch: {8}\n')
+
+global_client = elasticsearch.Elasticsearch(host=host, port=port)
+
+delete_count_pattern = ('---\n'
+'actions:\n'
+'  1:\n'
+'    description: "Delete indices as filtered"\n'
+'    action: delete_indices\n'
+'    options:\n'
+'      continue_if_exception: False\n'
+'      disable_action: False\n'
+'    filters:\n'
+'      - filtertype: count\n'
+'        pattern: {0}\n'
+'        use_age: {1}\n'
+'        source: {2}\n'
+'        timestring: {3}\n'
+'        reverse: {4}\n'
+'        count: {5}\n')
+
+class TestCLICountPattern(CuratorTestCase):
+    def test_match_proper_indices(self):
+        for i in range(1, 4):
+            self.create_index('a-{0}'.format(i))
+        for i in range(4, 7):
+            self.create_index('b-{0}'.format(i))
+        for i in range(5, 9):
+            self.create_index('c-{0}'.format(i))
+        self.create_index('not_a_match')
+        self.write_config(
+            self.args['configfile'], testvars.client_config.format(host, port))
+        self.write_config(
+            self.args['actionfile'],
+            delete_count_pattern.format(
+                '\'^(a|b|c)-\d$\'', 'false', 'name', '\'%Y.%m.%d\'', 'true', 1
+            )
+        )
+        test = clicktest.CliRunner()
+        result = test.invoke(
+            curator.cli,
+            [
+                '--config', self.args['configfile'],
+                self.args['actionfile']
+            ],
+        )
+        indices = sorted(list(self.client.indices.get('_all')))
+        self.assertEquals(['a-3', 'b-6', 'c-8', 'not_a_match'], indices)
+    def test_match_proper_indices_by_age(self):
+        self.create_index('a-2017.10.01')
+        self.create_index('a-2017.10.02')
+        self.create_index('a-2017.10.03')
+        self.create_index('b-2017.09.01')
+        self.create_index('b-2017.09.02')
+        self.create_index('b-2017.09.03')
+        self.create_index('not_a_match')
+        self.write_config(
+            self.args['configfile'], testvars.client_config.format(host, port))
+        self.write_config(
+            self.args['actionfile'],
+            delete_count_pattern.format(
+                '\'^(a|b)-\d{4}\.\d{2}\.\d{2}$\'', 'true', 'name', '\'%Y.%m.%d\'', 'true', 1
+            )
+        )
+        test = clicktest.CliRunner()
+        result = test.invoke(
+            curator.cli,
+            [
+                '--config', self.args['configfile'],
+                self.args['actionfile']
+            ],
+        )
+        indices = sorted(list(self.client.indices.get('_all')))
+        self.assertEquals(['a-2017.10.03', 'b-2017.09.03', 'not_a_match'], indices)
diff --git a/test/integration/testvars.py b/test/integration/testvars.py
@@ -410,6 +410,7 @@
                 '        unit_count: {5}\n'
                 '        unit_count_pattern: {6}\n')
 
+
 delete_period_proto = ('---\n'
 'actions:\n'
 '  1:\n'