From efcda40adf5fcba8b71840a22dac601bedf342b4 Mon Sep 17 00:00:00 2001 From: Makman2 Date: Mon, 2 Mar 2015 19:02:17 +0100 Subject: [PATCH] parsing: Add unescaped_search_in_between function Searches for a string enclosed between a begin- and end-sequence. Handles escaped sequences. This commit includes a full test set. Partially fixes https://github.com/coala-analyzer/coala/issues/160 --- coalib/parsing/StringProcessing.py | 69 +++++++++++ coalib/tests/parsing/StringProcessingTest.py | 123 +++++++++++++++++++ 2 files changed, 192 insertions(+) diff --git a/coalib/parsing/StringProcessing.py b/coalib/parsing/StringProcessing.py index d43d0af593..45dde12104 100644 --- a/coalib/parsing/StringProcessing.py +++ b/coalib/parsing/StringProcessing.py @@ -240,3 +240,72 @@ def search_in_between(begin, for elem in matches: yield elem.group(compiled_begin_pattern.groups + 1) + +def unescaped_search_in_between(begin, + end, + string, + max_matches = 0, + remove_empty_matches = False): + """ + Searches for a string enclosed between a specified begin- and end-sequence. + Also enclosed \n are put into the result. + Handles escaped begin- and end-sequences (and so only patterns that are + unescaped). + This function is a generator. + CAUTION: Using the escaped character '\' in the begin- or end-sequences + the function can return strange results. The backslash can + interfere with the escaping regex-sequence used internally to + match the enclosed string. + + :param begin: The begin-sequence where to start matching. + Providing regexes (and not only fixed strings) + is allowed. + :param end: The end-sequence where to end matching. + Providing regexes (and not only fixed strings) + is allowed. + :param string: The string where to search in. + :param max_matches Defines the maximum number of matches. If 0 or + less is provided, the number of splits is not + limited. + :param remove_empty_matches: Defines whether empty entries should + be removed from the result. + :return: An iterator returning the matched strings. + """ + # Compilation of the begin sequence is needed to get the number of + # capturing groups in it. + compiled_begin_pattern = re.compile(begin) + + # Regex explanation: + # 1. (?