Skip to content

Commit

Permalink
parsing: Add unescaped_search_in_between function
Browse files Browse the repository at this point in the history
Searches for a string enclosed between a begin- and end-sequence.
Handles escaped sequences.

This commit includes a full test set.

Partially fixes #160
  • Loading branch information
Makman2 authored and sils committed Mar 17, 2015
1 parent d34beb3 commit efcda40
Show file tree
Hide file tree
Showing 2 changed files with 192 additions and 0 deletions.
69 changes: 69 additions & 0 deletions coalib/parsing/StringProcessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,3 +240,72 @@ def search_in_between(begin,
for elem in matches:
yield elem.group(compiled_begin_pattern.groups + 1)


def unescaped_search_in_between(begin,
end,
string,
max_matches = 0,
remove_empty_matches = False):
"""
Searches for a string enclosed between a specified begin- and end-sequence.
Also enclosed \n are put into the result.
Handles escaped begin- and end-sequences (and so only patterns that are
unescaped).
This function is a generator.
CAUTION: Using the escaped character '\' in the begin- or end-sequences
the function can return strange results. The backslash can
interfere with the escaping regex-sequence used internally to
match the enclosed string.
:param begin: The begin-sequence where to start matching.
Providing regexes (and not only fixed strings)
is allowed.
:param end: The end-sequence where to end matching.
Providing regexes (and not only fixed strings)
is allowed.
:param string: The string where to search in.
:param max_matches Defines the maximum number of matches. If 0 or
less is provided, the number of splits is not
limited.
:param remove_empty_matches: Defines whether empty entries should
be removed from the result.
:return: An iterator returning the matched strings.
"""
# Compilation of the begin sequence is needed to get the number of
# capturing groups in it.
compiled_begin_pattern = re.compile(begin)

# Regex explanation:
# 1. (?<!\\)(?:\\\\)* Unescapes the following char. The first part of this
# regex is a look-behind assertion. Only match the
# following if no single backslash is before it.
# The second part matches all double backslashes.
# In fact this sequence matches all escapes that occur
# as a multiple of two, means the following statement
# is not escaped.
# 2. (?:begin) A non-capturing group that matches the begin
# 3. (.*?) sequence. Match any char unlimited times, as few
# times as possible. Save the match in the capturing
# group after all capturing groups that can appear in
# 'begin'.
# 4. (?<!\\)(?:\\\\)* Again the unescaping regex.
# 5. (?:end) A non-capturing group that matches the end sequence.
# Because the 3. group is lazy (matches as few times as
# possible) the next occurring end-sequence is matched.
regex = (r"(?<!\\)(?:\\\\)*(?:" + begin + r")(.*?)(?<!\\)((?:\\\\)*)(?:" +
end + r")")

matches = re.finditer(regex, string, re.DOTALL)

if remove_empty_matches:
matches = trim_empty_matches(
matches,
[compiled_begin_pattern.groups + 1,
compiled_begin_pattern.groups + 2])

matches = limit(matches, max_matches)

for elem in matches:
yield (elem.group(compiled_begin_pattern.groups + 1) +
elem.group(compiled_begin_pattern.groups + 2))

123 changes: 123 additions & 0 deletions coalib/tests/parsing/StringProcessingTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from coalib.parsing.StringProcessing import split
from coalib.parsing.StringProcessing import unescaped_split
from coalib.parsing.StringProcessing import search_in_between
from coalib.parsing.StringProcessing import unescaped_search_in_between


class StringProcessingTest(unittest.TestCase):
Expand Down Expand Up @@ -67,6 +68,7 @@ def setUp(self):
self.setUp_split()
self.setUp_unescaped_split()
self.setUp_search_in_between()
self.setUp_unescaped_search_in_between()

def setUp_search_for(self):
# Match either "out1" or "out2".
Expand Down Expand Up @@ -265,6 +267,55 @@ def setUp_search_in_between(self):
[],
[r"a"]]

def setUp_unescaped_search_in_between(self):
self.test_unescaped_search_in_between_pattern = "'"
self.test_unescaped_search_in_between_expected_results = [
[r"escaped-escape: \\ "],
[r"escaped-quote: \' "],
[r"escaped-anything: \X "],
[r"two escaped escapes: \\\\ "],
[r"escaped-quote at end: \'"],
[r"escaped-escape at end: " + 2 * self.bs],
[r"str1", r"str2"],
[r"str1", r"str2"],
[r"str1", r"str2"],
[r"str1", r"str2"],
[r"str1", r"str2"],
[r"str1", r"str2"],
[r"str1", r"str2"],
[r"str1", r"str2", r"str3"],
[],
[],
[],
[]]

self.test_unescaped_search_in_between_max_match_pattern = (
self.test_unescaped_search_in_between_pattern)
self.test_unescaped_search_in_between_max_match_expected_m_results = (
self.test_unescaped_search_in_between_expected_results)

self.test_unescaped_search_in_between_regex_pattern_expected = [
[r""],
[r"c"],
[r"c", r"bc\+'**'"],
[r"\13q4ujsabbc"],
[r"\\13q4ujsabbc\+'**'ac", r"."],
[r"", r"", r"", r"", r"", r"c\+'**'", r"", r"", r"-"],
[r"cba###\\13q4ujs"],
[]]

self.test_unescaped_search_in_between_auto_trim_pattern = ";"
self.test_unescaped_search_in_between_auto_trim_expected_results = [
[],
[r"\\\\\;\\#", r"+ios"],
[r"2", r"4", r"6"],
[r"2", r"4", r"6"],
[],
[],
[],
[],
[r"a"]]

def assertSearchForResultEqual(self,
pattern,
test_strings,
Expand Down Expand Up @@ -553,6 +604,78 @@ def test_search_in_between_auto_trim(self):
self.assertIteratorElementsEqual(iter(expected_results[i]),
return_value)

# Test the basic unescaped_search_in_between() functionality.
def test_unescaped_search_in_between(self):
sequence = self.test_unescaped_search_in_between_pattern
expected_results = (
self.test_unescaped_search_in_between_expected_results)

self.assertEqual(len(expected_results), len(self.test_strings))
for i in range(0, len(expected_results)):
return_value = unescaped_search_in_between(sequence,
sequence,
self.test_strings[i])
self.assertIteratorElementsEqual(iter(expected_results[i]),
return_value)

# Test the unescaped_search_in_between() while varying the max_match
# parameter.
def test_unescaped_search_in_between_max_match(self):
sequence = self.test_unescaped_search_in_between_max_match_pattern

expected_master_results = (
self.test_unescaped_search_in_between_max_match_expected_m_results)

for max_match in [1, 2, 3, 4, 5, 67]:
expected_results = [
expected_master_results[j][0 : max_match]
for j in range(len(expected_master_results))]

self.assertEqual(len(expected_results), len(self.test_strings))
for x in range(0, len(expected_results)):
return_value = unescaped_search_in_between(
sequence,
sequence,
self.test_strings[x],
max_match)
self.assertIteratorElementsEqual(iter(expected_results[x]),
return_value)

# Test the unescaped_search_in_between() function with different regex
# patterns.
def test_unescaped_search_in_between_regex_pattern(self):
expected_results = (
self.test_unescaped_search_in_between_regex_pattern_expected)

self.assertEqual(len(expected_results), len(self.multi_patterns))
for i in range(0, len(expected_results)):
# Use each pattern as begin and end sequence.
return_value = unescaped_search_in_between(
self.multi_patterns[i],
self.multi_patterns[i],
self.multi_pattern_test_string)
self.assertIteratorElementsEqual(iter(expected_results[i]),
return_value)

# Test the unescaped_search_in_between() function for its
# remove_empty_matches feature.
def test_unescaped_search_in_between_auto_trim(self):
sequence = self.test_unescaped_search_in_between_auto_trim_pattern
expected_results = (
self.test_unescaped_search_in_between_auto_trim_expected_results)

self.assertEqual(len(expected_results),
len(self.auto_trim_test_strings))
for i in range(0, len(expected_results)):
return_value = unescaped_search_in_between(
sequence,
sequence,
self.auto_trim_test_strings[i],
0,
True)
self.assertIteratorElementsEqual(iter(expected_results[i]),
return_value)


if __name__ == '__main__':
unittest.main(verbosity=2)
Expand Down

0 comments on commit efcda40

Please sign in to comment.