Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Started to work on the regex reverse-engineering phase.

git-svn-id: http://code.djangoproject.com/svn/django/trunk@7850 bcc190cf-cafb-0310-a4f2-bffc1f526a37
  • Loading branch information...
commit c2f753a8bae369e39234ea50b23c228fee3de70c 1 parent c88e55b
@malcolmt malcolmt authored
Showing with 123 additions and 0 deletions.
  1. +123 −0 django/utils/regex_helper.py
View
123 django/utils/regex_helper.py
@@ -0,0 +1,123 @@
+"""
+Functions for reversing a regular expression (used in reverse URL resolving).
+
+This is not, and is not intended to be, a complete reg-exp decompiler. It
+should be good enough for almost all sane URLs.
+"""
+
+import re
+from bisect import bisect
+
+GROUP_CLASS = re.compile(r'''\((?:
+ (?P<positional>[^?])| # Unnamed (positional) capturing group.
+ \?(?:
+ P<(?P<named>[\w]+)>(?P<contents>.*)| # Named capturing group.
+ P=(?P<repeat>.+)| # Repeat of a previous named group.
+ (?P<grouping>:)| # Non-capturing grouping parens.
+ (?P<comment>\#)| # Comment group
+ (?P<illegal>.) # Anything else (which will be an error)
+ )
+ ).*\)''', re.VERBOSE)
+
+def normalize(pattern):
+ """
+ Given a reg-exp pattern, normalizes it to a list of forms that suffice for
+ reverse matching. This does the following:
+
+ (1) For any repeating sections, keeps the minimum number of occurrences
+ permitted (this means zero for optional groups).
+ (2) If an optional group includes parameters, include one occurrence of
+ that group (along with the zero occurrence case from step (1)).
+ (3) Select the first (essentially an arbitrary) element from any character
+ class. Select an arbitrary character for any unordered class (e.g. '.' or
+ '\w') in the pattern.
+ (4) Take the first alternative in any '|' division, unless other
+ alternatives would involve different parameters.
+ (5) Ignore comments. Error on all other non-capturing (?...) forms (e.g.
+ look-ahead and look-behind matches).
+
+ Returns a list of tuples, each tuple containing (a) a pattern, (b) the
+ number of parameters, (c) the names of the parameters. Any unnamed
+ parameters are called '_0', '_1', etc.
+ """
+ # Do a linear scan to work out the special features of this pattern. The
+ # idea is that we scan once here and collect all the information we need to
+ # make future decisions.
+ groups = [] # (start, end)
+ quantifiers = [] # start pos
+ ranges = [] # (start, end)
+ eols = [] # pos
+ disjunctions = [] # pos
+ unclosed_groups = []
+ unclosed_ranges = []
+ escaped = False
+ quantify = False
+ in_range = False
+ for pos, c in enumerate(pattern):
+ if in_range and c != ']' or (c == ']' and
+ unclosed_ranges[-1] == pos - 1):
+ continue
+ elif c == '[':
+ unclosed_ranges.append(pos)
+ elif c == ']':
+ ranges.append((unclosed_ranges.pop(), pos + 1))
+ in_range = False
+ elif c == '.':
+ # Treat this as a one-character long range:
+ ranges.append((pos, pos + 1))
+ elif escaped or c == '\\':
+ escaped = not escaped
+ elif c == '(':
+ unclosed_groups.append(pos)
+ elif c == ')':
+ groups.append((unclosed_groups.pop(), pos + 1))
+ elif quantify and c == '?':
+ quantify = False
+ elif c in '?*+{':
+ quantifiers.append(pos)
+ quantify = True
+ elif c == '$':
+ eols.append(pos)
+ elif c == '|':
+ disjunctions.append(pos)
+
+ # Now classify each of the parenthetical groups to work out which ones take
+ # parameters. Only the outer-most of a set of nested capturing groups is
+ # important.
+ groups.sort()
+ params = []
+ comments = []
+ last_end = 0
+ for start, end in groups:
+ if start < last_end:
+ # Skip over inner nested capturing groups.
+ continue
+ m = GROUP_CLASS.match(pattern, start)
+ if m.group('positional'):
+ params.append((start, end, '_%d' % len(params), start + 1))
+ elif m.group('named'):
+ params.append((start, end, m.group('named'), m.start('contents')))
+ elif m.group('repeat'):
+ params.append((start, end, m.group('repeat'), start + 1))
+ elif m.group('illegal'):
+ raise ValueError('The pattern construct %r is not valid here.'
+ % pattern[start:end])
+ elif m.group('comment'):
+ comments.extend([start, end])
+ else:
+ # This is a non-capturing set, so nesting prohibitions don't apply
+ # to any inner groups.
+ continue
+ last_end = end
+
+ # XXX: Got to here!
+ results = []
+ end = groups[0][0]
+ # The first bit, before the first group starts.
+ if end == 0:
+ # FIXME: don't want to handle this case just yet.
+ raise Exception
+
+ quant_end = bisect(quantifiers, end)
+ range_end = bisect(ranges, end)
+ dis_end = bisect(disjunctions, end)
Please sign in to comment.
Something went wrong with that request. Please try again.