Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Started to work on the regex reverse-engineering phase.

git-svn-id: http://code.djangoproject.com/svn/django/trunk@7850 bcc190cf-cafb-0310-a4f2-bffc1f526a37
  • Loading branch information...
commit c2f753a8bae369e39234ea50b23c228fee3de70c 1 parent c88e55b
Malcolm Tredinnick authored

Showing 1 changed file with 123 additions and 0 deletions. Show diff stats Hide diff stats

  1. 123  django/utils/regex_helper.py
123  django/utils/regex_helper.py
... ...
@@ -0,0 +1,123 @@
  1
+"""
  2
+Functions for reversing a regular expression (used in reverse URL resolving).
  3
+
  4
+This is not, and is not intended to be, a complete reg-exp decompiler. It
  5
+should be good enough for almost all sane URLs.
  6
+"""
  7
+
  8
+import re
  9
+from bisect import bisect
  10
+
  11
+GROUP_CLASS = re.compile(r'''\((?:
  12
+        (?P<positional>[^?])|       # Unnamed (positional) capturing group.
  13
+        \?(?:
  14
+            P<(?P<named>[\w]+)>(?P<contents>.*)|    # Named capturing group.
  15
+            P=(?P<repeat>.+)|       # Repeat of a previous named group.
  16
+            (?P<grouping>:)|        # Non-capturing grouping parens.
  17
+            (?P<comment>\#)|        # Comment group
  18
+            (?P<illegal>.)          # Anything else (which will be an error)
  19
+        )
  20
+    ).*\)''', re.VERBOSE)
  21
+
  22
+def normalize(pattern):
  23
+    """
  24
+    Given a reg-exp pattern, normalizes it to a list of forms that suffice for
  25
+    reverse matching. This does the following:
  26
+
  27
+    (1) For any repeating sections, keeps the minimum number of occurrences
  28
+    permitted (this means zero for optional groups).
  29
+    (2) If an optional group includes parameters, include one occurrence of
  30
+    that group (along with the zero occurrence case from step (1)).
  31
+    (3) Select the first (essentially an arbitrary) element from any character
  32
+    class. Select an arbitrary character for any unordered class (e.g. '.' or
  33
+    '\w') in the pattern.
  34
+    (4) Take the first alternative in any '|' division, unless other
  35
+    alternatives would involve different parameters.
  36
+    (5) Ignore comments. Error on all other non-capturing (?...) forms (e.g.
  37
+    look-ahead and look-behind matches).
  38
+
  39
+    Returns a list of tuples, each tuple containing (a) a pattern, (b) the
  40
+    number of parameters, (c) the names of the parameters. Any unnamed
  41
+    parameters are called '_0', '_1', etc.
  42
+    """
  43
+    # Do a linear scan to work out the special features of this pattern. The
  44
+    # idea is that we scan once here and collect all the information we need to
  45
+    # make future decisions.
  46
+    groups = []             # (start, end)
  47
+    quantifiers = []        # start pos
  48
+    ranges = []             # (start, end)
  49
+    eols = []               # pos
  50
+    disjunctions = []       # pos
  51
+    unclosed_groups = []
  52
+    unclosed_ranges = []
  53
+    escaped = False
  54
+    quantify = False
  55
+    in_range = False
  56
+    for pos, c in enumerate(pattern):
  57
+        if in_range and c != ']' or (c == ']' and
  58
+                unclosed_ranges[-1] == pos - 1):
  59
+            continue
  60
+        elif c == '[':
  61
+            unclosed_ranges.append(pos)
  62
+        elif c == ']':
  63
+            ranges.append((unclosed_ranges.pop(), pos + 1))
  64
+            in_range = False
  65
+        elif c == '.':
  66
+            # Treat this as a one-character long range:
  67
+            ranges.append((pos, pos + 1))
  68
+        elif escaped or c == '\\':
  69
+            escaped = not escaped
  70
+        elif c == '(':
  71
+            unclosed_groups.append(pos)
  72
+        elif c == ')':
  73
+            groups.append((unclosed_groups.pop(), pos + 1))
  74
+        elif quantify and c == '?':
  75
+            quantify = False
  76
+        elif c in '?*+{':
  77
+            quantifiers.append(pos)
  78
+            quantify = True
  79
+        elif c == '$':
  80
+            eols.append(pos)
  81
+        elif c == '|':
  82
+            disjunctions.append(pos)
  83
+
  84
+    # Now classify each of the parenthetical groups to work out which ones take
  85
+    # parameters. Only the outer-most of a set of nested capturing groups is
  86
+    # important.
  87
+    groups.sort()
  88
+    params = []
  89
+    comments = []
  90
+    last_end = 0
  91
+    for start, end in groups:
  92
+        if start < last_end:
  93
+            # Skip over inner nested capturing groups.
  94
+            continue
  95
+        m = GROUP_CLASS.match(pattern, start)
  96
+        if m.group('positional'):
  97
+            params.append((start, end, '_%d' % len(params), start + 1))
  98
+        elif m.group('named'):
  99
+            params.append((start, end, m.group('named'), m.start('contents')))
  100
+        elif m.group('repeat'):
  101
+            params.append((start, end, m.group('repeat'), start + 1))
  102
+        elif m.group('illegal'):
  103
+            raise ValueError('The pattern construct %r is not valid here.'
  104
+                    % pattern[start:end])
  105
+        elif m.group('comment'):
  106
+            comments.extend([start, end])
  107
+        else:
  108
+            # This is a non-capturing set, so nesting prohibitions don't apply
  109
+            # to any inner groups.
  110
+            continue
  111
+        last_end = end
  112
+
  113
+    # XXX: Got to here!
  114
+    results = []
  115
+    end = groups[0][0]
  116
+    # The first bit, before the first group starts.
  117
+    if end == 0:
  118
+        # FIXME: don't want to handle this case just yet.
  119
+        raise Exception
  120
+
  121
+    quant_end = bisect(quantifiers, end)
  122
+    range_end = bisect(ranges, end)
  123
+    dis_end = bisect(disjunctions, end)

0 notes on commit c2f753a

Please sign in to comment.
Something went wrong with that request. Please try again.