Browse files

better selection of variable names

replace non-id characters with unicode names and add hash for long strings
  • Loading branch information...
1 parent cbaddd2 commit dd95489a24fe8dce2b570b23b6c571d5135b5e08 @denik committed Aug 3, 2011
Showing with 61 additions and 3 deletions.
  1. +61 −3 Cython/Compiler/Code.py
View
64 Cython/Compiler/Code.py
@@ -344,7 +344,7 @@ def __init__(self, cname, type):
replace_identifier=object, find_alphanums=object)
possible_unicode_identifier = re.compile(ur"(?![0-9])\w+$", re.U).match
possible_bytes_identifier = re.compile(r"(?![0-9])\w+$".encode('ASCII')).match
-replace_identifier = re.compile(r'[^a-zA-Z0-9_]+').sub
+replace_identifier = re.compile(r'[^a-zA-Z0-9_]+').subn
find_alphanums = re.compile('([a-zA-Z0-9]+)').findall
class StringConst(object):
@@ -707,10 +707,47 @@ def new_int_const_cname(self, value, longness):
cname = cname.replace('-', 'neg_').replace('.','_')
return cname
- def new_const_cname(self, prefix='', value=''):
+ def new_const_cname(self, prefix='', value='', limit=32):
+ from hashlib import md5
@scoder
scoder added a note Aug 5, 2011

hashlib isn't available in Py2.4

@denik
Owner
denik added a note Aug 6, 2011

fixed: fixed here: 005b3ee

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
+ from base64 import b64encode
if hasattr(value, 'decode'):
value = value.decode('ASCII', 'ignore')
- value = replace_identifier('_', value)[:32].strip('_')
+ orig_value = value
+ need_hash = False
+
+ def repl(m):
+ chars = []
+ for c in m.group():
+ chars.append(short_unicode_name(c))
+ if chars:
+ result = '_'.join(chars)
+ if m.start() > 0:
+ result = '_' + result
+ if m.end() < len(value):
+ result = result + '_'
+ return result
+ return '_'
+
+ if len(value) > limit:
+ need_hash = True
+
+ value, n = replace_identifier(repl, value[:limit])
+ if len(value) >= limit:
+ need_hash = True
+ if n:
+ value = '_' + value[:limit - 1]
+
+ if need_hash:
+ digest = b64encode(md5(orig_value).digest()).replace('+', '=').replace('/', '=').replace('=', '')
@robertwb
robertwb added a note Aug 3, 2011

hexdigest()

fixed here: 005b3ee
also simplified

@denik
Owner
denik added a note Aug 3, 2011

Yeah, I guess that would work too :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
+ length = len(digest)
+
+ for length in xrange(4, len(digest)):
+ if (value + '_' + digest[:length]) not in self.const_cname_counters:
+ value = value[:limit - length - 1] + '_' + digest[:length]
@robertwb
robertwb added a note Aug 3, 2011

If we need a lot of the hash, I'm OK with not shortening the value portion for readability. (Same below.) As long as the expected length is small and maximum length is not unbounded then I think that's good.

@denik
Owner
denik added a note Aug 3, 2011

OK, so what if we just get rid of the for-loop and use 4 or 5 characters of hash? I think it will work for all cases. Counter below will stay for the exceptional case when it won't.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
+ break
+ else:
+ value = value[:limit - 1 - len(digest)] + '_' + digest
+
c = self.const_cname_counters
c[value] = c.setdefault(value, 0) + 1
if c[value] == 1:
@@ -922,6 +959,27 @@ def use_utility_code(self, utility_code):
utility_code.put_code(self)
+def short_unicode_name(char, shortcut={'SPACE': 'SP',
+ 'HYPHEN-MINUS': 'HYPHEN',
+ 'PERCENT': 'PCNT'}):
+ import unicodedata
+ name = unicodedata.name(char, '').replace(' SIGN', '')
+ if name:
+ name = shortcut.get(name, name)
+ name = name.replace('-', ' ')
+ if ' ' in name:
+ # "LEFT PARENTHESIS" => "LP"
+ name = ''.join(word[:1] for word in name.split(' '))
+ name, _ = replace_identifier('_', name)
+ else:
+ name = repr(char).lstrip('u').strip("'").strip('"').lstrip('\\')
+ if name:
+ name, _ = replace_identifier('_', name)
+ else:
+ name = '_'
+ return name
+
+
def funccontext_property(name):
try:
import operator

0 comments on commit dd95489

Please sign in to comment.