Skip to content

Commit

Permalink
better selection of variable names
Browse files Browse the repository at this point in the history
replace non-id characters with unicode names and add hash for long strings
  • Loading branch information
denik committed Aug 2, 2011
1 parent cbaddd2 commit dd95489
Showing 1 changed file with 61 additions and 3 deletions.
64 changes: 61 additions & 3 deletions Cython/Compiler/Code.py
Expand Up @@ -344,7 +344,7 @@ def __init__(self, cname, type):
replace_identifier=object, find_alphanums=object)
possible_unicode_identifier = re.compile(ur"(?![0-9])\w+$", re.U).match
possible_bytes_identifier = re.compile(r"(?![0-9])\w+$".encode('ASCII')).match
replace_identifier = re.compile(r'[^a-zA-Z0-9_]+').sub
replace_identifier = re.compile(r'[^a-zA-Z0-9_]+').subn
find_alphanums = re.compile('([a-zA-Z0-9]+)').findall

class StringConst(object):
Expand Down Expand Up @@ -707,10 +707,47 @@ def new_int_const_cname(self, value, longness):
cname = cname.replace('-', 'neg_').replace('.','_')
return cname

def new_const_cname(self, prefix='', value=''):
def new_const_cname(self, prefix='', value='', limit=32):
from hashlib import md5

This comment has been minimized.

Copy link
@scoder

scoder Aug 5, 2011

hashlib isn't available in Py2.4

This comment has been minimized.

Copy link
@denik

denik Aug 6, 2011

Author Owner

fixed: fixed here: 005b3ee

from base64 import b64encode
if hasattr(value, 'decode'):
value = value.decode('ASCII', 'ignore')
value = replace_identifier('_', value)[:32].strip('_')
orig_value = value
need_hash = False

def repl(m):
chars = []
for c in m.group():
chars.append(short_unicode_name(c))
if chars:
result = '_'.join(chars)
if m.start() > 0:
result = '_' + result
if m.end() < len(value):
result = result + '_'
return result
return '_'

if len(value) > limit:
need_hash = True

value, n = replace_identifier(repl, value[:limit])
if len(value) >= limit:
need_hash = True
if n:
value = '_' + value[:limit - 1]

if need_hash:
digest = b64encode(md5(orig_value).digest()).replace('+', '=').replace('/', '=').replace('=', '')

This comment has been minimized.

Copy link
@robertwb

robertwb Aug 3, 2011

hexdigest()

fixed here: 005b3ee
also simplified

This comment has been minimized.

Copy link
@denik

denik Aug 3, 2011

Author Owner

Yeah, I guess that would work too :)

length = len(digest)

for length in xrange(4, len(digest)):
if (value + '_' + digest[:length]) not in self.const_cname_counters:
value = value[:limit - length - 1] + '_' + digest[:length]

This comment has been minimized.

Copy link
@robertwb

robertwb Aug 3, 2011

If we need a lot of the hash, I'm OK with not shortening the value portion for readability. (Same below.) As long as the expected length is small and maximum length is not unbounded then I think that's good.

This comment has been minimized.

Copy link
@denik

denik Aug 3, 2011

Author Owner

OK, so what if we just get rid of the for-loop and use 4 or 5 characters of hash? I think it will work for all cases. Counter below will stay for the exceptional case when it won't.

break
else:
value = value[:limit - 1 - len(digest)] + '_' + digest

c = self.const_cname_counters
c[value] = c.setdefault(value, 0) + 1
if c[value] == 1:
Expand Down Expand Up @@ -922,6 +959,27 @@ def use_utility_code(self, utility_code):
utility_code.put_code(self)


def short_unicode_name(char, shortcut={'SPACE': 'SP',
'HYPHEN-MINUS': 'HYPHEN',
'PERCENT': 'PCNT'}):
import unicodedata
name = unicodedata.name(char, '').replace(' SIGN', '')
if name:
name = shortcut.get(name, name)
name = name.replace('-', ' ')
if ' ' in name:
# "LEFT PARENTHESIS" => "LP"
name = ''.join(word[:1] for word in name.split(' '))
name, _ = replace_identifier('_', name)
else:
name = repr(char).lstrip('u').strip("'").strip('"').lstrip('\\')
if name:
name, _ = replace_identifier('_', name)
else:
name = '_'
return name


def funccontext_property(name):
try:
import operator
Expand Down

0 comments on commit dd95489

Please sign in to comment.