Permalink
Browse files

Make emitted strings only use u() when they have to

  • Loading branch information...
1 parent c23878b commit 3ff89a806c87c18cd9abcec97eea000460238037 @daviddrysdale committed Nov 27, 2011
@@ -8,30 +8,43 @@
class DumpLocale {
private static final char SINGLE_QUOTE = 39;
+ private static final char BACKSLASH = 92;
private static final char[] hexChar = {'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'};
/* Print a Unicode name suitably escaped */
private static void printName(String name) {
- System.out.print("u('");
- // Need to escape unicode data
+ // Need to escape Unicode data if we find it.
+ boolean seenUnicode = false;
+ StringBuilder sb = new StringBuilder();
for (int ii=0; ii<name.length(); ii++) {
char c = name.charAt(ii);
if ((c >= 32) && (c < 127)) {
if (c == SINGLE_QUOTE) {
- System.out.print("\\'");
+ sb.append("\\'");
+ } else if (c == BACKSLASH) {
+ sb.append("\\\\");
} else {
- System.out.print(c);
+ sb.append(c);
}
} else {
- // non-ASCII
- System.out.print("\\u");
- System.out.print(hexChar[(c >> 12) & 0xF]);
- System.out.print(hexChar[(c >> 8) & 0xF]);
- System.out.print(hexChar[(c >> 4) & 0xF]);
- System.out.print(hexChar[c & 0xF]);
+ // Non-ASCII. Assume nothing outside of the BMP
+ seenUnicode = true;
+ sb.append("\\u");
+ sb.append(hexChar[(c >> 12) & 0xF]);
+ sb.append(hexChar[(c >> 8) & 0xF]);
+ sb.append(hexChar[(c >> 4) & 0xF]);
+ sb.append(hexChar[c & 0xF]);
}
}
- System.out.print("')");
+ if (seenUnicode) {
+ System.out.print("u('");
+ System.out.print(sb.toString());
+ System.out.print("')");
+ } else {
+ System.out.print("'");
+ System.out.print(sb.toString());
+ System.out.print("'");
+ }
}
private static void printProperty(String propName) {
@@ -116,7 +116,7 @@ def _stable_dict_repr(strdict):
"""Return a repr() for a dict keyed by a string, in sorted key order"""
lines = []
for key in sorted(strdict.keys()):
- lines.append("%s: %s" % (rpr(key), rpr(strdict[key])))
+ lines.append("'%s': %s" % (key, rpr(strdict[key])))
return "{%s}" % ", ".join(lines)
@@ -63,7 +63,6 @@
# Boilerplate header for individual region data files
_REGION_METADATA_PROLOG = '''"""Auto-generated file, do not edit by hand. %(region)s metadata"""
-from %(module)s.util import u
from %(module)s.phonemetadata import NumberFormat, PhoneNumberDesc, PhoneMetadata
'''
@@ -67,6 +67,7 @@
else:
raise
+
def _may_fall_back_to_english(lang):
# Don't fall back to English if the requested language is among the following:
# - Chinese
@@ -131,7 +131,7 @@ def __unicode__(self):
if self.national_prefix_formatting_rule is not None:
result += u(", national_prefix_formatting_rule=%s") % rpr(self.national_prefix_formatting_rule)
if self.national_prefix_optional_when_formatting:
- result += u(", national_prefix_optional_when_formatting=%s") % rpr(self.national_prefix_optional_when_formatting)
+ result += u(", national_prefix_optional_when_formatting=%s") % str(self.national_prefix_optional_when_formatting)
if self.domestic_carrier_code_formatting_rule is not None:
result += u(", domestic_carrier_code_formatting_rule=%s") % rpr(self.domestic_carrier_code_formatting_rule)
result += u(")")
@@ -185,11 +185,11 @@ def __ne__(self, other):
def __repr__(self):
return (("PhoneNumber(country_code=%s, national_number=%s, extension=%s, " +
"italian_leading_zero=%s, country_code_source=%s, preferred_domestic_carrier_code=%s)") %
- (rpr(self.country_code),
- rpr(self.national_number),
+ (self.country_code,
+ self.national_number,
rpr(self.extension),
- rpr(self.italian_leading_zero),
- rpr(self.country_code_source),
+ self.italian_leading_zero,
+ self.country_code_source,
rpr(self.preferred_domestic_carrier_code)))
def __unicode__(self):
@@ -24,12 +24,10 @@
import builtins
print3 = builtins.__dict__['print']
+ unicod = str
u = str
uchr = chr
to_long = int
- # TODO create a Py3k repr-equivalent that produces something
- # parsable in Python 2 (with the assistance of this module)
- rpr = repr
def prnt(*args, **kwargs):
sep = kwargs.get('sep', ' ')
@@ -41,6 +39,8 @@ class UnicodeMixin(object):
__str__ = lambda x: x.__unicode__()
else: # pragma no cover
+ unicod = unicode
+
import unicodedata
import re
# \N{name} = character named name in the Unicode database
@@ -59,18 +59,6 @@ def u(s):
uchr = unichr
to_long = long
- _U_SQ_RE = re.compile("^u('[^']*')")
- _U_DQ_RE = re.compile('^u("[^"]*")')
- _X_LATIN1_RE = re.compile(r"(?P<x>\\x)(?P<hexval>[0-9a-fA-Z]{2})")
- def rpr(obj):
- s = repr(obj)
- # Assume any \xYY sequences are taking advantage of Python 2's default
- # Latin-1 string encoding
- s = re.sub(_X_LATIN1_RE, '\\u00\g<hexval>', s)
- s = re.sub(_U_SQ_RE, r'u(\1)', s)
- s = re.sub(_U_DQ_RE, r'u(\1)', s)
- return s
-
def prnt(*args, **kwargs):
sep = kwargs.get('sep', ' ')
end = kwargs.get('end', '\n')
@@ -83,6 +71,39 @@ class UnicodeMixin(object): # pragma no cover
__str__ = lambda x: unicode(x).encode('utf-8')
+def rpr(s):
+ """Create a representation of a Unicode string that can be used in both
+ Python 2 and Python 3k, allowing for use of the u() function"""
+ if s is None:
+ return 'None'
+ seen_unicode = False
+ results = []
+ for cc in s:
+ ccn = ord(cc)
+ if ccn >= 32 and ccn < 127:
+ if cc == "'": # escape single quote
+ results.append('\\')
+ results.append(cc)
+ elif cc == "\\": # escape backslash
+ results.append('\\')
+ results.append(cc)
+ else:
+ results.append(cc)
+ else:
+ seen_unicode = True
+ if ccn <= 0xFFFF:
+ results.append('\\u')
+ results.append("%04x" % ccn)
+ else:
+ results.append('\\U')
+ results.append("%08x" % ccn)
+ result = "'" + "".join(results) + "'"
+ if seen_unicode:
+ return "u(" + result + ")"
+ else:
+ return result
+
+
if __name__ == '__main__': # pragma no cover
import doctest
doctest.testmod()
@@ -574,8 +574,8 @@ def testFormatWithPreferredCarrierCode(self):
self.assertEqual('Country Code: 54 National Number: 91234125678 '
'Leading Zero: False Preferred Domestic Carrier Code: 19',
str(arNumber))
- self.assertEqual("PhoneNumber(country_code=54, national_number=91234125678%s, extension=None, "
- "italian_leading_zero=False, country_code_source=None, preferred_domestic_carrier_code='19')" % _LS,
+ self.assertEqual("PhoneNumber(country_code=54, national_number=91234125678, extension=None, "
+ "italian_leading_zero=False, country_code_source=None, preferred_domestic_carrier_code='19')",
repr(arNumber))
# When the preferred_domestic_carrier_code is present (even when it
# contains an empty string), use it instead of the default carrier
@@ -2003,7 +2003,7 @@ def testMetadataAsString(self):
metadata = PhoneMetadata.region_metadata["AU"]
self.assertEqual('\\' + 'd',
metadata.number_format[0].pattern[1:3])
- self.assertEqual(r"""NumberFormat(pattern='(\\d{4})(\\d{3})(\\d{3})', format=u('\\1 \\2 \\3'), leading_digits_pattern=['1'], national_prefix_formatting_rule=u('\\1'))""",
+ self.assertEqual(r"""NumberFormat(pattern='(\\d{4})(\\d{3})(\\d{3})', format='\\1 \\2 \\3', leading_digits_pattern=['1'], national_prefix_formatting_rule='\\1')""",
str(metadata.number_format[0]))
self.assertEqual(repr(metadata.number_format[0]),
str(metadata.number_format[0]))
@@ -2072,8 +2072,8 @@ def testMetadataAsString(self):
preferred_international_prefix='0011',
national_prefix='0',
national_prefix_for_parsing='0',
- number_format=[NumberFormat(pattern='(\\d{4})(\\d{3})(\\d{3})', format=u('\\1 \\2 \\3'), leading_digits_pattern=['1'], national_prefix_formatting_rule=u('\\1')),
- NumberFormat(pattern='(\\d{1})(\\d{4})(\\d{4})', format=u('\\1 \\2 \\3'), leading_digits_pattern=['[2-478]'], national_prefix_formatting_rule=u('0\\1'))])""",
+ number_format=[NumberFormat(pattern='(\\d{4})(\\d{3})(\\d{3})', format='\\1 \\2 \\3', leading_digits_pattern=['1'], national_prefix_formatting_rule='\\1'),
+ NumberFormat(pattern='(\\d{1})(\\d{4})(\\d{4})', format='\\1 \\2 \\3', leading_digits_pattern=['[2-478]'], national_prefix_formatting_rule='0\\1')])""",
str(metadata))
def testMetadataEval(self):

0 comments on commit 3ff89a8

Please sign in to comment.