diff --git a/.travis.yml b/.travis.yml index 1b4df81..da4481b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,3 +13,8 @@ script: - make coverage after_success: - coveralls +branches: + only: + - master + - develop + - /^v\d+\.\d+(\.\d+)?(-\S*)?$/ diff --git a/src/spines/_vendored/boltons/strutils.py b/src/spines/_vendored/boltons/strutils.py new file mode 100644 index 0000000..d98f5b1 --- /dev/null +++ b/src/spines/_vendored/boltons/strutils.py @@ -0,0 +1,1094 @@ +# -*- coding: utf-8 -*- +"""So much practical programming involves string manipulation, which +Python readily accommodates. Still, there are dozens of basic and +common capabilities missing from the standard library, several of them +provided by ``strutils``. +""" + +from __future__ import print_function + +import re +import sys +import uuid +import zlib +import string +import unicodedata +import collections +from gzip import GzipFile + +try: + from cStringIO import cStringIO as StringIO +except ImportError: + from io import BytesIO as StringIO + +try: + from collections.abc import Mapping +except ImportError: + from collections import Mapping + +try: + unicode, str, bytes, basestring = unicode, str, str, basestring + from HTMLParser import HTMLParser + import htmlentitydefs +except NameError: # basestring not defined in Python 3 + unicode, str, bytes, basestring = str, bytes, bytes, (str, bytes) + unichr = chr + from html.parser import HTMLParser + from html import entities as htmlentitydefs + + +__all__ = ['camel2under', 'under2camel', 'slugify', 'split_punct_ws', + 'unit_len', 'ordinalize', 'cardinalize', 'pluralize', 'singularize', + 'asciify', 'is_ascii', 'is_uuid', 'html2text', 'strip_ansi', + 'bytes2human', 'find_hashtags', 'a10n', 'gzip_bytes', 'gunzip_bytes', + 'iter_splitlines', 'indent', 'escape_shell_args', + 'args2cmd', 'args2sh', 'parse_int_list', 'format_int_list'] + + +_punct_ws_str = string.punctuation + string.whitespace +_punct_re = re.compile('[' + _punct_ws_str + ']+') +_camel2under_re = re.compile('((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))') + + +def camel2under(camel_string): + """Converts a camelcased string to underscores. Useful for turning a + class name into a function name. + + >>> camel2under('BasicParseTest') + 'basic_parse_test' + """ + return _camel2under_re.sub(r'_\1', camel_string).lower() + + +def under2camel(under_string): + """Converts an underscored string to camelcased. Useful for turning a + function name into a class name. + + >>> under2camel('complex_tokenizer') + 'ComplexTokenizer' + """ + return ''.join(w.capitalize() or '_' for w in under_string.split('_')) + + +def slugify(text, delim='_', lower=True, ascii=False): + """ + A basic function that turns text full of scary characters + (i.e., punctuation and whitespace), into a relatively safe + lowercased string separated only by the delimiter specified + by *delim*, which defaults to ``_``. + + The *ascii* convenience flag will :func:`asciify` the slug if + you require ascii-only slugs. + + >>> slugify('First post! Hi!!!!~1 ') + 'first_post_hi_1' + + >>> slugify("Kurt Gödel's pretty cool.", ascii=True) == \ + b'kurt_goedel_s_pretty_cool' + True + + """ + ret = delim.join(split_punct_ws(text)) or delim if text else '' + if ascii: + ret = asciify(ret) + if lower: + ret = ret.lower() + return ret + + +def split_punct_ws(text): + """While :meth:`str.split` will split on whitespace, + :func:`split_punct_ws` will split on punctuation and + whitespace. This used internally by :func:`slugify`, above. + + >>> split_punct_ws('First post! Hi!!!!~1 ') + ['First', 'post', 'Hi', '1'] + """ + return [w for w in _punct_re.split(text) if w] + + +def unit_len(sized_iterable, unit_noun='item'): # TODO: len_units()/unitize()? + """Returns a plain-English description of an iterable's + :func:`len()`, conditionally pluralized with :func:`cardinalize`, + detailed below. + + >>> print(unit_len(range(10), 'number')) + 10 numbers + >>> print(unit_len('aeiou', 'vowel')) + 5 vowels + >>> print(unit_len([], 'worry')) + No worries + """ + count = len(sized_iterable) + units = cardinalize(unit_noun, count) + if count: + return u'%s %s' % (count, units) + return u'No %s' % (units,) + + +_ORDINAL_MAP = {'1': 'st', + '2': 'nd', + '3': 'rd'} # 'th' is the default + + +def ordinalize(number, ext_only=False): + """Turns *number* into its cardinal form, i.e., 1st, 2nd, + 3rd, 4th, etc. If the last character isn't a digit, it returns the + string value unchanged. + + Args: + number (int or str): Number to be cardinalized. + ext_only (bool): Whether to return only the suffix. Default ``False``. + + >>> print(ordinalize(1)) + 1st + >>> print(ordinalize(3694839230)) + 3694839230th + >>> print(ordinalize('hi')) + hi + >>> print(ordinalize(1515)) + 1515th + """ + numstr, ext = unicode(number), '' + if numstr and numstr[-1] in string.digits: + try: + # first check for teens + if numstr[-2] == '1': + ext = 'th' + else: + # all other cases + ext = _ORDINAL_MAP.get(numstr[-1], 'th') + except IndexError: + # single digit numbers (will reach here based on [-2] above) + ext = _ORDINAL_MAP.get(numstr[-1], 'th') + if ext_only: + return ext + else: + return numstr + ext + + +def cardinalize(unit_noun, count): + """Conditionally pluralizes a singular word *unit_noun* if + *count* is not one, preserving case when possible. + + >>> vowels = 'aeiou' + >>> print(len(vowels), cardinalize('vowel', len(vowels))) + 5 vowels + >>> print(3, cardinalize('Wish', 3)) + 3 Wishes + """ + if count == 1: + return unit_noun + return pluralize(unit_noun) + + +def singularize(word): + """Semi-intelligently converts an English plural *word* to its + singular form, preserving case pattern. + + >>> singularize('chances') + 'chance' + >>> singularize('Activities') + 'Activity' + >>> singularize('Glasses') + 'Glass' + >>> singularize('FEET') + 'FOOT' + + """ + orig_word, word = word, word.strip().lower() + if not word or word in _IRR_S2P: + return orig_word + + irr_singular = _IRR_P2S.get(word) + if irr_singular: + singular = irr_singular + elif not word.endswith('s'): + return orig_word + elif len(word) == 2: + singular = word[:-1] # or just return word? + elif word.endswith('ies') and word[-4:-3] not in 'aeiou': + singular = word[:-3] + 'y' + elif word.endswith('es') and word[-3] == 's': + singular = word[:-2] + else: + singular = word[:-1] + return _match_case(orig_word, singular) + + +def pluralize(word): + """Semi-intelligently converts an English *word* from singular form to + plural, preserving case pattern. + + >>> pluralize('friend') + 'friends' + >>> pluralize('enemy') + 'enemies' + >>> pluralize('Sheep') + 'Sheep' + """ + orig_word, word = word, word.strip().lower() + if not word or word in _IRR_P2S: + return orig_word + irr_plural = _IRR_S2P.get(word) + if irr_plural: + plural = irr_plural + elif word.endswith('y') and word[-2:-1] not in 'aeiou': + plural = word[:-1] + 'ies' + elif word[-1] == 's' or word.endswith('ch') or word.endswith('sh'): + plural = word if word.endswith('es') else word + 'es' + else: + plural = word + 's' + return _match_case(orig_word, plural) + + +def _match_case(master, disciple): + if not master.strip(): + return disciple + if master.lower() == master: + return disciple.lower() + elif master.upper() == master: + return disciple.upper() + elif master.title() == master: + return disciple.title() + return disciple + + +# Singular to plural map of irregular pluralizations +_IRR_S2P = {'addendum': 'addenda', 'alga': 'algae', 'alumna': 'alumnae', + 'alumnus': 'alumni', 'analysis': 'analyses', 'antenna': 'antennae', + 'appendix': 'appendices', 'axis': 'axes', 'bacillus': 'bacilli', + 'bacterium': 'bacteria', 'basis': 'bases', 'beau': 'beaux', + 'bison': 'bison', 'bureau': 'bureaus', 'cactus': 'cacti', + 'calf': 'calves', 'child': 'children', 'corps': 'corps', + 'corpus': 'corpora', 'crisis': 'crises', 'criterion': 'criteria', + 'curriculum': 'curricula', 'datum': 'data', 'deer': 'deer', + 'diagnosis': 'diagnoses', 'die': 'dice', 'dwarf': 'dwarves', + 'echo': 'echoes', 'elf': 'elves', 'ellipsis': 'ellipses', + 'embargo': 'embargoes', 'emphasis': 'emphases', 'erratum': 'errata', + 'fireman': 'firemen', 'fish': 'fish', 'focus': 'foci', + 'foot': 'feet', 'formula': 'formulae', 'formula': 'formulas', + 'fungus': 'fungi', 'genus': 'genera', 'goose': 'geese', + 'half': 'halves', 'hero': 'heroes', 'hippopotamus': 'hippopotami', + 'hoof': 'hooves', 'hypothesis': 'hypotheses', 'index': 'indices', + 'knife': 'knives', 'leaf': 'leaves', 'life': 'lives', + 'loaf': 'loaves', 'louse': 'lice', 'man': 'men', + 'matrix': 'matrices', 'means': 'means', 'medium': 'media', + 'memorandum': 'memoranda', 'millennium': 'milennia', 'moose': 'moose', + 'mosquito': 'mosquitoes', 'mouse': 'mice', 'nebula': 'nebulae', + 'neurosis': 'neuroses', 'nucleus': 'nuclei', 'oasis': 'oases', + 'octopus': 'octopi', 'offspring': 'offspring', 'ovum': 'ova', + 'ox': 'oxen', 'paralysis': 'paralyses', 'parenthesis': 'parentheses', + 'person': 'people', 'phenomenon': 'phenomena', 'potato': 'potatoes', + 'radius': 'radii', 'scarf': 'scarves', 'scissors': 'scissors', + 'self': 'selves', 'series': 'series', 'sheep': 'sheep', + 'shelf': 'shelves', 'species': 'species', 'stimulus': 'stimuli', + 'stratum': 'strata', 'syllabus': 'syllabi', 'symposium': 'symposia', + 'synopsis': 'synopses', 'synthesis': 'syntheses', 'tableau': 'tableaux', + 'that': 'those', 'thesis': 'theses', 'thief': 'thieves', + 'this': 'these', 'tomato': 'tomatoes', 'tooth': 'teeth', + 'torpedo': 'torpedoes', 'vertebra': 'vertebrae', 'veto': 'vetoes', + 'vita': 'vitae', 'watch': 'watches', 'wife': 'wives', + 'wolf': 'wolves', 'woman': 'women'} + + +# Reverse index of the above +_IRR_P2S = dict([(v, k) for k, v in _IRR_S2P.items()]) + +HASHTAG_RE = re.compile(r"(?:^|\s)[##]{1}(\w+)", re.UNICODE) + + +def find_hashtags(string): + """Finds and returns all hashtags in a string, with the hashmark + removed. Supports full-width hashmarks for Asian languages and + does not false-positive on URL anchors. + + >>> find_hashtags('#atag http://asite/#ananchor') + ['atag'] + + ``find_hashtags`` also works with unicode hashtags. + """ + + # the following works, doctest just struggles with it + # >>> find_hashtags(u"can't get enough of that dignity chicken #肯德基 woo") + # [u'\u80af\u5fb7\u57fa'] + return HASHTAG_RE.findall(string) + + +def a10n(string): + """That thing where "internationalization" becomes "i18n", what's it + called? Abbreviation? Oh wait, no: ``a10n``. (It's actually a form + of `numeronym`_.) + + >>> a10n('abbreviation') + 'a10n' + >>> a10n('internationalization') + 'i18n' + >>> a10n('') + '' + + .. _numeronym: http://en.wikipedia.org/wiki/Numeronym + """ + if len(string) < 3: + return string + return '%s%s%s' % (string[0], len(string[1:-1]), string[-1]) + + +ANSI_ESCAPE_BEGIN = '\x1b[' +ANSI_TERMINATORS = ('H', 'f', 'A', 'B', 'C', 'D', 'R', 's', 'u', 'J', + 'K', 'h', 'l', 'p', 'm') + + +def strip_ansi(text): + """Strips ANSI escape codes from *text*. Useful for the occasional + time when a log or redirected output accidentally captures console + color codes and the like. + + >>> strip_ansi('\x1b[0m\x1b[1;36mart\x1b[46;34m\xdc') + 'art' + + The test above is an excerpt from ANSI art on + `sixteencolors.net`_. This function does not interpret or render + ANSI art, but you can do so with `ansi2img`_ or `escapes.js`_. + + .. _sixteencolors.net: http://sixteencolors.net + .. _ansi2img: http://www.bedroomlan.org/projects/ansi2img + .. _escapes.js: https://github.com/atdt/escapes.js + """ + # TODO: move to cliutils.py + nansi, keep, i, text_len = [], True, 0, len(text) + while i < text_len: + if not keep and text[i] in ANSI_TERMINATORS: + keep = True + elif keep: + keep_end_i = text.find(ANSI_ESCAPE_BEGIN, i) + if keep_end_i < 0: + break + else: + nansi.append(text[i:keep_end_i]) + i, keep = keep_end_i, False + i += 1 + if not nansi: + return text + return type(text)().join(nansi) # attempted unicode + str support + + +def asciify(text, ignore=False): + """Converts a unicode or bytestring, *text*, into a bytestring with + just ascii characters. Performs basic deaccenting for all you + Europhiles out there. + + Also, a gentle reminder that this is a **utility**, primarily meant + for slugification. Whenever possible, make your application work + **with** unicode, not against it. + + Args: + text (str or unicode): The string to be asciified. + ignore (bool): Configures final encoding to ignore remaining + unasciified unicode instead of replacing it. + + >>> asciify('Beyoncé') == b'Beyonce' + True + """ + try: + try: + return text.encode('ascii') + except UnicodeDecodeError: + # this usually means you passed in a non-unicode string + text = text.decode('utf-8') + return text.encode('ascii') + except UnicodeEncodeError: + mode = 'replace' + if ignore: + mode = 'ignore' + transd = unicodedata.normalize('NFKD', text.translate(DEACCENT_MAP)) + ret = transd.encode('ascii', mode) + return ret + + +def is_ascii(text): + """Check if a unicode or bytestring, *text*, is composed of ascii + characters only. Raises :exc:`ValueError` if argument is not text. + + Args: + text (str or unicode): The string to be checked. + + >>> is_ascii('Beyoncé') + False + >>> is_ascii('Beyonce') + True + """ + if isinstance(text, unicode): + try: + text.encode('ascii') + except UnicodeEncodeError: + return False + elif isinstance(text, bytes): + try: + text.decode('ascii') + except UnicodeDecodeError: + return False + else: + raise ValueError('expected text or bytes, not %r' % type(text)) + return True + + +class DeaccenterDict(dict): + "A small caching dictionary for deaccenting." + def __missing__(self, key): + ch = self.get(key) + if ch is not None: + return ch + try: + de = unicodedata.decomposition(unichr(key)) + p1, _, p2 = de.rpartition(' ') + if int(p2, 16) == 0x308: + ch = self.get(key) + else: + ch = int(p1, 16) + except (IndexError, ValueError): + ch = self.get(key, key) + self[key] = ch + return ch + + try: + from collections import defaultdict + except ImportError: + # no defaultdict means that __missing__ isn't supported in + # this version of python, so we define __getitem__ + def __getitem__(self, key): + try: + return super(DeaccenterDict, self).__getitem__(key) + except KeyError: + return self.__missing__(key) + else: + del defaultdict + + +# http://chmullig.com/2009/12/python-unicode-ascii-ifier/ +# For something more complete, investigate the unidecode +# or isounidecode packages, which are capable of performing +# crude transliteration. +_BASE_DEACCENT_MAP = { + 0xc6: u"AE", # Æ LATIN CAPITAL LETTER AE + 0xd0: u"D", # Ð LATIN CAPITAL LETTER ETH + 0xd8: u"OE", # Ø LATIN CAPITAL LETTER O WITH STROKE + 0xde: u"Th", # Þ LATIN CAPITAL LETTER THORN + 0xc4: u'Ae', # Ä LATIN CAPITAL LETTER A WITH DIAERESIS + 0xd6: u'Oe', # Ö LATIN CAPITAL LETTER O WITH DIAERESIS + 0xdc: u'Ue', # Ü LATIN CAPITAL LETTER U WITH DIAERESIS + 0xc0: u"A", # À LATIN CAPITAL LETTER A WITH GRAVE + 0xc1: u"A", # Á LATIN CAPITAL LETTER A WITH ACUTE + 0xc3: u"A", # Ã LATIN CAPITAL LETTER A WITH TILDE + 0xc7: u"C", # Ç LATIN CAPITAL LETTER C WITH CEDILLA + 0xc8: u"E", # È LATIN CAPITAL LETTER E WITH GRAVE + 0xc9: u"E", # É LATIN CAPITAL LETTER E WITH ACUTE + 0xca: u"E", # Ê LATIN CAPITAL LETTER E WITH CIRCUMFLEX + 0xcc: u"I", # Ì LATIN CAPITAL LETTER I WITH GRAVE + 0xcd: u"I", # Í LATIN CAPITAL LETTER I WITH ACUTE + 0xd2: u"O", # Ò LATIN CAPITAL LETTER O WITH GRAVE + 0xd3: u"O", # Ó LATIN CAPITAL LETTER O WITH ACUTE + 0xd5: u"O", # Õ LATIN CAPITAL LETTER O WITH TILDE + 0xd9: u"U", # Ù LATIN CAPITAL LETTER U WITH GRAVE + 0xda: u"U", # Ú LATIN CAPITAL LETTER U WITH ACUTE + 0xdf: u"ss", # ß LATIN SMALL LETTER SHARP S + 0xe6: u"ae", # æ LATIN SMALL LETTER AE + 0xf0: u"d", # ð LATIN SMALL LETTER ETH + 0xf8: u"oe", # ø LATIN SMALL LETTER O WITH STROKE + 0xfe: u"th", # þ LATIN SMALL LETTER THORN, + 0xe4: u'ae', # ä LATIN SMALL LETTER A WITH DIAERESIS + 0xf6: u'oe', # ö LATIN SMALL LETTER O WITH DIAERESIS + 0xfc: u'ue', # ü LATIN SMALL LETTER U WITH DIAERESIS + 0xe0: u"a", # à LATIN SMALL LETTER A WITH GRAVE + 0xe1: u"a", # á LATIN SMALL LETTER A WITH ACUTE + 0xe3: u"a", # ã LATIN SMALL LETTER A WITH TILDE + 0xe7: u"c", # ç LATIN SMALL LETTER C WITH CEDILLA + 0xe8: u"e", # è LATIN SMALL LETTER E WITH GRAVE + 0xe9: u"e", # é LATIN SMALL LETTER E WITH ACUTE + 0xea: u"e", # ê LATIN SMALL LETTER E WITH CIRCUMFLEX + 0xec: u"i", # ì LATIN SMALL LETTER I WITH GRAVE + 0xed: u"i", # í LATIN SMALL LETTER I WITH ACUTE + 0xf2: u"o", # ò LATIN SMALL LETTER O WITH GRAVE + 0xf3: u"o", # ó LATIN SMALL LETTER O WITH ACUTE + 0xf5: u"o", # õ LATIN SMALL LETTER O WITH TILDE + 0xf9: u"u", # ù LATIN SMALL LETTER U WITH GRAVE + 0xfa: u"u", # ú LATIN SMALL LETTER U WITH ACUTE + 0x2018: u"'", # ‘ LEFT SINGLE QUOTATION MARK + 0x2019: u"'", # ’ RIGHT SINGLE QUOTATION MARK + 0x201c: u'"', # “ LEFT DOUBLE QUOTATION MARK + 0x201d: u'"', # ” RIGHT DOUBLE QUOTATION MARK + } + + +DEACCENT_MAP = DeaccenterDict(_BASE_DEACCENT_MAP) + + +_SIZE_SYMBOLS = ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y') +_SIZE_BOUNDS = [(1024 ** i, sym) for i, sym in enumerate(_SIZE_SYMBOLS)] +_SIZE_RANGES = list(zip(_SIZE_BOUNDS, _SIZE_BOUNDS[1:])) + + +def bytes2human(nbytes, ndigits=0): + """Turns an integer value of *nbytes* into a human readable format. Set + *ndigits* to control how many digits after the decimal point + should be shown (default ``0``). + + >>> bytes2human(128991) + '126K' + >>> bytes2human(100001221) + '95M' + >>> bytes2human(0, 2) + '0.00B' + """ + abs_bytes = abs(nbytes) + for (size, symbol), (next_size, next_symbol) in _SIZE_RANGES: + if abs_bytes <= next_size: + break + hnbytes = float(nbytes) / size + return '{hnbytes:.{ndigits}f}{symbol}'.format(hnbytes=hnbytes, + ndigits=ndigits, + symbol=symbol) + + +class HTMLTextExtractor(HTMLParser): + def __init__(self): + self.reset() + self.strict = False + self.convert_charrefs = True + self.result = [] + + def handle_data(self, d): + self.result.append(d) + + def handle_charref(self, number): + if number[0] == u'x' or number[0] == u'X': + codepoint = int(number[1:], 16) + else: + codepoint = int(number) + self.result.append(unichr(codepoint)) + + def handle_entityref(self, name): + try: + codepoint = htmlentitydefs.name2codepoint[name] + except KeyError: + self.result.append(u'&' + name + u';') + else: + self.result.append(unichr(codepoint)) + + def get_text(self): + return u''.join(self.result) + + +def html2text(html): + """Strips tags from HTML text, returning markup-free text. Also, does + a best effort replacement of entities like " " + + >>> r = html2text(u'Test &(\u0394ημώ)') + >>> r == u'Test &(\u0394\u03b7\u03bc\u03ce)' + True + """ + # based on answers to http://stackoverflow.com/questions/753052/ + s = HTMLTextExtractor() + s.feed(html) + return s.get_text() + + +_EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x089\xf3\xb9U\x00\x03empty\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00' +_NON_EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x08\xbc\xf7\xb9U\x00\x03not_empty\x00K\xaa,I-N\xcc\xc8\xafT\xe4\x02\x00\xf3nb\xbf\x0b\x00\x00\x00' + + +def gunzip_bytes(bytestring): + """The :mod:`gzip` module is great if you have a file or file-like + object, but what if you just have bytes. StringIO is one + possibility, but it's often faster, easier, and simpler to just + use this one-liner. Use this tried-and-true utility function to + decompress gzip from bytes. + + >>> gunzip_bytes(_EMPTY_GZIP_BYTES) == b'' + True + >>> gunzip_bytes(_NON_EMPTY_GZIP_BYTES).rstrip() == b'bytesahoy!' + True + """ + return zlib.decompress(bytestring, 16 + zlib.MAX_WBITS) + + +def gzip_bytes(bytestring, level=6): + """Turn some bytes into some compressed bytes. + + >>> len(gzip_bytes(b'a' * 10000)) + 46 + + Args: + bytestring (bytes): Bytes to be compressed + level (int): An integer, 1-9, controlling the + speed/compression. 1 is fastest, least compressed, 9 is + slowest, but most compressed. + + Note that all levels of gzip are pretty fast these days, though + it's not really a competitor in compression, at any level. + """ + out = StringIO() + f = GzipFile(fileobj=out, mode='wb', compresslevel=level) + f.write(bytestring) + f.close() + return out.getvalue() + + + +_line_ending_re = re.compile(r'(\r\n|\n|\x0b|\f|\r|\x85|\x2028|\x2029)', + re.UNICODE) + + +def iter_splitlines(text): + r"""Like :meth:`str.splitlines`, but returns an iterator of lines + instead of a list. Also similar to :meth:`file.next`, as that also + lazily reads and yields lines from a file. + + This function works with a variety of line endings, but as always, + be careful when mixing line endings within a file. + + >>> list(iter_splitlines('\nhi\nbye\n')) + ['', 'hi', 'bye', ''] + >>> list(iter_splitlines('\r\nhi\rbye\r\n')) + ['', 'hi', 'bye', ''] + >>> list(iter_splitlines('')) + [] + """ + prev_end, len_text = 0, len(text) + # print('last: %r' % last_idx) + # start, end = None, None + for match in _line_ending_re.finditer(text): + start, end = match.start(1), match.end(1) + # print(start, end) + if prev_end <= start: + yield text[prev_end:start] + if end == len_text: + yield '' + prev_end = end + tail = text[prev_end:] + if tail: + yield tail + return + + +def indent(text, margin, newline='\n', key=bool): + """The missing counterpart to the built-in :func:`textwrap.dedent`. + + Args: + text (str): The text to indent. + margin (str): The string to prepend to each line. + newline (str): The newline used to rejoin the lines (default: ``\\n``) + key (callable): Called on each line to determine whether to + indent it. Default: :class:`bool`, to ensure that empty lines do + not get whitespace added. + """ + indented_lines = [(margin + line if key(line) else line) + for line in iter_splitlines(text)] + return newline.join(indented_lines) + + +def is_uuid(obj, version=4): + """Check the argument is either a valid UUID object or string. + + Args: + obj (object): The test target. Strings and UUID objects supported. + version (int): The target UUID version, set to 0 to skip version check. + + >>> is_uuid('e682ccca-5a4c-4ef2-9711-73f9ad1e15ea') + True + >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9') + False + >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9', version=1) + True + """ + if not isinstance(obj, uuid.UUID): + try: + obj = uuid.UUID(obj) + except (TypeError, ValueError, AttributeError): + return False + if version and obj.version != int(version): + return False + return True + + +def escape_shell_args(args, sep=' ', style=None): + """Returns an escaped version of each string in *args*, according to + *style*. + + Args: + args (list): A list of arguments to escape and join together + sep (str): The separator used to join the escaped arguments. + style (str): The style of escaping to use. Can be one of + ``cmd`` or ``sh``, geared toward Windows and Linux/BSD/etc., + respectively. If *style* is ``None``, then it is picked + according to the system platform. + + See :func:`args2cmd` and :func:`args2sh` for details and example + output for each style. + """ + if not style: + style = 'cmd' if sys.platform == 'win32' else 'sh' + + if style == 'sh': + return args2sh(args, sep=sep) + elif style == 'cmd': + return args2cmd(args, sep=sep) + + raise ValueError("style expected one of 'cmd' or 'sh', not %r" % style) + + +_find_sh_unsafe = re.compile(r'[^a-zA-Z0-9_@%+=:,./-]').search + + +def args2sh(args, sep=' '): + """Return a shell-escaped string version of *args*, separated by + *sep*, based on the rules of sh, bash, and other shells in the + Linux/BSD/MacOS ecosystem. + + >>> print(args2sh(['aa', '[bb]', "cc'cc", 'dd"dd'])) + aa '[bb]' 'cc'"'"'cc' 'dd"dd' + + As you can see, arguments with no special characters are not + escaped, arguments with special characters are quoted with single + quotes, and single quotes themselves are quoted with double + quotes. Double quotes are handled like any other special + character. + + Based on code from the :mod:`pipes`/:mod:`shlex` modules. Also + note that :mod:`shlex` and :mod:`argparse` have functions to split + and parse strings escaped in this manner. + """ + ret_list = [] + + for arg in args: + if not arg: + ret_list.append("''") + continue + if _find_sh_unsafe(arg) is None: + ret_list.append(arg) + continue + # use single quotes, and put single quotes into double quotes + # the string $'b is then quoted as '$'"'"'b' + ret_list.append("'" + arg.replace("'", "'\"'\"'") + "'") + + return ' '.join(ret_list) + + +def args2cmd(args, sep=' '): + r"""Return a shell-escaped string version of *args*, separated by + *sep*, using the same rules as the Microsoft C runtime. + + >>> print(args2cmd(['aa', '[bb]', "cc'cc", 'dd"dd'])) + aa [bb] cc'cc dd\"dd + + As you can see, escaping is through backslashing and not quoting, + and double quotes are the only special character. See the comment + in the code for more details. Based on internal code from the + :mod:`subprocess` module. + + """ + # technique description from subprocess below + """ + 1) Arguments are delimited by white space, which is either a + space or a tab. + + 2) A string surrounded by double quotation marks is + interpreted as a single argument, regardless of white space + contained within. A quoted string can be embedded in an + argument. + + 3) A double quotation mark preceded by a backslash is + interpreted as a literal double quotation mark. + + 4) Backslashes are interpreted literally, unless they + immediately precede a double quotation mark. + + 5) If backslashes immediately precede a double quotation mark, + every pair of backslashes is interpreted as a literal + backslash. If the number of backslashes is odd, the last + backslash escapes the next double quotation mark as + described in rule 3. + + See http://msdn.microsoft.com/en-us/library/17w5ykft.aspx + or search http://msdn.microsoft.com for + "Parsing C++ Command-Line Arguments" + """ + result = [] + needquote = False + for arg in args: + bs_buf = [] + + # Add a space to separate this argument from the others + if result: + result.append(' ') + + needquote = (" " in arg) or ("\t" in arg) or not arg + if needquote: + result.append('"') + + for c in arg: + if c == '\\': + # Don't know if we need to double yet. + bs_buf.append(c) + elif c == '"': + # Double backslashes. + result.append('\\' * len(bs_buf)*2) + bs_buf = [] + result.append('\\"') + else: + # Normal char + if bs_buf: + result.extend(bs_buf) + bs_buf = [] + result.append(c) + + # Add remaining backslashes, if any. + if bs_buf: + result.extend(bs_buf) + + if needquote: + result.extend(bs_buf) + result.append('"') + + return ''.join(result) + + +def parse_int_list(range_string, delim=',', range_delim='-'): + """Returns a sorted list of positive integers based on + *range_string*. Reverse of :func:`format_int_list`. + + Args: + range_string (str): String of comma separated positive + integers or ranges (e.g. '1,2,4-6,8'). Typical of a custom + page range string used in printer dialogs. + delim (char): Defaults to ','. Separates integers and + contiguous ranges of integers. + range_delim (char): Defaults to '-'. Indicates a contiguous + range of integers. + + >>> parse_int_list('1,3,5-8,10-11,15') + [1, 3, 5, 6, 7, 8, 10, 11, 15] + + """ + output = [] + + for x in range_string.strip().split(delim): + + # Range + if range_delim in x: + range_limits = list(map(int, x.split(range_delim))) + output += list(range(min(range_limits), max(range_limits)+1)) + + # Empty String + elif not x: + continue + + # Integer + else: + output.append(int(x)) + + return sorted(output) + + +def format_int_list(int_list, delim=',', range_delim='-', delim_space=False): + """Returns a sorted range string from a list of positive integers + (*int_list*). Contiguous ranges of integers are collapsed to min + and max values. Reverse of :func:`parse_int_list`. + + Args: + int_list (list): List of positive integers to be converted + into a range string (e.g. [1,2,4,5,6,8]). + delim (char): Defaults to ','. Separates integers and + contiguous ranges of integers. + range_delim (char): Defaults to '-'. Indicates a contiguous + range of integers. + delim_space (bool): Defaults to ``False``. If ``True``, adds a + space after all *delim* characters. + + >>> format_int_list([1,3,5,6,7,8,10,11,15]) + '1,3,5-8,10-11,15' + + """ + output = [] + contig_range = collections.deque() + + for x in sorted(int_list): + + # Handle current (and first) value. + if len(contig_range) < 1: + contig_range.append(x) + + # Handle current value, given multiple previous values are contiguous. + elif len(contig_range) > 1: + delta = x - contig_range[-1] + + # Current value is contiguous. + if delta == 1: + contig_range.append(x) + + # Current value is non-contiguous. + elif delta > 1: + range_substr = '{0:d}{1}{2:d}'.format(min(contig_range), + range_delim, + max(contig_range)) + output.append(range_substr) + contig_range.clear() + contig_range.append(x) + + # Current value repeated. + else: + continue + + # Handle current value, given no previous contiguous integers + else: + delta = x - contig_range[0] + + # Current value is contiguous. + if delta == 1: + contig_range.append(x) + + # Current value is non-contiguous. + elif delta > 1: + output.append('{0:d}'.format(contig_range.popleft())) + contig_range.append(x) + + # Current value repeated. + else: + continue + + # Handle the last value. + else: + + # Last value is non-contiguous. + if len(contig_range) == 1: + output.append('{0:d}'.format(contig_range.popleft())) + contig_range.clear() + + # Last value is part of contiguous range. + elif len(contig_range) > 1: + range_substr = '{0:d}{1}{2:d}'.format(min(contig_range), + range_delim, + max(contig_range)) + output.append(range_substr) + contig_range.clear() + + if delim_space: + output_str = (delim+' ').join(output) + else: + output_str = delim.join(output) + + return output_str + + +class MultiReplace(object): + """ + MultiReplace is a tool for doing multiple find/replace actions in one pass. + + Given a mapping of values to be replaced it allows for all of the matching + values to be replaced in a single pass which can save a lot of performance + on very large strings. In addition to simple replace, it also allows for + replacing based on regular expressions. + + Keyword Arguments: + + :type regex: bool + :param regex: Treat search keys as regular expressions [Default: False] + :type flags: int + :param flags: flags to pass to the regex engine during compile + + Dictionary Usage:: + + from lrmslib import stringutils + s = stringutils.MultiReplace({ + 'foo': 'zoo', + 'cat': 'hat', + 'bat': 'kraken' + }) + new = s.sub('The foo bar cat ate a bat') + new == 'The zoo bar hat ate a kraken' + + Iterable Usage:: + + from lrmslib import stringutils + s = stringutils.MultiReplace([ + ('foo', 'zoo'), + ('cat', 'hat'), + ('bat', 'kraken)' + ]) + new = s.sub('The foo bar cat ate a bat') + new == 'The zoo bar hat ate a kraken' + + + The constructor can be passed a dictionary or other mapping as well as + an iterable of tuples. If given an iterable, the substitution will be run + in the order the replacement values are specified in the iterable. This is + also true if it is given an OrderedDict. If given a dictionary then the + order will be non-deterministic:: + + >>> 'foo bar baz'.replace('foo', 'baz').replace('baz', 'bar') + 'bar bar bar' + >>> m = MultiReplace({'foo': 'baz', 'baz': 'bar'}) + >>> m.sub('foo bar baz') + 'baz bar bar' + + This is because the order of replacement can matter if you're inserting + something that might be replaced by a later substitution. Pay attention and + if you need to rely on order then consider using a list of tuples instead + of a dictionary. + """ + + def __init__(self, sub_map, **kwargs): + """Compile any regular expressions that have been passed.""" + options = { + 'regex': False, + 'flags': 0, + } + options.update(kwargs) + self.group_map = {} + regex_values = [] + + if isinstance(sub_map, Mapping): + sub_map = sub_map.items() + + for idx, vals in enumerate(sub_map): + group_name = 'group{0}'.format(idx) + if isinstance(vals[0], basestring): + # If we're not treating input strings like a regex, escape it + if not options['regex']: + exp = re.escape(vals[0]) + else: + exp = vals[0] + else: + exp = vals[0].pattern + + regex_values.append('(?P<{0}>{1})'.format( + group_name, + exp + )) + self.group_map[group_name] = vals[1] + + self.combined_pattern = re.compile( + '|'.join(regex_values), + flags=options['flags'] + ) + + def _get_value(self, match): + """Given a match object find replacement value.""" + group_dict = match.groupdict() + key = [x for x in group_dict if group_dict[x]][0] + return self.group_map[key] + + def sub(self, text): + """ + Run substitutions on the input text. + + Given an input string, run all substitutions given in the + constructor. + """ + return self.combined_pattern.sub(self._get_value, text) + + +def multi_replace(text, sub_map, **kwargs): + """Shortcut function to invoke multi-replace in a single command.""" + m = MultiReplace(sub_map, **kwargs) + return m.sub(text) diff --git a/src/spines/utils/string.py b/src/spines/utils/string.py index 7b07502..0598f54 100644 --- a/src/spines/utils/string.py +++ b/src/spines/utils/string.py @@ -15,6 +15,13 @@ from .._vendored import autopep8 as _v_autopep8 +# +# Constants +# + +INDENT_TAB = ' ' + + # # Functions # diff --git a/src/spines/versioning/base.py b/src/spines/versioning/base.py index f883bb5..71c849b 100644 --- a/src/spines/versioning/base.py +++ b/src/spines/versioning/base.py @@ -85,3 +85,7 @@ def _get_hash(self, obj) -> [bytes, None]: def _get_bytes(self, obj) -> bytes: """Gets the relevant bytes for a single object""" pass + + def _get_source(self, obj) -> str: + """Gets the source code for the given object""" + return diff --git a/src/spines/versioning/core.py b/src/spines/versioning/core.py index 80899e2..a5e3a7f 100644 --- a/src/spines/versioning/core.py +++ b/src/spines/versioning/core.py @@ -5,9 +5,13 @@ # # Imports # +from collections import OrderedDict +from textwrap import indent from types import FunctionType from typing import Any +from typing import Callable from typing import Dict +from typing import Mapping from typing import Tuple from typing import Type @@ -17,11 +21,16 @@ T_OrderedDict = Dict from ..utils.function import get_hash_bytes +from ..utils.function import get_source as get_source_fn from ..utils.function import get_specification +from ..utils.object import get_doc_string from ..utils.object import get_new_functions from ..utils.object import get_new_properties from ..utils.object import get_overridden_functions from ..utils.object import get_overridden_properties +from ..utils.property import get_source as get_source_prop +from ..utils.string import INDENT_TAB +from ..utils.string import format_code_style from .base import Signature @@ -100,6 +109,10 @@ def _get_bytes(self, obj: FunctionType) -> bytes: """Gets the bytes to hash for a Function""" return get_hash_bytes(obj) + def _get_source(self, obj: FunctionType) -> str: + """Gets the source code for the function given""" + return get_source_fn(obj) + class PropertySignature(Signature): """ @@ -151,6 +164,10 @@ def _get_bytes(self, obj: [type, object]) -> bytes: ret += self._fdel.hash_bytes return ret + def _get_source(self, obj: property) -> str: + """Gets the source code for the property given""" + return get_source_prop(obj) + class ClassSignature(Signature): """ @@ -194,6 +211,53 @@ def _get_bytes(self, obj: Type[__base_cls__]) -> bytes: ret += t_bytes return ret + def _get_source(self, obj) -> str: + """Gets the source code for the given class""" + if not isinstance(obj, type): + obj = obj.__class__ + + docstr = get_doc_string(obj) + prop_source = self._get_signature_dict_sources( + obj, self._properties, get_source_prop + ) + func_source = self._get_signature_dict_sources( + obj, self._functions, get_source_fn + ) + + ret = "class {class_name}({base_classes}):\n".format( + class_name=obj.__name__, + base_classes=', '.join([x.__name__ for x in obj.__bases__]) + ) + if docstr: + ret += indent('"""%s"""\n' % docstr, INDENT_TAB) + ret += '\n' + if '__init__' in func_source.keys(): + ret += indent(func_source.pop('__init__') + '\n', INDENT_TAB) + for prop, src in prop_source.items(): + ret += indent(src + '\n', INDENT_TAB) + for fn, src in func_source.items(): + ret += indent(src + '\n', INDENT_TAB) + + return format_code_style(ret) + + @classmethod + def _get_signature_dict_sources( + cls, + obj, + signature_dict: Mapping[str, Type[Signature]], + source_func: Callable + ) -> T_OrderedDict[str, str]: + """Helper to get sources from a dictionary of Signatures""" + ret = OrderedDict() + for k in sorted(signature_dict.keys()): + v = signature_dict[k] + v_src = getattr(v, 'source', None) + if v_src: + ret[k] = v_src + else: + ret[k] = source_func(obj.__dict__[k]) + return ret + @classmethod def _get_functions(cls, obj: type) -> Dict[str, Type[FunctionSignature]]: """Gets the relevant functions and their signatures""" diff --git a/src/spines/versioning/mixins.py b/src/spines/versioning/mixins.py index 1f6fe2b..26955d7 100644 --- a/src/spines/versioning/mixins.py +++ b/src/spines/versioning/mixins.py @@ -8,21 +8,14 @@ from abc import ABC from abc import abstractmethod from hashlib import blake2s -from textwrap import indent from typing import Sequence from typing import Type import parver from .. import __version__ -from ..utils.function import get_source from ..utils.object import get_doc_string -from ..utils.string import format_code_style from ..utils.string import slugify -from ..utils.property import get_source as get_source_prop -from .core import ClassSignature -from .core import FunctionSignature -from .core import PropertySignature # @@ -126,9 +119,10 @@ def dependencies(self) -> Sequence[str]: """:obj:`tuple` of :obj:`str`: Dependencies of the object""" return self._dependencies + @abstractmethod def _get_dependencies(self, obj: [type, object]) -> Sequence[str]: """Gets the dependencies of the signed object""" - return tuple() + pass class SourceMixin(ABC): @@ -145,58 +139,3 @@ def __init__(self, obj: [type, object]) -> None: def source(self) -> [str, None]: """:obj:`str`: Source code for the signature's object.""" return self._source - - def _get_source(self, obj: [object, type]) -> [str, None]: - """Gets the source code for the given object""" - ret = None - if any(issubclass(x, FunctionSignature) - for x in self.__class__.__bases__): - ret = get_source(obj) - elif any(issubclass(x, PropertySignature) - for x in self.__class__.__bases__): - ret = get_source_prop(obj) - elif any(issubclass(x, ClassSignature) - for x in self.__class__.__bases__): - if not isinstance(obj, type): - obj = obj.__class__ - - ret = "class {class_name}({base_classes}):\n".format( - class_name=obj.__name__, - base_classes=', '.join([x.__name__ for x in obj.__bases__]) - ) - docstr = get_doc_string(obj) - if docstr: - ret += indent('"""%s"""\n' % docstr, ' ') - ret += '\n' - - fn_source = {} - for k, v in self._functions.items(): - if hasattr(v, 'source'): - fn_source[k] = v.source - else: - fn_source[k] = get_source(getattr(obj, k)) - - prop_source = {} - for k, v in self._properties.items(): - if hasattr(v, 'source'): - prop_source[k] = v.source - else: - prop_source[k] = get_source_prop(obj.__dict__[k]) - - if '__init__' in fn_source.keys(): - ret += indent(fn_source['__init__'] + '\n', ' ') - - prop_ordering = sorted(x for x in prop_source.keys()) - for prop in prop_ordering: - src = prop_source[prop] - ret += indent(src + '\n', ' ') - - fn_ordering = sorted(x for x in fn_source.keys() - if x != '__init___') - for fn in fn_ordering: - src = fn_source[fn] - ret += indent(src + '\n', ' ') - - ret = format_code_style(ret) - - return ret